示例#1
0
    def run(self):

        base_url = "https://www.yelp.com/search?find_desc=" \
                   "Dry+Cleaners&find_loc=New+York%2C+NY&start="

        for j in range(1, 1000, 10):
            try:
                url = base_url + str(j)
                print '[YelpScraper] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "Failed to get content of url: %s" % url
                    return

                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')

                li_class = "regular-search-result"

                # parsing html content to fet information about dry cleaners
                for li in soup.find_all('li', class_=li_class):
                    self.scrap_row_yelp(li)
                    # break  # just use it for testing only
                sleep_scrapper('YelpScraper')
            except Exception as exp:
                print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp
                print(traceback.format_exc())
    def run(self):

        try:
            base_url = 'https://www.flipkart.com/search?as=off&as-show=' \
                       'on&otracker=start&page='
            sufix = '&q=%s&viewType=list' % self.product

            for i in range(1, 100, 1):
                url = base_url + str(i) + sufix
                print '[FlipkartScraper] :: fetching data from url: ', url

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print '[FlipkartScraper] :: Failed to get the content ' \
                          'of url: %s' % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                # for div in soup.find_all('div', class_='col col-7-12'):
                for div in soup.find_all('div', class_='_1-2Iqu row'):
                    # print '---------------------div', div
                    self.scrap_result_row(div)
                sleep_scrapper('FlipkartScraper')
        except Exception as exp:
            print '[FlipkartScraper] :: run() :: Got exception: %s' % exp
            print(traceback.format_exc())
    def run(self):
        url = ''
        try:
            base_url = 'https://www.overstock.com/Home-Garden/%s/%s/' \
                       % (self.product_category, self.product_code)
            sufix = 'subcat.html?page='
            for j in range(1, 100, 1):
                url = base_url + sufix + str(j)
                print '[OverStockScraper] :: fetching data from url:', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[OverStockScraper] :: Failed to " \
                          "get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div', class_='product-tile'):
                    # print '---------div', div
                    self.scrap_result_row(div)
                    # break
                sleep_scrapper('OverStockScraper')
        except Exception as exp:
            print '[OverStockScraper] :: run() :: Got exception : ' \
                  '%s and fetching data from url: %s' % (exp, url)
            print(traceback.format_exc())
    def run(self):
        try:

            url = 'https://www.bedbathandbeyond.com/store/category' \
                  '/%s/%s/%s/%s/' \
                  % (self.product_category, self.product_subcategory,
                     self.product_title, self.product_code)

            print '[BedBathAndBeyondScraper] :: fetching data from url: ', url
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print "[BedBathAndBeyondScraper] :: Failed to get " \
                        "content of url: %s" % url
                return
            html_doc = r.content

            soup = BeautifulSoup(html_doc, 'html.parser')

            for div in soup.find_all('div',
                                     class_='productCo'
                                     'ntent ec_listing'):
                self.scrap_result_row(div)
            sleep_scrapper('BedBathAndBeyondScraper')
        except Exception as exp:
            print '[BedBathAndBeyondScraper] :: run() :: Got exception: %s'\
                  % exp
            print(traceback.format_exc())
示例#5
0
    def run(self):

        base_url = 'https://www.homedepot.com/b/' \
                   '%s/N-5yc1vZbm79?Nao=' % (self.product)
        sufix = '&Ns=None'

        for j in range(0, 1000, 12):
            url = ''
            try:
                url = base_url + str(j) + sufix
                print '[HomeDepot] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print "[HomeDepot] :: Failed to get " \
                          "content of url: %s" % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div', class_='pod-inner'):
                    self.scrap_result_row(div)
                sleep_scrapper('HomeDepot')
            except Exception as exp:
                print '[HomeDepot] :: run() :: Got exception : ' \
                      '%s and fetching data from url: %s' % (exp, url)
    def run(self):

        base_url = 'https://www.indeed.co.in/jobs?q=' \
              '%s&l=%s&start=' % (self.post, self.location)
        for j in range(0, 1000, 10):
            url = ''
            try:
                url = base_url + str(j)
                print '[IndeedScrapper] :: fetching data from url:', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[IndeedScrapper] :: Failed to " \
                          "get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div'):
                    # ignore divs with classes
                    if not div.attrs.has_key('class'):
                        continue

                    cls = div.attrs['class']
                    if 'row' in cls and 'result' in cls:
                        self.scrap_result_row(div)
                        # break
                sleep_scrapper('IndeedScraper')
            except Exception as exp:
                print '[IndeedScraper] :: run() :: Got exception : ' \
                      '%s and fetching data from url: %s' % (exp, url)
示例#7
0
    def scrap_android_developer(self):
        dir_path = os.path.realpath(__file__)
        file_data = "%s/%s" % (dir_path, PROGRESS_ANDROID)

        if not os.path.isfile(file_data):
            print("Error: %s file not found" % file_data)
            txt_write(PROGRESS_ANDROID, "10")

        print("file is exist %s ..." % file_data)
        f = open(PROGRESS_ANDROID, "r")
        i = int(f.readline())
        base_url = "https://www.indeed.co.in/jobs?q=android+developer&l=Mohali%2C+Punjab&start="

        for j in range(i, 100, 10):
            try:
                android_url = base_url + str(j)
                self.android.scrap_android_developer(android_url)

                # update scrapping progress in Android progress file
                txt_write(PROGRESS_ANDROID, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Android-Scrapper")
            except Exception as exp:
                print "scrap_android_developer() :: Got exception: %s" % exp
                print(traceback.format_exc())
    def scrap_yellowpages_dry_cleaner(self):
        """
        Scrap dry cleaners data from yellow pages
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_YELLOWPAGES_FILE)

        # if progress file is not found,
        # create a new one with default value 1
        if not os.path.isfile(file_data):
            logging.error("Error: %s file not found" % file_data)
            print("Error: %s file not found" % file_data)
            txt_write(PROGRESS_YELLOWPAGES_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        print("file is exist %s ..." % file_data)
        f = open(PROGRESS_YELLOWPAGES_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nYellowPages-Dry-Cleaner-Scrapper started at i: %d \n\n" % i
        )
        print("\n\nYellowPages-Dry-Cleaner-Scrapper started at i: %d \n\n" % i)

        base_url = "https://www.yellowpages.com/" \
                   "new-york-ny/dry-cleaners-laundries?page="

        for j in range(i, 86, 1):
            try:
                yellowpages_url = base_url + str(j)
                self.yellowpages.scrap_yellowpages(yellowpages_url,
                                                   ST_DRY_CLEANERS)

                # progress % of yellowpages_dry_cleaner scrapping
                percentage = float(j * 100 / 86)
                logging.info(
                    "\n\nYellowPages-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )
                print(
                    "\n\nYellowPages-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )

                # update scrapping progress in
                # yellowpages_dry_cleaner progress file
                txt_write(PROGRESS_YELLOWPAGES_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("YellowPages-Dry-Cleaner-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_yellowpages_dry_cleaner() :: "
                    "Got Exception : %s" % exp
                )
                logging.error(
                    traceback.format_exc()
                )
    def scrap_yelp_dry_cleaners(self):
        """
        Scrap dry cleaners data from yelp
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_YELP_FILE)

        # if progress file is not found,
        # create a new one with default value 10
        if not os.path.isfile(file_data):
            logging.error("Error: %s file not found" % file_data)
            print("Error: %s file not found" % file_data)
            txt_write(PROGRESS_YELP_FILE, "10 ")

        logging.info("file is exist %s ..." % file_data)
        print("file is exist %s ..." % file_data)
        f = open(PROGRESS_YELP_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nYelp-Dry-Cleaner-Scrapper started at i: %d \n\n" % i
        )
        print("\n\nYelp-Dry-Cleaner-Scrapper started at i: %d \n\n" % i)

        base_url = "https://www.yelp.com/search?find_desc" \
                   "=Dry+Cleaners&find_loc=New+York%2C+NY&start="

        for j in range(i, 1000, 10):
            try:
                yelp_url = base_url + str(j)
                self.yelp.scrap_yelp(yelp_url, ST_DRY_CLEANERS)

                # progress % of yelp_dry_cleaner scrapping
                percentage = float(j * 100 / 1000)
                logging.info(
                    "\n\nYelp-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )
                print(
                    "\n\nYelp-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )
                # update scrapping progress in yelp_dry_cleaner progress file
                txt_write(PROGRESS_YELP_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Yelp-Dry-Cleaner-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_yelp_dry_cleaners() :: Got Exception : %s" % exp
                )
                logging.error(
                    traceback.format_exc()
                )
示例#10
0
    def scrap_yellowpages_shoe_repair(self):
        """
        Scrap shoe repair data from yellow pages
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (
            dir_path,
            PROGRESS_YELLOWPAGES_SHOE_REPAIR_FILE
        )

        if not os.path.isfile(file_data):
            logging.error("Error: %s file not found" % file_data)
            txt_write(PROGRESS_YELLOWPAGES_SHOE_REPAIR_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        f = open(PROGRESS_YELLOWPAGES_SHOE_REPAIR_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nYellowpages-Shoe-Repair-Scrapper started at i: %d \n\n" % i
        )

        base_url = "https://www.yellowpages.com/search?" \
                   "search_terms=Shoe%20Repair" \
                   "&geo_location_terms=New%20York%2C%20NY&page="

        for j in range(i, 30, 1):
            try:
                yellowpages_url = base_url + str(j)
                self.yellowpages.scrap_yellowpages(yellowpages_url,
                                                   ST_SHOE_REPAIR)

                # progress of yellowpages_shoe_repair scrapping %
                percentage = float(j * 100 / 30)

                logging.info(
                    "\n\nYellowPages-Shoe-Repair-Scrapper %f percent "
                    "completed" % percentage
                )

                # update scrapping progress in
                # yellowpages_shoe_repair progress file
                txt_write(PROGRESS_YELLOWPAGES_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("YellowPages-Shoe-Repair-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_yellowpages_shoe_repair() :: "
                    "Got Exception : %s" % exp
                )
                logging.error(traceback.format_exc())
    def run(self):
        try:

            url = 'https://www.countryliving.com/food-drinks/' #input("Enter url to be scrapped: ")
            # url1 = url.split('/')
            # print(url1)
            # #url2 = url1.split('/').pop()
            # url2 = url1[3]
            
            

            print ('[AnyWebsiteScraper] :: fetching data from url: ', url)
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print ("[AnyWebsiteScraper] :: Failed to get " \
                        "content of url: %s" % url)
                return
            html_doc = r.content
            soup = BeautifulSoup(html_doc, 'html.parser')
            div22 = soup.find('div',class_='site-content')
            div11= soup.find('div',class_='feed feed-grid')
            for div33 in div11.find_all('div',class_='simple-item'):
                self.scrap_result_row(div33)
            sleep_scrapper('AnyWebsiteScraper')
     #infinite scrolling logic start
                        #infinitescrollingUrl ##################### Input here 
            base_url = 'https://www.countryliving.com/ajax/infiniteload/?id=34aae02d-c035-47e5-95c5-b87ba30c1dd8&class=CoreModels%5Csections%5CSectionModel&viewset=section&cachebuster=&page='

            for i in range(2, 100, 1):
                url = base_url + str(i)
                
                print ('[AnyWebsiteScraper] :: fetching data from url: ', url)
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ("[AnyWebsiteScraper] :: Failed to get " \
                        "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                div22 = soup.find('div',class_='site-content')
                div11= soup.find('div',class_='feed feed-grid')
                for div33 in div11.find_all('div',class_='simple-item'):
                    self.scrap_result_row(div33)
                sleep_scrapper('AnyWebsiteScraper')

    #Infinite Scroll Logic Ends            
        except Exception as exp:
            print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\
                  % exp)
            print(traceback.format_exc())        
示例#12
0
    def scrap_cragilist_shoe_repair(self):
        """
        Scrap shoe repair data from cragilist
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE)

        # if progress file is not found,
        # create a new one with default value 1
        if not os.path.isfile(file_data):
            logging.error("Error: %s file is not found" % file_data)
            txt_write(PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        f = open(PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE, "r")
        i = int(f.readline())

        logging.info("\n\nCraiglist-Dry-Cleaner started at i: %d \n\n" % i)

        base_url = "https://newyork.craigslist.org/search" \
                   "/sss?query=shoe+repair&sort=rel"

        for j in range(i, 90, 1):
            try:
                craiglist_url = base_url + str(i)
                self.craiglist.scrap_craiglist(craiglist_url, ST_SHOE_REPAIR)

                # progress of craiglist_shoe_repair scrapping %
                percentage = float(j * 100 / 90)
                logging.info(
                    "\n\nCraiglist-Shoe-Repair-Scrapper %f "
                    "percent completed" % percentage
                )

                # update scrapping progress in
                # craiglist_shoe_repair progress file
                txt_write(PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Craiglist-Shoe-Repair-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_shoe_rapair_companies() :: "
                    "Got Exception : %s" % exp
                )
                logging.error(
                    traceback.format_exc()
                )
示例#13
0
    def scrap_groupon_shoe_repair(self):
        """
        Scrap shoe repair data from groupon
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_GROUPON_SHOE_REPAIR_FILE)

        # if progress file is not found,
        # create a new one with default value 1
        if not os.path.isfile(file_data):
            logging.error("Error: %s file not found" % file_data)
            txt_write(PROGRESS_GROUPON_SHOE_REPAIR_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        f = open(PROGRESS_GROUPON_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nGroupon-Shoe-Repair-Scrapper started at i: %d \n\n" % i
        )

        base_url = "https://www.groupon.com/browse/chicago?" \
                   "lat=41.8795&lng=-87.6243&address=Chicago&query=" \
                   "new+york+shoe+repair&locale=en_US"

        for j in range(i, 400, 1):
            try:
                groupon_url = base_url + str(j)
                self.groupon.scrap_groupon(groupon_url, ST_SHOE_REPAIR)

                # progress of groupon_shoe_repair scrapping %
                percentage = float(j * 100 / 400)
                logging.info(
                    "\n\nGroupon-Shoe-Repair-Scrapper %f percent "
                    "completed" % percentage
                )

                # update scrapping progress in
                # groupon_shoe_repair progress file
                txt_write(PROGRESS_GROUPON_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Groupon-Shoe-Repair-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_groupon_shoe_repair() :: Got Exception : %s" % exp
                )
                logging.error(traceback.format_exc())
示例#14
0
    def srap_yelp_shoe_repair(self):
        """
        Scrap shoe repair data from Yelp
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_YELP_SHOE_REPAIR_FILE)

        if not os.path.isfile(file_data):
            logging.error("Error: %s file not found" % file_data)
            txt_write(PROGRESS_YELP_SHOE_REPAIR_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        f = open(PROGRESS_YELP_SHOE_REPAIR_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nYelp-Shoe-Repair-Scrapper started at i: %d \n\n" % i
        )

        base_url = "https://www.yelp.com/search?find_desc" \
                   "=Shoe+repair&find_loc=New+York,+NY&start="

        for j in range(i, 820, 10):
            try:
                yelp_url = base_url + str(j)
                self.yelp.scrap_yelp(yelp_url, ST_SHOE_REPAIR)

                # progress of yelp_shoe_repair scrapping %
                percentage = float(j * 100 / 820)
                logging.info(
                    "\n\nYelp-Shoe-Repair-Scrapper %f percent "
                    "completed" % percentage
                )

                # update scrapping progress in yelp_shoe_repair progress file
                txt_write(PROGRESS_YELP_SHOE_REPAIR_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Yelp-Shoe-Repair-Scrapper")
            except Exception as exp:
                logging.error(
                    "srap_yelp_shoe_repair() :: Got exception: %s" % exp
                )
                logging.error(traceback.format_exc())
    def run(self):
        base_url = 'https://www.yellowpages.com/search?search_terms=software+company&geo_location_terms=New+York%2C+NY&page='

        for j in range(27, 100, 1):
            try:
                url = base_url + str(j)
                print ('[YellowPagesScraper] :: fetching data from url: ', url)

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ('[YellowPagesScraper] :: Failed to get the content ' \
                          'of url: %s' % url)
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                for div in soup.find_all('div', class_='info'):
                    self.scrap_result_row(div)
                sleep_scrapper('YellowPagesScraper')
            except Exception as exp:
                print ('[YellowPagesScraper] :: run() :: Got exception: %s' % exp)
                print(traceback.format_exc())
 def run(self):
     try:
         base_url = 'https://inc42.com/buzz/'
         r = requests.get(base_url, headers=get_request_headers())
         print('[IncScrapper] :: fetching data from TEAMS url: ', base_url)
         if not r.status_code == 200:
             print ("[IncScrapper] :: Failed to get " \
                 "content of url: %s" % base_url)
             return
         html_doc = r.content
         soup = BeautifulSoup(html_doc, 'html.parser')
         div1 = soup.find('div', class_="site-content")
         inc_news = div1.find_all("div",
                                  {"class": "card-wrapper horizontal-card"})
         for news in inc_news:
             self.scrape_home(news)
         sleep_scrapper('Inc42scrapper')
         #next pages data
         for i in range(2, 100, 1):
             page = 'page'
             url = base_url + page + str(i)
             r = requests.get(url, headers=get_request_headers())
             print('[IncScrapper] :: fetching data from TEAMS url: ', url)
             if not r.status_code == 200:
                 print ("[IncScrapper] :: Failed to get " \
                     "content of url: %s" % url)
                 return
             html_doc = r.content
             soup = BeautifulSoup(html_doc, 'html.parser')
             div1 = soup.find('div', class_="site-content")
             inc_news = div1.find_all(
                 "div", {"class": "card-wrapper horizontal-card"})
             for news in inc_news:
                 self.scrape_home(news)
             sleep_scrapper('Inc42scrapper')
     except Exception as exp:
         print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 Homepage: %s'\
             % exp)
         print(traceback.format_exc())
示例#17
0
    def run(self):

        base_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=" % (
            self.product, self.location)

        for j in range(1, 1000, 10):
            try:
                url = base_url + str(j)
                print '[YelpScraper] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print '[YelpScraper] :: Failed to get content of url: %s' % url
                    return

                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')

                for li in soup.find_all('li', class_='regular-search-result'):
                    self.scrap_row_yelp(li)
                sleep_scrapper('YelpScraper')
            except Exception as exp:
                print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp
                print(traceback.format_exc())
示例#18
0
    def run(self):
        try:

            url = 'https://news.google.com/news/headlines/section/topic' \
                  '/NATION.en_in/India?ned=in&hl=en-IN&gl=IN'

            print '[GoogleNewsScraper] :: fetching data from url: ', url
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print "[GoogleNewsScraper] :: Failed to get " \
                        "content of url: %s" % url
                return
            html_doc = r.content

            soup = BeautifulSoup(html_doc, 'html.parser')
            # print '------soup', soup
            for div in soup.find_all('div', class_='v4IxVd'):
                # print '-----div', div
                self.scrap_result_row(div)
            sleep_scrapper('GoogleNewsScraper')
        except Exception as exp:
            print '[GoogleNewsScraper] :: run() :: Got exception: %s'\
                  % exp
            print(traceback.format_exc())
    def run(self):
        try:

            options = Options()
            options.add_argument("window-size=1400,600")
            ua = UserAgent()
            a = ua.random
            user_agent = ua.random

            options.add_argument(f'user-agent={user_agent}')
            driver = webdriver.Chrome(
                "C:/Users/Dell/Downloads/chromedriver_win32/chromedriver.exe",
                options=options)
            html = driver.page_source

            for i in range(00, 60, 10):
                suffix = '&q=%s' % self.keyword
                url = 'https://www.google.com/search?client=firefox-b-d&biw=1366&bih=654&sa=N&ved=0ahUKEwjfy8ugnZHkAhVLro8KHXq1Ar0Q8tMDCJMC&ei=zslbXd-sHcvcvgT66oroCw&start=' + str(
                    i) + suffix
                driver.get(url)
                html = driver.page_source

                # scrap_data(website_link,website_title,website_snippet)
                soup = BeautifulSoup(html, 'html.parser')
                print('...........soupppppppppp', soup.encode('utf-8'))
                for div in soup.find_all('div', class_='g'):
                    print('---------------------div', div)
                    self.scrap_result_row(div)
                time.sleep(15)
                sleep_scrapper('GoogleSearchListingsScraper')

        except Exception as exp:
            print(
                '[GoogleSearchListingsScraper] :: run() :: Got exception: %s' %
                exp)
            print(traceback.format_exc())
示例#20
0
    def run(self):

        for j in range(0, 2, 1):
            try:
                # url = base_url + str(j) + sufix
                url = 'https://www.samsclub.com/sams/coffee-tea-cocoa/1493.cp?xid=cat_sub&navAction=jump'
                print '[Samsclub] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[Samsclub] :: Failed to get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                # parsing html content  to fet information about python developer
                for div in soup.find_all('div', class_='products-card'):
                    self.scrap_result_row(div)
                sleep_scrapper('Samsclub')

            except Exception as exp:
                print '[Samsclub] :: run() :: Got exception : %s' % exp
示例#21
0
    def scrap_craiglist_dry_cleaner(self):
        """
        Scrap dry cleaners data from craiglist
        :return:
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_CRAIGLIST_FILE)

        # if progress file is not found,
        # create a new one with default value 1
        if not os.path.isfile(file_data):
            logging.error("Error: %s file is not found" % file_data)
            print(
                "file is exist %s ..." % file_data
            )
            txt_write(PROGRESS_CRAIGLIST_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        print(
            "file is exist %s ..." % file_data
        )
        f = open(PROGRESS_CRAIGLIST_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nCraiglist-Dry-Cleaner started at i: %d \n\n" % i
        )
        print(
            "\n\nCraiglist-Dry-Cleaner started at i: %d \n\n" % i
        )

        base_url = "https://newyork.craigslist.org/" \
                   "search/sss?query=dry%20cleaner&sort=rel"

        for j in range(i, 89, 1):
            try:
                craiglist_url = base_url + str(j)
                self.craiglist.scrap_craiglist(craiglist_url, ST_DRY_CLEANERS)

                # progress % of craiglist_dry_cleaner scrapping
                percentage = float(j * 100 / 89)
                logging.info(
                    "\n\nCraiglist-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )
                print(
                    "\n\nCraiglist-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )

                # update scrapping progress in
                # craiglist_dry_cleaner progress file
                txt_write(PROGRESS_CRAIGLIST_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Craiglist-Dry-Cleaner-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_craiglist_dry_cleaner() :: "
                    "Got Exception : %s" % exp
                )
                logging.error(
                    traceback.format_exc()
                )
示例#22
0
    def scrap_groupon_dry_cleaner(self):
        """
        Scrap dry cleaners data from groupon
        """

        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_data = "%s/%s" % (dir_path, PROGRESS_GROUPON_FILE)

        # if progress file is not found,
        # create a new one with default value 1
        if not os.path.isfile(file_data):
            logging.error("Error: %s file not found" % file_data)
            print(
                "Error: %s file not found" % file_data
            )
            txt_write(PROGRESS_GROUPON_FILE, "1")

        logging.info("file is exist %s ..." % file_data)
        print(
            "file is exist %s ..." % file_data
        )
        f = open(PROGRESS_GROUPON_FILE, "r")
        i = int(f.readline())

        logging.info(
            "\n\nGroupon-Dry-Cleaner-Scrapper started at i: %d \n\n" % i
        )
        print(
            "\n\nGroupon-Dry-Cleaner-Scrapper started at i: %d \n\n" % i
        )

        base_url = "https://www.groupon.com/browse/chicago?" \
                   "lat=41.8795&lng=-87.6243&address=Chicago&query=" \
                   "dry+cleaners&locale=en_US&page="

        for j in range(i, 16, 1):
            try:
                groupon_url = base_url + str(j)
                self.groupon.scrap_groupon(groupon_url, ST_DRY_CLEANERS)

                # progress % of groupon_dry_cleaner scrapping
                percentage = float(j * 100 / 15)
                logging.info(
                    "\n\nGroupon-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )
                print(
                    "\n\nGroupon-Dry-Cleaner-Scrapper %f "
                    "percent completed" % percentage
                )

                # update scrapping progress in
                # groupon_dry_cleaner progress file
                txt_write(PROGRESS_GROUPON_FILE, str(j))

                # sleep scrapper for a while
                sleep_scrapper("Groupon-Dry-Cleaner-Scrapper")
            except Exception as exp:
                logging.error(
                    "scrap_groupon_dry_cleaner() :: Got Exception : %s" % exp
                )
                logging.error(
                    traceback.format_exc()
                )
    def run(self):
        try:
            #Teams news scrapping 'https://www.espn.in/cricket/team/_/id/1/england',
            #   'https://www.espn.in/cricket/team/_/id/2/australia',
            #   'https://www.espn.in/cricket/team/_/id/3/south-africa',
            #   'https://www.espn.in/cricket/team/_/id/4/west-indies',
            #   'https://www.espn.in/cricket/team/_/id/5/new-zealand',
            #   'https://www.espn.in/cricket/team/_/id/6/india',
            #   'https://www.espn.in/cricket/team/_/id/7/pakistan',
            #   'https://www.espn.in/cricket/team/_/id/8/sri-lanka',
            #   'https://www.espn.in/cricket/team/_/id/9/zimbabwe',
            #   'https://www.espn.in/cricket/team/_/id/25/bangladesh/',
            cricketTeams_URLs = [
                'http://www.espn.in/cricket/team/_/id/1/england',
                'http://www.espn.in/cricket/team/_/id/2/australia',
                'http://www.espn.in/cricket/team/_/id/3/south-africa',
                'http://www.espn.in/cricket/team/_/id/4/west-indies',
                'http://www.espn.in/cricket/team/_/id/5/new-zealand',
                'http://www.espn.in/cricket/team/_/id/6/india',
                'http://www.espn.in/cricket/team/_/id/7/pakistan',
                'http://www.espn.in/cricket/team/_/id/8/sri-lanka',
                'http://www.espn.in/cricket/team/_/id/9/zimbabwe',
                'http://www.espn.in/cricket/team/_/id/25/bangladesh/'
            ]

            for url in cricketTeams_URLs:
                print("PART_________________________________A")
                print('[ESPNScrapper] :: fetching data from TEAMS url: ', url)
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ("[ESPNScrapper] :: Failed to get " \
                        "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                sect1 = soup.find('section', id='pane-main')
                sect11 = sect1.find('section', id='main-container')
                sect2 = sect11.find('section', id='news-feed')
                news_feed_list = sect2.find('div', class_='container')
                section_news = news_feed_list.find_all(
                    "article",
                    {"class": "news-feed-item news-feed-story-package"})
                Headlines = news_feed_list.find_all("article", {
                    "class":
                    "news-feed-item news-feed-story-package is-headline"
                })
                team_parsed_urls = []
                for news in Headlines:
                    print("PART______________________________A.A")
                    news_ = news.find("a", {"class": "story-link"})
                    if (news):
                        news_url = (news_)['data-popup-href']
                        #self.scrape_post_content(news_url)
                        team_parsed_urls.append(news_url)

                for news in section_news:
                    print("PART______________________________A.B")
                    print(
                        'You are now scrapping regular news fro webpage !!.. ')
                    news_ = news.find("a", {"class": "story-link"})
                    if (news):
                        news_url = (news_)['data-popup-href']
                        print('[ESPNScrapper] :: section news URL: ', news_url)
                        #self.scrape_post_content(news_url)
                        team_parsed_urls.append(news_url)

                infnite_scroll_url = 'https://secure.espn.com/core/minifeed?render=true&partial=newsfeed&limit=20&xhr=1&template=clubhouse&headlinestack=true&site=espn&lang=en&region=in&sport=cricket&pubkey=cricket-clubhouse&insider=false&device=desktop&country=in&lang=en&region=in&site=espn&edition-host=espn.in&site-type=full&userab=0&offset='
                ur_l = url.split('/')
                print('url to be scrolled infinite', ur_l)
                team = '&team=' + ur_l[7]
                for i in range(0, 100, 25):
                    scroll_url = infnite_scroll_url + str(i) + team
                    print(
                        "PART____________AUTO-SCROLL______________________A.C")
                    print(
                        '[ESPNScrapper] :: fetching data from infinite-url: ',
                        scroll_url)
                    try:
                        raw_json = requests.get(scroll_url).text
                        data = json.loads(raw_json)
                        qw = (data['content']['html']['items'][0]['html'])
                        for data in data['content']['html']['items']:
                            qw = data['html']
                            try:

                                qwe = json.dumps(qw)
                                soup = BeautifulSoup(qwe, 'html.parser')
                                section_ = soup.find("a")['data-popup-href']
                                print(section_)
                                sect = section_.replace('\\"', '')
                                if re.search('clip', sect):
                                    print(
                                        "NEWS only contains video, no text , no image , so skipping this News"
                                    )
                                else:
                                    team_parsed_urls.append(sect)
                                    self.scrape_post_content(sect)

                            except Exception as exp:
                                print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                                    % exp)
                                print(traceback.format_exc())

                    except Exception as exp:
                        print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                            % exp)
                        print(traceback.format_exc())
                    sleep_scrapper('ESPNScrapper')
                sleep_scrapper('ESPNScrapper')

                base_url = 'https://www.espn.in/cricket/'
                parent_fol = base_url.split('/')
                print(parent_fol)
                print("PART_______________________________B")
                print('[ESPNScrapper] :: fetching data from BASE Url: ',
                      base_url)
                ree = requests.get(base_url, headers=get_request_headers())
                if not ree.status_code == 200:
                    print ("[ESPNScrapper] :: Failed to get " \
                        "content of url: %s" % base_url)
                    return
                html_doc = ree.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                col2_feed = soup.find('section', class_='col-two contentFeed')
                contentf = col2_feed.find_all("section",
                                              {"class": "contentItem"})
                for content in contentf:
                    print('contentfeedsections')
                    self.scrape_sports(content, cricketTeams_URLs)
                contentf = col2_feed.find_all("article",
                                              {"class": "contentItem"})
                for contentfeed in contentf:
                    print('contentfeedARTICLES')
                    self.scrape_sports(contentfeed, cricketTeams_URLs)
                    #scrape only 26 posts of the starting webpage

                #Infinite loading URL
                infinite_url = 'https://onefeed.fan.api.espn.com/apis/v3/cached/contentEngine/oneFeed/leagues/cricket?source=ESPN.com%2B-%2BFAM&showfc=true&region=in&limit=15&lang=en&authorizedNetworks=espn_free&editionKey=espnin-en&device=desktop&pubkey=espncricinfo-en-in-cricket-index&isPremium=true&locale=in&featureFlags=expandAthlete&featureFlags=mmaGB&offset='
                #for 10 times scrolling
                for i in range(10, 100, 15):
                    scroll_url = infinite_url + str(i)
                    print(
                        '[ESPNScrapper] :: fetching data from infinite-url: ',
                        scroll_url)

                    r = requests.get(scroll_url, headers=get_request_headers())
                    try:
                        raw_json = requests.get(scroll_url).text
                        dataa = json.loads(raw_json)
                        for data in dataa['feed']:
                            qw = (data['data']['now'][0])
                            try:
                                keys = sorted(qw.items())
                                result = [(key, value) for key, value in keys
                                          if key.startswith("links")]
                                result11 = result[0]
                                result33 = list(result11)
                                reo = result33[1]
                                scroll_news_url = reo['web']['href']
                                print('FETCHING post from scroll URL')
                                self.scrape_post_content(scroll_news_url)

                            except Exception as exp:
                                print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                                    % exp)
                                print(traceback.format_exc())

                    except Exception as exp:
                        print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                            % exp)
                        print(traceback.format_exc())

                sleep_scrapper('ESPNScrapper')

        except Exception as exp:
            print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                  % exp)
            print(traceback.format_exc())