def scrape_link_and_child(self,parent_url): parent_url=base_util.replace_dot_url(parent_url) webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None) #exit if failed to scrap website if webpage_body is None: return Logger.debug('Saving Parent') MongoDB.save_page(url=parent_url,page=webpage_body) Logger.info('Completed page: '+parent_url) #Now, we grab the childs of this webpage all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs] child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref #get rid of bad normalization if not re.match('^www[.].*$',parent_url): Logger.info('Updating bad url for {}'.format(parent_url)) MongoDB.update_url(base_util.normalize_url(parent_url),parent_url) if len(child_urls) > 0: #get the childs, child urls is a subset of all urls for child_url in child_urls: Logger.debug('Get Child {}'.format(child_url)) child_page=self.scrape(child_url,parent_url) if child_page is None: exploredset=set() tries=0 for url in set(all_ahref)^(exploredset): if tries==settings.MAX_RETRIES: Logger.info('Max retrie number exceeded') break Logger.info("trying new url: "+url) child_page=self.scrape(url,parent_url) if child_page is not None: break exploredset.add(url) tries+=1 if child_page is not None: Logger.debug('Saving Child {}'.format(child_url)) MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=[],page=child_page) Logger.info('Completed page: '+child_url)
def __get_url_properties_and_sanitize(boxer_line): split_elements=boxer_line.split(settings.BOXER_DELIMITER) url=base_util.replace_dot_url(split_elements[settings.INDEX_OF_URL]) genre=split_elements[settings.INDEX_OF_GENRE].replace('_','/',1) desc=split_elements[settings.INDEX_OF_DESC] return {'url':url,'genre':[{'genre':genre,'alexa':{}}],'desc':desc}
def get_dictionary_from_top_250_line(boxer_line): split_elements=boxer_line.split(settings.BOXER_DELIMITER) url=util.replace_dot_url(re.sub('^[0-9]+ ','',split_elements[settings.INDEX_OF_URL])).strip() url=url[0].lower()+url[1:] genre=split_elements[settings.INDEX_OF_GENRE].replace('_','/',1) desc=split_elements[settings.INDEX_OF_DESC] return {'url':url,'genre':[{'genre':genre,'alexa':{}}],'desc':desc}
def scrape_pipeline(self,webpageinfo_iterable,output_collection_cls): """ Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres """ webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable) , output_collection_cls)) for rank,webpageinfo_obj in itertools.islice(enumerate(webpageinfo_iterable),self.queue.get_location(),None): assert isinstance(webpageinfo_obj,WebPageInfo) webscraper_logger.debug("Current on rank number {}".format(rank)) url=unreplace_dot_url(webpageinfo_obj.url) try: #first get the webpage page=self.get_page(url) if page is None: raise AssertionError("Skippin rank {} due to empty page".format(rank)) webscraper_logger.debug("Found page of length {}".format(len(page))) dot_replaced_url=replace_dot_url(url) try: alexa_genre_strings=self.alexa_scraper.query_url(url) dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings)) except DoesNotExist: #sleep for 200 seconds and then try again time.sleep(200) dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings)) if len(alexa_genre_strings)+len(dmoz_genre_strings)==0: raise AssertionError("Skippin rank {} due to no genres".format(rank)) webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings))) webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings))) #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url) dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url) #convert from genres -> embedded genres for more info and storage in genre_metadata alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs) dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs) #Create the genre metadata genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url) #finally put page in collection output_collection_cls(genres_data=genre_metadata,url=dot_replaced_url,original=True,page=page,ranking=rank).save() webscraper_logger.debug("Done, commited to URlToGenreAlexa300k, there are now {} objects" .format(output_collection_cls.objects.count())) except Exception as ex: webscraper_logger.info("Exception occured: {}".format(str(ex))) #update reference so we don't go over the same again self.queue.increment_location()
def scrape_pipeline(self,webpageinfo_iterable,input_collection_cls,start=0): """ Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres """ webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable) , input_collection_cls)) for count,webpageinfo_obj in enumerate(webpageinfo_iterable,start=start): assert isinstance(webpageinfo_obj,WebPageInfo) url=unreplace_dot_url(webpageinfo_obj.url) try: dot_replaced_url=replace_dot_url(url) url_obj=input_collection_cls.objects.get(url=dot_replaced_url) if not hasattr(url_obj,"original") or not url_obj.original: self.queue.increment_location() continue webscraper_logger.debug("Current on count number {}".format(count)) alexa_genre_strings=self.alexa_scraper.query_url(url) dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings)) if len(alexa_genre_strings)+len(dmoz_genre_strings)==0: raise AssertionError("Skippin count {} due to no genres".format(count)) webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings))) webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings))) #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url) dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url) #convert from genres -> embedded genres for more info and storage in genre_metadata alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs) dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs) #Create the genre metadata models.GenreMetaData.objects(url=url).delete() genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url) #finally put page in collection url_obj.update(genres_data=genre_metadata) input_collection_cls.objects.get(url=dot_replaced_url) webscraper_logger.debug("Done, validating") #something is very wrong with mongoengine, references do not work any longer fetched_genre_data=models.GenreMetaData.objects.get(url=dot_replaced_url).genres except (AssertionError,DoesNotExist) as ex: webscraper_logger.info("AssertException occured: {}".format(str(ex))) #update reference so we don't go over the same again self.queue.increment_location()