def scrape_links(self,pos): doc_object=MongoDB.get(URLQueue,'document',number=pos) while doc_object is not None: self.scrape_link_and_child(doc_object['url']) pos=MongoDB.increment_url_counter() doc_object=MongoDB.get(URLQueue,'document',number=pos)
def save_top_urls_to_mongo(cls): with open(settings.OUTPUT_FILE,encoding='ISO-8859-1') as input: for webpage in input: #dictionary of webpage properties webpage_dict=cls.__get_url_properties_and_sanitize(webpage) MongoDB.save_modify_url(**webpage_dict) return cls
def scrape_urls(cls): position=MongoDB.get(MetaData,'position',type='queue') WebScraper().scrape_links(position) return cls
def scrape(self): home=self.http.get(dmoz_home) home_page_links=self._scrapeHomeAndGetLinks(home.data) #visit each link in homepage and dig down #for url in home_page_links: i=0 while i<settings.NUM_RANDOM_WEBPAGE: result=self._scrapPage(home_page_links[random.randint(0,len(home_page_links)-1)]) if result is not None and MongoDB.get_url_object(result['url']) is None: i+=1 try: page=utf_8_safe_decode(self.http.get(result['url']).data) MongoDB.save_modify_url(page=page,**result) Logger.info("Completed: "+result['url']) except Exception as ex: Logger.error(ex)
def scrape_link_and_child(self,parent_url): parent_url=base_util.replace_dot_url(parent_url) webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None) #exit if failed to scrap website if webpage_body is None: return MongoDB.save_page(url=parent_url,page=webpage_body) Logger.info('Completed page: '+parent_url) #Now, we grab the childs of this webpage all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs] child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref #get rid of bad normalization if not re.match('^www[.].*$',parent_url): Logger.info('Updating bad url for {}'.format(parent_url)) MongoDB.update_url(base_util.normalize_url(parent_url),parent_url) if len(child_urls) > 0: parent_genres=MongoDB.get_genre(parent_url) #get the childs for child_url in child_urls: child_page=self.scrape(child_url,parent_url) if child_page is None: exploredset=set() tries=0 for url in set(all_ahref)^(exploredset): if tries==settings.MAX_RETRIES: Logger.info('Max retrie number exceeded') break Logger.info("trying new url: "+url) child_page=self.scrape(url,parent_url) if child_page is not None: break exploredset.add(url) tries+=1 if child_page is not None: MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=parent_genres,page=child_page) Logger.info('Completed page: '+child_url)
edit_distance = sys.maxsize alexa_genre_length += len(alexa_genre_name) for dmoz_genre_name, dmoz_genre_info in dmoz_dict.items(): if edit_distance is 0: break if alexa_genre_name == dmoz_genre_name: exact_match += 1 edit_distance = 0 else: edit_distance = min(edit_distance, levenshtein(dmoz_genre_name, alexa_genre_name)) if edit_distance is not sys.maxsize: total_edit_distance += edit_distance edit_distance_count += 1 return { "alexa_total": len(alexa_dict), "edit_distance_count": edit_distance_count, "total_edit_distance": total_edit_distance, "alexa_match": exact_match, "alexa_genre_length": alexa_genre_length, } # accumulate stats and update the collection MongoDB.connect(settings.HOST_NAME, settings.PORT)
def start(cls): MongoDB.connect(settings.HOST_NAME,settings.PORT) return cls
def create_url_queue(cls): for num,URL_document in enumerate(URLToGenre.objects): MongoDB.push_to_queue(num,URL_document) return cls