Exemplo n.º 1
0
    def scrape_link_and_child(self,parent_url):
        parent_url=base_util.replace_dot_url(parent_url)
        webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None)

        #exit if failed to scrap website
        if webpage_body is None:
            return

        Logger.debug('Saving Parent')
        MongoDB.save_page(url=parent_url,page=webpage_body)
        Logger.info('Completed page: '+parent_url)

        #Now, we grab the childs of this webpage
        all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs]

        child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref

        #get rid of bad normalization
        if not re.match('^www[.].*$',parent_url):
            Logger.info('Updating bad url for {}'.format(parent_url))
            MongoDB.update_url(base_util.normalize_url(parent_url),parent_url)

        if len(child_urls) > 0:

            #get the childs, child urls is a subset of all urls
            for child_url in child_urls:
                Logger.debug('Get Child {}'.format(child_url))
                child_page=self.scrape(child_url,parent_url)

                if child_page is None:
                    exploredset=set()
                    tries=0
                    for url in set(all_ahref)^(exploredset):
                        if tries==settings.MAX_RETRIES:
                            Logger.info('Max retrie number exceeded')
                            break

                        Logger.info("trying new url: "+url)

                        child_page=self.scrape(url,parent_url)

                        if child_page is not None:
                            break
                        exploredset.add(url)

                        tries+=1

                if child_page is not None:
                    Logger.debug('Saving Child {}'.format(child_url))
                    MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=[],page=child_page)
                    Logger.info('Completed page: '+child_url)
Exemplo n.º 2
0
    def __get_url_properties_and_sanitize(boxer_line):
        split_elements=boxer_line.split(settings.BOXER_DELIMITER)

        url=base_util.replace_dot_url(split_elements[settings.INDEX_OF_URL])

        genre=split_elements[settings.INDEX_OF_GENRE].replace('_','/',1)

        desc=split_elements[settings.INDEX_OF_DESC]

        return {'url':url,'genre':[{'genre':genre,'alexa':{}}],'desc':desc}
Exemplo n.º 3
0
def get_dictionary_from_top_250_line(boxer_line):
    split_elements=boxer_line.split(settings.BOXER_DELIMITER)

    url=util.replace_dot_url(re.sub('^[0-9]+ ','',split_elements[settings.INDEX_OF_URL])).strip()
    url=url[0].lower()+url[1:]

    genre=split_elements[settings.INDEX_OF_GENRE].replace('_','/',1)

    desc=split_elements[settings.INDEX_OF_DESC]

    return {'url':url,'genre':[{'genre':genre,'alexa':{}}],'desc':desc}
Exemplo n.º 4
0
    def scrape_pipeline(self,webpageinfo_iterable,output_collection_cls):
        """
        Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres

        """
        webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable)
                                                                                                  , output_collection_cls))

        for rank,webpageinfo_obj in itertools.islice(enumerate(webpageinfo_iterable),self.queue.get_location(),None):
            assert isinstance(webpageinfo_obj,WebPageInfo)
            webscraper_logger.debug("Current on rank number {}".format(rank))

            url=unreplace_dot_url(webpageinfo_obj.url)

            try:
                #first get the webpage
                page=self.get_page(url)

                if page is None:
                    raise AssertionError("Skippin rank {} due to empty page".format(rank))

                webscraper_logger.debug("Found page of length {}".format(len(page)))

                dot_replaced_url=replace_dot_url(url)

                try:
                    alexa_genre_strings=self.alexa_scraper.query_url(url)
                    dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))
                except DoesNotExist:
                    #sleep for 200 seconds and then try again
                    time.sleep(200)
                    dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))


                if len(alexa_genre_strings)+len(dmoz_genre_strings)==0:
                    raise AssertionError("Skippin rank {} due to no genres".format(rank))

                webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings)))
                webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings)))

                #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects
                alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url)
                dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url)

                #convert from genres -> embedded genres for more info and storage in genre_metadata
                alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs)
                dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs)

                #Create the genre metadata
                genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url)


                #finally put page in collection
                output_collection_cls(genres_data=genre_metadata,url=dot_replaced_url,original=True,page=page,ranking=rank).save()
                webscraper_logger.debug("Done, commited to URlToGenreAlexa300k, there are now {} objects"
                                       .format(output_collection_cls.objects.count()))

            except Exception as ex:
                webscraper_logger.info("Exception occured: {}".format(str(ex)))

            #update reference so we don't go over the same again
            self.queue.increment_location()
Exemplo n.º 5
0
    def scrape_pipeline(self,webpageinfo_iterable,input_collection_cls,start=0):
        """
        Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres

        """
        webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable)
                                                                                                  , input_collection_cls))

        for count,webpageinfo_obj in enumerate(webpageinfo_iterable,start=start):
            assert isinstance(webpageinfo_obj,WebPageInfo)

            url=unreplace_dot_url(webpageinfo_obj.url)

            try:

                dot_replaced_url=replace_dot_url(url)

                url_obj=input_collection_cls.objects.get(url=dot_replaced_url)

                if not hasattr(url_obj,"original") or not url_obj.original:
                    self.queue.increment_location()
                    continue


                webscraper_logger.debug("Current on count number {}".format(count))

                alexa_genre_strings=self.alexa_scraper.query_url(url)
                dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))

                if len(alexa_genre_strings)+len(dmoz_genre_strings)==0:
                    raise AssertionError("Skippin count {} due to no genres".format(count))

                webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings)))
                webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings)))

                #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects
                alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url)
                dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url)

                #convert from genres -> embedded genres for more info and storage in genre_metadata
                alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs)
                dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs)

                #Create the genre metadata
                models.GenreMetaData.objects(url=url).delete()

                genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url)


                #finally put page in collection
                url_obj.update(genres_data=genre_metadata)
                input_collection_cls.objects.get(url=dot_replaced_url)
                webscraper_logger.debug("Done, validating")

                #something is very wrong with mongoengine, references do not work any longer
                fetched_genre_data=models.GenreMetaData.objects.get(url=dot_replaced_url).genres


            except (AssertionError,DoesNotExist) as ex:
                webscraper_logger.info("AssertException occured: {}".format(str(ex)))

            #update reference so we don't go over the same again
            self.queue.increment_location()