示例#1
0
    def scrape_pipeline(self,webpageinfo_iterable,output_collection_cls):
        """
        Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres

        """
        webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable)
                                                                                                  , output_collection_cls))

        for rank,webpageinfo_obj in itertools.islice(enumerate(webpageinfo_iterable),self.queue.get_location(),None):
            assert isinstance(webpageinfo_obj,WebPageInfo)
            webscraper_logger.debug("Current on rank number {}".format(rank))

            url=unreplace_dot_url(webpageinfo_obj.url)

            try:
                #first get the webpage
                page=self.get_page(url)

                if page is None:
                    raise AssertionError("Skippin rank {} due to empty page".format(rank))

                webscraper_logger.debug("Found page of length {}".format(len(page)))

                dot_replaced_url=replace_dot_url(url)

                try:
                    alexa_genre_strings=self.alexa_scraper.query_url(url)
                    dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))
                except DoesNotExist:
                    #sleep for 200 seconds and then try again
                    time.sleep(200)
                    dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))


                if len(alexa_genre_strings)+len(dmoz_genre_strings)==0:
                    raise AssertionError("Skippin rank {} due to no genres".format(rank))

                webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings)))
                webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings)))

                #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects
                alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url)
                dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url)

                #convert from genres -> embedded genres for more info and storage in genre_metadata
                alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs)
                dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs)

                #Create the genre metadata
                genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url)


                #finally put page in collection
                output_collection_cls(genres_data=genre_metadata,url=dot_replaced_url,original=True,page=page,ranking=rank).save()
                webscraper_logger.debug("Done, commited to URlToGenreAlexa300k, there are now {} objects"
                                       .format(output_collection_cls.objects.count()))

            except Exception as ex:
                webscraper_logger.info("Exception occured: {}".format(str(ex)))

            #update reference so we don't go over the same again
            self.queue.increment_location()
示例#2
0
    def scrape_pipeline(self,webpageinfo_iterable,input_collection_cls,start=0):
        """
        Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres

        """
        webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable)
                                                                                                  , input_collection_cls))

        for count,webpageinfo_obj in enumerate(webpageinfo_iterable,start=start):
            assert isinstance(webpageinfo_obj,WebPageInfo)

            url=unreplace_dot_url(webpageinfo_obj.url)

            try:

                dot_replaced_url=replace_dot_url(url)

                url_obj=input_collection_cls.objects.get(url=dot_replaced_url)

                if not hasattr(url_obj,"original") or not url_obj.original:
                    self.queue.increment_location()
                    continue


                webscraper_logger.debug("Current on count number {}".format(count))

                alexa_genre_strings=self.alexa_scraper.query_url(url)
                dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))

                if len(alexa_genre_strings)+len(dmoz_genre_strings)==0:
                    raise AssertionError("Skippin count {} due to no genres".format(count))

                webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings)))
                webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings)))

                #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects
                alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url)
                dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url)

                #convert from genres -> embedded genres for more info and storage in genre_metadata
                alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs)
                dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs)

                #Create the genre metadata
                models.GenreMetaData.objects(url=url).delete()

                genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url)


                #finally put page in collection
                url_obj.update(genres_data=genre_metadata)
                input_collection_cls.objects.get(url=dot_replaced_url)
                webscraper_logger.debug("Done, validating")

                #something is very wrong with mongoengine, references do not work any longer
                fetched_genre_data=models.GenreMetaData.objects.get(url=dot_replaced_url).genres


            except (AssertionError,DoesNotExist) as ex:
                webscraper_logger.info("AssertException occured: {}".format(str(ex)))

            #update reference so we don't go over the same again
            self.queue.increment_location()