def __init__(self, config, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # schema.org news article extractor self.schema_extractor = self.get_schema_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO: use the log prefix self.log_prefix = "crawler: "
def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO : log prefix self.logPrefix = "crawler:"
def test_instance(self): a = Article() self.assertEqual(isinstance(a, Article), True)