class TopicScrapingTask: def __init__(self, topic_scraper, article_scraping_task): """ @type topic_scraper: Scraper @type article_scraping_task: ArticleScrapingTask """ self._topic_scraper = topic_scraper self._article_scraping_task = article_scraping_task self._logger = Logger(self.__class__.__name__) def run(self, topic_url): for parser in self._topic_scraper.scrape(topic_url): assert isinstance(parser, TopicParser) self._logger.info('Scraping topic at %s.' % topic_url) articles = [] for article in parser.get_articles(): try: if self._article_scraping_task.run(article): articles.append(article) else: self._logger.warn('Could not parse article body at %s', article.url) except IOError, e: self._logger.error('Failed scraping article: %s' % e) continue return articles
class ArticleScrapingTask: def __init__(self, scraper, min_word_count_heuristic=100): """ @type scraper: Scraper @type min_word_count_heuristic: int """ self._scraper = scraper self._min_word_count_heuristic = min_word_count_heuristic self._logger = Logger(self.__class__.__name__) pass def run(self, article): """ @type article: Article """ if article.url: self._logger.info('Scraping %s.', article.url) for parser in self._scraper.scrape(article.url): assert isinstance(parser, ArticleParser) # The final URL of the article may be different, during scraping, the scraper # passing the final URL to each constructed Parser. article.url = parser.url title = parser.get_title() publish_date = parser.get_publish_date() preview_image_url = parser.get_preview_image_url() body = parser.get_body() if title: article.title = title if publish_date: article.publish_date = publish_date if preview_image_url: article.preview_image_url = preview_image_url if body and self._is_article_body(body): article.body = body elif article.description and self._is_article_body(article.description): article.body = article.description else: break return True return False def _is_article_body(self, body): return self._count_words(body) > self._min_word_count_heuristic @staticmethod def _count_words(s): return len(s.split())
class KeywordAlgorithm(Algorithm): name = 'keyword' def __init__(self): Algorithm.__init__(self) self._score_mapper = ScoreMapper() self._logger = Logger(self.__class__.__name__) def train(self, articles, states): self._params.extractor = KeywordFeatureExtractor(finder=KeywordFinder(), text_key=lambda a: a.title) features = np.array(self._params.extractor.train_extract(articles)) scores = np.array(self._score_mapper.map_batch_score(states)) regression = LinearRegression(fit_intercept=True) n_features = features.shape[1] self._logger.info('Feature space uses %d keywords', n_features) if n_features >= 100: param_grid = { 'pca__n_components': range(50, n_features, 50) } pca = PCA(n_components=100) pipeline = Pipeline([('pca', pca), ('regression', regression)]) clf = GridSearchCV(pipeline, param_grid, n_jobs=1, verbose=0, cv=3, score_func=top_item_scorer) else: clf = regression self._params.classifier = clf #self._params.classifier = regression self._params.classifier.fit(features, scores) def score(self, articles): self._logger.info('Feature space uses %d keywords', self._params.extractor.keyword_count()) features = self._params.extractor.extract(articles) return self._params.classifier.predict(np.array(features))
class WorkerThread(threading.Thread): def __init__(self, worker_id, task_queue, completed_queue): """ @type worker_id: int @type task_queue: Queue.Queue @type completed_queue: Queue.Queue """ self._name = '%s-%d' % (self.__class__.__name__, worker_id) threading.Thread.__init__(self, name=self._name) self._logger = Logger(self._name) self._id = worker_id self._task_queue = task_queue self._completed_queue = completed_queue self._continue = True def stop(self): self._continue = False def run(self): while self._continue: self.work() self.exit() def work(self): raise NotImplementedError def exit(self): self._logger.info('Exiting.') @classmethod def initializer(cls, *args, **kwargs): class _WorkerThread(cls): def __init__(self, worker_id, task_queue, completed_queue): cls.__init__(self, worker_id, task_queue, completed_queue, *args, **kwargs) return _WorkerThread