def test_simplify_headline(self): s = '<NQN>◇<東証>三菱UFJが続伸 株高受けた買い戻し優勢' expected = '三菱UFJが続伸 株高受けた買い戻し優勢' self.assertEqual(simplify_headline(s), expected) s = IDEOGRAPHIC_SPACE.join( ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で']) expected = IDEOGRAPHIC_SPACE.join( ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で']) self.assertEqual(simplify_headline(s), expected) s = '【要チェック画面】日銀追加緩和見送り' expected = '日銀追加緩和見送り' self.assertEqual(simplify_headline(s), expected)
def test_simplify_headline(): s = '<NQN>◇<東証>三菱UFJが続伸 株高受けた買い戻し優勢' expected = '三菱UFJが続伸 株高受けた買い戻し優勢' result = simplify_headline(s) assert result == expected s = IDEOGRAPHIC_SPACE.join( ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で']) expected = IDEOGRAPHIC_SPACE.join( ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で']) result = simplify_headline(s) assert result == expected s = '【要チェック画面】日銀追加緩和見送り' expected = '日銀追加緩和見送り' result = simplify_headline(s) assert result == expected
def update_headlines(session: Session, user_dict: Path, logger: Logger) -> None: query_result = session \ .query(Headline) \ .filter(Headline.is_used.is_(None)) \ .all() headlines = list(query_result) if len(headlines) == 0: return tokenizer = Tokenizer(str(user_dict)) mappings = [] logger.info('start updating headlines') for headline in tqdm(headlines): h = simplify_headline(headline.headline) is_about_di = headline.categories is not None and \ DOMESTIC_INDEX in headline.categories # We stopped using `is_template` because the size of the dataset decreased and the result got worse. # if is_template(h) or not is_interesting(h) or not is_about_di: if not is_interesting(h) or not is_about_di: mappings.append({ 'article_id': headline.article_id, 'is_used': False }) continue tokens = kansuuzi2number( [token.surface for token in tokenizer.tokenize(h)]) tag_tokens = replace_prices_with_tags(tokens) mappings.append({ 'article_id': headline.article_id, 'simple_headline': h, 'tokens': tokens, 'tag_tokens': tag_tokens, 'is_used': True, }) session.bulk_update_mappings(Headline, mappings) session.commit() logger.info('end updating headlines')