def schedule_special_hibor(): session = Session(**database) news = api.special_hibor() for n in news: session.insert_one(n) logs.info("慧博资讯导入数据库完成") print("慧博资讯导入数据库完成") session.close()
def simple(_date): """样本集 :return: List<News> """ session = Session(**database).session ret = session.query( News.title, News.abstract).filter(News.savedate >= _date).all() ret = [x[0] + x[1] for x in ret] session.close() return ret
def simple(_date): """样本集 :return: List<News> """ session = Session(**database).session ret = session.query(News.title, News.abstract).filter( News.savedate >= _date - timedelta(days=5)).all() if not ret: ret = session.query(News.title, News.abstract).filter( News.savedate >= _date - timedelta(days=1)).all() session.close() ret = [Similarity.reduce(x[0] + x[1]) for x in ret] return ret
def schedule(website_name): session = Session(**database) web = website[website_name] for k, v in web.items(): for section in v: logs.info( f"{datetime.now().strftime('%Y-%m-%d %H:%M')} 执行任务<{website_name} {section['section']}>" ) for i in range(1, 3): section["page"] = i try: news = getattr(api, website_name)(**section) except Exception as e: logs.error(e) break for n in news: n = api.revise(n) if n: session.insert_one(n) session.close()
def schedule_special_search_api(): session = Session(**database) news = api.special_eastmoney_search_api() for n in news: session.insert_one(n) session.close()
from jieba import lcut from jieba import posseg from gensim.similarities import SparseMatrixSimilarity from gensim.corpora import Dictionary from gensim.models import TfidfModel from app.config import database_remote as database from app.database.insert import Session from app.database.model import News session = Session(**database).session ret = session.query(News).filter(News.savedate >= "2019-10-28").all() session.close() def similar(aim): aim_text = aim.title + aim.abstract simple = [x.title + x.abstract for x in ret[0:-10]] text = [set(posseg.lcut(x)) for x in simple] text = list({y for x in text for y in x}) dictionary = Dictionary(text) length = len(dictionary.token2id) corpus = [dictionary.doc2bow(lcut(src)) for src in simple] tfidf = TfidfModel(corpus) tf_texts = tfidf[corpus] sparse_matrix = SparseMatrixSimilarity(tf_texts, length) vector = dictionary.doc2bow(lcut(aim_text)) tf_kw = tfidf[vector] similarities = sparse_matrix.get_similarities(tf_kw)