def post_article(): """ 发表新文章后,首先调用该函数进行文章分析和数据库存入 :return: """ try: jdata = json.loads(request.data) media_id = jdata['media_id'] items = jdata['items'] update_time = jdata['update_time'] # admin_id = jdata['admin_id'] for item in items: article_title = item['article_title'] article_content = item['article_content'] article_thumb_id = item['article_thumb_id'] article_id = hashlib.md5(media_id + article_thumb_id).hexdigest() article_url = item['article_url'] article_post_user = item['article_post_user'] article_post_date = update_time a_topiclist = tagging_utils.passage_second_level_classify(article_content) atags = {} for topic in a_topiclist: atags[topic['topic_tag']] = topic['topic_prob'] article = Article(a_id=article_id, a_title=article_title, post_user=article_post_user, post_date=article_post_date, a_tags=atags, a_url=article_url, a_content=article_content) DAO_utils.mongo_insert_article(article) resp = make_response(json.dumps({'code': 0, 'msg': 'success'}), 200) except KeyError, ke: print ke resp = make_response( json.dumps({'code': 103, 'msg': 'request key error, details=%s' % str(ke)}), 500)
def tag_article_and_save(root_path): dir_list = os.listdir(root_path) for d in dir_list: if os.path.isdir(os.path.join(root_path, d)): flist = os.listdir(os.path.join(root_path, d)) for f in flist: if os.path.isfile(os.path.join(root_path, d, f)): print 'processing %s ...' % f fjson = json.loads(open(os.path.join(root_path, d, f)).read()) content = fjson['post_content'] post_date = datetime.datetime.strptime(fjson['post_date'], '%Y-%m-%d') post_title = fjson['post_title'] post_user = fjson['post_user'] a_tags_list = passage_second_level_classify(content) a_tags = {} for topic in a_tags_list: a_tags[topic['topic_tag']] = topic['topic_prob'] a_id = hashlib.md5(post_title + fjson['post_date']).hexdigest() article = Article.Article(a_id, post_title, post_user, a_tags, content, post_date) wechat_analyzer.DAO_utils.mongo_insert_article(article)
def classify_passage_boson_url(): url = request.form['url'] content = get_content(url) # print 'top_k=%s' % top_k classify_result = requests.post('http://bosonnlp.com/analysis/category', {'data': content}).content keyword_result = json.loads( requests.post('http://bosonnlp.com/analysis/key?top_k=%s' % 100, {'data': content}).content) class_dict = {0: u'体育', 1: u'教育', 2: u'财经', 3: u'社会', 4: u'娱乐', 5: u'军事', 6: u'国内', 7: u'科技', 8: '互联网', 9: u'房产', 10: u'国际', 11: u'女人', 12: u'汽车', 13: u'游戏'} print classify_result classify_result = int(re.compile('\d+').findall(classify_result)[0]) jieba_textrank = jieba.analyse.textrank(content, topK=15) jieba_keywords = jieba.analyse.extract_tags(content, allowPOS=['n', 'vn', 'ns', 'v'], topK=15) topic_list = tagging_utils.passage_second_level_classify(content) resp = make_response( json.dumps({'code': 0, 'class': class_dict[classify_result], 'keyword': keyword_result, 'jieba_textrank': jieba_textrank, 'jieba_keywords': jieba_keywords, 'topic_list': topic_list}, ensure_ascii=False), 200) return resp