def tencent_classify_rawtext_files(files_root_path, result_path, pass_num=-1): count = 0 flist = os.listdir(files_root_path) for f in flist: print '%s:%s' % (count, f) count += 1 if count < pass_num: continue ftext = codecs.open(os.path.join(files_root_path, f), 'r', encoding='utf8').read() try: # json_obj = json.loads(ftext) ftext = ftext.replace('\n', '') ftext = ftext.replace(' ', '') refined_text = wenzhi_utils.remove_illegal_characters(ftext) result = wenzhi_utils.wenzhi_analysis(refined_text) # result = tencent_classify(ftext) except Exception, e: # 懒得差各种异常了,直接重复 print e continue if result['code'] == 0: for class_type in result['classes']: if class_type['conf'] > 0.5: try: fout = codecs.open(os.path.join(result_path, class_type['class'], f + '.txt'), 'w') except IOError, e: print e os.mkdir(os.path.join(result_path, class_type['class'])) fout = codecs.open(os.path.join(result_path, class_type['class'], f + ".txt"), 'w') except KeyError, ke: print ke continue fout.write(refined_text)
def analyzse_article(): """ 抽离文章分析接口 :return: """ req_data = json.loads(request.data) content_list = req_data.get('article_content') article_content = req_data.get('article_content') result = wenzhi_utils.wenzhi_analysis(article_content) # topic_list = tagging_utils.passage_second_level_classify(web_content) tag_result = [] if result['code'] == 0: for class_item in result['classes']: class_type = class_item['class'] class_prob = class_item['conf'] tag_result.append({'tag': class_type, 'prob': class_prob}) return json.dumps({'code': 0, 'tag_result': tag_result}, ensure_ascii=False)