def tGrocery(): outFile = open('testResult.tmp', 'w') [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk')) testingSet = [] correctLabel = [] for i in xrange(len(benchmark)): print '%d out of %d' % (i, len(benchmark)) testingSet.append(benchmark[i][1]) correctLabel.append(benchmark[i][0]) grocery = Grocery('test') grocery.train(trainingSet) grocery.save() # load new_grocery = Grocery('test') new_grocery.load() Prediction = [] for i in xrange(len(testingSet)): print '%d out of %d' % (i, len(testingSet)) prediction = new_grocery.predict(testingSet[i]) Prediction.append(prediction) temp = correctLabel[i] + '<-->' + prediction + ' /x01' + testingSet[i] + '\n' outFile.write(temp) correct = 0 for i in xrange(len(Prediction)): print Prediction[i], correctLabel[i], if Prediction[i] == correctLabel[i]: correct += 1 print 'Correct' else: print 'False' print 'Correct Count:', correct print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
def test_grocery(): grocery = Grocery('model_redian') grocery.train('trdata_4.txt') grocery.save() new_grocery = Grocery('model_redian') new_grocery.load() test_result = new_grocery.test('tedata_4.txt') print test_result.accuracy_labels print test_result.recall_labels test_result.show_result()
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def predict_test(model_path, data): # 加载模型 try: model_path = os.path.join(BASE_DIR, 'learn', model_path) new_grocery = Grocery(model_path.encode('utf-8')) new_grocery.load() except Exception as e: return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'} # 整理输入数据 result = list() sentences = data.split(';') if sentences[-1] == '': sentences.pop() if len(sentences) == 0: return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'} # 分词,再判断 stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt')) for s in sentences: tmp_s = '' words = jieba.cut(s) for word in words: if word in stop_words: continue else: tmp_s += word + ' ' result.append({ 'tag': str(new_grocery.predict(tmp_s.strip().encode('utf-8'))), 'sentence': s, }) return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
def get_data(ids, b_date, end_data, log, stop_word): b_date = b_date.strftime('%Y-%m-%d') end_data = end_data.strftime('%Y-%m-%d') # 选择数据来源 df = load_data(ids, b_date, end_data) # df = load_data_excel() # df = pd.read_excel('data_treasure.xls') df['RateDate'] = pd.to_datetime(df['RateDate']) # df_group = df['RateDate'].groupby([df.RateDate.values.astype('datetime64[D]')]).size() res = list() log.info('Have %d comments need to process' % len(df)) # 分类模型导入 new_grocery = Grocery('sample2') new_grocery.load() for record_data in range(0, len(df)): # 按日期分类摘取内容 # tmp_df = df[df['RateDate'] > df_group.index[record_data]][df['RateDate'] < df_group.index[record_data + 1]] # 自然语言处理 content_sw, level, tag = nlp_process_with_sw(df.iloc[record_data], new_grocery, stop_word) # 记录结果 res.append({ 'RateContent': json.dumps(content_sw, ensure_ascii=False), 'RateDate': df.iloc[record_data]['RateDate'], 'TreasureID': df.iloc[record_data]['TreasureID'], 'Level': level, 'Tag': tag, 'Sentence': df.iloc[record_data]['RateContent'], }) return res
def train(): print 'train start '+'.'*30 #grocery=Grocery('sample') grocery=Grocery('version1.0') grocery.train(trainlist) grocery.save() print 'train end '+'.'*30
def test(test_path): new_grocery = Grocery('cv_' + str(fold) + '_model') #, custom_tokenize=segment) new_grocery.load() test_src = [] with open(test_path) as f: for line in f: label, text = line.strip().split("|text|") label = yiji_label[classify_dict[label]] test_src.append((label, text)) test_result = new_grocery.test(test_src) #print test_result #print test_result.accuracy_overall #accs = test_result.accuracy_labels recalls = test_result.recall_labels #print "Recall for each class: ", recalls predictlabels = test_result.predicted_y truelabels = test_result.true_y acc = accuracy_score(truelabels, predictlabels) macro_precision, macro_recall, macro_fscore, _ = precision_recall_fscore_support( truelabels, predictlabels, average='macro') print "Accuracy: ", acc, "Macro-average Precision:", macro_precision, "Macro-average Recall:", macro_recall, "Macro-average Fscore:", macro_fscore labellist = [ 'safe_and_stable', 'industrial_information', 'politics', 'culture_health', 'social_livelihood', 'economic_and_financial' ] precision, recall, fscore, _ = precision_recall_fscore_support( truelabels, predictlabels, average=None, labels=labellist) precisions = dict() recalls = dict() for idx, p in enumerate(precision): precisions[labellist[idx]] = p for idx, c in enumerate(recall): recalls[labellist[idx]] = c
def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s", [keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def train_compare_result(train_src, test_src): grocery = Grocery('test') grocery.train(train_src) print grocery.get_load_status() len_test = len(test_src) print len_test Predict_num = 0 History = [] for test in test_src: Predict_result = { 'predict_title': test[1], 'predict_class': None, 'true_class': None } predict_title = Predict_result['predict_title'] predict_result = grocery.predict(predict_title) Predict_result['predict_class'], Predict_result['true_class'] = test[ 0], predict_result if str(predict_result) == str(test[0]): # print 'prediction is True' Predict_num += 1 History.append(Predict_result) # print 'prediction is False' predict_precision = float(Predict_num) / len_test return predict_precision, History
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() result = grocery.predict('just a testing') print(result) result = grocery.predict('考生必读:新托福写作考试评分标准') print(result) print("type of result is :",type(result)) assert str(grocery.predict('考生必读:新托福写作考试评分标准')) == 'education' assert str(grocery.predict('法网')) == 'sports' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def labelmaker(self): result=[] grocery = Grocery('11c_20k_20171226') grocery.load() label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0] result.append(label_confidence[0])#置信度最高的分类结果 result.append(label_confidence[1])# 置信度 return result
def __load_model__(): if not Classify.__MODEL_LOADED__: Classify.__MODEL_LOADED__ = True Classify.__train__model__() else: if Classify.__MODEL__: Classify.__MODEL__ = Grocery('Classify') Classify.__MODEL__.load()
def __init__(self, *args, **kwargs): self.grocery_name = str(kwargs["grocery_name"]) method = str(kwargs["method"]) train_src = str(kwargs["train_src"]) self.PREFIX = conf.load("predict_label")["prefix"] self.MODEL_DIR = conf.load("predict_label")["model_dir"] self.kwargs = kwargs if method == "normal": self.key_ext = keyExt() self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) elif method == "jieba": self.grocery = Grocery(self.grocery_name) elif method == "processed": self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) pass
def phgrocery(text): # result_text = [] model_grocery = Grocery('model_redian_5') model_grocery.load() result = int(model_grocery.predict(text).predicted_y) # print result # if result == 1: # result_text.append(text) return result
def tgrocery_train(train_data,test_data): '''model预测''' print("训练语料总数为: " + str(len(train_data))) test_corpus, test_label = test_split(test_data) grocery = Grocery('TextGrocery') print("start training......") grocery.train(train_data) grocery.save() new_grocery = Grocery('TextGrocery') new_grocery.load() predict_label = [] for sample in test_corpus: label = new_grocery.predict(sample) predict_label.append(str(label)) # print(predict_label) return test_corpus,test_label,predict_label
def __train__model__(): dataframe = pd.read_excel(Classify.__FILE_PATH__) data = dataframe[[u'类型', u'释义']] train_data = [(x[0], x[1]) for x in data.values] grocery = Grocery('Classify') grocery.train(train_data) grocery.save() Classify.__MODEL__ = grocery
def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'): ''' :param text: :param model_name: :return: ''' new_grocery = Grocery(self.model_name) new_grocery.load() result = new_grocery.predict(text) return result.dec_values[u'postive']
def __init__(self): self.degreedic = set( line.strip() for line in codecs.open('./data/degrees.txt','rb','utf-8')) # 载入学历词库 self.majordic =set( line.strip() for line in codecs.open('./data/majordic.txt','rb','utf-8')) # 载入专业词库 self.citydic = set( line.strip() for line in codecs.open("./data/citydic.txt",'rb','utf-8')) # 载入城市词库 self.firmnames =set( line.strip() for line in codecs.open('./data/firm.txt','rb','utf-8')) # 载入公司缩写名库 self.jobdic = set(line.strip() for line in codecs.open('./data/jobposition.txt','rb','utf-8') ) # 载入招聘职位名库 self.skills = set( line.strip() for line in codecs.open('./data/skills.txt','rb','utf-8')) # self.wordlisttf = pickle.load(open('./data/wordlist.pkl')) # 出现频率最高的2000个单词 # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector self.clf = Grocery("jdclf") # 句子分类器,分为demand,duty,other self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企") self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析') self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]") self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利") self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]") self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红") self.SPLIT_JD = re.compile(u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]") self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]") self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]") self.SKILL = re.compile(u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的") jieba.load_userdict('./data/majordic.txt') jieba.load_userdict('./data/skills.txt') jieba.load_userdict('./data/firm.txt') jieba.load_userdict('./data/degrees.txt') jieba.load_userdict('./data/benefits.txt') self.jdStr = "" self.linelist = [] self.lineindex = defaultdict(int) self.result = OrderedDict()
def sentiment_train(gro_name, train_set): """ tgGrocery svm train :param gro_name: :param train_set: :return: """ gro_ins = Grocery(gro_name) # gro_ins.load() gro_ins.train(train_set) print("Is trained? ", gro_ins.get_load_status()) gro_ins.save()
def train(train_origin_path, fold): grocery = Grocery('cv_' + str(fold) + '_model') #, custom_tokenize=segment) train_src = [] with open(train_origin_path) as f: for line in f: label, text = line.strip().split("|text|") label = yiji_label[classify_dict[label]] train_src.append((label, text)) grocery.train(train_src) grocery.save()
def train_phrasing_and_save(self, trainsets=all): ''' :param trainsets: :param model_name: :return: ''' try: grocery = Grocery(self.model_name) grocery.train(trainsets) grocery.save() return True except: return False
def __init__(self): self.data = [] self.clf = Grocery("jdclf") self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile( u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile( u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile( u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile( u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\ |互联网|创业型|国企|央企") self.JOBNAME = re.compile( u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析' ) self.START_DEMAND = re.compile( u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]" ) self.DEMAND = re.compile( u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利" ) self.DUTY = re.compile( u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\ |完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile( u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]" ) self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile( u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
def demo_flask(image_file): grocery = Grocery('NameIdAdd_NLP') model_name = grocery.name text_converter = None tgm = GroceryTextModel(text_converter, model_name) tgm.load(model_name) grocery.model = tgm t = time.time() result_dir = './result' image = np.array(Image.open(image_file).convert('RGB')) result, image_framed = ocr_whole.model(image) output_file = os.path.join(result_dir, image_file.split('/')[-1]) Image.fromarray(image_framed).save(output_file) name_total = '' id_total = '' for key in result: string1 = result[key][1] if len(string1) <= 8: continue string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1) no_digit = len(list(filter(str.isdigit, string2.encode('gbk')))) no_alpha = len(list(filter(is_alphabet, string2))) if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2: predict_result = 'others' elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3: predict_result = 'company-address' elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8): predict_result = 'company-id' elif '名称' in string2: predict_result = 'company-name' else: predict_result = grocery.predict(string2) if str(predict_result) == 'company-name': name_total += string1 break elif str(predict_result) == 'company-id': id_total += string1 else: continue id_total = re.sub(r'\W', '', id_total) name_total = stupid_revise(name_total) print("Mission complete, it took {:.3f}s".format(time.time() - t)) print('\nRecongition Result:\n') print(id_total) print(name_total) return output_file, id_total, name_total
def learn_model(file_name): path = os.path.join(BASE_DIR, 'learn', file_name) try: df = pd.read_excel(path) except Exception as e: return {'IsErr': True, 'ErrDesc': u'找不到文档或者读取文档出错'} try: # 删去缺失值的行 df = df.dropna(axis=0) df = df.apply(split_comment, axis=1) except Exception as e: return {'IsErr': True, 'ErrDesc': u'文档格式有误,应包含Tag(标签名字),Comment(评价内容)'} try: # 拆分学习组和测试组 3 :2 len_learn = len(df) / 5 * 3 # 生成学习文档和测试文档 learn_file_name, test_file_name = output_file(file_name, df, len_learn) tmp_learn_name = os.path.join(BASE_DIR, 'learn', 'model_' + learn_file_name.split('.')[0]) grocery = Grocery(tmp_learn_name.encode('utf-8')) path = os.path.join(BASE_DIR, 'learn', learn_file_name) grocery.train(path.encode('utf-8')) grocery.save() except Exception as e: return {'IsErr': True, 'ErrDesc': u'学习不成功,没有生产新的模型,请再次尝试。'} # 测试 res = test_sample(tmp_learn_name, test_file_name) return { 'IsErr': False, 'ErrDesc': u'成功生产新的模型,测试验证的正确率为%s, 模型保存为:%s' % (res, os.path.split(tmp_learn_name)[1]) }
# coding=utf-8 from tgrocery import Grocery grocery = Grocery('sample') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) #grocery.train('/home/wangjianfei/git/data/train_ch.txt') # grocery.train('train_ch.txt') grocery.save() new_grocery = Grocery('sample') new_grocery.load() print( new_grocery.predict( 'Abbott government spends $8 million on higher education media blitz')) test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] print("start test..................") #grocery.test('/home/wangjianfei/git/data/test.txt') # grocery.train('train_ch.txt')) # custom_grocery = Grocery('custom', custom_tokenize=list) print(new_grocery.test(test_src))
# coding: utf-8 from tgrocery import Grocery grocery = Grocery('test') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values) grocery = Grocery('read_text') train_src = '../text_src/train_ch.txt' grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values)
# -*- coding: utf-8 -*- import sys reload(sys) sys.path.append('../../') from config import * from tgrocery import Grocery STOP_WORDS_FILE = 'stopwords.txt' USER_DICT_FILE = 'user_dict.txt' model_fintext = Grocery('model_fintext') model_fintext.load() sys.path.append('../') from get_es import * es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) def search(index_name): es_search_options = set_search_optional() es_result = get_search_result(es_search_options,index=index_name) # final_result = get_result_list(es_result) # return final_result return es_result def get_result_list(es_result): final_result = [] for item in es_result: final_result.append(item['_source']) return final_result
#!/usr/bin/env python # coding=utf-8 from tgrocery import Grocery #grocery = Grocery('age56') #grocery.train('train4_age_56', ' ') #grocery.save() new_grocery = Grocery("age") new_grocery.load() predict_result = new_grocery.test('test4_age', ' ') #print len(predict_result.true_y) #for i in range(len(predict_result.predicted_y)): #print predict_result.predicted_y[i] print predict_result predict_result.show_result()
# -*- coding:utf-8 -*- import sys reload(sys) sys.path.append('../../') from config import * from es import es214 as es from elasticsearch import Elasticsearch from elasticsearch.exceptions import TransportError from elasticsearch.helpers import bulk from tgrocery import Grocery model_fintext = Grocery('../fintext_classify/model_fintext') model_fintext.load() def match_topic_kw(news_id, keywords_list, source, doc_type, size=10000): result = [] keyword_str = ''.join(keywords_list) # 通过一组关键词查找相关文本 query_body = { "query": { "match": { "content": keyword_str #这个可能还得改,争取用一个list } }, "size": size } # print keyword_str es_result = es.search(index=source, doc_type=doc_type, body=query_body, request_timeout=400)
# coding: utf-8 from tgrocery import Grocery # save grocery = Grocery('test') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) grocery.save() # load # grocery name must be the same as the previous one new_grocery = Grocery('test') new_grocery.load() print new_grocery.predict('考生必读:新托福写作考试评分标准')