def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s", [keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def get_data(ids, b_date, end_data, log, stop_word): b_date = b_date.strftime('%Y-%m-%d') end_data = end_data.strftime('%Y-%m-%d') # 选择数据来源 df = load_data(ids, b_date, end_data) # df = load_data_excel() # df = pd.read_excel('data_treasure.xls') df['RateDate'] = pd.to_datetime(df['RateDate']) # df_group = df['RateDate'].groupby([df.RateDate.values.astype('datetime64[D]')]).size() res = list() log.info('Have %d comments need to process' % len(df)) # 分类模型导入 new_grocery = Grocery('sample2') new_grocery.load() for record_data in range(0, len(df)): # 按日期分类摘取内容 # tmp_df = df[df['RateDate'] > df_group.index[record_data]][df['RateDate'] < df_group.index[record_data + 1]] # 自然语言处理 content_sw, level, tag = nlp_process_with_sw(df.iloc[record_data], new_grocery, stop_word) # 记录结果 res.append({ 'RateContent': json.dumps(content_sw, ensure_ascii=False), 'RateDate': df.iloc[record_data]['RateDate'], 'TreasureID': df.iloc[record_data]['TreasureID'], 'Level': level, 'Tag': tag, 'Sentence': df.iloc[record_data]['RateContent'], }) return res
def predict_test(model_path, data): # 加载模型 try: model_path = os.path.join(BASE_DIR, 'learn', model_path) new_grocery = Grocery(model_path.encode('utf-8')) new_grocery.load() except Exception as e: return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'} # 整理输入数据 result = list() sentences = data.split(';') if sentences[-1] == '': sentences.pop() if len(sentences) == 0: return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'} # 分词,再判断 stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt')) for s in sentences: tmp_s = '' words = jieba.cut(s) for word in words: if word in stop_words: continue else: tmp_s += word + ' ' result.append({ 'tag': str(new_grocery.predict(tmp_s.strip().encode('utf-8'))), 'sentence': s, }) return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
def test(test_path): new_grocery = Grocery('cv_' + str(fold) + '_model') #, custom_tokenize=segment) new_grocery.load() test_src = [] with open(test_path) as f: for line in f: label, text = line.strip().split("|text|") label = yiji_label[classify_dict[label]] test_src.append((label, text)) test_result = new_grocery.test(test_src) #print test_result #print test_result.accuracy_overall #accs = test_result.accuracy_labels recalls = test_result.recall_labels #print "Recall for each class: ", recalls predictlabels = test_result.predicted_y truelabels = test_result.true_y acc = accuracy_score(truelabels, predictlabels) macro_precision, macro_recall, macro_fscore, _ = precision_recall_fscore_support( truelabels, predictlabels, average='macro') print "Accuracy: ", acc, "Macro-average Precision:", macro_precision, "Macro-average Recall:", macro_recall, "Macro-average Fscore:", macro_fscore labellist = [ 'safe_and_stable', 'industrial_information', 'politics', 'culture_health', 'social_livelihood', 'economic_and_financial' ] precision, recall, fscore, _ = precision_recall_fscore_support( truelabels, predictlabels, average=None, labels=labellist) precisions = dict() recalls = dict() for idx, p in enumerate(precision): precisions[labellist[idx]] = p for idx, c in enumerate(recall): recalls[labellist[idx]] = c
def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s",[keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def labelmaker(self): result=[] grocery = Grocery('11c_20k_20171226') grocery.load() label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0] result.append(label_confidence[0])#置信度最高的分类结果 result.append(label_confidence[1])# 置信度 return result
def GET(self,name): #i = web.input(name=None) #url = "http://"+name #html = urllib2.urlopen(url).read() #soup = BeautifulSoup(html) #title = soup.html.head.title.contents.pop().encode('utf-8') title = name.encode('utf-8') new_grocery = Grocery('sample') new_grocery.load() return new_grocery.predict(title)
def __train__model__(): dataframe = pd.read_excel(Classify.__FILE_PATH__) data = dataframe[[u'类型', u'释义']] train_data = [(x[0],x[1]) for x in data.values] grocery = Grocery('Classify') grocery.train(train_data) grocery.save() Classify.__MODEL__ = grocery
def phgrocery(text): # result_text = [] model_grocery = Grocery('model_redian_5') model_grocery.load() result = int(model_grocery.predict(text).predicted_y) # print result # if result == 1: # result_text.append(text) return result
def tGrocery(): outFile = open('testResult.tmp', 'w') [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk')) testingSet = [] correctLabel = [] for i in xrange(len(benchmark)): print '%d out of %d' % (i, len(benchmark)) testingSet.append(benchmark[i][1]) correctLabel.append(benchmark[i][0]) grocery = Grocery('test') grocery.train(trainingSet) grocery.save() # load new_grocery = Grocery('test') new_grocery.load() Prediction = [] for i in xrange(len(testingSet)): print '%d out of %d' % (i, len(testingSet)) prediction = new_grocery.predict(testingSet[i]) Prediction.append(prediction) temp = correctLabel[i] + '<-->' + prediction + ' /x01' + testingSet[i] + '\n' outFile.write(temp) correct = 0 for i in xrange(len(Prediction)): print Prediction[i], correctLabel[i], if Prediction[i] == correctLabel[i]: correct += 1 print 'Correct' else: print 'False' print 'Correct Count:', correct print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
class GroceryModel(object): def __init__(self): self.grocery = Grocery('TextClassify') def train(self,train_file): f = open(train_file,'r') line = f.readline().decode('utf8') dataset = [] while line: tmp = line.split('\t') dataset.append((tmp[0],''.join(tmp[1:]))) line = f.readline().decode('utf8') f.close() self.grocery.train(dataset) self.grocery.save() def load_model(self): self.grocery.load() def test(self,test_src): self.load_model() f = open(test_src,'r') line = f.readline().decode('utf8') dataset = [] while line: tmp = line.split('\t') dataset.append((tmp[0],''.join(tmp[1:]))) line = f.readline().decode('utf8') f.close() result = self.grocery.test(dataset) print result def predict(self,text): print self.grocery.predict(text)
def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'): ''' :param text: :param model_name: :return: ''' new_grocery = Grocery(self.model_name) new_grocery.load() result = new_grocery.predict(text) return result.dec_values[u'postive']
def __init__(self): self.degreedic = set( line.strip() for line in codecs.open('./data/degrees.txt','rb','utf-8')) # 载入学历词库 self.majordic =set( line.strip() for line in codecs.open('./data/majordic.txt','rb','utf-8')) # 载入专业词库 self.citydic = set( line.strip() for line in codecs.open("./data/citydic.txt",'rb','utf-8')) # 载入城市词库 self.firmnames =set( line.strip() for line in codecs.open('./data/firm.txt','rb','utf-8')) # 载入公司缩写名库 self.jobdic = set(line.strip() for line in codecs.open('./data/jobposition.txt','rb','utf-8') ) # 载入招聘职位名库 self.skills = set( line.strip() for line in codecs.open('./data/skills.txt','rb','utf-8')) # self.wordlisttf = pickle.load(open('./data/wordlist.pkl')) # 出现频率最高的2000个单词 # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector self.clf = Grocery("jdclf") # 句子分类器,分为demand,duty,other self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企") self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析') self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]") self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利") self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]") self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红") self.SPLIT_JD = re.compile(u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]") self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]") self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]") self.SKILL = re.compile(u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的") jieba.load_userdict('./data/majordic.txt') jieba.load_userdict('./data/skills.txt') jieba.load_userdict('./data/firm.txt') jieba.load_userdict('./data/degrees.txt') jieba.load_userdict('./data/benefits.txt') self.jdStr = "" self.linelist = [] self.lineindex = defaultdict(int) self.result = OrderedDict()
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def test_grocery(): grocery = Grocery('model_redian') grocery.train('trdata_4.txt') grocery.save() new_grocery = Grocery('model_redian') new_grocery.load() test_result = new_grocery.test('tedata_4.txt') print test_result.accuracy_labels print test_result.recall_labels test_result.show_result()
class AutoGrocery(object): """ """ def __init__(self, name, train_data): self._train_data = train_data self._grocery = Grocery(project_dir + '/models/model_data/' + name) def train(self): self._grocery.train(self._train_data) def save(self): self._grocery.save() def load(self): self._grocery.load() def predicate(self, src): if not self._grocery.get_load_status(): try: self.load() except ValueError: self.train() self.save() pr = self._grocery.predict(src) label = pr.predicted_y return label, pr.dec_values[label]
def __init__(self): self.data = [] self.clf = Grocery("jdclf") self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile( u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile( u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile( u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile( u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\ |互联网|创业型|国企|央企") self.JOBNAME = re.compile( u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析' ) self.START_DEMAND = re.compile( u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]" ) self.DEMAND = re.compile( u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利" ) self.DUTY = re.compile( u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\ |完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile( u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]" ) self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile( u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
def demo_flask(image_file): grocery = Grocery('NameIdAdd_NLP') model_name = grocery.name text_converter = None tgm = GroceryTextModel(text_converter, model_name) tgm.load(model_name) grocery.model = tgm t = time.time() result_dir = './result' image = np.array(Image.open(image_file).convert('RGB')) result, image_framed = ocr_whole.model(image) output_file = os.path.join(result_dir, image_file.split('/')[-1]) Image.fromarray(image_framed).save(output_file) name_total = '' id_total = '' for key in result: string1 = result[key][1] if len(string1) <= 8: continue string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1) no_digit = len(list(filter(str.isdigit, string2.encode('gbk')))) no_alpha = len(list(filter(is_alphabet, string2))) if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2: predict_result = 'others' elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3: predict_result = 'company-address' elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8): predict_result = 'company-id' elif '名称' in string2: predict_result = 'company-name' else: predict_result = grocery.predict(string2) if str(predict_result) == 'company-name': name_total += string1 break elif str(predict_result) == 'company-id': id_total += string1 else: continue id_total = re.sub(r'\W', '', id_total) name_total = stupid_revise(name_total) print("Mission complete, it took {:.3f}s".format(time.time() - t)) print('\nRecongition Result:\n') print(id_total) print(name_total) return output_file, id_total, name_total
def __load_model__(): if not Classify.__MODEL_LOADED__: Classify.__MODEL_LOADED__ = True Classify.__train__model__() else: if Classify.__MODEL__: Classify.__MODEL__ = Grocery('Classify') Classify.__MODEL__.load()
def train_compare_result(train_src, test_src): grocery = Grocery('test') grocery.train(train_src) print grocery.get_load_status() len_test = len(test_src) print len_test Predict_num = 0 History = [] for test in test_src: Predict_result = { 'predict_title': test[1], 'predict_class': None, 'true_class': None } predict_title = Predict_result['predict_title'] predict_result = grocery.predict(predict_title) Predict_result['predict_class'], Predict_result['true_class'] = test[ 0], predict_result if str(predict_result) == str(test[0]): # print 'prediction is True' Predict_num += 1 History.append(Predict_result) # print 'prediction is False' predict_precision = float(Predict_num) / len_test return predict_precision, History
def predict_corpus(input_file,output_csv): import csv csvfile = file(output_csv, 'wb') writer = csv.writer(csvfile) corpus = [] f = xlrd.open_workbook(input_file) table = f.sheet_by_name('Sheet1') nrows = table.nrows # 读取行数 for rownum in range(0, nrows): row = table.row_values(rownum) row[2].strip() corpus.append(row[2]) corpus_grocery = Grocery(project_name) corpus_grocery.load() output = [] for sentence in corpus: predict = corpus_grocery.predict(sentence) output.append((sentence,predict)) writer.writerows(output) print('Done!') csvfile.close()
def train(): print 'train start '+'.'*30 #grocery=Grocery('sample') grocery=Grocery('version1.0') grocery.train(trainlist) grocery.save() print 'train end '+'.'*30
def sentiment_train(gro_name, train_set): """ tgGrocery svm train :param gro_name: :param train_set: :return: """ gro_ins = Grocery(gro_name) # gro_ins.load() gro_ins.train(train_set) print("Is trained? ", gro_ins.get_load_status()) gro_ins.save()
class jdParser(object): def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]") def get_demand_and_duty(self, jdstr): linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip() > 4) ] result = {} demand = [] duty = [] for line in linelist: pred = str(self.clf.predict(line)) if pred == "demand": demand.append(line) elif pred == "duty": duty.append(line) result['demand'] = '\n'.join(demand) result['duty'] = '\n'.join(duty)
class jdParser(object): def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]") def get_demand_and_duty(self,jdstr): linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip()>4) ] result = {} demand = [] duty = [] for line in linelist: pred = str(self.clf.predict(line)) if pred =="demand": demand.append(line) elif pred == "duty": duty.append(line) result['demand'] = '\n'.join(demand) result['duty'] = '\n'.join(duty)
def __train__model__(): dataframe = pd.read_excel(Classify.__FILE_PATH__) data = dataframe[[u'类型', u'释义']] train_data = [(x[0], x[1]) for x in data.values] grocery = Grocery('Classify') grocery.train(train_data) grocery.save() Classify.__MODEL__ = grocery
def sentiment_train(gro_name, train_set): """ :param gro_name: :param train_set: :return: """ gro_ins = Grocery(gro_name) # gro_ins.load() gro_ins.train(train_set) print("Is trained? ", gro_ins.get_load_status()) gro_ins.save()
def train(train_origin_path, fold): grocery = Grocery('cv_' + str(fold) + '_model') #, custom_tokenize=segment) train_src = [] with open(train_origin_path) as f: for line in f: label, text = line.strip().split("|text|") label = yiji_label[classify_dict[label]] train_src.append((label, text)) grocery.train(train_src) grocery.save()
def train_phrasing_and_save(self, trainsets=all): ''' :param trainsets: :param model_name: :return: ''' try: grocery = Grocery(self.model_name) grocery.train(trainsets) grocery.save() return True except: return False
class MyGrocery(object): def __init__(self, name): super(MyGrocery, self).__init__() self.grocery = Grocery(name) self.loaded = False self.correct = 1.0 def train(self, src): lines = [] for line in csv.reader(open(src)): label, s = line[0],line[1] text = s.decode('utf8') lines.append((label, text)) self.grocery.train(lines) def save_model(self): self.grocery.save() def train_and_save(self, src): self.train(src) self.save_model() def load_model(self): if not self.loaded: self.grocery.load() self.loaded = True def predict(self, text): self.load_model() return self.grocery.predict(text) def test(self, src): self.load_model() total, wrong_num = 0.0, 0.0 for line in csv.reader(open(src)): total += 1 if line[0] != self.predict(line[1]): wrong_num += 1 print "load test file from " + src correct = (total - wrong_num ) / total self.correct = correct print "total: %d , wrong_num: %d, success percentage: %f" %(total, wrong_num, correct) result = dict(type="test", total=total, wrong_num=wrong_num, correct=correct) return json.dumps(result)
def __init__(self): self.CLEAN_TEXT = re.compile(u"[^\u4e00-\u9fa5\w\d;::;,。、\.,。!!@()\r\n\(\)\-\+ - ]") self.clf = Grocery(base_dir+"/jdclf") self.clf.load() self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}") self.CLEAN_LINE = re.compile(u"^[\u2022(【\[\s\t\r\n\(\- ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]") self.CLEAN_JOBNAME = re.compile(u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]") self.PAY = re.compile("(\d{3,}\-)?\d{3,}元") self.SEX = re.compile(u"性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.JOB_TAG = re.compile(u"全职|实习") self.DEGREE = re.compile(u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限") self.START_DEMAND = re.compile(u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?") self.START_DUTY = re.compile(u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]") self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]") self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+") self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验") self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作") self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴") self.CERT = re.compile(u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\ 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书") self.degreedic = set([line.strip() for line in codecs.open(base_dir+'/data/degrees.txt','rb','utf-8')]) self.majordic = set([line.strip() for line in codecs.open(base_dir+'/data/majordic.txt','rb','utf-8')]) self.skilldic = set([line.strip() for line in codecs.open(base_dir+'/data/skills.txt','rb','utf-8')]) self.jobdic = set([line.strip() for line in codecs.open(base_dir+'/data/jobnames.txt','rb','utf-8')]) jieba.load_userdict(base_dir+'/data/majordic.txt') jieba.load_userdict(base_dir+'/data/skills.txt') jieba.load_userdict(base_dir+'/data/firm.txt') jieba.load_userdict(base_dir+'/data/degrees.txt') jieba.load_userdict(base_dir+'/data/benefits.txt')
def __init__(self, *args, **kwargs): self.grocery_name = str(kwargs["grocery_name"]) method = str(kwargs["method"]) train_src = str(kwargs["train_src"]) self.PREFIX = conf.load("predict_label")["prefix"] self.MODEL_DIR = conf.load("predict_label")["model_dir"] self.kwargs = kwargs if method == "normal": self.key_ext = keyExt() self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) elif method == "jieba": self.grocery = Grocery(self.grocery_name) elif method == "processed": self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) pass
def __init__(self): self.data = [] self.clf = Grocery("jdclf") self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\ |互联网|创业型|国企|央企") self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析') self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]") self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利") self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\ |完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]") self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
def learn_model(file_name): path = os.path.join(BASE_DIR, 'learn', file_name) try: df = pd.read_excel(path) except Exception as e: return {'IsErr': True, 'ErrDesc': u'找不到文档或者读取文档出错'} try: # 删去缺失值的行 df = df.dropna(axis=0) df = df.apply(split_comment, axis=1) except Exception as e: return {'IsErr': True, 'ErrDesc': u'文档格式有误,应包含Tag(标签名字),Comment(评价内容)'} try: # 拆分学习组和测试组 3 :2 len_learn = len(df) / 5 * 3 # 生成学习文档和测试文档 learn_file_name, test_file_name = output_file(file_name, df, len_learn) tmp_learn_name = os.path.join(BASE_DIR, 'learn', 'model_' + learn_file_name.split('.')[0]) grocery = Grocery(tmp_learn_name.encode('utf-8')) path = os.path.join(BASE_DIR, 'learn', learn_file_name) grocery.train(path.encode('utf-8')) grocery.save() except Exception as e: return {'IsErr': True, 'ErrDesc': u'学习不成功,没有生产新的模型,请再次尝试。'} # 测试 res = test_sample(tmp_learn_name, test_file_name) return { 'IsErr': False, 'ErrDesc': u'成功生产新的模型,测试验证的正确率为%s, 模型保存为:%s' % (res, os.path.split(tmp_learn_name)[1]) }
# -*- coding: utf-8 -*- import csv,codecs from tgrocery import Grocery import preprocessing as pp trainFileName='../data/train.txt' validateFileName='../data/validate.txt' outputFileName='../output/result.txt' # validate ################################## #grocery=Grocery('sample') grocery=Grocery('version1.0') grocery.load() print 'start test' TP=0.0 TN=0.0 FP=0.0 FN=0.0 fileValidate=codecs.open(validateFileName,'r','utf-8') validate_reader=fileValidate.readlines() fileOutput=codecs.open(outputFileName,'w','utf-8') resultlist=[] i=0 for line in validate_reader: content=pp.getcontent(validate_reader,i) i=i+1 if(i%5000==0):
class JdCRF(object): def __init__(self): self.data = [] self.clf = Grocery("jdclf") self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\ |互联网|创业型|国企|央企") self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析') self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]") self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利") self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\ |完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]") self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红") def gen_data(self,fname='./data/lagou_train.txt'): fw = codecs.open('./data/jd_train_crf.txt','wb','utf-8') cnt = 1 for line in codecs.open(fname,'rb','utf-8'): if line.startswith(u"====="): fw.write(line) continue cnt +=1 if len(line.strip())>1: pred = self.clf.predict(line) newline = pred+'\t\t'+line.strip()+'\t\t'+str(len(line))+"\n" fw.write(newline) print cnt print 'done' def load_data(self,fname="./data/jd_train_crf.txt"): data = [] tmp = [] for line in codecs.open(fname,'rb','utf-8'): if line.startswith(u"===="): data.append(tmp) tmp = [] continue else: tag_data = line.strip().split('\t\t') if len(tag_data)==3: tmp.append(tuple(tag_data)) else: print '\t '.join(tag_data) n = len(data)/2 print 'train data',n print 'test data',len(data)-n return data[n:],data[:n] def word2features(self,sent,i): word = sent[i][0] postag = sent[i][1] features = [ 'bias', 'word.lower=' + word.lower(), 'word[:2]=' +word[:2], 'word.isdigit=%s'%word.isdigit(), 'postag='+postag, 'demand=%s'% '1' if self.DEMAND.search(word) else '0', 'start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0', 'start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', 'duty=%s'% '1' if self.DUTY.search(word) else '0', 'jobname=%s'% '1' if self.JOBNAME.search(word) else '0', 'incname=%s'% '1' if self.INCNAME.search(word) else '0', 'benefit = %s'% '1' if self.BENEFIT.search(word) else '0', 'pred=%s' % self.clf.predict(word) ] if i>0: word1 = sent[i-1][0] postag1 = sent[i-1][1] features.extend([ '-1:postag='+postag1, '-1:word.islower='+word1[:3].lower(), '-1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '1', '-1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', '-1:demand=%s'% '1' if self.DEMAND.search(word1) else '0', '-1:duty=%s'% '1' if self.DUTY.search(word1) else '0', '-1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0', '-1:incname=%s'% '1' if self.INCNAME.search(word1) else '0', '-1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0', '-1:pred=%s' % self.clf.predict(word), ]) else: features.append('BOS') if i<len(sent)-1: word1 = sent[i+1][1] postag1 = sent[i+1][1] features.extend([ '+1:word.lower=' + word1[:3].lower(), '+1:word.istitle=%s' % word1.istitle(), '+1:word.isupper=%s' % word1.isupper(), '+1:postag=' + postag1, '+1:postag[:2]=' + postag1[:2], '+1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0', '+1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', '+1:demand=%s'% '1' if self.DEMAND.search(word1) else '0', '+1:duty=%s'% '1' if self.DUTY.search(word1) else '0', '+1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0', '+1:incname=%s'% '1' if self.INCNAME.search(word1) else '0', '+1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0', '+1:pred=%s' % self.clf.predict(word), ]) else: features.append('EOS') return features def sent2features(self,sent): return [self.word2features(sent,i) for i in range(len(sent))] def sent2labels(self,sent): return [label for (label,token,postag) in sent] def sent2tokens(self,sent): return [token for (label,token,postag) in sent] def train(self,x_train,y_train): assert len(x_train)==len(y_train),"not the same %d %d"%(len(x_train),len(y_train)) trainer = pycrfsuite.Trainer(verbose=False) for xseq,yseq in zip(x_train,y_train): trainer.append(xseq,yseq) trainer.set_params({ 'c1':1.0, 'c2':1e-3, 'max_iterations':50, 'feature.possible_transitions':True }) trainer.train('jd_skill.crfsuite') def test(self,sent): tagger = pycrfsuite.Tagger() tagger.open('./jd_skill.crfsuite') print 'tokens ','\n '.join(self.sent2tokens(sent)) print 'Predicted','\t '.join(tagger.tag(self.sent2features(sent))) print 'Correct ','\t '.join(self.sent2labels(sent))
# coding: utf-8 from tgrocery import Grocery grocery = Grocery('test') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values) grocery = Grocery('read_text') train_src = '../text_src/train_ch.txt' grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values)
# -*- coding:utf-8 -*- import sys reload(sys) sys.path.append('../../') from config import * from es import es214 as es from elasticsearch import Elasticsearch from elasticsearch.exceptions import TransportError from elasticsearch.helpers import bulk from tgrocery import Grocery model_fintext = Grocery('../fintext_classify/model_fintext') model_fintext.load() def match_topic_kw(news_id, keywords_list, source, doc_type, size=10000): result = [] keyword_str = ''.join(keywords_list) # 通过一组关键词查找相关文本 query_body = { "query": { "match": { "content": keyword_str #这个可能还得改,争取用一个list } }, "size": size } # print keyword_str es_result = es.search(index=source, doc_type=doc_type, body=query_body, request_timeout=400)
# coding: utf-8 from tgrocery import Grocery # save grocery = Grocery('test') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) grocery.save() # load # grocery name must be the same as the previous one new_grocery = Grocery('test') new_grocery.load() print new_grocery.predict('考生必读:新托福写作考试评分标准')
i=i+1 if(i%5000==0): print ("%d "%(i))+'#'*30 str=line.split(u',') count=str.__len__() if(count<2): print 'error happen'+"#"*30 continue #print count #print str trainstr=(str[0],str[1]) trainlist.append(trainstr) #print str[1]+u','+str[2] grocery=Grocery('sample') grocery.train(trainlist) grocery.save() filein.close() # test ################################## print 'start test' TP=0.0 TN=0.0 FP=0.0 FN=0.0 filetest=codecs.open(validateFileName,'r','utf-8') test_reader=filetest.readlines()
def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]")
# coding=utf-8 from tgrocery import Grocery grocery = Grocery('sample') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) #grocery.train('/home/wangjianfei/git/data/train_ch.txt') # grocery.train('train_ch.txt') grocery.save() new_grocery = Grocery('sample') new_grocery.load() print( new_grocery.predict( 'Abbott government spends $8 million on higher education media blitz')) test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] print("start test..................") #grocery.test('/home/wangjianfei/git/data/test.txt') # grocery.train('train_ch.txt')) # custom_grocery = Grocery('custom', custom_tokenize=list) print(new_grocery.test(test_src))
break t_pre_result = grocery_in.predict(t_text) t_label = t_pre_result.predicted_y # if max(pre_result.dec_values) < 0.03: # label = "neutral" print("Sentiment: ", t_label) print("How much: ", max(t_pre_result.dec_values)) ######################################################## # main if __name__ == "__main__": import time grocery_name = "./meter" corpus_path = "./Corpus/" max_line_num_once = 1000000 # 每个文件中读取的最大行数 tic = time.time() file_list = [corpus_path + "neg.xls", corpus_path + "pos.xls"] train_src = get_xls_train_set(file_list, max_line_num_once) sentiment_train(grocery_name, train_src) toc = time.time() print("Elapsed time of training is: ", toc - tic) grocery = Grocery(grocery_name) grocery.load() predict_for_one(grocery)
class Cat: def __init__(self): self.grocery = Grocery('autohome') def test(self): print self.grocery.get_load_status()
from tgrocery import Grocery data_dir = "../data/" src_fn = data_dir + 'train_set_100.txt' grocery = Grocery('backout_reason') grocery.train(src_fn) tp_cnt = {} f = open(data_dir + 'type.txt') for line in f: tps = line.split() tp_cnt[tps[1]] = 0 f.close() f = open(data_dir + 'bcmtmoz.merge') for line in f: tp = grocery.predict(line) tp_cnt[tp] += 1 print tp_cnt
# coding: utf-8 from tgrocery import Grocery grocery = Grocery('test') train_src = [ ('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] grocery.train(train_src) print grocery.get_load_status() test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] test_result = grocery.test(test_src) print test_result.accuracy_labels print test_result.recall_labels grocery = Grocery('text_src') train_src = '../text_src/train_ch.txt' grocery.train(train_src) print grocery.get_load_status() test_src = '../text_src/test_ch.txt' test_result = grocery.test(test_src) print test_result.accuracy_labels
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() print(grocery.predict('考生必读:新托福写作考试评分标准')) assert grocery.get_load_status() assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def unzip(seq, L=None): if L is None: L = [] for s in seq: if not isinstance(s, (list, )): L.append(s) else: L.extend(unzip(s)) return L if __name__ == "__main__": grocery = Grocery('sample') grocery.train(train_src) grocery.save() new_grocery = Grocery('sample') new_grocery.load() L1 = [] with open('/home/mouse/Downloads/female.csv', 'r') as f1: f1_csv = csv.reader(f1) for row in f1_csv: L1.append(row[0]) # print(len(L1)) cate = category(L1) i = 1 with open('/home/mouse/infoss.csv', 'w') as f:
#!/usr/bin/env python # coding=utf-8 from tgrocery import Grocery #grocery = Grocery('age56') #grocery.train('train4_age_56', ' ') #grocery.save() new_grocery = Grocery("age") new_grocery.load() predict_result = new_grocery.test('test4_age', ' ') #print len(predict_result.true_y) #for i in range(len(predict_result.predicted_y)): #print predict_result.predicted_y[i] print predict_result predict_result.show_result()
########################################## # init model_choose = "svm" # svm, lda, rnn grocery_name = "./SVM_models/svm_for_news" corpus_path = "./Corpus/NewsClassCorpus/" file_path = "./" file_name = "post.txt" t_text = delete_stop_words(codecs.open(file_path + file_name, encoding="UTF-8").read()) ########################################### # 调用 SVM 模型分类 if model_choose == "svm": tic = time.time() grocery = Grocery(grocery_name) grocery.load() t_pre_result = grocery.predict(delete_stop_words(t_text)) toc = time.time() t_label = t_pre_result.predicted_y print("Sentiment: ", t_label) print("How much: ", t_pre_result.dec_values[t_label]) print("Elapsed time of predict is: %s s" % (toc - tic)) elif model_choose == "lda": pass elif model_choose == "rnn": pass else: print("")
# -*- coding: utf-8 -*- import sys reload(sys) sys.path.append('../../') from config import * from tgrocery import Grocery STOP_WORDS_FILE = 'stopwords.txt' USER_DICT_FILE = 'user_dict.txt' model_fintext = Grocery('model_fintext') model_fintext.load() sys.path.append('../') from get_es import * es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) def search(index_name): es_search_options = set_search_optional() es_result = get_search_result(es_search_options,index=index_name) # final_result = get_result_list(es_result) # return final_result return es_result def get_result_list(es_result): final_result = [] for item in es_result: final_result.append(item['_source']) return final_result
# coding:utf-8 #!/usr/bin/evn python from tgrocery import Grocery copy_grocery = Grocery('./classfynews_instance')#模型所在路径 copy_grocery.load() #copy_grocery = grocery test = ['我是中国人','台北*****'] test_result = copy_grocery.predict(test) print test_result.predicted_y #test_result = copy_grocery.test(test_in) #print test_result.show_result()
dic['id'].append(_id) dic['type'].append(_type) dic['contents'].append(contents) else : tdic['id'].append(_id) tdic['type'].append(_type) tdic['contents'].append(contents) i +=1 #train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 ) #test = pd.read_csv( test_file, header = 1, delimiter = "\t", quoting = 3 ) train = DataFrame(dic) test = DataFrame(tdic) # #classfynews_instance 是模型保存路径 grocery = Grocery('classfynews_instance') train_in = [train['contents'],train['type']] grocery.train(train_in) print grocery.get_load_status() #grocery.save() copy_grocery = Grocery('classfynews_instance') copy_grocery.load() #copy_grocery = grocery test_in = [test['contents'],test['type']] #输入类似 ['我是中国人','台北*****'] #输出 [11,12] test_result = copy_grocery.predict(test['contents']) print test_result.predicted_y #test_result = copy_grocery.test(test_in)
def __init__(self): self.grocery = Grocery('autohome')
#!/usr/bin/env python # -*- coding: utf-8 -*- from tgrocery import Grocery # 新开张一个杂货铺(别忘了取名) grocery = Grocery('sample') # 训练文本可以用列表传入 train_src = [ ('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] grocery.train(train_src) # 也可以用文件传入(默认以tab为分隔符,也支持自定义) #grocery.train('train_ch.txt') # 保存模型 grocery.save() # 加载模型(名字和保存的一样) new_grocery = Grocery('sample') new_grocery.load() # 预测 new_grocery.predict('考生必读:新托福写作考试评分标准') #education # 测试 test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] new_grocery.test(test_src)
def __init__(self, name): super(MyGrocery, self).__init__() self.grocery = Grocery(name) self.loaded = False self.correct = 1.0