class GroceryModel(object): def __init__(self): self.grocery = Grocery('TextClassify') def train(self,train_file): f = open(train_file,'r') line = f.readline().decode('utf8') dataset = [] while line: tmp = line.split('\t') dataset.append((tmp[0],''.join(tmp[1:]))) line = f.readline().decode('utf8') f.close() self.grocery.train(dataset) self.grocery.save() def load_model(self): self.grocery.load() def test(self,test_src): self.load_model() f = open(test_src,'r') line = f.readline().decode('utf8') dataset = [] while line: tmp = line.split('\t') dataset.append((tmp[0],''.join(tmp[1:]))) line = f.readline().decode('utf8') f.close() result = self.grocery.test(dataset) print result def predict(self,text): print self.grocery.predict(text)
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() print(grocery.predict('考生必读:新托福写作考试评分标准')) assert grocery.get_load_status() assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
class AutoGrocery(object): """ """ def __init__(self, name, train_data): self._train_data = train_data self._grocery = Grocery(project_dir + '/models/model_data/' + name) def train(self): self._grocery.train(self._train_data) def save(self): self._grocery.save() def load(self): self._grocery.load() def predicate(self, src): if not self._grocery.get_load_status(): try: self.load() except ValueError: self.train() self.save() pr = self._grocery.predict(src) label = pr.predicted_y return label, pr.dec_values[label]
def predict_test(model_path, data): # 加载模型 try: model_path = os.path.join(BASE_DIR, 'learn', model_path) new_grocery = Grocery(model_path.encode('utf-8')) new_grocery.load() except Exception as e: return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'} # 整理输入数据 result = list() sentences = data.split(';') if sentences[-1] == '': sentences.pop() if len(sentences) == 0: return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'} # 分词,再判断 stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt')) for s in sentences: tmp_s = '' words = jieba.cut(s) for word in words: if word in stop_words: continue else: tmp_s += word + ' ' result.append({ 'tag': str(new_grocery.predict(tmp_s.strip().encode('utf-8'))), 'sentence': s, }) return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
def tGrocery(): outFile = open('testResult.tmp', 'w') [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk')) testingSet = [] correctLabel = [] for i in xrange(len(benchmark)): print '%d out of %d' % (i, len(benchmark)) testingSet.append(benchmark[i][1]) correctLabel.append(benchmark[i][0]) grocery = Grocery('test') grocery.train(trainingSet) grocery.save() # load new_grocery = Grocery('test') new_grocery.load() Prediction = [] for i in xrange(len(testingSet)): print '%d out of %d' % (i, len(testingSet)) prediction = new_grocery.predict(testingSet[i]) Prediction.append(prediction) temp = correctLabel[i] + '<-->' + prediction + ' /x01' + testingSet[i] + '\n' outFile.write(temp) correct = 0 for i in xrange(len(Prediction)): print Prediction[i], correctLabel[i], if Prediction[i] == correctLabel[i]: correct += 1 print 'Correct' else: print 'False' print 'Correct Count:', correct print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s", [keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def train_compare_result(train_src, test_src): grocery = Grocery('test') grocery.train(train_src) print grocery.get_load_status() len_test = len(test_src) print len_test Predict_num = 0 History = [] for test in test_src: Predict_result = { 'predict_title': test[1], 'predict_class': None, 'true_class': None } predict_title = Predict_result['predict_title'] predict_result = grocery.predict(predict_title) Predict_result['predict_class'], Predict_result['true_class'] = test[ 0], predict_result if str(predict_result) == str(test[0]): # print 'prediction is True' Predict_num += 1 History.append(Predict_result) # print 'prediction is False' predict_precision = float(Predict_num) / len_test return predict_precision, History
def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s",[keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() result = grocery.predict('just a testing') print(result) result = grocery.predict('考生必读:新托福写作考试评分标准') print(result) print("type of result is :",type(result)) assert str(grocery.predict('考生必读:新托福写作考试评分标准')) == 'education' assert str(grocery.predict('法网')) == 'sports' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def labelmaker(self): result=[] grocery = Grocery('11c_20k_20171226') grocery.load() label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0] result.append(label_confidence[0])#置信度最高的分类结果 result.append(label_confidence[1])# 置信度 return result
def phgrocery(text): # result_text = [] model_grocery = Grocery('model_redian_5') model_grocery.load() result = int(model_grocery.predict(text).predicted_y) # print result # if result == 1: # result_text.append(text) return result
def GET(self,name): #i = web.input(name=None) #url = "http://"+name #html = urllib2.urlopen(url).read() #soup = BeautifulSoup(html) #title = soup.html.head.title.contents.pop().encode('utf-8') title = name.encode('utf-8') new_grocery = Grocery('sample') new_grocery.load() return new_grocery.predict(title)
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'): ''' :param text: :param model_name: :return: ''' new_grocery = Grocery(self.model_name) new_grocery.load() result = new_grocery.predict(text) return result.dec_values[u'postive']
def demo_flask(image_file): grocery = Grocery('NameIdAdd_NLP') model_name = grocery.name text_converter = None tgm = GroceryTextModel(text_converter, model_name) tgm.load(model_name) grocery.model = tgm t = time.time() result_dir = './result' image = np.array(Image.open(image_file).convert('RGB')) result, image_framed = ocr_whole.model(image) output_file = os.path.join(result_dir, image_file.split('/')[-1]) Image.fromarray(image_framed).save(output_file) name_total = '' id_total = '' for key in result: string1 = result[key][1] if len(string1) <= 8: continue string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1) no_digit = len(list(filter(str.isdigit, string2.encode('gbk')))) no_alpha = len(list(filter(is_alphabet, string2))) if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2: predict_result = 'others' elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3: predict_result = 'company-address' elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8): predict_result = 'company-id' elif '名称' in string2: predict_result = 'company-name' else: predict_result = grocery.predict(string2) if str(predict_result) == 'company-name': name_total += string1 break elif str(predict_result) == 'company-id': id_total += string1 else: continue id_total = re.sub(r'\W', '', id_total) name_total = stupid_revise(name_total) print("Mission complete, it took {:.3f}s".format(time.time() - t)) print('\nRecongition Result:\n') print(id_total) print(name_total) return output_file, id_total, name_total
class MyGrocery(object): def __init__(self, name): super(MyGrocery, self).__init__() self.grocery = Grocery(name) self.loaded = False self.correct = 1.0 def train(self, src): lines = [] for line in csv.reader(open(src)): label, s = line[0],line[1] text = s.decode('utf8') lines.append((label, text)) self.grocery.train(lines) def save_model(self): self.grocery.save() def train_and_save(self, src): self.train(src) self.save_model() def load_model(self): if not self.loaded: self.grocery.load() self.loaded = True def predict(self, text): self.load_model() return self.grocery.predict(text) def test(self, src): self.load_model() total, wrong_num = 0.0, 0.0 for line in csv.reader(open(src)): total += 1 if line[0] != self.predict(line[1]): wrong_num += 1 print "load test file from " + src correct = (total - wrong_num ) / total self.correct = correct print "total: %d , wrong_num: %d, success percentage: %f" %(total, wrong_num, correct) result = dict(type="test", total=total, wrong_num=wrong_num, correct=correct) return json.dumps(result)
def tgrocery_train(train_data,test_data): '''model预测''' print("训练语料总数为: " + str(len(train_data))) test_corpus, test_label = test_split(test_data) grocery = Grocery('TextGrocery') print("start training......") grocery.train(train_data) grocery.save() new_grocery = Grocery('TextGrocery') new_grocery.load() predict_label = [] for sample in test_corpus: label = new_grocery.predict(sample) predict_label.append(str(label)) # print(predict_label) return test_corpus,test_label,predict_label
def predict_corpus(input_file,output_csv): import csv csvfile = file(output_csv, 'wb') writer = csv.writer(csvfile) corpus = [] f = xlrd.open_workbook(input_file) table = f.sheet_by_name('Sheet1') nrows = table.nrows # 读取行数 for rownum in range(0, nrows): row = table.row_values(rownum) row[2].strip() corpus.append(row[2]) corpus_grocery = Grocery(project_name) corpus_grocery.load() output = [] for sentence in corpus: predict = corpus_grocery.predict(sentence) output.append((sentence,predict)) writer.writerows(output) print('Done!') csvfile.close()
class jdParser(object): def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]") def get_demand_and_duty(self, jdstr): linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip() > 4) ] result = {} demand = [] duty = [] for line in linelist: pred = str(self.clf.predict(line)) if pred == "demand": demand.append(line) elif pred == "duty": duty.append(line) result['demand'] = '\n'.join(demand) result['duty'] = '\n'.join(duty)
class jdParser(object): def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]") def get_demand_and_duty(self,jdstr): linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip()>4) ] result = {} demand = [] duty = [] for line in linelist: pred = str(self.clf.predict(line)) if pred =="demand": demand.append(line) elif pred == "duty": duty.append(line) result['demand'] = '\n'.join(demand) result['duty'] = '\n'.join(duty)
from tgrocery import Grocery # 新开张一个杂货铺,别忘了取名! grocery = Grocery('sample') # 训练文本可以用列表传入 train_src = [ #('education', '名师指导托福语法技巧:名词的复数形式'), ('tuofu', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('tuofu', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] grocery.train(train_src) # 也可以用文件传入 #grocery.train('train_ch.txt') # 保存模型 grocery.save() # 加载模型(名字和保存的一样) new_grocery = Grocery('sample') new_grocery.load() # 预测 print "category:", new_grocery.predict('考生必读:新托福写作考试评分标准') #education # 测试 test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] new_grocery.test(test_src)
class JdCRF(object): def __init__(self): self.data = [] self.clf = Grocery("jdclf") self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\ |互联网|创业型|国企|央企") self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析') self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]") self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利") self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\ |完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]") self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红") def gen_data(self,fname='./data/lagou_train.txt'): fw = codecs.open('./data/jd_train_crf.txt','wb','utf-8') cnt = 1 for line in codecs.open(fname,'rb','utf-8'): if line.startswith(u"====="): fw.write(line) continue cnt +=1 if len(line.strip())>1: pred = self.clf.predict(line) newline = pred+'\t\t'+line.strip()+'\t\t'+str(len(line))+"\n" fw.write(newline) print cnt print 'done' def load_data(self,fname="./data/jd_train_crf.txt"): data = [] tmp = [] for line in codecs.open(fname,'rb','utf-8'): if line.startswith(u"===="): data.append(tmp) tmp = [] continue else: tag_data = line.strip().split('\t\t') if len(tag_data)==3: tmp.append(tuple(tag_data)) else: print '\t '.join(tag_data) n = len(data)/2 print 'train data',n print 'test data',len(data)-n return data[n:],data[:n] def word2features(self,sent,i): word = sent[i][0] postag = sent[i][1] features = [ 'bias', 'word.lower=' + word.lower(), 'word[:2]=' +word[:2], 'word.isdigit=%s'%word.isdigit(), 'postag='+postag, 'demand=%s'% '1' if self.DEMAND.search(word) else '0', 'start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0', 'start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', 'duty=%s'% '1' if self.DUTY.search(word) else '0', 'jobname=%s'% '1' if self.JOBNAME.search(word) else '0', 'incname=%s'% '1' if self.INCNAME.search(word) else '0', 'benefit = %s'% '1' if self.BENEFIT.search(word) else '0', 'pred=%s' % self.clf.predict(word) ] if i>0: word1 = sent[i-1][0] postag1 = sent[i-1][1] features.extend([ '-1:postag='+postag1, '-1:word.islower='+word1[:3].lower(), '-1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '1', '-1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', '-1:demand=%s'% '1' if self.DEMAND.search(word1) else '0', '-1:duty=%s'% '1' if self.DUTY.search(word1) else '0', '-1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0', '-1:incname=%s'% '1' if self.INCNAME.search(word1) else '0', '-1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0', '-1:pred=%s' % self.clf.predict(word), ]) else: features.append('BOS') if i<len(sent)-1: word1 = sent[i+1][1] postag1 = sent[i+1][1] features.extend([ '+1:word.lower=' + word1[:3].lower(), '+1:word.istitle=%s' % word1.istitle(), '+1:word.isupper=%s' % word1.isupper(), '+1:postag=' + postag1, '+1:postag[:2]=' + postag1[:2], '+1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0', '+1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', '+1:demand=%s'% '1' if self.DEMAND.search(word1) else '0', '+1:duty=%s'% '1' if self.DUTY.search(word1) else '0', '+1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0', '+1:incname=%s'% '1' if self.INCNAME.search(word1) else '0', '+1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0', '+1:pred=%s' % self.clf.predict(word), ]) else: features.append('EOS') return features def sent2features(self,sent): return [self.word2features(sent,i) for i in range(len(sent))] def sent2labels(self,sent): return [label for (label,token,postag) in sent] def sent2tokens(self,sent): return [token for (label,token,postag) in sent] def train(self,x_train,y_train): assert len(x_train)==len(y_train),"not the same %d %d"%(len(x_train),len(y_train)) trainer = pycrfsuite.Trainer(verbose=False) for xseq,yseq in zip(x_train,y_train): trainer.append(xseq,yseq) trainer.set_params({ 'c1':1.0, 'c2':1e-3, 'max_iterations':50, 'feature.possible_transitions':True }) trainer.train('jd_skill.crfsuite') def test(self,sent): tagger = pycrfsuite.Tagger() tagger.open('./jd_skill.crfsuite') print 'tokens ','\n '.join(self.sent2tokens(sent)) print 'Predicted','\t '.join(tagger.tag(self.sent2features(sent))) print 'Correct ','\t '.join(self.sent2labels(sent))
('normal', 'Fw:关于ICP备案申请审核通过的通知'), ('normal', '技术部-SSL数字加密证书') ] # 创建一个 grocery,'mail_class'为模型名称 grocery = Grocery('mail_class') grocery.train(train_src) grocery.save() # Load model(和之前设的名字一样) new_grocery = Grocery('mail_class') new_grocery.load() # 预测 print new_grocery.predict('关于神经网络与深度学习一书源码') # education # Test from list # test_src = [ # ('education', 'Abbott government spends $8 million on higher education media blitz'), # ('sports', 'Middle East and Asia boost investment in top level sports'), # ] # new_grocery.test(test_src) # # Return Accuracy # # 1.0 # # Or test from file # new_grocery.test('test_ch.txt') # # Custom tokenize # custom_grocery = Grocery('custom', custom_tokenize=list)
q=0 for c in mycontent: if c: k=mysign[q] p=[k,c] train_listc.append(p) q=q+1 for t in mytitle: m=mysign[i] n=[m,t] train_list.append(n) i=i+1 grocery.train(train_listc) grocery.train(train_list) grocery.save() new_grocery=Grocery('trydb') new_grocery.load() pc=message.getContent1() pt=message.getTitle1() g=1 for newscontent in pc: if newscontent: num=new_grocery.predict(newscontent+pt[g-1]) message.saveContent(g,num) else: num=new_grocery.predict(pt[g-1]) message.saveContent(g,num) g=g+1
Train = CochraneAnalysis["Train"] Data = CochraneAnalysis["Data"] PredictedLabels = CochraneAnalysis["PredictedLabels"] x = ["Y", "YU", "U", "UN", "N", "YR", "YUR", "UR", "UNR", "NR"] def getTag(tag): if (tag[0] == "Y"): return "Y" return "N" grocery = Grocery('sample') train_src = [] for data in Train.find(): label = getTag(data["Tag"]) text = data["Authors' conclusions"] train_src.append((label, text)) # Preparing training data from hand labeled classfiers grocery.train(train_src) grocery.save() for data in Data.find(): pred_label = grocery.predict(data["Authors' conclusions"]) data["PredictedLabel"] = pred_label PredictedLabels.insert_one(data)
# ======= Predict And result.csv ======= with open( "result.csv", "w") as outputfile: outputfile.write("NewsId,Agency") outputfile.write("\n") for fname in os.listdir(queryDir): if fname == ".DS_Store": continue with open(os.path.join(queryDir, fname)) as f: data = f.read().splitlines() nameOfdata = data[0] print nameOfdata content = "" for x in xrange(2,len(data)): # print data[x] content += data[x] # print content result = 0 # if grocery.predict(simplify(content)) == 2: # else: result = corporaMgr.getClassifiyPublisherName(content, grocery.predict(simplify(content)),sentimentResult) outputfile.write(nameOfdata) outputfile.write(",") outputfile.write(result) outputfile.write("\n") else: print "Please use like 'python sim.py [originDocs_Dir] [outputDocs_Dir] [WithCatagoryAndPublisher] [train.csv]'" # python categorydocs.py news_story_dataset/ preprocessingData/simplify/ preprocessingData/withCategory/ train.csv p2data/phase2_test_dataset/
CochraneAnalysis = c["CochraneAnalysisArticle"] Train = CochraneAnalysis["Train"] Data = CochraneAnalysis["Data"] PredictedLabels = CochraneAnalysis["PredictedLabels"] x = ["Y", "YU", "U", "UN", "N", "YR", "YUR", "UR", "UNR", "NR"] def getTag(tag): if (tag[0] == "Y"): return "Y" return "N" grocery = Grocery('sample') train_src = [] for data in Train.find(): label = getTag(data["Tag"]) text = data["Authors' conclusions"] train_src.append((label, text)) # Preparing training data from hand labeled classfiers grocery.train(train_src) grocery.save() for data in Data.find(): pred_label = grocery.predict(data["Authors' conclusions"]) data["PredictedLabel"] = pred_label PredictedLabels.insert_one(data)
def main(): # Get market_sentiment of word from NTUSD-Fin train_t = [] train_s = [] targetIn = {} targetDict = dict() with open('NTUSD-Fin/NTUSD_Fin_hashtag_v1.0.json', 'r', encoding='utf-8') as f: targetIn = json.load(f) N = len(targetIn) for i in range(N): word = "#" + targetIn[i]['token'] targetDict[word] = targetIn[i]['market_sentiment'] sg = str(GroupValue_s(str(targetDict[word] / 3.5))) train_s.append((sg, word)) with open('NTUSD-Fin/NTUSD_Fin_word_v1.0.json', 'r', encoding='utf-8') as f: targetIn = json.load(f) N = len(targetIn) for i in range(N): word = targetIn[i]['token'] targetDict[word] = targetIn[i]['market_sentiment'] sg = str(GroupValue_s(str(targetDict[word] / 3.5))) train_s.append((sg, word)) # Training File: Load data & Use tgrocery to train classification model TrainingFile = open('training_set.json', 'r') TrainingData = json.load(TrainingFile) TrainingFile.close() DataList = [] grocery_t = Grocery("tweet") grocery_s = Grocery("snippet") for DataElement in TrainingData: tempt = DataManager() tempt.insertData(DataElement) tempt.group_t = GroupValue_t(tempt.sentiment) tempt.group_s = GroupValue_s(tempt.sentiment) line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ", DataElement["tweet"]) train_t.append((str(tempt.group_t), line)) if isinstance(DataElement["snippet"], list): for line in DataElement["snippet"]: train_s.append((str(tempt.group_s), line)) elif DataElement["snippet"] != "": train_s.append((str(tempt.group_s), DataElement["snippet"])) else: tempt.group_s = 0.0 DataList.append(tempt) grocery_t.train(train_t + train_s) grocery_t.save() grocery_s.train(train_s) grocery_s.save() # Save training data created by WordScore() and GroupValue_*() # Data will be uesd for LinearRegression() in BOTH.py outfile = open('TG_train.txt', 'w', encoding='utf-8') dataScore = [] dataSentiment = [] for row in DataList: dataSentiment.append([float(row.sentiment)]) a = WordScore(row.tweet, targetDict) b = WordScore(row.snippet, targetDict) c = row.group_t d = row.group_s dataScore.append([a, b, c, d]) print(a, b, c, d, file=outfile) outfile.close() ''' # Train linear regression model model = LinearRegression() model.fit(dataScore, dataSentiment) # Test for training data print('(train)R-squared: %.3f' % model.score(dataScore, dataSentiment)) #0.915 predictions = model.predict(dataScore) rms = mean_squared_error(dataSentiment,predictions) print('RMSE: %.3f' % sqrt(rms)) #0.110 print('MSE: %.3f' % rms) #0.012 ''' # Testing File: Load data & Use tgrocery classification model to predict TestingFile = open('test_set.json', 'r') TestingData = json.load(TestingFile) TestingFile.close() DataList = [] new_grocery_t = Grocery('tweet') new_grocery_t.load() new_grocery_s = Grocery('snippet') new_grocery_s.load() for DataElement in TestingData: tempt = DataManager() tempt.insertData(DataElement) line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ", DataElement["tweet"]) tempt.group_t = float('{0}'.format(new_grocery_t.predict(line))) value = 0.0 if isinstance(DataElement["snippet"], list): for line in DataElement["snippet"]: value = value + float('{0}'.format( new_grocery_s.predict(line))) value = value / len(DataElement["snippet"]) elif DataElement["snippet"] != "": value = float('{0}'.format( new_grocery_s.predict(DataElement["snippet"]))) tempt.group_s = value DataList.append(tempt) # Save testing data created by WordScore() and classification prediction # Data will be uesd for LinearRegression() in BOTH.py outfile = open('TG_test.txt', 'w', encoding='utf-8') dataScore = [] dataSentiment = [] for row in DataList: dataSentiment.append([float(row.sentiment)]) a = WordScore(row.tweet, targetDict) b = WordScore(row.snippet, targetDict) c = row.group_t d = row.group_s dataScore.append([a, b, c, d]) print(a, b, c, d, file=outfile) outfile.close() '''
# coding: utf-8 from tgrocery import Grocery # save grocery = Grocery('test') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) grocery.save() # load # grocery name must be the same as the previous one new_grocery = Grocery('test') new_grocery.load() print new_grocery.predict('考生必读:新托福写作考试评分标准')
# test ################################## print 'start test' TP = 0.0 TN = 0.0 FP = 0.0 FN = 0.0 filetest = codecs.open(testFileName, 'r', 'utf-8') test_reader = filetest.readlines() resultlist = [] for line in test_reader: str = line.split(u'\t') #import pdb; pdb.set_trace() #print line result = grocery.predict(str[2]) resultlist.append((str[0], result)) #print result #import pdb; pdb.set_trace() if (result == str[1]): if (str[1] == u'0'): TN = TN + 1 else: TP = TP + 1 else: if (str[1] == u'0'): FP = FP + 1 else: FN = FN + 1 precision = TP / (TP + FP)
concat = ','.join(no) + '$%^' + ','.join(qz) print concat clientSender.publish('similarResult', reqParamList[0] + '!@#' + concat) elif item['channel'] == 'abstract': # 文本抽取 text = reqParamList[1] tr4s.train(text=text, speech_tag_filter=True, lower=True, source = 'all_filters') # 使用词性过滤,文本小写,使用words_all_filters生成句子之间的相似性 # abstractResult = '\n'.join(tr4s.get_key_sentences(num=1+len(text)/350)) abstractResult = tr4s.get_key_sentences(num=1+len(text)/250) re = '' '$%^'.join(abstractResult) clientSender.publish('abstractResult', reqParamList[0] + '!@#' + '$%^'.join(abstractResult)) elif item['channel'] == 'classification': doc = reqParamList[1] #data, data_vec = ldaModel.file_to_vec(doc) #out_put, out_put_class = ldaModel.pre(data_vec) t_pre_result = grocery.predict(delete_stop_words(doc)) out_put_class = t_pre_result.predicted_y clientSender.publish('classificationResult', reqParamList[0] + '!@#' + out_put_class)
resultlist=[] i=0 for line in validate_reader: content=pp.getcontent(validate_reader,i) i=i+1 if(i%5000==0): print ("%d "%(i))+'#'*30 #if(i>10): #break if(content==''): print line else: str=content.split('\t') len=str[0].__len__() result=grocery.predict(content[len+3:]) if(result==str[1]): if(str[1]==u'0'): TN=TN+1 else: TP=TP+1 else: if(str[1]==u'0'): FP=FP+1 fileOutput.write('FP: '+line+' \n') else: FN=FN+1 fileOutput.write('FN: '+line+' \n') precision=TP/(TP+FP) recall=TP/(TP+FN)
train_set = '/home/hntea/RobotWorkSpace/SpeechSystem/speech_system/src/nlu/script/training/dataset/train.dat' text_set = '/home/hntea/RobotWorkSpace/SpeechSystem/speech_system/src/nlu/script/training/dataset/test.dat' modelsave = 'model' grocery = Grocery(modelsave) grocery.train(train_set) #保存模型 grocery.save() # 加载模型(名字和保存的一样) new_grocery = Grocery(modelsave) new_grocery.load() # 预测 # ret = new_grocery.predict('放一首歌来听').predicted_y # print "放一首歌吧"+str(new_grocery.predict('放一首歌来听').predicted_y) print new_grocery.predict('你叫什么名字') print new_grocery.predict('吃饱没有') print new_grocery.predict('周杰伦') print new_grocery.predict('黑色衣服好看') print new_grocery.predict('王力宏') print new_grocery.predict('波哥') print new_grocery.predict('播歌') print new_grocery.predict('我要听张含韵的歌') print new_grocery.predict('放一首:富士山下') print new_grocery.predict('点播:兄弟') print new_grocery.predict('听歌') print new_grocery.predict('听歌。') print new_grocery.predict('我要听歌') print new_grocery.predict('我要听音乐。') print new_grocery.predict('播放歌曲。') print new_grocery.predict('音乐播放。')
import sys reload(sys) sys.setdefaultencoding('utf8') # from pyspark import SparkContext # from pyspark.sql import * # from pyspark.sql.types import * # import time # import rapidjson as json # # sc = SparkContext(appName="cmt") # sqlContext = SQLContext(sc) # hiveContext = HiveContext(sc) from tgrocery import Grocery grocery = Grocery('sample') train_src = [ ('education', 'Student debt to cost Britain billions within decades'), ('education', 'Chinese education for TV experiment'), ('sports', 'Middle East and Asia boost investment in top level sports'), ('sports', 'Summit Series look launches HBO Canada sports doc series: Mudhar') ] grocery.train('/home/hadoop/tmp/tgrocery/train_file.txt') print grocery.predict("7款清爽眼部卸妆液 卸掉残妆不留暗沉")
#!/usr/bin/env python # -*- coding: utf-8 -*- import MySQLdb from tgrocery import Grocery import sys reload(sys) sys.setdefaultencoding('utf8') grocery = Grocery('sample') dict_list = list() conn = MySQLdb.connect(host = 'localhost', db = 'newsdata', user = '******', passwd = 'root', charset = 'utf8', use_unicode = False) cur = conn.cursor() cur.execute('select com_new_type_id, com_new_name from tbl_new where com_new_type_id is not null') for row in cur.fetchall(): dict_list.append(row) grocery.train(dict_list) grocery.save() news_grocery = Grocery('sample') news_grocery.load() while True: result = news_grocery.predict(raw_input('please input title:' )) print result
########################################## # init model_choose = "svm" # svm, lda, rnn grocery_name = "./SVM_models/svm_for_news" corpus_path = "./Corpus/NewsClassCorpus/" file_path = "./" file_name = "post.txt" t_text = delete_stop_words(codecs.open(file_path + file_name, encoding="UTF-8").read()) ########################################### # 调用 SVM 模型分类 if model_choose == "svm": tic = time.time() grocery = Grocery(grocery_name) grocery.load() t_pre_result = grocery.predict(delete_stop_words(t_text)) toc = time.time() t_label = t_pre_result.predicted_y print("Sentiment: ", t_label) print("How much: ", t_pre_result.dec_values[t_label]) print("Elapsed time of predict is: %s s" % (toc - tic)) elif model_choose == "lda": pass elif model_choose == "rnn": pass else: print("")
# coding: utf-8 from tgrocery import Grocery grocery = Grocery('test') train_src = [ ('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values) grocery = Grocery('read_text') train_src = '../text_src/train_ch.txt' grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values)
# test ################################## print 'start test' TP=0.0 TN=0.0 FP=0.0 FN=0.0 filetest=codecs.open(validateFileName,'r','utf-8') test_reader=filetest.readlines() resultlist=[] for line in test_reader: str=line.split(u',') #import pdb; pdb.set_trace() #print line result=grocery.predict(str[1]) #print result #import pdb; pdb.set_trace() if(result==str[0]): if(str[0]==u'0'): TN=TN+1 else: TP=TP+1 else: if(str[0]==u'0'): FP=FP+1 else: FN=FN+1 precision=TP/(TP+FP) recall=TP/(TP+FN)
# coding: utf-8 from tgrocery import Grocery # pass a tokenizer, must be a python func custom_grocery = Grocery('custom', custom_tokenize=list) train_src = [ ('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] custom_grocery.train(train_src) print custom_grocery.get_load_status() print custom_grocery.predict('考生必读:新托福写作考试评分标准')
# coding: utf-8 from tgrocery import Grocery grocery = Grocery('test') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values) grocery = Grocery('read_text') train_src = '../text_src/train_ch.txt' grocery.train(train_src) print(grocery.get_load_status()) predict_result = grocery.predict('考生必读:新托福写作考试评分标准') print(predict_result) print(predict_result.dec_values)
grocery = Grocery('sample') train_src = r'E:\classify\plan2\train_kmer.txt' grocery.train(train_src, delimiter=',') print('Training finished! Time consumption:') mid = time.process_time() print(str(mid - start)) grocery.save() grocery.load() test_src = r'E:\classify\plan2\test_kmer.txt' print('Classification accuracy:') print(grocery.test(test_src, delimiter=',')) classifile = open(r'E:\classify\tokens.txt', mode='r', encoding='utf-8') pinyin = open(r'E:\classify\pinyin_grocery.txt', mode='w', encoding='utf-8') words = open(r'E:\classify\words_grocery.txt', mode='w', encoding='utf-8') for line in classifile.readlines(): if grocery.predict(getkmer(line, 2)) == 'word': words.write(line) if grocery.predict(getkmer(line, 2)) == 'pinyin': pinyin.write(line) classifile.close() pinyin.close() words.close() print('Program running time:') end = time.process_time() print(str(end - start))
# coding:utf-8 #!/usr/bin/evn python from tgrocery import Grocery copy_grocery = Grocery('./classfynews_instance')#模型所在路径 copy_grocery.load() #copy_grocery = grocery test = ['我是中国人','台北*****'] test_result = copy_grocery.predict(test) print test_result.predicted_y #test_result = copy_grocery.test(test_in) #print test_result.show_result()
# coding=utf-8 from tgrocery import Grocery grocery = Grocery('sample') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) #grocery.train('/home/wangjianfei/git/data/train_ch.txt') # grocery.train('train_ch.txt') grocery.save() new_grocery = Grocery('sample') new_grocery.load() print( new_grocery.predict( 'Abbott government spends $8 million on higher education media blitz')) test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] print("start test..................") #grocery.test('/home/wangjianfei/git/data/test.txt') # grocery.train('train_ch.txt')) # custom_grocery = Grocery('custom', custom_tokenize=list) print(new_grocery.test(test_src))
class TagPredictor(object): def _custom_tokenize(self, line, **kwargs): try: kwargs["method"] except: method = str(self.kwargs["method"]) else: method = str(kwargs["method"]) if method == "normal": tokens = self.key_ext.calculateTokens(line, doc_len_lower_bound=5, doc_len_upper_bound=500, method="normal") elif method == "processed": tokens = line.split(',') return tokens def __init__(self, *args, **kwargs): self.grocery_name = str(kwargs["grocery_name"]) method = str(kwargs["method"]) train_src = str(kwargs["train_src"]) self.PREFIX = conf.load("predict_label")["prefix"] self.MODEL_DIR = conf.load("predict_label")["model_dir"] self.kwargs = kwargs if method == "normal": self.key_ext = keyExt() self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) elif method == "jieba": self.grocery = Grocery(self.grocery_name) elif method == "processed": self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) pass def trainFromDocs(self, *args, **kwargs): model = self.grocery.train(self.kwargs["train_src"]) return model def autoEvaluation(self, *args, **kwargs): prune_threshold = float(kwargs["threshold"]) excluded_labels = kwargs["excluded_labels"] excluded_docs = kwargs["excluded_docs"] train_data = [] with open(self.kwargs["train_src"], 'rb') as f: for line in f: try: line.split('\t', 1)[1] except: continue else: train_data.append( (line.split('\t', 1)[0], line.split('\t', 1)[1].split('\n', 1)[0])) f.close() print "#items before filtering:", len(train_data) print "-- Now we filter out the excluded docs --" train_data = [i for i in train_data if i[1] not in excluded_docs] print "#items after filtering:", len(train_data) print "-- Now we filter out the excluded labels --" train_data = [i for i in train_data if i[0] not in excluded_labels] print "#items after filtering:", len(train_data) n = len(train_data) #number of rows in your dataset indices = range(n) indices = shuffle(indices) train_set = map(lambda x: train_data[x], indices[:n * 10 // 10]) test_set = map(lambda x: train_data[x], indices[:n * 10 // 10]) self.grocery.train(train_set) test_result = self.grocery.test(test_set) print '-- Accuracy after training --' print 'Accuracy, A-0:', test_result low_recall_label = [] for item in test_result.recall_labels.items(): if item[1] < prune_threshold: low_recall_label.append(item[0]) new_train_set = [ item for item in train_set if item[0] not in low_recall_label ] new_test_set = [ item for item in train_set if item[0] not in low_recall_label ] self.grocery.train(new_train_set) new_test_result = self.grocery.test(new_test_set) print '-- Accuracy after training, with low-recall labels (less than', str( prune_threshold * 100), '%) pruned --' print 'Accuracy, A-1:', new_test_result return self.grocery, new_test_result def manualEvaluation(self, *args, **kwargs): n_docs = int(kwargs["n_docs"]) excluded_labels = kwargs["excluded_labels"] excluded_docs = kwargs["excluded_docs"] train_data = [] with open(self.kwargs["train_src"], 'rb') as f: for line in f: try: line.split('\t', 1)[1] except: continue else: train_data.append( (line.split('\t', 1)[0], line.split('\t', 1)[1].split('\n', 1)[0])) f.close() train_data = [ item for item in train_data if item[0] not in excluded_labels ] train_data = [i for i in train_data if i[1] not in excluded_docs] n = len(train_data) #number of rows in your dataset indices = range(n) indices = shuffle(indices) test_set = map(lambda x: train_data[x], indices[0:n_docs]) g = self.loadTrainModel() test_result = g.test(test_set) return test_set, test_result def saveTrainModel(self, *args, **kwargs): self.grocery.save() os.rename( self.PREFIX + self.grocery_name + '_train.svm', self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm') return def loadTrainModel(self, *args, **kwargs): os.rename( self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm', self.PREFIX + self.grocery_name + '_train.svm') self.grocery.load() os.rename( self.PREFIX + self.grocery_name + '_train.svm', self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm') return self.grocery def predict(self, line, **kwargs): tag = self.grocery.predict(line) return tag def test(self, *args, **kwargs): test_src = str(kwargs["test_src"]) test_result = self.grocery.test(test_src) print "Total Accuracy", test_result return test_result
ftest = open(path2, 'w') for line in open(path): if random.random() < theta: ftest.write(line) else: ftrain.write(line) ftrain.close() ftest.close() def train(path,name): grocery = Grocery(name) grocery.train(path) grocery.save() if __name__ == "__main__": data2tt(sys.argv[3], sys.argv[1], sys.argv[2], 0.02) train(sys.argv[1], "music") new_grocey = Grocery("music") new_grocey.load() n = 0 for line in open(sys.argv[2],"r"): ls = line.strip().split("\t") predict = new_grocey.predict(ls[1]) test = ls[0] result = 0 if test == str(predict): result = 1 n += result print predict,test,result print n
# test ################################## #grocery=Grocery('sample') grocery = Grocery('version1.0') grocery.load() print 'start test' filetest = codecs.open(testFileName, 'r', 'utf-8') test_reader = filetest.readlines() fileOutput = codecs.open(outputFileName, 'w', 'utf-8') i = 0 for line in test_reader: content = pp.getcontent(test_reader, i) i = i + 1 #if(i>10): #break if (i % 5000 == 0): print("%d " % (i)) + '#' * 30 if (content == ''): print "test.py#" * 3 + line else: str = content.split('\t') len = str[0].__len__() result = grocery.predict(content[len + 1:]) fileOutput.write(str[0] + ',' + result + '\n') filetest.close() fileOutput.close()
('english', u'新托福考试官方指南'), ('extra', u'牛奶可乐经济学'), ('course', u'自动控制理论与设计'), ('course', u'电力电子技术'), ('course', u'数字图像处理'), ('course', u'自动控制原理习题精解与考研指导'), ('course', u'现代检测技术'), ('extra', u'忒修斯之船'), ('professional', u'Arduino程序设计基础'), ('professional', u'机器学习导论'), ('professional', u'TensorFlow实战'), ('professional', u'Effective Modern C++'), ('extra', u'重新发现社会'), ('extra', u'Letter from an Unknown Woman') ] # create a model named 'book_class' grocery = Grocery('book_class') grocery.train(train_src) grocery.save() # load the model 'book_class' new_grocery = Grocery('book_class') new_grocery.load() # make predictions str = raw_input('bookname: ') while str.strip(): print('category: ', new_grocery.predict(str)) str = raw_input('bookname: ')
class JdParser(object): def __init__(self): self.degreedic = set(line.strip() for line in codecs.open( './data/degrees.txt', 'rb', 'utf-8')) # 载入学历词库 self.majordic = set(line.strip() for line in codecs.open( './data/majordic.txt', 'rb', 'utf-8')) # 载入专业词库 self.citydic = set(line.strip() for line in codecs.open( "./data/citydic.txt", 'rb', 'utf-8')) # 载入城市词库 self.firmnames = set(line.strip() for line in codecs.open( './data/firm.txt', 'rb', 'utf-8')) # 载入公司缩写名库 self.jobdic = set(line.strip() for line in codecs.open( './data/jobposition.txt', 'rb', 'utf-8')) # 载入招聘职位名库 self.skills = set( line.strip() for line in codecs.open('./data/skills.txt', 'rb', 'utf-8')) # self.wordlisttf = pickle.load(open('./data/wordlist.pkl')) # 出现频率最高的2000个单词 # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector self.clf = Grocery("jdclf") # 句子分类器,分为demand,duty,other self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile( u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验|工作年限|工作经历|项目经[历验]|\d年经[历验]|.{1,2}年相关工作经验") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile( u"\S+(有限公司|酒店|银行|集团|厂|研究中心|研究所|学校|旅行社|中心/s|分?公司|研发中心|技术部|事.部|招聘|商务平台)" ) self.INCTAG = re.compile( u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企" ) self.JOBNAME = re.compile( u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析' ) self.START_DEMAND = re.compile( u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]" ) self.DEMAND = re.compile( u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|学历|经验|喜欢|较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利" ) self.DUTY = re.compile( u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译" ) self.START_DUTY = re.compile( u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]" ) self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile( u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动") self.SPLIT_JD = re.compile( u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]") self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]") self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]") self.SKILL = re.compile( u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的" ) jieba.load_userdict('./data/majordic.txt') jieba.load_userdict('./data/skills.txt') jieba.load_userdict('./data/firm.txt') jieba.load_userdict('./data/degrees.txt') jieba.load_userdict('./data/benefits.txt') self.jdStr = "" self.linelist = [] self.lineindex = defaultdict(int) self.result = OrderedDict() # 分句,预处理 def preprocess(self, jdstr): self.result.clear() jdstr = re.sub(u"[【】◆ \u25cf\u25c6\u2605]", "", jdstr.decode('utf-8')) self.linelist = [ line.strip() for line in jdstr.split('\n') if len(line) > 1 ] self.jdStr = '\n'.join(self.linelist) for line in self.linelist: # print self.clf.predict(line),'\t',line self.lineindex[re.sub(u"[\s ]+", " ", line)] = 0 def line2vec(self, line): vec = np.zeros(50) cnt = 1 for word in jieba.cut(line): if word in self.w2vdict: vec += self.w2vdict[word] cnt += 1 vec = vec / cnt return vec # 抽取性别要求 def regular_sex(self): res = set() for line in self.linelist: if self.clf.predict(line) == 'demand' or self.DEMAND.search(line): findsex = self.SEX.search(line) if findsex: getsex = re.search(u"性别不限|男|女", line.replace(u"男女不限", u"性别不限")) if getsex: res.add(getsex.group()) break if res: self.result['sex'] = ' / '.join(res) else: self.result['sex'] = u'性别不限' # 抽取年龄要求 def regular_age(self): res = '' for line in self.linelist: if re.search(u'\d{2}后', line): continue findage = self.AGE.search(line) if findage: age = re.findall(u'\d{2}', line) if len(age) >= 2: res = '-'.join(age) elif len(age) == 1: if re.search(u'以上|不低于', line): res = age[0] + u'以上' if re.search(u"不超过|不高于|以下", line): res = age[0] + '以下' if re.search(u"左右|大约|大概", line): res = age[0] + '左右' break if len(res) < 2: res = u'年龄不限' self.result['age'] = res return res # 抽取专业要求 def regular_major(self): res = [] for line in self.linelist: findmajor = re.search(u"专业要求[::\s]", line) if findmajor: print 'major demand', line items = self.clean_line(line[findmajor.span()[1]:]).split() items = filter( lambda x: x not in self.degreedic and not re.search( u"薪酬|经验|元|\d+|月", x), items) res.append(' / '.join(items)) break if not res: for line in self.linelist: if re.search(u"专业.限|.限专业", line) and not re.search(u"专业优先", line): res.append(u"专业不限") print 'major demand', line break else: findmajor = self.MAJOR.search(line) if findmajor: majoritem = re.split(u'[\s,,;; ]', findmajor.group()) for item in majoritem: if re.search( u'学历|年龄|岁|学校|公司|性格|具有|具备|能够|经验|有|毕业|性别|男|女', item): continue print 'major item', item if self.BENEFIT.search(line): continue print 'major item', item if re.search(u"专业", item) and len(item) < 3: continue res.append(self.clean_line(item)) break if not res: for majorword in jieba.cut(line): if majorword in self.majordic or majorword[: -2] in self.majordic: res.append(majorword) if re.search(u"[等及类]?相关专业", self.jdStr) and len(res) == 1: res[0] += u"等相关专业" if not res: res.append(u"专业不限") self.result['major'] = res # 抽取学历要求 def regular_degree(self): """ 抽查学历信息,先整找关键字,而后再切词,用词典匹配 """ degree = [ u'小学', u'初中', u'中专', u'中技', u'高中', u'专科', u'大专', u'本科', u'硕士', u'博士', u'博士后' ] res = set() for line in self.linelist: finddegree = re.search(u"学历要求[::\s]", line) if finddegree: items = self.clean_line(line[finddegree.span()[1]:]).split() items = filter(lambda x: not re.search(u"薪酬|经验|元|月|年|\d+", x), items) res.add(' / '.join(items)) break if not res: for line in self.linelist: if re.search(u"学历不限|学历要求不限|不限学历", line): res.add(u"学历不限") break else: finddegree = self.DEGREE.search(line) if finddegree: res.add(finddegree.group()) break # 如果没有匹配到学历的要求信息,就整个文本切词后匹配查找 if len(res) == 0: for word in jieba.cut(self.jdStr): if word in self.degreedic: res.add(word) res = list(res) if len(res) == 1 and re.search(u'[及或]?以上', res[0]): tmp = res[0][:2] if tmp == u'全日': tmp = u'本科' elif tmp == u'研究': tmp = u'硕士' if tmp in degree: idx = degree.index(tmp) res = degree[idx:] self.result['degree'] = ' / '.join(res) # 抽取工作经验年限要求 def regular_exp(self): cnyear = u'[半一二三四五六七八九十两]年|\d-\d{1,2}年|\d年及?以上|不少于\d年|\d年' res = set() jdStr = self.jdStr findexp = re.search(u'经验不限|(经验)?\d{1,2}年及以上|经验\d-\d{1,2}年', jdStr) if findexp: res = findexp.group() self.result['exp'] = res.replace(u"经验", "") return res findexp = self.EXP.search(jdStr) if findexp: pos = findexp.span()[1] jdStr = jdStr[max(0, pos - 25):min(pos + 15, len(jdStr))] exp = re.search(cnyear, jdStr) if exp: res.add(exp.group()) if not res: exp = re.search( u"(\d-)?\d{1,2}年(工作|开发|项目)?经[历验]|(不少于)?([半\d]年)及?(以上)?经[历验]|经[历验]\s?(\d-)?\d{1,2}年", ' '.join(self.regular_jobtag())) if exp: res.add(exp.group()) else: exp = re.search(cnyear, ' '.join(self.regular_jobtag())) if exp: res.add(exp.group()) self.result["exp"] = "-".join(res) self.result["exp"] = self.result['exp'].replace(u'经验', "").replace(u"经历", "") return res def regular_jobtag(self): """ 有关职位标签信息 """ res = [] job_tag = re.search(u"应届生|全职|兼职|实习生|应届毕业生|社招|急招|急聘", self.jdStr) if job_tag: res.append(job_tag.group()) job_tag = re.search(u"招聘人数[::]?|招聘[::\s]|人数[::\s]", self.jdStr) if job_tag: jdstr = self.jdStr[job_tag.span()[1]:] for line in jdstr.split(): if len(line.strip()) < 1: continue else: num = re.search(u"(\d+\-)?\d+人?|若干|\d+位", line) if num: res.append(u"招聘人数:" + num.group()) break job_tag = re.search(u"(职能类别|职位标签)[:: ]?", self.jdStr) if job_tag: jdstr = self.jdStr[job_tag.span()[1]:] for line in jdstr.split('\n'): if len(line.strip()) < 3: continue else: res.append("职业标签:" + line.strip()) break if len(line) > 25: break # 根据产品部需求专门切割出包含经验的句子等有关职位标注信息,句子进行更精细化切割 linelist = [ line for line in re.split(u"[,。;\s]", self.jdStr) if 5 < len(line) < 15 ] for line in linelist: if re.search(u"经验", line) and not re.search(u"月薪|地点|日期", line): if re.search(u"\d+k|[。?)\)\]]", line): continue res.append(self.clean_line(line)) break self.result["job_tag"] = res return res # 清除句子前的数字和标点符合 def clean_line(self, line): line = self.CLEAR_NUM.sub("", line.strip()) line = self.CLEAR_COLO.sub("", line) return line # 抽取工作地点 def regular_workplace(self): res = set() jdstr = self.jdStr pos = list(re.finditer(u"(工作地.|上班地.|实习地.|地址|地点)[::\s]", jdstr)) if pos: jdstr = jdstr[pos[0].span()[1]:] for line in jdstr.split(): if len(line.strip()) < 2: continue if len(line) < 26: res.add(line.strip().replace(":", "").replace(":", "")) else: for city in jieba.cut(line): if city in self.citydic and city[:-1] not in res: res.add(city) break if not res: for city in jieba.cut(jdstr): if city in self.citydic and city[: -1] not in res and u"国" not in city: res.add(city) break self.result["workplace"] = " / ".join(res) return res # 抽取证书获奖情况等其他要求 def regular_cert(self): res = set() linelist = [ line for line in re.split(u"[\s ,。;,]", self.jdStr) if len(line) > 3 ] for line in linelist: findcert = re.search( u"(\S+证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|职业资格|律师证|会计证", line) if findcert: res.add(findcert.group()) else: findcert = re.search(u"有(.+证)书?", line) if findcert: res.add(findcert.group(1)) else: findcert = re.search(u"有.+资格", line) if findcert: res.add(findcert.group()) self.result['cert'] = re.sub(u"[或及以上]", "", ' / '.join(res)) if self.result['cert']: self.result['cert'] = self.result['cert'].split(' / ') else: self.result['cert'] = [] # 利用技能词库抽取相关技能 def regular_skill(self, num=6): res = [] for line in self.linelist: if self.DEMAND.search(line) or self.clf.predict(line) == 'demand': for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skills: res.append(word) sorted_words = [w[0] for w in Counter(res).most_common(2 * num)] for word in jieba.cut(self.result['job_name']): word = strQ2B(word).lower() if word in self.skills and word not in sorted_words: sorted_words.insert(0, word) after_top3 = sorted_words[3:] np.random.shuffle(after_top3) self.result['skill'] = sorted_words[:3] + after_top3[:num - 3] # 抽取岗位职责 def regular_duty(self): res = [] jdStr = self.jdStr pos = list(self.START_DUTY.finditer(jdStr)) if len(pos) > 0: linelist = [ re.sub("[\s ]+", " ", line) for line in jdStr[pos[-1].span()[1]:].split("\n") if len(line) > 2 ] for i in xrange(len(linelist)): line = linelist[i] if self.START_DUTY.search( line) or self.lineindex[line] == 1 or ( re.search(u".年来|谢谢|请在|公司介绍|举报|收藏|岗位职责", line) and not re.search(u"了解", line)): continue if re.search(u"要求[::\s]?|岗位要求", line) and len(line) < 6: break if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]", line.strip()) or self.DUTY.search( line) or self.clf.predict(line) == 'duty': res.append(line.strip()) elif i < len(linelist) - 1 and self.clf.predict( linelist[i + 1]) == 'duty': res.append(line) else: break if not res: for line in self.linelist: if re.search(u"粉丝团", line) and len(line) < 12: continue if self.DUTY.search(line) and self.clf.predict(line) == "duty": if self.lineindex[line] != 1: res.append(line) self.result["duty"] = "\n".join(res) for line in res: self.lineindex[line] = 1 return res # 抽取岗位要求 def regular_demand(self): res = [] jdStr = self.jdStr pos = list(self.START_DEMAND.finditer(jdStr)) if len(pos) > 0: tmppos = pos[-1].span()[0] if re.search(u"具有|具备", jdStr[tmppos - 5:tmppos + 5]) or re.search( u"证书|证", jdStr[tmppos:tmppos + 8]): pos.pop() if pos: linelist = [ re.sub("[\s ]+", " ", line) for line in jdStr[pos[-1].span()[1]:].split("\n") if len(line) > 2 ] else: linelist = [] for i in xrange(len(linelist)): line = linelist[i] if self.START_DEMAND.search(linelist[i]) or re.search( u"谢谢|请在|公司介绍|举报|收藏|\d+k?元|加分", line): continue if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]", line) or self.DEMAND.search( line) or self.clf.predict(line) == 'demand': res.append(line) elif i < len(linelist) - 1 and self.clf.predict( linelist[i + 1]) == 'demand': res.append(line) else: break if not res: for line in self.linelist: if self.lineindex[line] == 1 or len(line.split()) > 6: continue # 如果该句已经被处理过,就不再重复显示 if self.clf.predict(line) == 'demand' or self.DEMAND.search( line): res.append(line.strip()) self.result['demand'] = '\n'.join(res) for line in res: self.lineindex[line] = 1 return res # 招聘的职位名 def regular_jobname(self): res = set() jdStr = self.jdStr findpos = re.search(u"(招聘岗位|招聘职位|职位名称|岗位名称|岗位[一二三四五六七八九])[:、:\s ]", jdStr) # if not findpos: # findpos = re.search(u"(职位类别|职位职能)[::\s ]",jdStr) if findpos: pos = findpos.span()[1] linelist = jdStr[pos:].split("\n") for line in linelist: if len(line) < 2: continue if len(line) >= 2 and len(line) < 20: if re.search(u"职位描述|查看|地址|工作|分享|举报|下一条|时间|福利|待遇|周末|双休", line): continue res.add(re.sub(u"聘请|高薪诚聘|诚聘|[,。、\d!]+", "", line.strip())) break # 如果没有匹配到招聘的具体职位信息,就切词后到职位列表去匹配 if not res: for line in self.linelist: if re.search(u"招聘|高薪|诚聘", line): continue if len(line) < 6 and not re.search( u'岗位|岗位内容|工作内容|职责|任职|资格', line) and self.clf.predict(line) == 'job_name': res.add(line) break findPos = self.JOBNAME.search(line) if findPos and len(findPos.group()) < 20 and not re.match( u'\d', findPos.group()): jobname = findPos.group() res.add(re.sub(u"聘请|高薪诚聘|诚聘|急.|[,。、!]+", "", jobname)) break # res.add(re.sub(u"\(.+\)|(.+)|【.+】|[,。、\s\d]+|聘请|高薪诚聘|诚聘|急招|","",line.strip())) if not res: for line in self.linelist: for word in jieba.cut(line.lower()): if word in self.jobdic: res.add(word) self.result["job_name"] = " / ".join(res) return res if not res: tag = re.search(u"实习生|兼职", self.jdStr) if tag: res.add(tag.group()) self.result["job_name"] = strQ2B(" / ".join(res)).lower() return res # 薪酬 def regular_pay(self): pay = "" lagoup = re.search( u"(\d+[kK][-——]\d+[kK])|(\d{3,5}-\d{3,5}元?/[月日天])|(\d{3,5}-\d{3,5}元)|((\d+[-~]\d+)万.[年月])|底薪\d+(-\d+)?元?|\d{3,5}元(左右|以上)?|年薪\d+万?元(左右|以上)?", self.jdStr) # 针对拉勾网,没有待遇等关键字符 if lagoup: pay = lagoup.group() self.result["pay"] = pay self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000') return pay findpay = self.PAY.search(self.jdStr) if findpay: pos = findpay.span()[1] jdstr = self.jdStr[max(0, pos - 5):min(pos + 10, len(self.jdStr))] if re.search(u"面议", jdstr): pay = u"面议" else: findpay = re.findall(u"\d{3,7}", jdstr) pay = "-".join(findpay) self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000') return pay # 抽取薪资福利 def regular_benefits(self): res = [] jdStr = self.jdStr findpos = list(re.finditer(u"薪酬福利[::\s]|(福利|待遇)\s?[::]", jdStr)) if not findpos: findpos = list( re.finditer(u"(晋升制度|工作环境|职位诱惑|你会获得什么)\s?[?\?::]", jdStr)) if findpos: pos = findpos[-1].span()[1] linelist = jdStr[pos:].split('\n') for line in linelist: print 'benefits', line if len(line.strip()) < 3: continue if re.match(ur"[((]?\d+", line) or self.BENEFIT.search(line): res.append(line.strip()) self.lineindex[line.strip()] = 1 else: break if not res: for line in jdStr.split(): if len(line) > 1 and re.search( u"带薪|双休|股票期权|五险一金|发展空间|福利|诱惑|休假|薪酬|补助|年假|弹性工作", line): if re.search(u"福利|待遇|诱惑", line) and len(line.strip()) < 6: continue res.append(line.strip()) if len(res) == 1 and re.search( u"险一金", res[0]) and not re.search(u"[,、]", res[0]): res[0] = self.clean_line(' '.join(jieba.cut(res[0]))) self.result["benefits"] = "\n".join(res) return res
for i in acc: print i, acc[i] file = open('valid-sent.txt') result = open('result.txt', 'w+') DICT_res_stat = dict() mapping_dict = {'1': 'province', '2': 'city', '3': 'address', '4': 'town', '5': 'name', '6': 'shouji', '7': 'dianhua', '8': 'number', '9': 'leibie'} total_corr = 0 total_count = 0 with open('result.txt', 'w') as o: for line in file: line = line.strip() line_label = line.split(',')[0] text = line.split(',')[1] c = new_grocery.predict(text) #d是对text的每个类别预测的权重,对d进行排序 d = c.dec_values s = sorted(d.items(),key = lambda x:x[1],reverse = True) #若排序后第二的label是leibie,且值与第一相差不超过cha的话,就讲label改为leibie if s[1][0] == '9' and s[0][1] - s[1][1] < cha: c = '9' #print mapping_dict[str(c)], text print >> o, '%s, %s, %s' % (c, mapping_dict[str(c)], line) if DICT_res_stat.has_key(line_label) == False: DICT_res_stat[line_label] = {'error' : [], 'correct': []} if str(c) == line_label: DICT_res_stat[line_label]['correct'].append(line) total_corr += 1 else: DICT_res_stat[line_label]['error'].append(line)
# -*- coding: utf-8 -*- # 测试文件 # Author: Alex # Created Time: 2017年06月02日 星期五 11时15分12秒 from tgrocery import Grocery train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery = Grocery('sample') grocery.train(train_src) grocery.save() print("*" * 40) new_grocery = Grocery('sample') new_grocery.load() res = new_grocery.test(train_src) print(type(res)) print(res) print(res.accuracy_labels) print(res.show_result()) print("*" * 40) res = new_grocery.predict("考生必读:新托福写作考试评分标准") print(res) print(res.dec_values)
from tgrocery import Grocery data_dir = "../data/" src_fn = data_dir + 'train_set_100.txt' grocery = Grocery('backout_reason') grocery.train(src_fn) tp_cnt = {} f = open(data_dir + 'type.txt') for line in f: tps = line.split() tp_cnt[tps[1]] = 0 f.close() f = open(data_dir + 'bcmtmoz.merge') for line in f: tp = grocery.predict(line) tp_cnt[tp] += 1 print tp_cnt
tdic['id'].append(_id) tdic['type'].append(_type) tdic['contents'].append(contents) i +=1 #train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 ) #test = pd.read_csv( test_file, header = 1, delimiter = "\t", quoting = 3 ) train = DataFrame(dic) test = DataFrame(tdic) # #classfynews_instance 是模型保存路径 grocery = Grocery('classfynews_instance') train_in = [train['contents'],train['type']] grocery.train(train_in) print grocery.get_load_status() #grocery.save() copy_grocery = Grocery('classfynews_instance') copy_grocery.load() #copy_grocery = grocery test_in = [test['contents'],test['type']] #输入类似 ['我是中国人','台北*****'] #输出 [11,12] test_result = copy_grocery.predict(test['contents']) print test_result.predicted_y #test_result = copy_grocery.test(test_in) #print test_result.show_result()
# -*- coding: utf-8 -*- # 测试文件 # Author: Alex # Created Time: 2017年06月02日 星期五 11时15分12秒 from tgrocery import Grocery grocery = Grocery('sample') grocery.train('train_data.txt', delimiter=';') grocery.save() print("*" * 40) new_grocery = Grocery('sample') new_grocery.load() print(new_grocery.test('train_data.txt', delimiter=';')) print("*" * 40) print(new_grocery.predict("考生必读:新托福写作考试评分标准"))
train_src = [ ('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] grocery.train(train_src) # 也可以用文件传入(默认以tab为分隔符,也支持自定义) #grocery.train('train_ch.txt') # 保存模型 grocery.save() # 加载模型(名字和保存的一样) new_grocery = Grocery('sample') new_grocery.load() # 预测 new_grocery.predict('考生必读:新托福写作考试评分标准') #education # 测试 test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] new_grocery.test(test_src) # 输出测试的准确率 #0.5 # 同样可支持文件传入 #new_grocery.test('test_ch.txt') # 自定义分词模块(必须是一个函数) #custom_grocery = Grocery('custom', custom_tokenize=list)
#test file #print "test's file len",len(test_file) # test_src = test_file["String"].tolist() # print "test_src",test_src[0] # print "test's len",len(test_src) #tgrocery classify grocery =Grocery('sample') grocery.train(train_src) grocery.save() new_grocery = Grocery('sample') new_grocery.load() #predict print new_grocery.predict("当你坐上飞机在上面俯视它的时候") #load the submit standard file submit = pd.read_csv("sample_submit.csv",names=["id","value"]) print "the submit file len is ",len(submit) t = [] f = open("test.txt") for line in f.readlines(): #print line.split("\t")[1] #content t.append(new_grocery.predict(line.split("\t")[1])) submit["value"] = t print submit submit.to_csv("submit.csv",sep=",") t1 = time.time()