示例#1
0
class GroceryModel(object):
    def __init__(self):
        self.grocery = Grocery('TextClassify')
    
    def train(self,train_file):
        f = open(train_file,'r')
        line = f.readline().decode('utf8')
        dataset = []
        while line:
            tmp = line.split('\t')
            dataset.append((tmp[0],''.join(tmp[1:])))
            line = f.readline().decode('utf8')
        f.close()
        self.grocery.train(dataset)
        self.grocery.save()
    
    def load_model(self):
        self.grocery.load()
    
    def test(self,test_src):
        self.load_model()
        f = open(test_src,'r')
        line = f.readline().decode('utf8')
        dataset = []
        while line:
            tmp = line.split('\t')
            dataset.append((tmp[0],''.join(tmp[1:])))
            line = f.readline().decode('utf8')
        f.close()
        result = self.grocery.test(dataset)
        print result
    
    def predict(self,text):
        print self.grocery.predict(text)
示例#2
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     print(grocery.predict('考生必读:新托福写作考试评分标准'))
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
示例#3
0
class AutoGrocery(object):
    """

    """
    def __init__(self, name, train_data):
        self._train_data = train_data
        self._grocery = Grocery(project_dir + '/models/model_data/' + name)

    def train(self):
        self._grocery.train(self._train_data)

    def save(self):
        self._grocery.save()

    def load(self):
        self._grocery.load()

    def predicate(self, src):
        if not self._grocery.get_load_status():
            try:
                self.load()
            except ValueError:
                self.train()
                self.save()
        pr = self._grocery.predict(src)
        label = pr.predicted_y
        return label, pr.dec_values[label]
def predict_test(model_path, data):
    # 加载模型
    try:
        model_path = os.path.join(BASE_DIR, 'learn', model_path)
        new_grocery = Grocery(model_path.encode('utf-8'))
        new_grocery.load()
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'}
    # 整理输入数据
    result = list()
    sentences = data.split(';')
    if sentences[-1] == '':
        sentences.pop()
    if len(sentences) == 0:
        return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'}

    # 分词,再判断
    stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt'))
    for s in sentences:
        tmp_s = ''
        words = jieba.cut(s)
        for word in words:
            if word in stop_words:
                continue
            else:
                tmp_s += word + ' '
        result.append({
            'tag':
            str(new_grocery.predict(tmp_s.strip().encode('utf-8'))),
            'sentence':
            s,
        })
    return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
示例#5
0
def tGrocery():
    outFile = open('testResult.tmp', 'w')
    [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk'))
    testingSet = []
    correctLabel = []
    for i in xrange(len(benchmark)):
        print '%d out of %d' % (i, len(benchmark))
        testingSet.append(benchmark[i][1])
        correctLabel.append(benchmark[i][0]) 
    grocery = Grocery('test')
    grocery.train(trainingSet)
    grocery.save()
    # load
    new_grocery = Grocery('test')
    new_grocery.load()
    Prediction = []
    for i in xrange(len(testingSet)):
        print '%d out of %d' % (i, len(testingSet))
        prediction = new_grocery.predict(testingSet[i])
        Prediction.append(prediction)
        temp = correctLabel[i] + '<-->' + prediction + '  /x01' + testingSet[i] + '\n'
        outFile.write(temp)
    correct = 0
    for i in xrange(len(Prediction)):
        print Prediction[i], correctLabel[i],
        if Prediction[i] == correctLabel[i]:
            correct += 1
            print 'Correct'
        else:
            print 'False'
    print 'Correct Count:', correct
    print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
示例#6
0
    def __init__(self, keyword):
        print '进行新闻分类'
        (db, cursor) = connectdb()
        cursor.execute("update task set status=1 where keyword=%s", [keyword])
        cursor.execute("select id, title from news where keyword=%s",
                       [keyword])
        news = cursor.fetchall()
        new_grocery = Grocery('static/paris')
        new_grocery.load()

        for item in news:
            tag = new_grocery.predict(item['title'])
            if tag == '新闻背景':
                tag = 1
            elif tag == '事实陈述':
                tag = 2
            elif tag == '事件演化':
                tag = 3
            elif tag == '各方态度':
                tag = 4
            elif tag == '直接关联':
                tag = 6
            elif tag == '暂无关联':
                tag = 7
            cursor.execute("update news set tag=%s where id=%s",
                           [tag, item['id']])
        closedb(db, cursor)
        return
def train_compare_result(train_src, test_src):
    grocery = Grocery('test')
    grocery.train(train_src)
    print grocery.get_load_status()
    len_test = len(test_src)
    print len_test
    Predict_num = 0
    History = []
    for test in test_src:
        Predict_result = {
            'predict_title': test[1],
            'predict_class': None,
            'true_class': None
        }
        predict_title = Predict_result['predict_title']
        predict_result = grocery.predict(predict_title)
        Predict_result['predict_class'], Predict_result['true_class'] = test[
            0], predict_result
        if str(predict_result) == str(test[0]):
            # print 'prediction is True'
            Predict_num += 1
        History.append(Predict_result)
        # print 'prediction is False'
    predict_precision = float(Predict_num) / len_test
    return predict_precision, History
示例#8
0
	def __init__(self, keyword):
		print '进行新闻分类'
		(db, cursor) = connectdb()
		cursor.execute("update task set status=1 where keyword=%s", [keyword])
		cursor.execute("select id, title from news where keyword=%s",[keyword])
		news = cursor.fetchall()
		new_grocery = Grocery('static/paris')
		new_grocery.load()

		for item in news:
			tag = new_grocery.predict(item['title'])
			if tag == '新闻背景':
				tag = 1
			elif tag == '事实陈述':
				tag = 2
			elif tag == '事件演化':
				tag = 3 
			elif tag == '各方态度':
				tag = 4
			elif tag == '直接关联':
				tag = 6
			elif tag == '暂无关联':
				tag = 7
			cursor.execute("update news set tag=%s where id=%s", [tag, item['id']])
		closedb(db, cursor)
		return
示例#9
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     result = grocery.predict('just a testing')
     print(result)
     result = grocery.predict('考生必读:新托福写作考试评分标准')
     print(result)
     print("type of result is :",type(result))
     assert str(grocery.predict('考生必读:新托福写作考试评分标准')) == 'education'
     assert str(grocery.predict('法网')) == 'sports'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
	def labelmaker(self):
		result=[]
		grocery = Grocery('11c_20k_20171226')
		grocery.load()	
		label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0]
		result.append(label_confidence[0])#置信度最高的分类结果
		result.append(label_confidence[1])# 置信度
		return result
示例#11
0
def phgrocery(text):
    # result_text = []
    model_grocery = Grocery('model_redian_5')
    model_grocery.load()

    result = int(model_grocery.predict(text).predicted_y)
    # print result
    # if result == 1:
    #     result_text.append(text)
    return result
示例#12
0
	def GET(self,name):
		#i = web.input(name=None)	
		#url = "http://"+name
		#html = urllib2.urlopen(url).read()
		#soup = BeautifulSoup(html)
		#title =  soup.html.head.title.contents.pop().encode('utf-8')
		title = name.encode('utf-8')
		new_grocery = Grocery('sample')
		new_grocery.load()
		return new_grocery.predict(title)
示例#13
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
示例#14
0
    def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'):
        '''

        :param text:
        :param model_name:
        :return:
        '''
        new_grocery = Grocery(self.model_name)
        new_grocery.load()
        result = new_grocery.predict(text)
        return result.dec_values[u'postive']
示例#15
0
def demo_flask(image_file):
    grocery = Grocery('NameIdAdd_NLP')
    model_name = grocery.name
    text_converter = None
    tgm = GroceryTextModel(text_converter, model_name)
    tgm.load(model_name)
    grocery.model = tgm

    t = time.time()
    result_dir = './result'
    image = np.array(Image.open(image_file).convert('RGB'))
    result, image_framed = ocr_whole.model(image)
    output_file = os.path.join(result_dir, image_file.split('/')[-1])
    Image.fromarray(image_framed).save(output_file)
    name_total = ''
    id_total = ''
    for key in result:
        string1 = result[key][1]
        if len(string1) <= 8:
            continue
        string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1)
        no_digit = len(list(filter(str.isdigit, string2.encode('gbk'))))
        no_alpha = len(list(filter(is_alphabet, string2)))
        if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2:
            predict_result = 'others'
        elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3:
            predict_result = 'company-address'
        elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8):
            predict_result = 'company-id'
        elif '名称' in string2:
            predict_result = 'company-name'
        else:
            predict_result = grocery.predict(string2)
        if str(predict_result) == 'company-name':
            name_total += string1
            break
        elif str(predict_result) == 'company-id':
            id_total += string1
        else:
            continue
    id_total = re.sub(r'\W', '', id_total)
    name_total = stupid_revise(name_total)
    print("Mission complete, it took {:.3f}s".format(time.time() - t))
    print('\nRecongition Result:\n')
    print(id_total)
    print(name_total)
    return output_file, id_total, name_total
示例#16
0
class MyGrocery(object):
  def __init__(self, name):
    super(MyGrocery, self).__init__()
    self.grocery = Grocery(name)
    self.loaded = False
    self.correct = 1.0

  def train(self, src):
    lines = []
    for line in csv.reader(open(src)):
      label, s = line[0],line[1]
      text = s.decode('utf8')
      lines.append((label, text))
    self.grocery.train(lines)

  def save_model(self):
    self.grocery.save()

  def train_and_save(self, src):
    self.train(src)
    self.save_model()

  def load_model(self):
    if not self.loaded:
      self.grocery.load()
      self.loaded = True

  def predict(self, text):
    self.load_model()
    return self.grocery.predict(text)

  def test(self, src):
    self.load_model()
    total, wrong_num = 0.0, 0.0
    for line in csv.reader(open(src)):
      total += 1
      if line[0] != self.predict(line[1]):
        wrong_num += 1

    print "load test file from " + src
    correct = (total - wrong_num ) / total
    self.correct = correct
    print "total: %d , wrong_num: %d, success percentage: %f" %(total, wrong_num, correct)
    result = dict(type="test", total=total, wrong_num=wrong_num, correct=correct)
    return json.dumps(result)
示例#17
0
def tgrocery_train(train_data,test_data):
        '''model预测'''
        print("训练语料总数为: " + str(len(train_data)))
        test_corpus, test_label = test_split(test_data)

        grocery = Grocery('TextGrocery')
        print("start training......")
        grocery.train(train_data)
        grocery.save()
        new_grocery = Grocery('TextGrocery')
        new_grocery.load()

        predict_label = []
        for sample in test_corpus:
                label = new_grocery.predict(sample)

                predict_label.append(str(label))
        # print(predict_label)
        return test_corpus,test_label,predict_label
def predict_corpus(input_file,output_csv):
    import csv
    csvfile = file(output_csv, 'wb')
    writer = csv.writer(csvfile)
    corpus = []
    f = xlrd.open_workbook(input_file)
    table = f.sheet_by_name('Sheet1')
    nrows = table.nrows  # 读取行数
    for rownum in range(0, nrows):
        row = table.row_values(rownum)
        row[2].strip()
        corpus.append(row[2])
    corpus_grocery = Grocery(project_name)
    corpus_grocery.load()
    output = []
    for sentence in corpus:
        predict = corpus_grocery.predict(sentence)
        output.append((sentence,predict))
    writer.writerows(output)
    print('Done!')
    csvfile.close()
示例#19
0
class jdParser(object):
    def __init__(self):
        self.clf = Grocery("./jdclf")
        self.clf.load()
        self.LINE_SPLIT = re.compile(u"[;。;\n]")

    def get_demand_and_duty(self, jdstr):
        linelist = [
            line.strip() for line in self.LINE_SPLIT.split(jdstr)
            if len(line.strip() > 4)
        ]

        result = {}
        demand = []
        duty = []
        for line in linelist:
            pred = str(self.clf.predict(line))
            if pred == "demand":
                demand.append(line)
            elif pred == "duty":
                duty.append(line)

        result['demand'] = '\n'.join(demand)
        result['duty'] = '\n'.join(duty)
示例#20
0
class jdParser(object):

    def __init__(self):
        self.clf = Grocery("./jdclf")
        self.clf.load()
        self.LINE_SPLIT = re.compile(u"[;。;\n]")



    def get_demand_and_duty(self,jdstr):
        linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip()>4) ]

        result = {}
        demand = []
        duty = []
        for line in linelist:
            pred = str(self.clf.predict(line))
            if pred =="demand":
                demand.append(line)
            elif pred == "duty":
                duty.append(line)

        result['demand'] = '\n'.join(demand)
        result['duty'] = '\n'.join(duty)
示例#21
0
from tgrocery import Grocery
# 新开张一个杂货铺,别忘了取名!
grocery = Grocery('sample')
# 训练文本可以用列表传入
train_src = [
    #('education', '名师指导托福语法技巧:名词的复数形式'),
    ('tuofu', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('tuofu', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
grocery.train(train_src)
# 也可以用文件传入
#grocery.train('train_ch.txt')
# 保存模型
grocery.save()
# 加载模型(名字和保存的一样)
new_grocery = Grocery('sample')
new_grocery.load()
# 预测
print "category:", new_grocery.predict('考生必读:新托福写作考试评分标准')
#education
# 测试
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
new_grocery.test(test_src)
示例#22
0
class JdCRF(object):
    def __init__(self):
        self.data = []
        self.clf = Grocery("jdclf")
        self.clf.load()
        
        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")
        
        self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") 
        self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") 
        self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\
                                 |互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析')

        self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]")

        self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利")

        self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\
                               |完成|沟通|需求|秘书.{2,5}翻译")

        self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]")

        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")

        self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
                                  |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
        
    


    def gen_data(self,fname='./data/lagou_train.txt'):
        fw = codecs.open('./data/jd_train_crf.txt','wb','utf-8')
        cnt = 1
        for line in codecs.open(fname,'rb','utf-8'):
            if line.startswith(u"====="):
                fw.write(line)
                continue

            cnt +=1
            if len(line.strip())>1:
                    pred = self.clf.predict(line)
                    newline = pred+'\t\t'+line.strip()+'\t\t'+str(len(line))+"\n"
                    fw.write(newline)
        print cnt
        print 'done'


    def load_data(self,fname="./data/jd_train_crf.txt"):
        data = []
        tmp = []
        for line in codecs.open(fname,'rb','utf-8'):
            if line.startswith(u"===="):
                data.append(tmp)
                tmp = []
                continue
            else:
                tag_data = line.strip().split('\t\t')
                if len(tag_data)==3:
                    tmp.append(tuple(tag_data))
                else:
                    print '\t  '.join(tag_data)

        
        n = len(data)/2
        print 'train data',n
        print 'test data',len(data)-n
        return data[n:],data[:n]
    

    def word2features(self,sent,i):
        word = sent[i][0]
        postag = sent[i][1]

        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[:2]=' +word[:2],
            'word.isdigit=%s'%word.isdigit(),
            'postag='+postag,
            'demand=%s'% '1' if self.DEMAND.search(word) else '0',
            'start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0',
            'start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
            'duty=%s'% '1' if self.DUTY.search(word) else '0',
            'jobname=%s'% '1' if self.JOBNAME.search(word) else '0',
            'incname=%s'% '1' if self.INCNAME.search(word) else '0',
            'benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
            'pred=%s' % self.clf.predict(word)
        ]

        if i>0:
            word1 = sent[i-1][0]
            postag1 = sent[i-1][1]

            features.extend([
                '-1:postag='+postag1,
                '-1:word.islower='+word1[:3].lower(),
                '-1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '1',
                '-1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
                '-1:demand=%s'% '1' if self.DEMAND.search(word1) else '0',
                '-1:duty=%s'% '1' if self.DUTY.search(word1) else '0',
                '-1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0',
                '-1:incname=%s'% '1' if self.INCNAME.search(word1) else '0',
                '-1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
                '-1:pred=%s' % self.clf.predict(word),
            ])

        else:
            features.append('BOS')


        if i<len(sent)-1:
            word1 = sent[i+1][1]
            postag1 = sent[i+1][1]
            features.extend([
                '+1:word.lower=' + word1[:3].lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:postag=' + postag1,
                '+1:postag[:2]=' + postag1[:2],
                '+1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0',
                '+1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
                '+1:demand=%s'% '1' if self.DEMAND.search(word1) else '0',
                '+1:duty=%s'% '1' if self.DUTY.search(word1) else '0',
                '+1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0',
                '+1:incname=%s'% '1' if self.INCNAME.search(word1) else '0',
                '+1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
                '+1:pred=%s' % self.clf.predict(word),
            ])
        else:
            features.append('EOS')


        return features




    def sent2features(self,sent):
        return [self.word2features(sent,i) for i in range(len(sent))]

    def sent2labels(self,sent):
        return [label for (label,token,postag) in sent]

    def sent2tokens(self,sent):
        return [token for (label,token,postag) in sent]
    

    def train(self,x_train,y_train):
        
        assert len(x_train)==len(y_train),"not the same %d  %d"%(len(x_train),len(y_train))

        trainer = pycrfsuite.Trainer(verbose=False)

        for xseq,yseq in zip(x_train,y_train):
            trainer.append(xseq,yseq)

        trainer.set_params({
            'c1':1.0,
            'c2':1e-3,
            'max_iterations':50,
            'feature.possible_transitions':True
        })

        trainer.train('jd_skill.crfsuite')

    
    def test(self,sent):
        tagger = pycrfsuite.Tagger()
        tagger.open('./jd_skill.crfsuite')
        
        print 'tokens   ','\n '.join(self.sent2tokens(sent))
        print 'Predicted','\t '.join(tagger.tag(self.sent2features(sent)))
        print 'Correct  ','\t '.join(self.sent2labels(sent))
示例#23
0
    ('normal', 'Fw:关于ICP备案申请审核通过的通知'),
    ('normal', '技术部-SSL数字加密证书')
]


# 创建一个 grocery,'mail_class'为模型名称
grocery = Grocery('mail_class')

grocery.train(train_src)

grocery.save()

# Load model(和之前设的名字一样)
new_grocery = Grocery('mail_class')

new_grocery.load()
# 预测
print new_grocery.predict('关于神经网络与深度学习一书源码')
# education
# Test from list
# test_src = [
#     ('education', 'Abbott government spends $8 million on higher education media blitz'),
#     ('sports', 'Middle East and Asia boost investment in top level sports'),
# ]
# new_grocery.test(test_src)
# # Return Accuracy
# # 1.0
# # Or test from file
# new_grocery.test('test_ch.txt')
# # Custom tokenize
# custom_grocery = Grocery('custom', custom_tokenize=list)
示例#24
0
文件: trydb.py 项目: LiaoPan/MyGit
q=0
for c in mycontent:
    if c:
        k=mysign[q]
        p=[k,c]
        train_listc.append(p)
        q=q+1

for t in mytitle:
    m=mysign[i]
    n=[m,t]
    train_list.append(n)
    i=i+1
grocery.train(train_listc)
grocery.train(train_list)
grocery.save()
new_grocery=Grocery('trydb')
new_grocery.load()
pc=message.getContent1()
pt=message.getTitle1()
g=1
for newscontent in pc:
    if newscontent:
        num=new_grocery.predict(newscontent+pt[g-1])
        message.saveContent(g,num)
    else:
        num=new_grocery.predict(pt[g-1])
        message.saveContent(g,num)
   
    g=g+1
示例#25
0
Train = CochraneAnalysis["Train"]
Data = CochraneAnalysis["Data"]
PredictedLabels = CochraneAnalysis["PredictedLabels"]

x = ["Y", "YU", "U", "UN", "N", "YR", "YUR", "UR", "UNR", "NR"]


def getTag(tag):
    if (tag[0] == "Y"):
        return "Y"
    return "N"


grocery = Grocery('sample')

train_src = []

for data in Train.find():
    label = getTag(data["Tag"])
    text = data["Authors' conclusions"]
    train_src.append((label, text))

# Preparing training data from hand labeled classfiers
grocery.train(train_src)
grocery.save()

for data in Data.find():
    pred_label = grocery.predict(data["Authors' conclusions"])
    data["PredictedLabel"] = pred_label
    PredictedLabels.insert_one(data)
示例#26
0

		# ======= Predict And result.csv =======
		with open( "result.csv", "w") as outputfile:
			outputfile.write("NewsId,Agency")
			outputfile.write("\n")
			for fname in os.listdir(queryDir):
				if fname == ".DS_Store":
					continue
				with open(os.path.join(queryDir, fname)) as f:
					data = f.read().splitlines()
					nameOfdata = data[0]
					print nameOfdata
					content = ""
					for x in xrange(2,len(data)):
						# print data[x]
						content += data[x]
					# print content
					result = 0
					# if grocery.predict(simplify(content)) == 2:
						
					# else:
					result = corporaMgr.getClassifiyPublisherName(content, grocery.predict(simplify(content)),sentimentResult)
					outputfile.write(nameOfdata)
					outputfile.write(",")
					outputfile.write(result)
					outputfile.write("\n")

	else:
		print "Please use like 'python sim.py [originDocs_Dir] [outputDocs_Dir] [WithCatagoryAndPublisher] [train.csv]'"
	# python categorydocs.py news_story_dataset/ preprocessingData/simplify/ preprocessingData/withCategory/ train.csv p2data/phase2_test_dataset/
示例#27
0
CochraneAnalysis = c["CochraneAnalysisArticle"]
Train = CochraneAnalysis["Train"]
Data =  CochraneAnalysis["Data"]
PredictedLabels = CochraneAnalysis["PredictedLabels"]

x = ["Y", "YU", "U", "UN", "N", "YR", "YUR", "UR", "UNR", "NR"]
def getTag(tag):
	if (tag[0] == "Y"):
		return "Y"
	return "N"

grocery = Grocery('sample')

train_src = []

for data in Train.find():
	label = getTag(data["Tag"])
	text = data["Authors' conclusions"]
	train_src.append((label, text))

# Preparing training data from hand labeled classfiers
grocery.train(train_src)
grocery.save()

for data in Data.find():
	pred_label = grocery.predict(data["Authors' conclusions"])
	data["PredictedLabel"] = pred_label
	PredictedLabels.insert_one(data)


示例#28
0
def main():
    # Get market_sentiment of word from NTUSD-Fin
    train_t = []
    train_s = []
    targetIn = {}
    targetDict = dict()
    with open('NTUSD-Fin/NTUSD_Fin_hashtag_v1.0.json', 'r',
              encoding='utf-8') as f:
        targetIn = json.load(f)
    N = len(targetIn)
    for i in range(N):
        word = "#" + targetIn[i]['token']
        targetDict[word] = targetIn[i]['market_sentiment']
        sg = str(GroupValue_s(str(targetDict[word] / 3.5)))
        train_s.append((sg, word))
    with open('NTUSD-Fin/NTUSD_Fin_word_v1.0.json', 'r',
              encoding='utf-8') as f:
        targetIn = json.load(f)
    N = len(targetIn)
    for i in range(N):
        word = targetIn[i]['token']
        targetDict[word] = targetIn[i]['market_sentiment']
        sg = str(GroupValue_s(str(targetDict[word] / 3.5)))
        train_s.append((sg, word))

    # Training File: Load data & Use tgrocery to train classification model
    TrainingFile = open('training_set.json', 'r')
    TrainingData = json.load(TrainingFile)
    TrainingFile.close()
    DataList = []
    grocery_t = Grocery("tweet")
    grocery_s = Grocery("snippet")
    for DataElement in TrainingData:
        tempt = DataManager()
        tempt.insertData(DataElement)
        tempt.group_t = GroupValue_t(tempt.sentiment)
        tempt.group_s = GroupValue_s(tempt.sentiment)
        line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ",
                      DataElement["tweet"])
        train_t.append((str(tempt.group_t), line))
        if isinstance(DataElement["snippet"], list):
            for line in DataElement["snippet"]:
                train_s.append((str(tempt.group_s), line))
        elif DataElement["snippet"] != "":
            train_s.append((str(tempt.group_s), DataElement["snippet"]))
        else:
            tempt.group_s = 0.0
        DataList.append(tempt)
    grocery_t.train(train_t + train_s)
    grocery_t.save()
    grocery_s.train(train_s)
    grocery_s.save()

    # Save training data created by WordScore() and GroupValue_*()
    # Data will be uesd for LinearRegression() in BOTH.py
    outfile = open('TG_train.txt', 'w', encoding='utf-8')
    dataScore = []
    dataSentiment = []
    for row in DataList:
        dataSentiment.append([float(row.sentiment)])
        a = WordScore(row.tweet, targetDict)
        b = WordScore(row.snippet, targetDict)
        c = row.group_t
        d = row.group_s
        dataScore.append([a, b, c, d])
        print(a, b, c, d, file=outfile)
    outfile.close()
    '''
	# Train linear regression model
	model = LinearRegression()
	model.fit(dataScore, dataSentiment)

	# Test for training data
	print('(train)R-squared: %.3f' % model.score(dataScore, dataSentiment)) #0.915
	predictions = model.predict(dataScore)
	rms = mean_squared_error(dataSentiment,predictions)
	print('RMSE: %.3f' % sqrt(rms)) #0.110
	print('MSE: %.3f' % rms) #0.012
	'''

    # Testing File: Load data & Use tgrocery classification model to predict
    TestingFile = open('test_set.json', 'r')
    TestingData = json.load(TestingFile)
    TestingFile.close()
    DataList = []
    new_grocery_t = Grocery('tweet')
    new_grocery_t.load()
    new_grocery_s = Grocery('snippet')
    new_grocery_s.load()
    for DataElement in TestingData:
        tempt = DataManager()
        tempt.insertData(DataElement)
        line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ",
                      DataElement["tweet"])
        tempt.group_t = float('{0}'.format(new_grocery_t.predict(line)))
        value = 0.0
        if isinstance(DataElement["snippet"], list):
            for line in DataElement["snippet"]:
                value = value + float('{0}'.format(
                    new_grocery_s.predict(line)))
            value = value / len(DataElement["snippet"])
        elif DataElement["snippet"] != "":
            value = float('{0}'.format(
                new_grocery_s.predict(DataElement["snippet"])))
        tempt.group_s = value
        DataList.append(tempt)

# Save testing data created by WordScore() and classification prediction
# Data will be uesd for LinearRegression() in BOTH.py
    outfile = open('TG_test.txt', 'w', encoding='utf-8')
    dataScore = []
    dataSentiment = []
    for row in DataList:
        dataSentiment.append([float(row.sentiment)])
        a = WordScore(row.tweet, targetDict)
        b = WordScore(row.snippet, targetDict)
        c = row.group_t
        d = row.group_s
        dataScore.append([a, b, c, d])
        print(a, b, c, d, file=outfile)
    outfile.close()
    '''
示例#29
0
# coding: utf-8

from tgrocery import Grocery

# save
grocery = Grocery('test')
train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
grocery.save()

# load
# grocery name must be the same as the previous one
new_grocery = Grocery('test')
new_grocery.load()
print new_grocery.predict('考生必读:新托福写作考试评分标准')
# test ##################################
print 'start test'
TP = 0.0
TN = 0.0
FP = 0.0
FN = 0.0

filetest = codecs.open(testFileName, 'r', 'utf-8')
test_reader = filetest.readlines()

resultlist = []
for line in test_reader:
    str = line.split(u'\t')
    #import pdb; pdb.set_trace()
    #print line
    result = grocery.predict(str[2])
    resultlist.append((str[0], result))
    #print result
    #import pdb; pdb.set_trace()
    if (result == str[1]):
        if (str[1] == u'0'):
            TN = TN + 1
        else:
            TP = TP + 1
    else:
        if (str[1] == u'0'):
            FP = FP + 1
        else:
            FN = FN + 1

precision = TP / (TP + FP)
示例#31
0
            concat = ','.join(no) + '$%^' + ','.join(qz)
            print concat
            clientSender.publish('similarResult', reqParamList[0] + '!@#' + concat)

        elif item['channel'] == 'abstract':

            # 文本抽取
            text = reqParamList[1]
            tr4s.train(text=text, speech_tag_filter=True, lower=True, source = 'all_filters')

            # 使用词性过滤,文本小写,使用words_all_filters生成句子之间的相似性
            # abstractResult = '\n'.join(tr4s.get_key_sentences(num=1+len(text)/350))
            abstractResult = tr4s.get_key_sentences(num=1+len(text)/250)
            re = ''
            '$%^'.join(abstractResult)



            clientSender.publish('abstractResult', reqParamList[0] + '!@#' + '$%^'.join(abstractResult))

        elif item['channel'] == 'classification':
            doc = reqParamList[1]
            #data, data_vec = ldaModel.file_to_vec(doc)
            #out_put, out_put_class = ldaModel.pre(data_vec)
            t_pre_result = grocery.predict(delete_stop_words(doc))
            out_put_class = t_pre_result.predicted_y
            clientSender.publish('classificationResult', reqParamList[0] + '!@#' + out_put_class)



resultlist=[]
i=0
for line in validate_reader:
    content=pp.getcontent(validate_reader,i)
    i=i+1
    if(i%5000==0):
        print ("%d "%(i))+'#'*30
    #if(i>10):
        #break
    if(content==''):
        print line
    else:
        str=content.split('\t')
        len=str[0].__len__()
        result=grocery.predict(content[len+3:])
        if(result==str[1]):
            if(str[1]==u'0'):
                TN=TN+1
            else:
                TP=TP+1
        else:
            if(str[1]==u'0'):
                FP=FP+1
                fileOutput.write('FP: '+line+' \n')
            else:
                FN=FN+1
                fileOutput.write('FN: '+line+' \n')

precision=TP/(TP+FP)
recall=TP/(TP+FN)
示例#33
0
train_set = '/home/hntea/RobotWorkSpace/SpeechSystem/speech_system/src/nlu/script/training/dataset/train.dat'
text_set = '/home/hntea/RobotWorkSpace/SpeechSystem/speech_system/src/nlu/script/training/dataset/test.dat'

modelsave = 'model'
grocery = Grocery(modelsave)
grocery.train(train_set)
#保存模型
grocery.save()
# 加载模型(名字和保存的一样)
new_grocery = Grocery(modelsave)
new_grocery.load()
# 预测
# ret = new_grocery.predict('放一首歌来听').predicted_y
# print "放一首歌吧"+str(new_grocery.predict('放一首歌来听').predicted_y)
print new_grocery.predict('你叫什么名字')
print new_grocery.predict('吃饱没有')
print new_grocery.predict('周杰伦')
print new_grocery.predict('黑色衣服好看')
print new_grocery.predict('王力宏')
print new_grocery.predict('波哥')
print new_grocery.predict('播歌')
print new_grocery.predict('我要听张含韵的歌')
print new_grocery.predict('放一首:富士山下')
print new_grocery.predict('点播:兄弟')
print new_grocery.predict('听歌')
print new_grocery.predict('听歌。')
print new_grocery.predict('我要听歌')
print new_grocery.predict('我要听音乐。')
print new_grocery.predict('播放歌曲。')
print new_grocery.predict('音乐播放。')
示例#34
0
import sys

reload(sys)
sys.setdefaultencoding('utf8')

# from pyspark import SparkContext
# from pyspark.sql import *
# from pyspark.sql.types import *
# import time
# import rapidjson as json
#
# sc = SparkContext(appName="cmt")
# sqlContext = SQLContext(sc)
# hiveContext = HiveContext(sc)

from tgrocery import Grocery

grocery = Grocery('sample')

train_src = [
    ('education', 'Student debt to cost Britain billions within decades'),
    ('education', 'Chinese education for TV experiment'),
    ('sports', 'Middle East and Asia boost investment in top level sports'),
    ('sports',
     'Summit Series look launches HBO Canada sports doc series: Mudhar')
]

grocery.train('/home/hadoop/tmp/tgrocery/train_file.txt')

print grocery.predict("7款清爽眼部卸妆液 卸掉残妆不留暗沉")
示例#35
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import MySQLdb
from tgrocery import Grocery
import sys
reload(sys)
sys.setdefaultencoding('utf8')

grocery = Grocery('sample')
dict_list = list()

conn = MySQLdb.connect(host = 'localhost', db = 'newsdata', user = '******', passwd = 'root', charset = 'utf8', use_unicode = False)
cur = conn.cursor()
cur.execute('select com_new_type_id, com_new_name from tbl_new where com_new_type_id is not null')
for row in cur.fetchall():
    dict_list.append(row)


grocery.train(dict_list)
grocery.save()

news_grocery = Grocery('sample')
news_grocery.load()
while True:
    result = news_grocery.predict(raw_input('please input title:' ))
    print result

##########################################
# init
model_choose = "svm"  # svm, lda, rnn
grocery_name = "./SVM_models/svm_for_news"
corpus_path = "./Corpus/NewsClassCorpus/"
file_path = "./"
file_name = "post.txt"

t_text = delete_stop_words(codecs.open(file_path + file_name, encoding="UTF-8").read())

###########################################
# 调用 SVM 模型分类
if model_choose == "svm":
    tic = time.time()
    grocery = Grocery(grocery_name)
    grocery.load()
    t_pre_result = grocery.predict(delete_stop_words(t_text))
    toc = time.time()

    t_label = t_pre_result.predicted_y
    print("Sentiment: ", t_label)
    print("How much: ", t_pre_result.dec_values[t_label])
    print("Elapsed time of predict is: %s s" % (toc - tic))
elif model_choose == "lda":
    pass
elif model_choose == "rnn":
    pass
else:
    print("")
示例#37
0
# coding: utf-8

from tgrocery import Grocery


grocery = Grocery('test')
train_src = [
    ('education', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)

grocery = Grocery('read_text')
train_src = '../text_src/train_ch.txt'
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)
# test ##################################
print 'start test'
TP=0.0
TN=0.0
FP=0.0
FN=0.0

filetest=codecs.open(validateFileName,'r','utf-8')
test_reader=filetest.readlines()

resultlist=[]
for line in test_reader:
    str=line.split(u',')
    #import pdb; pdb.set_trace()
    #print line
    result=grocery.predict(str[1])
    #print result
    #import pdb; pdb.set_trace()
    if(result==str[0]):
        if(str[0]==u'0'):
            TN=TN+1
        else:
            TP=TP+1
    else:
        if(str[0]==u'0'):
            FP=FP+1
        else:
            FN=FN+1

precision=TP/(TP+FP)
recall=TP/(TP+FN)
示例#39
0
# coding: utf-8

from tgrocery import Grocery

# pass a tokenizer, must be a python func
custom_grocery = Grocery('custom', custom_tokenize=list)
train_src = [
    ('education', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
custom_grocery.train(train_src)
print custom_grocery.get_load_status()
print custom_grocery.predict('考生必读:新托福写作考试评分标准')
示例#40
0
# coding: utf-8

from tgrocery import Grocery

grocery = Grocery('test')
train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)

grocery = Grocery('read_text')
train_src = '../text_src/train_ch.txt'
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)
示例#41
0
grocery = Grocery('sample')
train_src = r'E:\classify\plan2\train_kmer.txt'
grocery.train(train_src, delimiter=',')
print('Training finished! Time consumption:')
mid = time.process_time()
print(str(mid - start))
grocery.save()

grocery.load()
test_src = r'E:\classify\plan2\test_kmer.txt'
print('Classification accuracy:')
print(grocery.test(test_src, delimiter=','))

classifile = open(r'E:\classify\tokens.txt', mode='r', encoding='utf-8')
pinyin = open(r'E:\classify\pinyin_grocery.txt', mode='w', encoding='utf-8')
words = open(r'E:\classify\words_grocery.txt', mode='w', encoding='utf-8')

for line in classifile.readlines():
    if grocery.predict(getkmer(line, 2)) == 'word':
        words.write(line)
    if grocery.predict(getkmer(line, 2)) == 'pinyin':
        pinyin.write(line)

classifile.close()
pinyin.close()
words.close()

print('Program running time:')
end = time.process_time()
print(str(end - start))
示例#42
0
# coding:utf-8
#!/usr/bin/evn python
from tgrocery import Grocery 


copy_grocery = Grocery('./classfynews_instance')#模型所在路径
copy_grocery.load()
#copy_grocery = grocery
test = ['我是中国人','台北*****']
test_result = copy_grocery.predict(test)
print test_result.predicted_y
#test_result = copy_grocery.test(test_in)
#print test_result.show_result()


示例#43
0
# coding=utf-8
from tgrocery import Grocery

grocery = Grocery('sample')

train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
#grocery.train('/home/wangjianfei/git/data/train_ch.txt')
# grocery.train('train_ch.txt')
grocery.save()
new_grocery = Grocery('sample')
new_grocery.load()
print(
    new_grocery.predict(
        'Abbott government spends $8 million on higher education media blitz'))
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
print("start test..................")
#grocery.test('/home/wangjianfei/git/data/test.txt')
# grocery.train('train_ch.txt'))
# custom_grocery = Grocery('custom', custom_tokenize=list)
print(new_grocery.test(test_src))
示例#44
0
class TagPredictor(object):
    def _custom_tokenize(self, line, **kwargs):
        try:
            kwargs["method"]
        except:
            method = str(self.kwargs["method"])
        else:
            method = str(kwargs["method"])
        if method == "normal":
            tokens = self.key_ext.calculateTokens(line,
                                                  doc_len_lower_bound=5,
                                                  doc_len_upper_bound=500,
                                                  method="normal")
        elif method == "processed":
            tokens = line.split(',')
        return tokens

    def __init__(self, *args, **kwargs):
        self.grocery_name = str(kwargs["grocery_name"])
        method = str(kwargs["method"])
        train_src = str(kwargs["train_src"])
        self.PREFIX = conf.load("predict_label")["prefix"]
        self.MODEL_DIR = conf.load("predict_label")["model_dir"]

        self.kwargs = kwargs
        if method == "normal":
            self.key_ext = keyExt()
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        elif method == "jieba":
            self.grocery = Grocery(self.grocery_name)
        elif method == "processed":
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        pass

    def trainFromDocs(self, *args, **kwargs):
        model = self.grocery.train(self.kwargs["train_src"])
        return model

    def autoEvaluation(self, *args, **kwargs):
        prune_threshold = float(kwargs["threshold"])
        excluded_labels = kwargs["excluded_labels"]
        excluded_docs = kwargs["excluded_docs"]

        train_data = []
        with open(self.kwargs["train_src"], 'rb') as f:
            for line in f:
                try:
                    line.split('\t', 1)[1]
                except:
                    continue
                else:
                    train_data.append(
                        (line.split('\t',
                                    1)[0], line.split('\t',
                                                      1)[1].split('\n', 1)[0]))
        f.close()

        print "#items before filtering:", len(train_data)
        print "-- Now we filter out the excluded docs --"
        train_data = [i for i in train_data if i[1] not in excluded_docs]
        print "#items after filtering:", len(train_data)
        print "-- Now we filter out the excluded labels --"
        train_data = [i for i in train_data if i[0] not in excluded_labels]
        print "#items after filtering:", len(train_data)

        n = len(train_data)  #number of rows in your dataset
        indices = range(n)
        indices = shuffle(indices)
        train_set = map(lambda x: train_data[x], indices[:n * 10 // 10])
        test_set = map(lambda x: train_data[x], indices[:n * 10 // 10])

        self.grocery.train(train_set)
        test_result = self.grocery.test(test_set)
        print '-- Accuracy after training --'
        print 'Accuracy, A-0:', test_result

        low_recall_label = []
        for item in test_result.recall_labels.items():
            if item[1] < prune_threshold:
                low_recall_label.append(item[0])
        new_train_set = [
            item for item in train_set if item[0] not in low_recall_label
        ]
        new_test_set = [
            item for item in train_set if item[0] not in low_recall_label
        ]

        self.grocery.train(new_train_set)
        new_test_result = self.grocery.test(new_test_set)

        print '-- Accuracy after training, with low-recall labels (less than', str(
            prune_threshold * 100), '%) pruned --'
        print 'Accuracy, A-1:', new_test_result

        return self.grocery, new_test_result

    def manualEvaluation(self, *args, **kwargs):
        n_docs = int(kwargs["n_docs"])
        excluded_labels = kwargs["excluded_labels"]
        excluded_docs = kwargs["excluded_docs"]

        train_data = []
        with open(self.kwargs["train_src"], 'rb') as f:
            for line in f:
                try:
                    line.split('\t', 1)[1]
                except:
                    continue
                else:
                    train_data.append(
                        (line.split('\t',
                                    1)[0], line.split('\t',
                                                      1)[1].split('\n', 1)[0]))
        f.close()

        train_data = [
            item for item in train_data if item[0] not in excluded_labels
        ]
        train_data = [i for i in train_data if i[1] not in excluded_docs]

        n = len(train_data)  #number of rows in your dataset
        indices = range(n)
        indices = shuffle(indices)
        test_set = map(lambda x: train_data[x], indices[0:n_docs])
        g = self.loadTrainModel()
        test_result = g.test(test_set)
        return test_set, test_result

    def saveTrainModel(self, *args, **kwargs):
        self.grocery.save()
        os.rename(
            self.PREFIX + self.grocery_name + '_train.svm',
            self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm')
        return

    def loadTrainModel(self, *args, **kwargs):
        os.rename(
            self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm',
            self.PREFIX + self.grocery_name + '_train.svm')
        self.grocery.load()
        os.rename(
            self.PREFIX + self.grocery_name + '_train.svm',
            self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm')
        return self.grocery

    def predict(self, line, **kwargs):
        tag = self.grocery.predict(line)
        return tag

    def test(self, *args, **kwargs):
        test_src = str(kwargs["test_src"])
        test_result = self.grocery.test(test_src)
        print "Total Accuracy", test_result

        return test_result
示例#45
0
    ftest = open(path2, 'w')
    for line in open(path):
        if random.random() < theta:
            ftest.write(line)
        else:
            ftrain.write(line)
    ftrain.close()
    ftest.close()

def train(path,name):
    grocery = Grocery(name)   
    grocery.train(path)
    grocery.save()

if __name__ == "__main__":
    data2tt(sys.argv[3], sys.argv[1], sys.argv[2], 0.02)
    train(sys.argv[1], "music")
    new_grocey = Grocery("music")
    new_grocey.load()
    n = 0
    for line in open(sys.argv[2],"r"):
        ls = line.strip().split("\t")
        predict = new_grocey.predict(ls[1])
        test = ls[0]
        result = 0
        if test == str(predict):
            result = 1
        n += result
        print predict,test,result
    print n
# test ##################################
#grocery=Grocery('sample')
grocery = Grocery('version1.0')
grocery.load()

print 'start test'

filetest = codecs.open(testFileName, 'r', 'utf-8')
test_reader = filetest.readlines()

fileOutput = codecs.open(outputFileName, 'w', 'utf-8')

i = 0
for line in test_reader:
    content = pp.getcontent(test_reader, i)
    i = i + 1
    #if(i>10):
    #break
    if (i % 5000 == 0):
        print("%d " % (i)) + '#' * 30

    if (content == ''):
        print "test.py#" * 3 + line
    else:
        str = content.split('\t')
        len = str[0].__len__()
        result = grocery.predict(content[len + 1:])
        fileOutput.write(str[0] + ',' + result + '\n')

filetest.close()
fileOutput.close()
    ('english', u'新托福考试官方指南'),
    ('extra', u'牛奶可乐经济学'),
    ('course', u'自动控制理论与设计'),
    ('course', u'电力电子技术'),
    ('course', u'数字图像处理'),
    ('course', u'自动控制原理习题精解与考研指导'),
    ('course', u'现代检测技术'),
    ('extra', u'忒修斯之船'),
    ('professional', u'Arduino程序设计基础'),
    ('professional', u'机器学习导论'),
    ('professional', u'TensorFlow实战'),
    ('professional', u'Effective Modern C++'),
    ('extra', u'重新发现社会'),
    ('extra', u'Letter from an Unknown Woman')
]

# create a model named 'book_class'
grocery = Grocery('book_class')
grocery.train(train_src)
grocery.save()

# load the model 'book_class'
new_grocery = Grocery('book_class')
new_grocery.load()

# make predictions
str = raw_input('bookname:  ')
while str.strip():
    print('category: ', new_grocery.predict(str))
    str = raw_input('bookname:  ')
示例#48
0
class JdParser(object):
    def __init__(self):
        self.degreedic = set(line.strip() for line in codecs.open(
            './data/degrees.txt', 'rb', 'utf-8'))  # 载入学历词库
        self.majordic = set(line.strip() for line in codecs.open(
            './data/majordic.txt', 'rb', 'utf-8'))  # 载入专业词库
        self.citydic = set(line.strip() for line in codecs.open(
            "./data/citydic.txt", 'rb', 'utf-8'))  # 载入城市词库
        self.firmnames = set(line.strip() for line in codecs.open(
            './data/firm.txt', 'rb', 'utf-8'))  # 载入公司缩写名库
        self.jobdic = set(line.strip() for line in codecs.open(
            './data/jobposition.txt', 'rb', 'utf-8'))  # 载入招聘职位名库
        self.skills = set(
            line.strip()
            for line in codecs.open('./data/skills.txt', 'rb', 'utf-8'))
        #        self.wordlisttf = pickle.load(open('./data/wordlist.pkl'))  # 出现频率最高的2000个单词
        # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector
        self.clf = Grocery("jdclf")  # 句子分类器,分为demand,duty,other
        self.clf.load()

        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(
            u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验|工作年限|工作经历|项目经[历验]|\d年经[历验]|.{1,2}年相关工作经验")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")

        self.INCNAME = re.compile(
            u"\S+(有限公司|酒店|银行|集团|厂|研究中心|研究所|学校|旅行社|中心/s|分?公司|研发中心|技术部|事.部|招聘|商务平台)"
        )
        self.INCTAG = re.compile(
            u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企"
        )

        self.JOBNAME = re.compile(
            u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析'
        )

        self.START_DEMAND = re.compile(
            u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]"
        )
        self.DEMAND = re.compile(
            u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|学历|经验|喜欢|较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利"
        )
        self.DUTY = re.compile(
            u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译"
        )
        self.START_DUTY = re.compile(
            u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]"
        )
        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")
        self.BENEFIT = re.compile(
            u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
        |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动")

        self.SPLIT_JD = re.compile(
            u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]")
        self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]")
        self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]")
        self.SKILL = re.compile(
            u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的"
        )

        jieba.load_userdict('./data/majordic.txt')
        jieba.load_userdict('./data/skills.txt')
        jieba.load_userdict('./data/firm.txt')
        jieba.load_userdict('./data/degrees.txt')
        jieba.load_userdict('./data/benefits.txt')

        self.jdStr = ""
        self.linelist = []
        self.lineindex = defaultdict(int)
        self.result = OrderedDict()

    # 分句,预处理
    def preprocess(self, jdstr):
        self.result.clear()
        jdstr = re.sub(u"[【】◆ \u25cf\u25c6\u2605]", "", jdstr.decode('utf-8'))
        self.linelist = [
            line.strip() for line in jdstr.split('\n') if len(line) > 1
        ]
        self.jdStr = '\n'.join(self.linelist)
        for line in self.linelist:
            # print self.clf.predict(line),'\t',line
            self.lineindex[re.sub(u"[\s ]+", " ", line)] = 0

    def line2vec(self, line):
        vec = np.zeros(50)
        cnt = 1
        for word in jieba.cut(line):
            if word in self.w2vdict:
                vec += self.w2vdict[word]
                cnt += 1
        vec = vec / cnt
        return vec

    # 抽取性别要求
    def regular_sex(self):
        res = set()
        for line in self.linelist:
            if self.clf.predict(line) == 'demand' or self.DEMAND.search(line):
                findsex = self.SEX.search(line)
                if findsex:
                    getsex = re.search(u"性别不限|男|女",
                                       line.replace(u"男女不限", u"性别不限"))
                    if getsex:
                        res.add(getsex.group())
                        break
        if res:
            self.result['sex'] = ' / '.join(res)
        else:
            self.result['sex'] = u'性别不限'

    # 抽取年龄要求
    def regular_age(self):
        res = ''
        for line in self.linelist:
            if re.search(u'\d{2}后', line): continue
            findage = self.AGE.search(line)
            if findage:
                age = re.findall(u'\d{2}', line)
                if len(age) >= 2:
                    res = '-'.join(age)
                elif len(age) == 1:
                    if re.search(u'以上|不低于', line):
                        res = age[0] + u'以上'
                    if re.search(u"不超过|不高于|以下", line):
                        res = age[0] + '以下'
                    if re.search(u"左右|大约|大概", line):
                        res = age[0] + '左右'
                break
        if len(res) < 2:
            res = u'年龄不限'
        self.result['age'] = res
        return res

    # 抽取专业要求
    def regular_major(self):
        res = []

        for line in self.linelist:
            findmajor = re.search(u"专业要求[::\s]", line)
            if findmajor:
                print 'major demand', line
                items = self.clean_line(line[findmajor.span()[1]:]).split()
                items = filter(
                    lambda x: x not in self.degreedic and not re.search(
                        u"薪酬|经验|元|\d+|月", x), items)
                res.append(' / '.join(items))
                break

        if not res:
            for line in self.linelist:
                if re.search(u"专业.限|.限专业",
                             line) and not re.search(u"专业优先", line):
                    res.append(u"专业不限")
                    print 'major demand', line
                    break
                else:
                    findmajor = self.MAJOR.search(line)
                    if findmajor:
                        majoritem = re.split(u'[\s,,;; ]', findmajor.group())
                        for item in majoritem:
                            if re.search(
                                    u'学历|年龄|岁|学校|公司|性格|具有|具备|能够|经验|有|毕业|性别|男|女',
                                    item):
                                continue
                            print 'major item', item
                            if self.BENEFIT.search(line): continue
                            print 'major item', item
                            if re.search(u"专业", item) and len(item) < 3:
                                continue
                            res.append(self.clean_line(item))
                        break
                        if not res:
                            for majorword in jieba.cut(line):
                                if majorword in self.majordic or majorword[:
                                                                           -2] in self.majordic:
                                    res.append(majorword)

                            if re.search(u"[等及类]?相关专业",
                                         self.jdStr) and len(res) == 1:
                                res[0] += u"等相关专业"

        if not res:
            res.append(u"专业不限")

        self.result['major'] = res

    # 抽取学历要求
    def regular_degree(self):
        """
        抽查学历信息,先整找关键字,而后再切词,用词典匹配
        """
        degree = [
            u'小学', u'初中', u'中专', u'中技', u'高中', u'专科', u'大专', u'本科', u'硕士',
            u'博士', u'博士后'
        ]
        res = set()
        for line in self.linelist:
            finddegree = re.search(u"学历要求[::\s]", line)
            if finddegree:
                items = self.clean_line(line[finddegree.span()[1]:]).split()
                items = filter(lambda x: not re.search(u"薪酬|经验|元|月|年|\d+", x),
                               items)
                res.add(' / '.join(items))
                break

        if not res:
            for line in self.linelist:
                if re.search(u"学历不限|学历要求不限|不限学历", line):
                    res.add(u"学历不限")
                    break
                else:
                    finddegree = self.DEGREE.search(line)
                    if finddegree:
                        res.add(finddegree.group())
                        break

        # 如果没有匹配到学历的要求信息,就整个文本切词后匹配查找
        if len(res) == 0:
            for word in jieba.cut(self.jdStr):
                if word in self.degreedic:
                    res.add(word)
        res = list(res)
        if len(res) == 1 and re.search(u'[及或]?以上', res[0]):
            tmp = res[0][:2]
            if tmp == u'全日':
                tmp = u'本科'
            elif tmp == u'研究':
                tmp = u'硕士'
            if tmp in degree:
                idx = degree.index(tmp)
                res = degree[idx:]

        self.result['degree'] = ' / '.join(res)

    # 抽取工作经验年限要求
    def regular_exp(self):

        cnyear = u'[半一二三四五六七八九十两]年|\d-\d{1,2}年|\d年及?以上|不少于\d年|\d年'
        res = set()
        jdStr = self.jdStr

        findexp = re.search(u'经验不限|(经验)?\d{1,2}年及以上|经验\d-\d{1,2}年', jdStr)
        if findexp:
            res = findexp.group()
            self.result['exp'] = res.replace(u"经验", "")
            return res

        findexp = self.EXP.search(jdStr)
        if findexp:
            pos = findexp.span()[1]
            jdStr = jdStr[max(0, pos - 25):min(pos + 15, len(jdStr))]
            exp = re.search(cnyear, jdStr)
            if exp:
                res.add(exp.group())

        if not res:
            exp = re.search(
                u"(\d-)?\d{1,2}年(工作|开发|项目)?经[历验]|(不少于)?([半\d]年)及?(以上)?经[历验]|经[历验]\s?(\d-)?\d{1,2}年",
                ' '.join(self.regular_jobtag()))
            if exp:
                res.add(exp.group())
            else:
                exp = re.search(cnyear, ' '.join(self.regular_jobtag()))
                if exp:
                    res.add(exp.group())

        self.result["exp"] = "-".join(res)
        self.result["exp"] = self.result['exp'].replace(u'经验',
                                                        "").replace(u"经历", "")
        return res

    def regular_jobtag(self):
        """
        有关职位标签信息
        """
        res = []
        job_tag = re.search(u"应届生|全职|兼职|实习生|应届毕业生|社招|急招|急聘", self.jdStr)
        if job_tag:
            res.append(job_tag.group())

        job_tag = re.search(u"招聘人数[::]?|招聘[::\s]|人数[::\s]", self.jdStr)
        if job_tag:
            jdstr = self.jdStr[job_tag.span()[1]:]
            for line in jdstr.split():
                if len(line.strip()) < 1: continue
                else:
                    num = re.search(u"(\d+\-)?\d+人?|若干|\d+位", line)
                    if num:
                        res.append(u"招聘人数:" + num.group())
                    break

        job_tag = re.search(u"(职能类别|职位标签)[:: ]?", self.jdStr)
        if job_tag:
            jdstr = self.jdStr[job_tag.span()[1]:]
            for line in jdstr.split('\n'):
                if len(line.strip()) < 3: continue
                else:
                    res.append("职业标签:" + line.strip())
                    break
                if len(line) > 25: break

        #  根据产品部需求专门切割出包含经验的句子等有关职位标注信息,句子进行更精细化切割
        linelist = [
            line for line in re.split(u"[,。;\s]", self.jdStr)
            if 5 < len(line) < 15
        ]
        for line in linelist:
            if re.search(u"经验", line) and not re.search(u"月薪|地点|日期", line):
                if re.search(u"\d+k|[。?)\)\]]", line): continue
                res.append(self.clean_line(line))
                break

        self.result["job_tag"] = res
        return res

    # 清除句子前的数字和标点符合
    def clean_line(self, line):
        line = self.CLEAR_NUM.sub("", line.strip())
        line = self.CLEAR_COLO.sub("", line)
        return line

    # 抽取工作地点
    def regular_workplace(self):
        res = set()
        jdstr = self.jdStr
        pos = list(re.finditer(u"(工作地.|上班地.|实习地.|地址|地点)[::\s]", jdstr))
        if pos:
            jdstr = jdstr[pos[0].span()[1]:]

            for line in jdstr.split():
                if len(line.strip()) < 2: continue
                if len(line) < 26:
                    res.add(line.strip().replace(":", "").replace(":", ""))
                else:
                    for city in jieba.cut(line):
                        if city in self.citydic and city[:-1] not in res:
                            res.add(city)
                break
        if not res:
            for city in jieba.cut(jdstr):
                if city in self.citydic and city[:
                                                 -1] not in res and u"国" not in city:
                    res.add(city)
                    break
        self.result["workplace"] = " / ".join(res)
        return res

    # 抽取证书获奖情况等其他要求
    def regular_cert(self):
        res = set()
        linelist = [
            line for line in re.split(u"[\s ,。;,]", self.jdStr)
            if len(line) > 3
        ]
        for line in linelist:
            findcert = re.search(
                u"(\S+证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|职业资格|律师证|会计证",
                line)
            if findcert:
                res.add(findcert.group())
            else:
                findcert = re.search(u"有(.+证)书?", line)
                if findcert:
                    res.add(findcert.group(1))
                else:
                    findcert = re.search(u"有.+资格", line)
                    if findcert:
                        res.add(findcert.group())

        self.result['cert'] = re.sub(u"[或及以上]", "", ' / '.join(res))
        if self.result['cert']:
            self.result['cert'] = self.result['cert'].split(' / ')
        else:
            self.result['cert'] = []

    # 利用技能词库抽取相关技能
    def regular_skill(self, num=6):
        res = []
        for line in self.linelist:
            if self.DEMAND.search(line) or self.clf.predict(line) == 'demand':
                for word in jieba.cut(line):
                    word = strQ2B(word).lower()
                    if word in self.skills:
                        res.append(word)

        sorted_words = [w[0] for w in Counter(res).most_common(2 * num)]

        for word in jieba.cut(self.result['job_name']):
            word = strQ2B(word).lower()
            if word in self.skills and word not in sorted_words:
                sorted_words.insert(0, word)

        after_top3 = sorted_words[3:]
        np.random.shuffle(after_top3)

        self.result['skill'] = sorted_words[:3] + after_top3[:num - 3]

    # 抽取岗位职责
    def regular_duty(self):
        res = []
        jdStr = self.jdStr
        pos = list(self.START_DUTY.finditer(jdStr))
        if len(pos) > 0:
            linelist = [
                re.sub("[\s ]+", " ", line)
                for line in jdStr[pos[-1].span()[1]:].split("\n")
                if len(line) > 2
            ]
            for i in xrange(len(linelist)):
                line = linelist[i]
                if self.START_DUTY.search(
                        line) or self.lineindex[line] == 1 or (
                            re.search(u".年来|谢谢|请在|公司介绍|举报|收藏|岗位职责", line)
                            and not re.search(u"了解", line)):
                    continue
                if re.search(u"要求[::\s]?|岗位要求", line) and len(line) < 6: break
                if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]",
                            line.strip()) or self.DUTY.search(
                                line) or self.clf.predict(line) == 'duty':
                    res.append(line.strip())
                elif i < len(linelist) - 1 and self.clf.predict(
                        linelist[i + 1]) == 'duty':
                    res.append(line)
                else:
                    break
        if not res:
            for line in self.linelist:
                if re.search(u"粉丝团", line) and len(line) < 12: continue
                if self.DUTY.search(line) and self.clf.predict(line) == "duty":
                    if self.lineindex[line] != 1:
                        res.append(line)

        self.result["duty"] = "\n".join(res)
        for line in res:
            self.lineindex[line] = 1

        return res

    # 抽取岗位要求
    def regular_demand(self):
        res = []
        jdStr = self.jdStr
        pos = list(self.START_DEMAND.finditer(jdStr))
        if len(pos) > 0:
            tmppos = pos[-1].span()[0]
            if re.search(u"具有|具备", jdStr[tmppos - 5:tmppos + 5]) or re.search(
                    u"证书|证", jdStr[tmppos:tmppos + 8]):
                pos.pop()
            if pos:
                linelist = [
                    re.sub("[\s ]+", " ", line)
                    for line in jdStr[pos[-1].span()[1]:].split("\n")
                    if len(line) > 2
                ]
            else:
                linelist = []
            for i in xrange(len(linelist)):
                line = linelist[i]
                if self.START_DEMAND.search(linelist[i]) or re.search(
                        u"谢谢|请在|公司介绍|举报|收藏|\d+k?元|加分", line):
                    continue
                if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]",
                            line) or self.DEMAND.search(
                                line) or self.clf.predict(line) == 'demand':
                    res.append(line)
                elif i < len(linelist) - 1 and self.clf.predict(
                        linelist[i + 1]) == 'demand':
                    res.append(line)
                else:
                    break
        if not res:
            for line in self.linelist:
                if self.lineindex[line] == 1 or len(line.split()) > 6:
                    continue  # 如果该句已经被处理过,就不再重复显示
                if self.clf.predict(line) == 'demand' or self.DEMAND.search(
                        line):
                    res.append(line.strip())

        self.result['demand'] = '\n'.join(res)
        for line in res:
            self.lineindex[line] = 1

        return res

    # 招聘的职位名
    def regular_jobname(self):
        res = set()
        jdStr = self.jdStr
        findpos = re.search(u"(招聘岗位|招聘职位|职位名称|岗位名称|岗位[一二三四五六七八九])[:、:\s ]",
                            jdStr)
        #        if not findpos:
        #            findpos = re.search(u"(职位类别|职位职能)[::\s ]",jdStr)

        if findpos:
            pos = findpos.span()[1]
            linelist = jdStr[pos:].split("\n")
            for line in linelist:
                if len(line) < 2: continue
                if len(line) >= 2 and len(line) < 20:
                    if re.search(u"职位描述|查看|地址|工作|分享|举报|下一条|时间|福利|待遇|周末|双休",
                                 line):
                        continue
                    res.add(re.sub(u"聘请|高薪诚聘|诚聘|[,。、\d!]+", "", line.strip()))
                    break

        # 如果没有匹配到招聘的具体职位信息,就切词后到职位列表去匹配
        if not res:
            for line in self.linelist:
                if re.search(u"招聘|高薪|诚聘", line): continue
                if len(line) < 6 and not re.search(
                        u'岗位|岗位内容|工作内容|职责|任职|资格',
                        line) and self.clf.predict(line) == 'job_name':
                    res.add(line)
                    break
                findPos = self.JOBNAME.search(line)
                if findPos and len(findPos.group()) < 20 and not re.match(
                        u'\d', findPos.group()):
                    jobname = findPos.group()
                    res.add(re.sub(u"聘请|高薪诚聘|诚聘|急.|[,。、!]+", "", jobname))
                    break
                #   res.add(re.sub(u"\(.+\)|(.+)|【.+】|[,。、\s\d]+|聘请|高薪诚聘|诚聘|急招|","",line.strip()))

        if not res:
            for line in self.linelist:
                for word in jieba.cut(line.lower()):
                    if word in self.jobdic:
                        res.add(word)
                        self.result["job_name"] = " / ".join(res)
                        return res
        if not res:
            tag = re.search(u"实习生|兼职", self.jdStr)
            if tag:
                res.add(tag.group())
        self.result["job_name"] = strQ2B(" / ".join(res)).lower()
        return res

    # 薪酬
    def regular_pay(self):
        pay = ""
        lagoup = re.search(
            u"(\d+[kK][-——]\d+[kK])|(\d{3,5}-\d{3,5}元?/[月日天])|(\d{3,5}-\d{3,5}元)|((\d+[-~]\d+)万.[年月])|底薪\d+(-\d+)?元?|\d{3,5}元(左右|以上)?|年薪\d+万?元(左右|以上)?",
            self.jdStr)  # 针对拉勾网,没有待遇等关键字符
        if lagoup:
            pay = lagoup.group()
            self.result["pay"] = pay
            self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000')
            return pay

        findpay = self.PAY.search(self.jdStr)
        if findpay:
            pos = findpay.span()[1]

            jdstr = self.jdStr[max(0, pos - 5):min(pos + 10, len(self.jdStr))]
            if re.search(u"面议", jdstr):
                pay = u"面议"
            else:
                findpay = re.findall(u"\d{3,7}", jdstr)
                pay = "-".join(findpay)
        self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000')
        return pay

    # 抽取薪资福利
    def regular_benefits(self):
        res = []
        jdStr = self.jdStr
        findpos = list(re.finditer(u"薪酬福利[::\s]|(福利|待遇)\s?[::]", jdStr))
        if not findpos:
            findpos = list(
                re.finditer(u"(晋升制度|工作环境|职位诱惑|你会获得什么)\s?[?\?::]", jdStr))
        if findpos:
            pos = findpos[-1].span()[1]
            linelist = jdStr[pos:].split('\n')
            for line in linelist:
                print 'benefits', line
                if len(line.strip()) < 3: continue
                if re.match(ur"[((]?\d+", line) or self.BENEFIT.search(line):
                    res.append(line.strip())
                    self.lineindex[line.strip()] = 1
                else:
                    break

        if not res:
            for line in jdStr.split():
                if len(line) > 1 and re.search(
                        u"带薪|双休|股票期权|五险一金|发展空间|福利|诱惑|休假|薪酬|补助|年假|弹性工作", line):
                    if re.search(u"福利|待遇|诱惑", line) and len(line.strip()) < 6:
                        continue
                    res.append(line.strip())

        if len(res) == 1 and re.search(
                u"险一金", res[0]) and not re.search(u"[,、]", res[0]):
            res[0] = self.clean_line(' '.join(jieba.cut(res[0])))

        self.result["benefits"] = "\n".join(res)
        return res
示例#49
0
for i in acc:
    print i, acc[i]
file = open('valid-sent.txt')
result = open('result.txt', 'w+')

DICT_res_stat = dict()
mapping_dict = {'1': 'province', '2': 'city', '3': 'address', '4': 'town', '5': 'name', '6': 'shouji', '7': 'dianhua', '8': 'number', '9': 'leibie'}

total_corr = 0
total_count = 0
with open('result.txt', 'w') as o:
    for line in file:
        line = line.strip()
        line_label = line.split(',')[0]
        text = line.split(',')[1]
        c = new_grocery.predict(text)
        #d是对text的每个类别预测的权重,对d进行排序
        d = c.dec_values
        s = sorted(d.items(),key = lambda x:x[1],reverse = True)
        #若排序后第二的label是leibie,且值与第一相差不超过cha的话,就讲label改为leibie
        if s[1][0] == '9' and s[0][1] - s[1][1] < cha:
            c = '9'
        #print mapping_dict[str(c)], text
  
        print >> o, '%s, %s, %s' % (c, mapping_dict[str(c)], line)
        if DICT_res_stat.has_key(line_label) == False: DICT_res_stat[line_label] = {'error' : [], 'correct': []}
        if str(c) == line_label:
            DICT_res_stat[line_label]['correct'].append(line)
            total_corr += 1
        else:
            DICT_res_stat[line_label]['error'].append(line)
示例#50
0
# -*- coding: utf-8 -*-

# 测试文件
# Author: Alex
# Created Time: 2017年06月02日 星期五 11时15分12秒
from tgrocery import Grocery

train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]

grocery = Grocery('sample')
grocery.train(train_src)
grocery.save()

print("*" * 40)
new_grocery = Grocery('sample')
new_grocery.load()
res = new_grocery.test(train_src)
print(type(res))
print(res)
print(res.accuracy_labels)
print(res.show_result())

print("*" * 40)
res = new_grocery.predict("考生必读:新托福写作考试评分标准")
print(res)
print(res.dec_values)
from tgrocery import Grocery
data_dir = "../data/"
src_fn = data_dir + 'train_set_100.txt'
grocery = Grocery('backout_reason')
grocery.train(src_fn)

tp_cnt = {}
f = open(data_dir + 'type.txt')
for line in f:
	tps = line.split()
	tp_cnt[tps[1]] = 0

f.close()

f = open(data_dir + 'bcmtmoz.merge')
for line in f:
	tp = grocery.predict(line)
	tp_cnt[tp] += 1

print tp_cnt
示例#52
0
		tdic['id'].append(_id)
		tdic['type'].append(_type)
		tdic['contents'].append(contents)
	i +=1
	
#train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 )
#test = pd.read_csv( test_file, header = 1, delimiter = "\t", quoting = 3 )
train = DataFrame(dic)
test = DataFrame(tdic)
#
#classfynews_instance 是模型保存路径
grocery = Grocery('classfynews_instance')

train_in = [train['contents'],train['type']]
grocery.train(train_in)
print grocery.get_load_status()
#grocery.save()

copy_grocery = Grocery('classfynews_instance')
copy_grocery.load()
#copy_grocery = grocery
test_in = [test['contents'],test['type']]
#输入类似 ['我是中国人','台北*****']
#输出 [11,12]
test_result = copy_grocery.predict(test['contents'])
print test_result.predicted_y
#test_result = copy_grocery.test(test_in)
#print test_result.show_result()


示例#53
0
# -*- coding: utf-8 -*-

# 测试文件
# Author: Alex
# Created Time: 2017年06月02日 星期五 11时15分12秒
from tgrocery import Grocery

grocery = Grocery('sample')
grocery.train('train_data.txt', delimiter=';')
grocery.save()

print("*" * 40)
new_grocery = Grocery('sample')
new_grocery.load()
print(new_grocery.test('train_data.txt', delimiter=';'))

print("*" * 40)
print(new_grocery.predict("考生必读:新托福写作考试评分标准"))
示例#54
0
train_src = [
    ('education', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
grocery.train(train_src)
# 也可以用文件传入(默认以tab为分隔符,也支持自定义)
#grocery.train('train_ch.txt')
# 保存模型
grocery.save()
# 加载模型(名字和保存的一样)
new_grocery = Grocery('sample')
new_grocery.load()
# 预测
new_grocery.predict('考生必读:新托福写作考试评分标准')
#education

# 测试
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
new_grocery.test(test_src)
# 输出测试的准确率
#0.5

# 同样可支持文件传入
#new_grocery.test('test_ch.txt')
# 自定义分词模块(必须是一个函数)
#custom_grocery = Grocery('custom', custom_tokenize=list)
示例#55
0
#test file
#print "test's file len",len(test_file)
# test_src = test_file["String"].tolist()
# print "test_src",test_src[0]
# print "test's len",len(test_src)

#tgrocery classify
grocery =Grocery('sample')
grocery.train(train_src)
grocery.save()
new_grocery = Grocery('sample')
new_grocery.load()


#predict
print new_grocery.predict("当你坐上飞机在上面俯视它的时候")
#load the submit standard file
submit = pd.read_csv("sample_submit.csv",names=["id","value"])
print "the submit file len is ",len(submit)

t = []
f = open("test.txt")
for line in f.readlines():
    #print line.split("\t")[1]  #content
    t.append(new_grocery.predict(line.split("\t")[1]))
submit["value"] = t
print submit


submit.to_csv("submit.csv",sep=",")
t1 = time.time()