예제 #1
0
파일: classify.py 프로젝트: Honlan/CleverTL
    def __init__(self, keyword):
        print '进行新闻分类'
        (db, cursor) = connectdb()
        cursor.execute("update task set status=1 where keyword=%s", [keyword])
        cursor.execute("select id, title from news where keyword=%s",
                       [keyword])
        news = cursor.fetchall()
        new_grocery = Grocery('static/paris')
        new_grocery.load()

        for item in news:
            tag = new_grocery.predict(item['title'])
            if tag == '新闻背景':
                tag = 1
            elif tag == '事实陈述':
                tag = 2
            elif tag == '事件演化':
                tag = 3
            elif tag == '各方态度':
                tag = 4
            elif tag == '直接关联':
                tag = 6
            elif tag == '暂无关联':
                tag = 7
            cursor.execute("update news set tag=%s where id=%s",
                           [tag, item['id']])
        closedb(db, cursor)
        return
예제 #2
0
def get_data(ids, b_date, end_data, log, stop_word):
    b_date = b_date.strftime('%Y-%m-%d')
    end_data = end_data.strftime('%Y-%m-%d')
    # 选择数据来源
    df = load_data(ids, b_date, end_data)
    # df = load_data_excel()
    # df = pd.read_excel('data_treasure.xls')
    df['RateDate'] = pd.to_datetime(df['RateDate'])
    # df_group = df['RateDate'].groupby([df.RateDate.values.astype('datetime64[D]')]).size()
    res = list()
    log.info('Have %d comments need to process' % len(df))
    # 分类模型导入
    new_grocery = Grocery('sample2')
    new_grocery.load()
    for record_data in range(0, len(df)):
        # 按日期分类摘取内容
        # tmp_df = df[df['RateDate'] > df_group.index[record_data]][df['RateDate'] < df_group.index[record_data + 1]]
        # 自然语言处理
        content_sw, level, tag = nlp_process_with_sw(df.iloc[record_data],
                                                     new_grocery, stop_word)
        # 记录结果
        res.append({
            'RateContent': json.dumps(content_sw, ensure_ascii=False),
            'RateDate': df.iloc[record_data]['RateDate'],
            'TreasureID': df.iloc[record_data]['TreasureID'],
            'Level': level,
            'Tag': tag,
            'Sentence': df.iloc[record_data]['RateContent'],
        })
    return res
예제 #3
0
def predict_test(model_path, data):
    # 加载模型
    try:
        model_path = os.path.join(BASE_DIR, 'learn', model_path)
        new_grocery = Grocery(model_path.encode('utf-8'))
        new_grocery.load()
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'}
    # 整理输入数据
    result = list()
    sentences = data.split(';')
    if sentences[-1] == '':
        sentences.pop()
    if len(sentences) == 0:
        return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'}

    # 分词,再判断
    stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt'))
    for s in sentences:
        tmp_s = ''
        words = jieba.cut(s)
        for word in words:
            if word in stop_words:
                continue
            else:
                tmp_s += word + ' '
        result.append({
            'tag':
            str(new_grocery.predict(tmp_s.strip().encode('utf-8'))),
            'sentence':
            s,
        })
    return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
예제 #4
0
def test(test_path):
    new_grocery = Grocery('cv_' + str(fold) +
                          '_model')  #, custom_tokenize=segment)
    new_grocery.load()
    test_src = []
    with open(test_path) as f:
        for line in f:
            label, text = line.strip().split("|text|")
            label = yiji_label[classify_dict[label]]
            test_src.append((label, text))
    test_result = new_grocery.test(test_src)
    #print test_result
    #print test_result.accuracy_overall
    #accs = test_result.accuracy_labels
    recalls = test_result.recall_labels
    #print "Recall for each class: ", recalls
    predictlabels = test_result.predicted_y
    truelabels = test_result.true_y
    acc = accuracy_score(truelabels, predictlabels)
    macro_precision, macro_recall, macro_fscore, _ = precision_recall_fscore_support(
        truelabels, predictlabels, average='macro')
    print "Accuracy: ", acc, "Macro-average Precision:", macro_precision, "Macro-average Recall:", macro_recall, "Macro-average Fscore:", macro_fscore
    labellist = [
        'safe_and_stable', 'industrial_information', 'politics',
        'culture_health', 'social_livelihood', 'economic_and_financial'
    ]
    precision, recall, fscore, _ = precision_recall_fscore_support(
        truelabels, predictlabels, average=None, labels=labellist)
    precisions = dict()
    recalls = dict()
    for idx, p in enumerate(precision):
        precisions[labellist[idx]] = p
    for idx, c in enumerate(recall):
        recalls[labellist[idx]] = c
예제 #5
0
파일: classify.py 프로젝트: Honlan/CleverTL
	def __init__(self, keyword):
		print '进行新闻分类'
		(db, cursor) = connectdb()
		cursor.execute("update task set status=1 where keyword=%s", [keyword])
		cursor.execute("select id, title from news where keyword=%s",[keyword])
		news = cursor.fetchall()
		new_grocery = Grocery('static/paris')
		new_grocery.load()

		for item in news:
			tag = new_grocery.predict(item['title'])
			if tag == '新闻背景':
				tag = 1
			elif tag == '事实陈述':
				tag = 2
			elif tag == '事件演化':
				tag = 3 
			elif tag == '各方态度':
				tag = 4
			elif tag == '直接关联':
				tag = 6
			elif tag == '暂无关联':
				tag = 7
			cursor.execute("update news set tag=%s where id=%s", [tag, item['id']])
		closedb(db, cursor)
		return
	def labelmaker(self):
		result=[]
		grocery = Grocery('11c_20k_20171226')
		grocery.load()	
		label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0]
		result.append(label_confidence[0])#置信度最高的分类结果
		result.append(label_confidence[1])# 置信度
		return result
예제 #7
0
	def GET(self,name):
		#i = web.input(name=None)	
		#url = "http://"+name
		#html = urllib2.urlopen(url).read()
		#soup = BeautifulSoup(html)
		#title =  soup.html.head.title.contents.pop().encode('utf-8')
		title = name.encode('utf-8')
		new_grocery = Grocery('sample')
		new_grocery.load()
		return new_grocery.predict(title)
예제 #8
0
파일: Classify.py 프로젝트: TimePi/Python
 def __train__model__():
     dataframe = pd.read_excel(Classify.__FILE_PATH__)
     data = dataframe[[u'类型',	u'释义']]
     train_data = [(x[0],x[1]) for x in data.values]
     
     grocery = Grocery('Classify')
     
     grocery.train(train_data)
     grocery.save()
     Classify.__MODEL__ = grocery
예제 #9
0
def phgrocery(text):
    # result_text = []
    model_grocery = Grocery('model_redian_5')
    model_grocery.load()

    result = int(model_grocery.predict(text).predicted_y)
    # print result
    # if result == 1:
    #     result_text.append(text)
    return result
예제 #10
0
def tGrocery():
    outFile = open('testResult.tmp', 'w')
    [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk'))
    testingSet = []
    correctLabel = []
    for i in xrange(len(benchmark)):
        print '%d out of %d' % (i, len(benchmark))
        testingSet.append(benchmark[i][1])
        correctLabel.append(benchmark[i][0]) 
    grocery = Grocery('test')
    grocery.train(trainingSet)
    grocery.save()
    # load
    new_grocery = Grocery('test')
    new_grocery.load()
    Prediction = []
    for i in xrange(len(testingSet)):
        print '%d out of %d' % (i, len(testingSet))
        prediction = new_grocery.predict(testingSet[i])
        Prediction.append(prediction)
        temp = correctLabel[i] + '<-->' + prediction + '  /x01' + testingSet[i] + '\n'
        outFile.write(temp)
    correct = 0
    for i in xrange(len(Prediction)):
        print Prediction[i], correctLabel[i],
        if Prediction[i] == correctLabel[i]:
            correct += 1
            print 'Correct'
        else:
            print 'False'
    print 'Correct Count:', correct
    print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
예제 #11
0
class GroceryModel(object):
    def __init__(self):
        self.grocery = Grocery('TextClassify')
    
    def train(self,train_file):
        f = open(train_file,'r')
        line = f.readline().decode('utf8')
        dataset = []
        while line:
            tmp = line.split('\t')
            dataset.append((tmp[0],''.join(tmp[1:])))
            line = f.readline().decode('utf8')
        f.close()
        self.grocery.train(dataset)
        self.grocery.save()
    
    def load_model(self):
        self.grocery.load()
    
    def test(self,test_src):
        self.load_model()
        f = open(test_src,'r')
        line = f.readline().decode('utf8')
        dataset = []
        while line:
            tmp = line.split('\t')
            dataset.append((tmp[0],''.join(tmp[1:])))
            line = f.readline().decode('utf8')
        f.close()
        result = self.grocery.test(dataset)
        print result
    
    def predict(self,text):
        print self.grocery.predict(text)
예제 #12
0
    def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'):
        '''

        :param text:
        :param model_name:
        :return:
        '''
        new_grocery = Grocery(self.model_name)
        new_grocery.load()
        result = new_grocery.predict(text)
        return result.dec_values[u'postive']
예제 #13
0
    def __init__(self):
        self.degreedic = set( line.strip() for line in codecs.open('./data/degrees.txt','rb','utf-8')) # 载入学历词库
        self.majordic =set( line.strip() for line in codecs.open('./data/majordic.txt','rb','utf-8')) # 载入专业词库
        self.citydic = set( line.strip() for line in codecs.open("./data/citydic.txt",'rb','utf-8'))   # 载入城市词库
        self.firmnames =set( line.strip() for line in codecs.open('./data/firm.txt','rb','utf-8'))    # 载入公司缩写名库
        self.jobdic = set(line.strip() for line in codecs.open('./data/jobposition.txt','rb','utf-8') ) # 载入招聘职位名库
        self.skills = set( line.strip() for line in codecs.open('./data/skills.txt','rb','utf-8'))
#        self.wordlisttf = pickle.load(open('./data/wordlist.pkl'))  # 出现频率最高的2000个单词
        # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector
        self.clf = Grocery("jdclf")        # 句子分类器,分为demand,duty,other
        self.clf.load()
        
        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")
        
        self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") 
        self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") 
        self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析')

        self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]")
        self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利")

        self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译")
        self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]")
        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")
        self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
        |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
        
        self.SPLIT_JD = re.compile(u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]")
        self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]")
        self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]")
        self.SKILL = re.compile(u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的")
        
        jieba.load_userdict('./data/majordic.txt')
        jieba.load_userdict('./data/skills.txt')
        jieba.load_userdict('./data/firm.txt')
        jieba.load_userdict('./data/degrees.txt')
        jieba.load_userdict('./data/benefits.txt')


        self.jdStr = ""
        self.linelist = []
        self.lineindex = defaultdict(int)
        self.result = OrderedDict() 
예제 #14
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
예제 #15
0
파일: grocery.py 프로젝트: SwoJa/ruman
def test_grocery():
    grocery = Grocery('model_redian')
    grocery.train('trdata_4.txt')
    grocery.save()
    new_grocery = Grocery('model_redian')
    new_grocery.load()
    test_result = new_grocery.test('tedata_4.txt')
    print test_result.accuracy_labels
    print test_result.recall_labels
    test_result.show_result()
예제 #16
0
class AutoGrocery(object):
    """

    """
    def __init__(self, name, train_data):
        self._train_data = train_data
        self._grocery = Grocery(project_dir + '/models/model_data/' + name)

    def train(self):
        self._grocery.train(self._train_data)

    def save(self):
        self._grocery.save()

    def load(self):
        self._grocery.load()

    def predicate(self, src):
        if not self._grocery.get_load_status():
            try:
                self.load()
            except ValueError:
                self.train()
                self.save()
        pr = self._grocery.predict(src)
        label = pr.predicted_y
        return label, pr.dec_values[label]
예제 #17
0
    def __init__(self):
        self.data = []
        self.clf = Grocery("jdclf")
        self.clf.load()

        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(
            u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")

        self.INCNAME = re.compile(
            u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)")
        self.NOT_INC = re.compile(
            u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责")
        self.INCTAG = re.compile(
            u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\
                                 |互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(
            u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析'
        )

        self.START_DEMAND = re.compile(
            u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]"
        )

        self.DEMAND = re.compile(
            u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利"
        )

        self.DUTY = re.compile(
            u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\
                               |完成|沟通|需求|秘书.{2,5}翻译")

        self.START_DUTY = re.compile(
            u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]"
        )

        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")

        self.BENEFIT = re.compile(
            u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
                                  |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
예제 #18
0
def demo_flask(image_file):
    grocery = Grocery('NameIdAdd_NLP')
    model_name = grocery.name
    text_converter = None
    tgm = GroceryTextModel(text_converter, model_name)
    tgm.load(model_name)
    grocery.model = tgm

    t = time.time()
    result_dir = './result'
    image = np.array(Image.open(image_file).convert('RGB'))
    result, image_framed = ocr_whole.model(image)
    output_file = os.path.join(result_dir, image_file.split('/')[-1])
    Image.fromarray(image_framed).save(output_file)
    name_total = ''
    id_total = ''
    for key in result:
        string1 = result[key][1]
        if len(string1) <= 8:
            continue
        string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1)
        no_digit = len(list(filter(str.isdigit, string2.encode('gbk'))))
        no_alpha = len(list(filter(is_alphabet, string2)))
        if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2:
            predict_result = 'others'
        elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3:
            predict_result = 'company-address'
        elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8):
            predict_result = 'company-id'
        elif '名称' in string2:
            predict_result = 'company-name'
        else:
            predict_result = grocery.predict(string2)
        if str(predict_result) == 'company-name':
            name_total += string1
            break
        elif str(predict_result) == 'company-id':
            id_total += string1
        else:
            continue
    id_total = re.sub(r'\W', '', id_total)
    name_total = stupid_revise(name_total)
    print("Mission complete, it took {:.3f}s".format(time.time() - t))
    print('\nRecongition Result:\n')
    print(id_total)
    print(name_total)
    return output_file, id_total, name_total
예제 #19
0
 def __load_model__():
     if not Classify.__MODEL_LOADED__:
         Classify.__MODEL_LOADED__ = True
         Classify.__train__model__()
     else:
         if Classify.__MODEL__:
             Classify.__MODEL__ = Grocery('Classify')
             Classify.__MODEL__.load()
def train_compare_result(train_src, test_src):
    grocery = Grocery('test')
    grocery.train(train_src)
    print grocery.get_load_status()
    len_test = len(test_src)
    print len_test
    Predict_num = 0
    History = []
    for test in test_src:
        Predict_result = {
            'predict_title': test[1],
            'predict_class': None,
            'true_class': None
        }
        predict_title = Predict_result['predict_title']
        predict_result = grocery.predict(predict_title)
        Predict_result['predict_class'], Predict_result['true_class'] = test[
            0], predict_result
        if str(predict_result) == str(test[0]):
            # print 'prediction is True'
            Predict_num += 1
        History.append(Predict_result)
        # print 'prediction is False'
    predict_precision = float(Predict_num) / len_test
    return predict_precision, History
def predict_corpus(input_file,output_csv):
    import csv
    csvfile = file(output_csv, 'wb')
    writer = csv.writer(csvfile)
    corpus = []
    f = xlrd.open_workbook(input_file)
    table = f.sheet_by_name('Sheet1')
    nrows = table.nrows  # 读取行数
    for rownum in range(0, nrows):
        row = table.row_values(rownum)
        row[2].strip()
        corpus.append(row[2])
    corpus_grocery = Grocery(project_name)
    corpus_grocery.load()
    output = []
    for sentence in corpus:
        predict = corpus_grocery.predict(sentence)
        output.append((sentence,predict))
    writer.writerows(output)
    print('Done!')
    csvfile.close()
def train():
    print 'train start '+'.'*30
    #grocery=Grocery('sample')
    grocery=Grocery('version1.0')
    grocery.train(trainlist)
    grocery.save()
    print 'train end '+'.'*30
예제 #23
0
def sentiment_train(gro_name, train_set):
    """
    tgGrocery svm train
    :param gro_name:
    :param train_set:
    :return:
    """
    gro_ins = Grocery(gro_name)
    # gro_ins.load()
    gro_ins.train(train_set)
    print("Is trained? ", gro_ins.get_load_status())
    gro_ins.save()
예제 #24
0
class jdParser(object):
    def __init__(self):
        self.clf = Grocery("./jdclf")
        self.clf.load()
        self.LINE_SPLIT = re.compile(u"[;。;\n]")

    def get_demand_and_duty(self, jdstr):
        linelist = [
            line.strip() for line in self.LINE_SPLIT.split(jdstr)
            if len(line.strip() > 4)
        ]

        result = {}
        demand = []
        duty = []
        for line in linelist:
            pred = str(self.clf.predict(line))
            if pred == "demand":
                demand.append(line)
            elif pred == "duty":
                duty.append(line)

        result['demand'] = '\n'.join(demand)
        result['duty'] = '\n'.join(duty)
예제 #25
0
class jdParser(object):

    def __init__(self):
        self.clf = Grocery("./jdclf")
        self.clf.load()
        self.LINE_SPLIT = re.compile(u"[;。;\n]")



    def get_demand_and_duty(self,jdstr):
        linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip()>4) ]

        result = {}
        demand = []
        duty = []
        for line in linelist:
            pred = str(self.clf.predict(line))
            if pred =="demand":
                demand.append(line)
            elif pred == "duty":
                duty.append(line)

        result['demand'] = '\n'.join(demand)
        result['duty'] = '\n'.join(duty)
예제 #26
0
    def __train__model__():
        dataframe = pd.read_excel(Classify.__FILE_PATH__)
        data = dataframe[[u'类型', u'释义']]
        train_data = [(x[0], x[1]) for x in data.values]

        grocery = Grocery('Classify')

        grocery.train(train_data)
        grocery.save()
        Classify.__MODEL__ = grocery
예제 #27
0
def sentiment_train(gro_name, train_set):
    """

    :param gro_name:
    :param train_set:
    :return:
    """
    gro_ins = Grocery(gro_name)
    # gro_ins.load()
    gro_ins.train(train_set)
    print("Is trained? ", gro_ins.get_load_status())
    gro_ins.save()
예제 #28
0
def train(train_origin_path, fold):
    grocery = Grocery('cv_' + str(fold) +
                      '_model')  #, custom_tokenize=segment)

    train_src = []
    with open(train_origin_path) as f:
        for line in f:
            label, text = line.strip().split("|text|")
            label = yiji_label[classify_dict[label]]
            train_src.append((label, text))

    grocery.train(train_src)
    grocery.save()
예제 #29
0
    def train_phrasing_and_save(self, trainsets=all):
        '''

        :param trainsets:
        :param model_name:
        :return:
        '''
        try:
            grocery = Grocery(self.model_name)
            grocery.train(trainsets)
            grocery.save()
            return True
        except:
            return False
예제 #30
0
class MyGrocery(object):
  def __init__(self, name):
    super(MyGrocery, self).__init__()
    self.grocery = Grocery(name)
    self.loaded = False
    self.correct = 1.0

  def train(self, src):
    lines = []
    for line in csv.reader(open(src)):
      label, s = line[0],line[1]
      text = s.decode('utf8')
      lines.append((label, text))
    self.grocery.train(lines)

  def save_model(self):
    self.grocery.save()

  def train_and_save(self, src):
    self.train(src)
    self.save_model()

  def load_model(self):
    if not self.loaded:
      self.grocery.load()
      self.loaded = True

  def predict(self, text):
    self.load_model()
    return self.grocery.predict(text)

  def test(self, src):
    self.load_model()
    total, wrong_num = 0.0, 0.0
    for line in csv.reader(open(src)):
      total += 1
      if line[0] != self.predict(line[1]):
        wrong_num += 1

    print "load test file from " + src
    correct = (total - wrong_num ) / total
    self.correct = correct
    print "total: %d , wrong_num: %d, success percentage: %f" %(total, wrong_num, correct)
    result = dict(type="test", total=total, wrong_num=wrong_num, correct=correct)
    return json.dumps(result)
예제 #31
0
파일: base.py 프로젝트: jkmiao/ipin2015
    def __init__(self):
        self.CLEAN_TEXT = re.compile(u"[^\u4e00-\u9fa5\w\d;::;,。、\.,。!!@()\r\n\(\)\-\+ - ]")
        
        self.clf = Grocery(base_dir+"/jdclf")
        self.clf.load()
        
        self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}")
        self.CLEAN_LINE = re.compile(u"^[\u2022(【\[\s\t\r\n\(\-  ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]")
        self.CLEAN_JOBNAME = re.compile(u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]")

        self.PAY = re.compile("(\d{3,}\-)?\d{3,}元")
        self.SEX = re.compile(u"性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.JOB_TAG = re.compile(u"全职|实习")
        self.DEGREE = re.compile(u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限")

        self.START_DEMAND = re.compile(u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?")
        self.START_DUTY = re.compile(u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]")
        self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]")
        
        self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+")
        self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验")
        self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作")
        self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴")
        self.CERT = re.compile(u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\
                                 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书")


        self.degreedic = set([line.strip() for line in codecs.open(base_dir+'/data/degrees.txt','rb','utf-8')])
        self.majordic = set([line.strip() for line in codecs.open(base_dir+'/data/majordic.txt','rb','utf-8')])
        self.skilldic = set([line.strip() for line in codecs.open(base_dir+'/data/skills.txt','rb','utf-8')])
        self.jobdic = set([line.strip() for line in codecs.open(base_dir+'/data/jobnames.txt','rb','utf-8')])

        jieba.load_userdict(base_dir+'/data/majordic.txt')
        jieba.load_userdict(base_dir+'/data/skills.txt')
        jieba.load_userdict(base_dir+'/data/firm.txt')
        jieba.load_userdict(base_dir+'/data/degrees.txt')
        jieba.load_userdict(base_dir+'/data/benefits.txt')
예제 #32
0
    def __init__(self, *args, **kwargs):
        self.grocery_name = str(kwargs["grocery_name"])
        method = str(kwargs["method"])
        train_src = str(kwargs["train_src"])
        self.PREFIX = conf.load("predict_label")["prefix"]
        self.MODEL_DIR = conf.load("predict_label")["model_dir"]

        self.kwargs = kwargs
        if method == "normal":
            self.key_ext = keyExt()
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        elif method == "jieba":
            self.grocery = Grocery(self.grocery_name)
        elif method == "processed":
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        pass
예제 #33
0
    def __init__(self):
        self.data = []
        self.clf = Grocery("jdclf")
        self.clf.load()
        
        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")
        
        self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") 
        self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") 
        self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\
                                 |互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析')

        self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]")

        self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利")

        self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\
                               |完成|沟通|需求|秘书.{2,5}翻译")

        self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]")

        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")

        self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
                                  |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
예제 #34
0
def learn_model(file_name):
    path = os.path.join(BASE_DIR, 'learn', file_name)
    try:
        df = pd.read_excel(path)
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'找不到文档或者读取文档出错'}
    try:
        # 删去缺失值的行
        df = df.dropna(axis=0)
        df = df.apply(split_comment, axis=1)
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'文档格式有误,应包含Tag(标签名字),Comment(评价内容)'}

    try:
        # 拆分学习组和测试组 3 :2
        len_learn = len(df) / 5 * 3
        # 生成学习文档和测试文档
        learn_file_name, test_file_name = output_file(file_name, df, len_learn)
        tmp_learn_name = os.path.join(BASE_DIR, 'learn',
                                      'model_' + learn_file_name.split('.')[0])
        grocery = Grocery(tmp_learn_name.encode('utf-8'))
        path = os.path.join(BASE_DIR, 'learn', learn_file_name)
        grocery.train(path.encode('utf-8'))
        grocery.save()
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'学习不成功,没有生产新的模型,请再次尝试。'}

    # 测试
    res = test_sample(tmp_learn_name, test_file_name)

    return {
        'IsErr':
        False,
        'ErrDesc':
        u'成功生产新的模型,测试验证的正确率为%s, 模型保存为:%s' %
        (res, os.path.split(tmp_learn_name)[1])
    }
# -*- coding: utf-8 -*-
import csv,codecs
from tgrocery import Grocery
import preprocessing as pp

trainFileName='../data/train.txt'
validateFileName='../data/validate.txt'
outputFileName='../output/result.txt'

# validate ##################################
#grocery=Grocery('sample')
grocery=Grocery('version1.0')
grocery.load()

print 'start test'
TP=0.0
TN=0.0
FP=0.0
FN=0.0

fileValidate=codecs.open(validateFileName,'r','utf-8')
validate_reader=fileValidate.readlines()

fileOutput=codecs.open(outputFileName,'w','utf-8')

resultlist=[]
i=0
for line in validate_reader:
    content=pp.getcontent(validate_reader,i)
    i=i+1
    if(i%5000==0):
예제 #36
0
class JdCRF(object):
    def __init__(self):
        self.data = []
        self.clf = Grocery("jdclf")
        self.clf.load()
        
        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")
        
        self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") 
        self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") 
        self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\
                                 |互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析')

        self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]")

        self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利")

        self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\
                               |完成|沟通|需求|秘书.{2,5}翻译")

        self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]")

        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")

        self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
                                  |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
        
    


    def gen_data(self,fname='./data/lagou_train.txt'):
        fw = codecs.open('./data/jd_train_crf.txt','wb','utf-8')
        cnt = 1
        for line in codecs.open(fname,'rb','utf-8'):
            if line.startswith(u"====="):
                fw.write(line)
                continue

            cnt +=1
            if len(line.strip())>1:
                    pred = self.clf.predict(line)
                    newline = pred+'\t\t'+line.strip()+'\t\t'+str(len(line))+"\n"
                    fw.write(newline)
        print cnt
        print 'done'


    def load_data(self,fname="./data/jd_train_crf.txt"):
        data = []
        tmp = []
        for line in codecs.open(fname,'rb','utf-8'):
            if line.startswith(u"===="):
                data.append(tmp)
                tmp = []
                continue
            else:
                tag_data = line.strip().split('\t\t')
                if len(tag_data)==3:
                    tmp.append(tuple(tag_data))
                else:
                    print '\t  '.join(tag_data)

        
        n = len(data)/2
        print 'train data',n
        print 'test data',len(data)-n
        return data[n:],data[:n]
    

    def word2features(self,sent,i):
        word = sent[i][0]
        postag = sent[i][1]

        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[:2]=' +word[:2],
            'word.isdigit=%s'%word.isdigit(),
            'postag='+postag,
            'demand=%s'% '1' if self.DEMAND.search(word) else '0',
            'start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0',
            'start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
            'duty=%s'% '1' if self.DUTY.search(word) else '0',
            'jobname=%s'% '1' if self.JOBNAME.search(word) else '0',
            'incname=%s'% '1' if self.INCNAME.search(word) else '0',
            'benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
            'pred=%s' % self.clf.predict(word)
        ]

        if i>0:
            word1 = sent[i-1][0]
            postag1 = sent[i-1][1]

            features.extend([
                '-1:postag='+postag1,
                '-1:word.islower='+word1[:3].lower(),
                '-1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '1',
                '-1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
                '-1:demand=%s'% '1' if self.DEMAND.search(word1) else '0',
                '-1:duty=%s'% '1' if self.DUTY.search(word1) else '0',
                '-1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0',
                '-1:incname=%s'% '1' if self.INCNAME.search(word1) else '0',
                '-1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
                '-1:pred=%s' % self.clf.predict(word),
            ])

        else:
            features.append('BOS')


        if i<len(sent)-1:
            word1 = sent[i+1][1]
            postag1 = sent[i+1][1]
            features.extend([
                '+1:word.lower=' + word1[:3].lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:postag=' + postag1,
                '+1:postag[:2]=' + postag1[:2],
                '+1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0',
                '+1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
                '+1:demand=%s'% '1' if self.DEMAND.search(word1) else '0',
                '+1:duty=%s'% '1' if self.DUTY.search(word1) else '0',
                '+1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0',
                '+1:incname=%s'% '1' if self.INCNAME.search(word1) else '0',
                '+1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
                '+1:pred=%s' % self.clf.predict(word),
            ])
        else:
            features.append('EOS')


        return features




    def sent2features(self,sent):
        return [self.word2features(sent,i) for i in range(len(sent))]

    def sent2labels(self,sent):
        return [label for (label,token,postag) in sent]

    def sent2tokens(self,sent):
        return [token for (label,token,postag) in sent]
    

    def train(self,x_train,y_train):
        
        assert len(x_train)==len(y_train),"not the same %d  %d"%(len(x_train),len(y_train))

        trainer = pycrfsuite.Trainer(verbose=False)

        for xseq,yseq in zip(x_train,y_train):
            trainer.append(xseq,yseq)

        trainer.set_params({
            'c1':1.0,
            'c2':1e-3,
            'max_iterations':50,
            'feature.possible_transitions':True
        })

        trainer.train('jd_skill.crfsuite')

    
    def test(self,sent):
        tagger = pycrfsuite.Tagger()
        tagger.open('./jd_skill.crfsuite')
        
        print 'tokens   ','\n '.join(self.sent2tokens(sent))
        print 'Predicted','\t '.join(tagger.tag(self.sent2features(sent)))
        print 'Correct  ','\t '.join(self.sent2labels(sent))
예제 #37
0
# coding: utf-8

from tgrocery import Grocery

grocery = Grocery('test')
train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)

grocery = Grocery('read_text')
train_src = '../text_src/train_ch.txt'
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)
예제 #38
0
def test_grocery():
    grocery = Grocery('model_redian')
    grocery.train('trdata_4.txt')
    grocery.save()
    new_grocery = Grocery('model_redian')
    new_grocery.load()
    test_result = new_grocery.test('tedata_4.txt')
    print test_result.accuracy_labels
    print test_result.recall_labels
    test_result.show_result()
예제 #39
0
# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.path.append('../../')
from config import *
from es import es214 as es
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import TransportError
from elasticsearch.helpers import bulk
from tgrocery import Grocery
model_fintext = Grocery('../fintext_classify/model_fintext')
model_fintext.load()


def match_topic_kw(news_id, keywords_list, source, doc_type, size=10000):
    result = []
    keyword_str = ''.join(keywords_list)
    # 通过一组关键词查找相关文本
    query_body = {
        "query": {
            "match": {
                "content": keyword_str  #这个可能还得改,争取用一个list
            }
        },
        "size": size
    }
    # print keyword_str
    es_result = es.search(index=source,
                          doc_type=doc_type,
                          body=query_body,
                          request_timeout=400)
예제 #40
0
# coding: utf-8

from tgrocery import Grocery

# save
grocery = Grocery('test')
train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
grocery.save()

# load
# grocery name must be the same as the previous one
new_grocery = Grocery('test')
new_grocery.load()
print new_grocery.predict('考生必读:新托福写作考试评分标准')
    i=i+1
    if(i%5000==0):
        print ("%d "%(i))+'#'*30
    str=line.split(u',')
    count=str.__len__()
    if(count<2):
        print 'error happen'+"#"*30
        continue

    #print count
    #print str
    trainstr=(str[0],str[1])
    trainlist.append(trainstr)
    #print str[1]+u','+str[2]

grocery=Grocery('sample')
grocery.train(trainlist)
grocery.save()
filein.close()


# test ##################################
print 'start test'
TP=0.0
TN=0.0
FP=0.0
FN=0.0

filetest=codecs.open(validateFileName,'r','utf-8')
test_reader=filetest.readlines()
예제 #42
0
 def __init__(self):
     self.clf = Grocery("./jdclf")
     self.clf.load()
     self.LINE_SPLIT = re.compile(u"[;。;\n]")
예제 #43
0
# coding=utf-8
from tgrocery import Grocery

grocery = Grocery('sample')

train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
#grocery.train('/home/wangjianfei/git/data/train_ch.txt')
# grocery.train('train_ch.txt')
grocery.save()
new_grocery = Grocery('sample')
new_grocery.load()
print(
    new_grocery.predict(
        'Abbott government spends $8 million on higher education media blitz'))
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
print("start test..................")
#grocery.test('/home/wangjianfei/git/data/test.txt')
# grocery.train('train_ch.txt'))
# custom_grocery = Grocery('custom', custom_tokenize=list)
print(new_grocery.test(test_src))
예제 #44
0
            break
        t_pre_result = grocery_in.predict(t_text)
        t_label = t_pre_result.predicted_y
        # if max(pre_result.dec_values) < 0.03:
        #     label = "neutral"
        print("Sentiment: ", t_label)
        print("How much: ", max(t_pre_result.dec_values))


########################################################
# main
if __name__ == "__main__":
    import time

    grocery_name = "./meter"
    corpus_path = "./Corpus/"
    max_line_num_once = 1000000  # 每个文件中读取的最大行数

    tic = time.time()
    file_list = [corpus_path + "neg.xls", corpus_path + "pos.xls"]
    train_src = get_xls_train_set(file_list, max_line_num_once)

    sentiment_train(grocery_name, train_src)
    toc = time.time()
    print("Elapsed time of training is: ", toc - tic)

    grocery = Grocery(grocery_name)
    grocery.load()

    predict_for_one(grocery)
예제 #45
0
파일: cat.py 프로젝트: FREEWING-JP/autohome
class Cat:
    def __init__(self):
        self.grocery = Grocery('autohome')

    def test(self):
        print self.grocery.get_load_status()
예제 #46
0
from tgrocery import Grocery
data_dir = "../data/"
src_fn = data_dir + 'train_set_100.txt'
grocery = Grocery('backout_reason')
grocery.train(src_fn)

tp_cnt = {}
f = open(data_dir + 'type.txt')
for line in f:
	tps = line.split()
	tp_cnt[tps[1]] = 0

f.close()

f = open(data_dir + 'bcmtmoz.merge')
for line in f:
	tp = grocery.predict(line)
	tp_cnt[tp] += 1

print tp_cnt
예제 #47
0
# coding: utf-8

from tgrocery import Grocery


grocery = Grocery('test')
train_src = [
    ('education', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
grocery.train(train_src)
print grocery.get_load_status()

test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
test_result = grocery.test(test_src)
print test_result.accuracy_labels
print test_result.recall_labels

grocery = Grocery('text_src')
train_src = '../text_src/train_ch.txt'
grocery.train(train_src)
print grocery.get_load_status()

test_src = '../text_src/test_ch.txt'
test_result = grocery.test(test_src)
print test_result.accuracy_labels
예제 #48
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     print(grocery.predict('考生必读:新托福写作考试评分标准'))
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
예제 #49
0

def unzip(seq, L=None):
    if L is None:
        L = []
    for s in seq:
        if not isinstance(s, (list, )):
            L.append(s)
        else:
            L.extend(unzip(s))
    return L


if __name__ == "__main__":

    grocery = Grocery('sample')
    grocery.train(train_src)
    grocery.save()
    new_grocery = Grocery('sample')
    new_grocery.load()

    L1 = []
    with open('/home/mouse/Downloads/female.csv', 'r') as f1:
        f1_csv = csv.reader(f1)
        for row in f1_csv:
            L1.append(row[0])
    # print(len(L1))

    cate = category(L1)
    i = 1
    with open('/home/mouse/infoss.csv', 'w') as f:
#!/usr/bin/env python
# coding=utf-8

from tgrocery import Grocery

#grocery = Grocery('age56')
#grocery.train('train4_age_56', ' ')
#grocery.save()

new_grocery = Grocery("age")
new_grocery.load()
predict_result = new_grocery.test('test4_age', ' ')
#print len(predict_result.true_y)
#for i in range(len(predict_result.predicted_y)):
#print predict_result.predicted_y[i]
print predict_result
predict_result.show_result()
##########################################
# init
model_choose = "svm"  # svm, lda, rnn
grocery_name = "./SVM_models/svm_for_news"
corpus_path = "./Corpus/NewsClassCorpus/"
file_path = "./"
file_name = "post.txt"

t_text = delete_stop_words(codecs.open(file_path + file_name, encoding="UTF-8").read())

###########################################
# 调用 SVM 模型分类
if model_choose == "svm":
    tic = time.time()
    grocery = Grocery(grocery_name)
    grocery.load()
    t_pre_result = grocery.predict(delete_stop_words(t_text))
    toc = time.time()

    t_label = t_pre_result.predicted_y
    print("Sentiment: ", t_label)
    print("How much: ", t_pre_result.dec_values[t_label])
    print("Elapsed time of predict is: %s s" % (toc - tic))
elif model_choose == "lda":
    pass
elif model_choose == "rnn":
    pass
else:
    print("")
예제 #52
0
파일: fin_text.py 프로젝트: lvleilei/screen
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.path.append('../../')
from config import *

from tgrocery import Grocery
STOP_WORDS_FILE = 'stopwords.txt'
USER_DICT_FILE = 'user_dict.txt'

model_fintext = Grocery('model_fintext')
model_fintext.load()
sys.path.append('../')
from get_es import *
es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])

def search(index_name):
    es_search_options = set_search_optional()
    es_result = get_search_result(es_search_options,index=index_name)
    # final_result = get_result_list(es_result)
    # return final_result
    return es_result


def get_result_list(es_result):
    final_result = []
    for item in es_result:
        final_result.append(item['_source'])
    return final_result

예제 #53
0
# coding:utf-8
#!/usr/bin/evn python
from tgrocery import Grocery 


copy_grocery = Grocery('./classfynews_instance')#模型所在路径
copy_grocery.load()
#copy_grocery = grocery
test = ['我是中国人','台北*****']
test_result = copy_grocery.predict(test)
print test_result.predicted_y
#test_result = copy_grocery.test(test_in)
#print test_result.show_result()


예제 #54
0
		dic['id'].append(_id)
		dic['type'].append(_type)
		dic['contents'].append(contents)
	else :
		tdic['id'].append(_id)
		tdic['type'].append(_type)
		tdic['contents'].append(contents)
	i +=1
	
#train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 )
#test = pd.read_csv( test_file, header = 1, delimiter = "\t", quoting = 3 )
train = DataFrame(dic)
test = DataFrame(tdic)
#
#classfynews_instance 是模型保存路径
grocery = Grocery('classfynews_instance')

train_in = [train['contents'],train['type']]
grocery.train(train_in)
print grocery.get_load_status()
#grocery.save()

copy_grocery = Grocery('classfynews_instance')
copy_grocery.load()
#copy_grocery = grocery
test_in = [test['contents'],test['type']]
#输入类似 ['我是中国人','台北*****']
#输出 [11,12]
test_result = copy_grocery.predict(test['contents'])
print test_result.predicted_y
#test_result = copy_grocery.test(test_in)
예제 #55
0
파일: cat.py 프로젝트: FREEWING-JP/autohome
 def __init__(self):
     self.grocery = Grocery('autohome')
예제 #56
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from tgrocery import Grocery
# 新开张一个杂货铺(别忘了取名)
grocery = Grocery('sample')
# 训练文本可以用列表传入
train_src = [
    ('education', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
grocery.train(train_src)
# 也可以用文件传入(默认以tab为分隔符,也支持自定义)
#grocery.train('train_ch.txt')
# 保存模型
grocery.save()
# 加载模型(名字和保存的一样)
new_grocery = Grocery('sample')
new_grocery.load()
# 预测
new_grocery.predict('考生必读:新托福写作考试评分标准')
#education

# 测试
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
new_grocery.test(test_src)
예제 #57
0
 def __init__(self, name):
   super(MyGrocery, self).__init__()
   self.grocery = Grocery(name)
   self.loaded = False
   self.correct = 1.0