Пример #1
0
def tGrocery():
    outFile = open('testResult.tmp', 'w')
    [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk'))
    testingSet = []
    correctLabel = []
    for i in xrange(len(benchmark)):
        print '%d out of %d' % (i, len(benchmark))
        testingSet.append(benchmark[i][1])
        correctLabel.append(benchmark[i][0]) 
    grocery = Grocery('test')
    grocery.train(trainingSet)
    grocery.save()
    # load
    new_grocery = Grocery('test')
    new_grocery.load()
    Prediction = []
    for i in xrange(len(testingSet)):
        print '%d out of %d' % (i, len(testingSet))
        prediction = new_grocery.predict(testingSet[i])
        Prediction.append(prediction)
        temp = correctLabel[i] + '<-->' + prediction + '  /x01' + testingSet[i] + '\n'
        outFile.write(temp)
    correct = 0
    for i in xrange(len(Prediction)):
        print Prediction[i], correctLabel[i],
        if Prediction[i] == correctLabel[i]:
            correct += 1
            print 'Correct'
        else:
            print 'False'
    print 'Correct Count:', correct
    print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
Пример #2
0
def test_grocery():
    grocery = Grocery('model_redian')
    grocery.train('trdata_4.txt')
    grocery.save()
    new_grocery = Grocery('model_redian')
    new_grocery.load()
    test_result = new_grocery.test('tedata_4.txt')
    print test_result.accuracy_labels
    print test_result.recall_labels
    test_result.show_result()
Пример #3
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
Пример #4
0
def predict_test(model_path, data):
    # 加载模型
    try:
        model_path = os.path.join(BASE_DIR, 'learn', model_path)
        new_grocery = Grocery(model_path.encode('utf-8'))
        new_grocery.load()
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'}
    # 整理输入数据
    result = list()
    sentences = data.split(';')
    if sentences[-1] == '':
        sentences.pop()
    if len(sentences) == 0:
        return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'}

    # 分词,再判断
    stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt'))
    for s in sentences:
        tmp_s = ''
        words = jieba.cut(s)
        for word in words:
            if word in stop_words:
                continue
            else:
                tmp_s += word + ' '
        result.append({
            'tag':
            str(new_grocery.predict(tmp_s.strip().encode('utf-8'))),
            'sentence':
            s,
        })
    return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
Пример #5
0
def get_data(ids, b_date, end_data, log, stop_word):
    b_date = b_date.strftime('%Y-%m-%d')
    end_data = end_data.strftime('%Y-%m-%d')
    # 选择数据来源
    df = load_data(ids, b_date, end_data)
    # df = load_data_excel()
    # df = pd.read_excel('data_treasure.xls')
    df['RateDate'] = pd.to_datetime(df['RateDate'])
    # df_group = df['RateDate'].groupby([df.RateDate.values.astype('datetime64[D]')]).size()
    res = list()
    log.info('Have %d comments need to process' % len(df))
    # 分类模型导入
    new_grocery = Grocery('sample2')
    new_grocery.load()
    for record_data in range(0, len(df)):
        # 按日期分类摘取内容
        # tmp_df = df[df['RateDate'] > df_group.index[record_data]][df['RateDate'] < df_group.index[record_data + 1]]
        # 自然语言处理
        content_sw, level, tag = nlp_process_with_sw(df.iloc[record_data],
                                                     new_grocery, stop_word)
        # 记录结果
        res.append({
            'RateContent': json.dumps(content_sw, ensure_ascii=False),
            'RateDate': df.iloc[record_data]['RateDate'],
            'TreasureID': df.iloc[record_data]['TreasureID'],
            'Level': level,
            'Tag': tag,
            'Sentence': df.iloc[record_data]['RateContent'],
        })
    return res
def train():
    print 'train start '+'.'*30
    #grocery=Grocery('sample')
    grocery=Grocery('version1.0')
    grocery.train(trainlist)
    grocery.save()
    print 'train end '+'.'*30
Пример #7
0
def test(test_path):
    new_grocery = Grocery('cv_' + str(fold) +
                          '_model')  #, custom_tokenize=segment)
    new_grocery.load()
    test_src = []
    with open(test_path) as f:
        for line in f:
            label, text = line.strip().split("|text|")
            label = yiji_label[classify_dict[label]]
            test_src.append((label, text))
    test_result = new_grocery.test(test_src)
    #print test_result
    #print test_result.accuracy_overall
    #accs = test_result.accuracy_labels
    recalls = test_result.recall_labels
    #print "Recall for each class: ", recalls
    predictlabels = test_result.predicted_y
    truelabels = test_result.true_y
    acc = accuracy_score(truelabels, predictlabels)
    macro_precision, macro_recall, macro_fscore, _ = precision_recall_fscore_support(
        truelabels, predictlabels, average='macro')
    print "Accuracy: ", acc, "Macro-average Precision:", macro_precision, "Macro-average Recall:", macro_recall, "Macro-average Fscore:", macro_fscore
    labellist = [
        'safe_and_stable', 'industrial_information', 'politics',
        'culture_health', 'social_livelihood', 'economic_and_financial'
    ]
    precision, recall, fscore, _ = precision_recall_fscore_support(
        truelabels, predictlabels, average=None, labels=labellist)
    precisions = dict()
    recalls = dict()
    for idx, p in enumerate(precision):
        precisions[labellist[idx]] = p
    for idx, c in enumerate(recall):
        recalls[labellist[idx]] = c
Пример #8
0
    def __init__(self, keyword):
        print '进行新闻分类'
        (db, cursor) = connectdb()
        cursor.execute("update task set status=1 where keyword=%s", [keyword])
        cursor.execute("select id, title from news where keyword=%s",
                       [keyword])
        news = cursor.fetchall()
        new_grocery = Grocery('static/paris')
        new_grocery.load()

        for item in news:
            tag = new_grocery.predict(item['title'])
            if tag == '新闻背景':
                tag = 1
            elif tag == '事实陈述':
                tag = 2
            elif tag == '事件演化':
                tag = 3
            elif tag == '各方态度':
                tag = 4
            elif tag == '直接关联':
                tag = 6
            elif tag == '暂无关联':
                tag = 7
            cursor.execute("update news set tag=%s where id=%s",
                           [tag, item['id']])
        closedb(db, cursor)
        return
def train_compare_result(train_src, test_src):
    grocery = Grocery('test')
    grocery.train(train_src)
    print grocery.get_load_status()
    len_test = len(test_src)
    print len_test
    Predict_num = 0
    History = []
    for test in test_src:
        Predict_result = {
            'predict_title': test[1],
            'predict_class': None,
            'true_class': None
        }
        predict_title = Predict_result['predict_title']
        predict_result = grocery.predict(predict_title)
        Predict_result['predict_class'], Predict_result['true_class'] = test[
            0], predict_result
        if str(predict_result) == str(test[0]):
            # print 'prediction is True'
            Predict_num += 1
        History.append(Predict_result)
        # print 'prediction is False'
    predict_precision = float(Predict_num) / len_test
    return predict_precision, History
Пример #10
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     result = grocery.predict('just a testing')
     print(result)
     result = grocery.predict('考生必读:新托福写作考试评分标准')
     print(result)
     print("type of result is :",type(result))
     assert str(grocery.predict('考生必读:新托福写作考试评分标准')) == 'education'
     assert str(grocery.predict('法网')) == 'sports'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
	def labelmaker(self):
		result=[]
		grocery = Grocery('11c_20k_20171226')
		grocery.load()	
		label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0]
		result.append(label_confidence[0])#置信度最高的分类结果
		result.append(label_confidence[1])# 置信度
		return result
Пример #12
0
 def __load_model__():
     if not Classify.__MODEL_LOADED__:
         Classify.__MODEL_LOADED__ = True
         Classify.__train__model__()
     else:
         if Classify.__MODEL__:
             Classify.__MODEL__ = Grocery('Classify')
             Classify.__MODEL__.load()
Пример #13
0
    def __init__(self, *args, **kwargs):
        self.grocery_name = str(kwargs["grocery_name"])
        method = str(kwargs["method"])
        train_src = str(kwargs["train_src"])
        self.PREFIX = conf.load("predict_label")["prefix"]
        self.MODEL_DIR = conf.load("predict_label")["model_dir"]

        self.kwargs = kwargs
        if method == "normal":
            self.key_ext = keyExt()
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        elif method == "jieba":
            self.grocery = Grocery(self.grocery_name)
        elif method == "processed":
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        pass
Пример #14
0
def phgrocery(text):
    # result_text = []
    model_grocery = Grocery('model_redian_5')
    model_grocery.load()

    result = int(model_grocery.predict(text).predicted_y)
    # print result
    # if result == 1:
    #     result_text.append(text)
    return result
Пример #15
0
def tgrocery_train(train_data,test_data):
        '''model预测'''
        print("训练语料总数为: " + str(len(train_data)))
        test_corpus, test_label = test_split(test_data)

        grocery = Grocery('TextGrocery')
        print("start training......")
        grocery.train(train_data)
        grocery.save()
        new_grocery = Grocery('TextGrocery')
        new_grocery.load()

        predict_label = []
        for sample in test_corpus:
                label = new_grocery.predict(sample)

                predict_label.append(str(label))
        # print(predict_label)
        return test_corpus,test_label,predict_label
Пример #16
0
    def __train__model__():
        dataframe = pd.read_excel(Classify.__FILE_PATH__)
        data = dataframe[[u'类型', u'释义']]
        train_data = [(x[0], x[1]) for x in data.values]

        grocery = Grocery('Classify')

        grocery.train(train_data)
        grocery.save()
        Classify.__MODEL__ = grocery
Пример #17
0
    def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'):
        '''

        :param text:
        :param model_name:
        :return:
        '''
        new_grocery = Grocery(self.model_name)
        new_grocery.load()
        result = new_grocery.predict(text)
        return result.dec_values[u'postive']
Пример #18
0
    def __init__(self):
        self.degreedic = set( line.strip() for line in codecs.open('./data/degrees.txt','rb','utf-8')) # 载入学历词库
        self.majordic =set( line.strip() for line in codecs.open('./data/majordic.txt','rb','utf-8')) # 载入专业词库
        self.citydic = set( line.strip() for line in codecs.open("./data/citydic.txt",'rb','utf-8'))   # 载入城市词库
        self.firmnames =set( line.strip() for line in codecs.open('./data/firm.txt','rb','utf-8'))    # 载入公司缩写名库
        self.jobdic = set(line.strip() for line in codecs.open('./data/jobposition.txt','rb','utf-8') ) # 载入招聘职位名库
        self.skills = set( line.strip() for line in codecs.open('./data/skills.txt','rb','utf-8'))
#        self.wordlisttf = pickle.load(open('./data/wordlist.pkl'))  # 出现频率最高的2000个单词
        # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector
        self.clf = Grocery("jdclf")        # 句子分类器,分为demand,duty,other
        self.clf.load()
        
        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")
        
        self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") 
        self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") 
        self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析')

        self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]")
        self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利")

        self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译")
        self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]")
        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")
        self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
        |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
        
        self.SPLIT_JD = re.compile(u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]")
        self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]")
        self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]")
        self.SKILL = re.compile(u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的")
        
        jieba.load_userdict('./data/majordic.txt')
        jieba.load_userdict('./data/skills.txt')
        jieba.load_userdict('./data/firm.txt')
        jieba.load_userdict('./data/degrees.txt')
        jieba.load_userdict('./data/benefits.txt')


        self.jdStr = ""
        self.linelist = []
        self.lineindex = defaultdict(int)
        self.result = OrderedDict() 
Пример #19
0
def sentiment_train(gro_name, train_set):
    """
    tgGrocery svm train
    :param gro_name:
    :param train_set:
    :return:
    """
    gro_ins = Grocery(gro_name)
    # gro_ins.load()
    gro_ins.train(train_set)
    print("Is trained? ", gro_ins.get_load_status())
    gro_ins.save()
Пример #20
0
def train(train_origin_path, fold):
    grocery = Grocery('cv_' + str(fold) +
                      '_model')  #, custom_tokenize=segment)

    train_src = []
    with open(train_origin_path) as f:
        for line in f:
            label, text = line.strip().split("|text|")
            label = yiji_label[classify_dict[label]]
            train_src.append((label, text))

    grocery.train(train_src)
    grocery.save()
Пример #21
0
    def train_phrasing_and_save(self, trainsets=all):
        '''

        :param trainsets:
        :param model_name:
        :return:
        '''
        try:
            grocery = Grocery(self.model_name)
            grocery.train(trainsets)
            grocery.save()
            return True
        except:
            return False
Пример #22
0
    def __init__(self):
        self.data = []
        self.clf = Grocery("jdclf")
        self.clf.load()

        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(
            u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")

        self.INCNAME = re.compile(
            u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)")
        self.NOT_INC = re.compile(
            u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责")
        self.INCTAG = re.compile(
            u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\
                                 |互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(
            u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析'
        )

        self.START_DEMAND = re.compile(
            u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]"
        )

        self.DEMAND = re.compile(
            u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利"
        )

        self.DUTY = re.compile(
            u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\
                               |完成|沟通|需求|秘书.{2,5}翻译")

        self.START_DUTY = re.compile(
            u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]"
        )

        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")

        self.BENEFIT = re.compile(
            u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
                                  |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
Пример #23
0
def demo_flask(image_file):
    grocery = Grocery('NameIdAdd_NLP')
    model_name = grocery.name
    text_converter = None
    tgm = GroceryTextModel(text_converter, model_name)
    tgm.load(model_name)
    grocery.model = tgm

    t = time.time()
    result_dir = './result'
    image = np.array(Image.open(image_file).convert('RGB'))
    result, image_framed = ocr_whole.model(image)
    output_file = os.path.join(result_dir, image_file.split('/')[-1])
    Image.fromarray(image_framed).save(output_file)
    name_total = ''
    id_total = ''
    for key in result:
        string1 = result[key][1]
        if len(string1) <= 8:
            continue
        string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1)
        no_digit = len(list(filter(str.isdigit, string2.encode('gbk'))))
        no_alpha = len(list(filter(is_alphabet, string2)))
        if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2:
            predict_result = 'others'
        elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3:
            predict_result = 'company-address'
        elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8):
            predict_result = 'company-id'
        elif '名称' in string2:
            predict_result = 'company-name'
        else:
            predict_result = grocery.predict(string2)
        if str(predict_result) == 'company-name':
            name_total += string1
            break
        elif str(predict_result) == 'company-id':
            id_total += string1
        else:
            continue
    id_total = re.sub(r'\W', '', id_total)
    name_total = stupid_revise(name_total)
    print("Mission complete, it took {:.3f}s".format(time.time() - t))
    print('\nRecongition Result:\n')
    print(id_total)
    print(name_total)
    return output_file, id_total, name_total
Пример #24
0
def learn_model(file_name):
    path = os.path.join(BASE_DIR, 'learn', file_name)
    try:
        df = pd.read_excel(path)
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'找不到文档或者读取文档出错'}
    try:
        # 删去缺失值的行
        df = df.dropna(axis=0)
        df = df.apply(split_comment, axis=1)
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'文档格式有误,应包含Tag(标签名字),Comment(评价内容)'}

    try:
        # 拆分学习组和测试组 3 :2
        len_learn = len(df) / 5 * 3
        # 生成学习文档和测试文档
        learn_file_name, test_file_name = output_file(file_name, df, len_learn)
        tmp_learn_name = os.path.join(BASE_DIR, 'learn',
                                      'model_' + learn_file_name.split('.')[0])
        grocery = Grocery(tmp_learn_name.encode('utf-8'))
        path = os.path.join(BASE_DIR, 'learn', learn_file_name)
        grocery.train(path.encode('utf-8'))
        grocery.save()
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'学习不成功,没有生产新的模型,请再次尝试。'}

    # 测试
    res = test_sample(tmp_learn_name, test_file_name)

    return {
        'IsErr':
        False,
        'ErrDesc':
        u'成功生产新的模型,测试验证的正确率为%s, 模型保存为:%s' %
        (res, os.path.split(tmp_learn_name)[1])
    }
Пример #25
0
# coding=utf-8
from tgrocery import Grocery

grocery = Grocery('sample')

train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
#grocery.train('/home/wangjianfei/git/data/train_ch.txt')
# grocery.train('train_ch.txt')
grocery.save()
new_grocery = Grocery('sample')
new_grocery.load()
print(
    new_grocery.predict(
        'Abbott government spends $8 million on higher education media blitz'))
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
print("start test..................")
#grocery.test('/home/wangjianfei/git/data/test.txt')
# grocery.train('train_ch.txt'))
# custom_grocery = Grocery('custom', custom_tokenize=list)
print(new_grocery.test(test_src))
Пример #26
0
# coding: utf-8

from tgrocery import Grocery

grocery = Grocery('test')
train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)

grocery = Grocery('read_text')
train_src = '../text_src/train_ch.txt'
grocery.train(train_src)
print(grocery.get_load_status())
predict_result = grocery.predict('考生必读:新托福写作考试评分标准')
print(predict_result)
print(predict_result.dec_values)
Пример #27
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.path.append('../../')
from config import *

from tgrocery import Grocery
STOP_WORDS_FILE = 'stopwords.txt'
USER_DICT_FILE = 'user_dict.txt'

model_fintext = Grocery('model_fintext')
model_fintext.load()
sys.path.append('../')
from get_es import *
es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])

def search(index_name):
    es_search_options = set_search_optional()
    es_result = get_search_result(es_search_options,index=index_name)
    # final_result = get_result_list(es_result)
    # return final_result
    return es_result


def get_result_list(es_result):
    final_result = []
    for item in es_result:
        final_result.append(item['_source'])
    return final_result

#!/usr/bin/env python
# coding=utf-8

from tgrocery import Grocery

#grocery = Grocery('age56')
#grocery.train('train4_age_56', ' ')
#grocery.save()

new_grocery = Grocery("age")
new_grocery.load()
predict_result = new_grocery.test('test4_age', ' ')
#print len(predict_result.true_y)
#for i in range(len(predict_result.predicted_y)):
#print predict_result.predicted_y[i]
print predict_result
predict_result.show_result()
Пример #29
0
# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.path.append('../../')
from config import *
from es import es214 as es
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import TransportError
from elasticsearch.helpers import bulk
from tgrocery import Grocery
model_fintext = Grocery('../fintext_classify/model_fintext')
model_fintext.load()


def match_topic_kw(news_id, keywords_list, source, doc_type, size=10000):
    result = []
    keyword_str = ''.join(keywords_list)
    # 通过一组关键词查找相关文本
    query_body = {
        "query": {
            "match": {
                "content": keyword_str  #这个可能还得改,争取用一个list
            }
        },
        "size": size
    }
    # print keyword_str
    es_result = es.search(index=source,
                          doc_type=doc_type,
                          body=query_body,
                          request_timeout=400)
Пример #30
0
# coding: utf-8

from tgrocery import Grocery

# save
grocery = Grocery('test')
train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
grocery.save()

# load
# grocery name must be the same as the previous one
new_grocery = Grocery('test')
new_grocery.load()
print new_grocery.predict('考生必读:新托福写作考试评分标准')