예제 #1
0
class AutoGrocery(object):
    """

    """
    def __init__(self, name, train_data):
        self._train_data = train_data
        self._grocery = Grocery(project_dir + '/models/model_data/' + name)

    def train(self):
        self._grocery.train(self._train_data)

    def save(self):
        self._grocery.save()

    def load(self):
        self._grocery.load()

    def predicate(self, src):
        if not self._grocery.get_load_status():
            try:
                self.load()
            except ValueError:
                self.train()
                self.save()
        pr = self._grocery.predict(src)
        label = pr.predicted_y
        return label, pr.dec_values[label]
예제 #2
0
def tGrocery():
    outFile = open('testResult.tmp', 'w')
    [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk'))
    testingSet = []
    correctLabel = []
    for i in xrange(len(benchmark)):
        print '%d out of %d' % (i, len(benchmark))
        testingSet.append(benchmark[i][1])
        correctLabel.append(benchmark[i][0]) 
    grocery = Grocery('test')
    grocery.train(trainingSet)
    grocery.save()
    # load
    new_grocery = Grocery('test')
    new_grocery.load()
    Prediction = []
    for i in xrange(len(testingSet)):
        print '%d out of %d' % (i, len(testingSet))
        prediction = new_grocery.predict(testingSet[i])
        Prediction.append(prediction)
        temp = correctLabel[i] + '<-->' + prediction + '  /x01' + testingSet[i] + '\n'
        outFile.write(temp)
    correct = 0
    for i in xrange(len(Prediction)):
        print Prediction[i], correctLabel[i],
        if Prediction[i] == correctLabel[i]:
            correct += 1
            print 'Correct'
        else:
            print 'False'
    print 'Correct Count:', correct
    print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
예제 #3
0
def test(test_path):
    new_grocery = Grocery('cv_' + str(fold) +
                          '_model')  #, custom_tokenize=segment)
    new_grocery.load()
    test_src = []
    with open(test_path) as f:
        for line in f:
            label, text = line.strip().split("|text|")
            label = yiji_label[classify_dict[label]]
            test_src.append((label, text))
    test_result = new_grocery.test(test_src)
    #print test_result
    #print test_result.accuracy_overall
    #accs = test_result.accuracy_labels
    recalls = test_result.recall_labels
    #print "Recall for each class: ", recalls
    predictlabels = test_result.predicted_y
    truelabels = test_result.true_y
    acc = accuracy_score(truelabels, predictlabels)
    macro_precision, macro_recall, macro_fscore, _ = precision_recall_fscore_support(
        truelabels, predictlabels, average='macro')
    print "Accuracy: ", acc, "Macro-average Precision:", macro_precision, "Macro-average Recall:", macro_recall, "Macro-average Fscore:", macro_fscore
    labellist = [
        'safe_and_stable', 'industrial_information', 'politics',
        'culture_health', 'social_livelihood', 'economic_and_financial'
    ]
    precision, recall, fscore, _ = precision_recall_fscore_support(
        truelabels, predictlabels, average=None, labels=labellist)
    precisions = dict()
    recalls = dict()
    for idx, p in enumerate(precision):
        precisions[labellist[idx]] = p
    for idx, c in enumerate(recall):
        recalls[labellist[idx]] = c
예제 #4
0
파일: classify.py 프로젝트: Honlan/CleverTL
    def __init__(self, keyword):
        print '进行新闻分类'
        (db, cursor) = connectdb()
        cursor.execute("update task set status=1 where keyword=%s", [keyword])
        cursor.execute("select id, title from news where keyword=%s",
                       [keyword])
        news = cursor.fetchall()
        new_grocery = Grocery('static/paris')
        new_grocery.load()

        for item in news:
            tag = new_grocery.predict(item['title'])
            if tag == '新闻背景':
                tag = 1
            elif tag == '事实陈述':
                tag = 2
            elif tag == '事件演化':
                tag = 3
            elif tag == '各方态度':
                tag = 4
            elif tag == '直接关联':
                tag = 6
            elif tag == '暂无关联':
                tag = 7
            cursor.execute("update news set tag=%s where id=%s",
                           [tag, item['id']])
        closedb(db, cursor)
        return
예제 #5
0
def predict_test(model_path, data):
    # 加载模型
    try:
        model_path = os.path.join(BASE_DIR, 'learn', model_path)
        new_grocery = Grocery(model_path.encode('utf-8'))
        new_grocery.load()
    except Exception as e:
        return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'}
    # 整理输入数据
    result = list()
    sentences = data.split(';')
    if sentences[-1] == '':
        sentences.pop()
    if len(sentences) == 0:
        return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'}

    # 分词,再判断
    stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt'))
    for s in sentences:
        tmp_s = ''
        words = jieba.cut(s)
        for word in words:
            if word in stop_words:
                continue
            else:
                tmp_s += word + ' '
        result.append({
            'tag':
            str(new_grocery.predict(tmp_s.strip().encode('utf-8'))),
            'sentence':
            s,
        })
    return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
예제 #6
0
class GroceryModel(object):
    def __init__(self):
        self.grocery = Grocery('TextClassify')
    
    def train(self,train_file):
        f = open(train_file,'r')
        line = f.readline().decode('utf8')
        dataset = []
        while line:
            tmp = line.split('\t')
            dataset.append((tmp[0],''.join(tmp[1:])))
            line = f.readline().decode('utf8')
        f.close()
        self.grocery.train(dataset)
        self.grocery.save()
    
    def load_model(self):
        self.grocery.load()
    
    def test(self,test_src):
        self.load_model()
        f = open(test_src,'r')
        line = f.readline().decode('utf8')
        dataset = []
        while line:
            tmp = line.split('\t')
            dataset.append((tmp[0],''.join(tmp[1:])))
            line = f.readline().decode('utf8')
        f.close()
        result = self.grocery.test(dataset)
        print result
    
    def predict(self,text):
        print self.grocery.predict(text)
예제 #7
0
def get_data(ids, b_date, end_data, log, stop_word):
    b_date = b_date.strftime('%Y-%m-%d')
    end_data = end_data.strftime('%Y-%m-%d')
    # 选择数据来源
    df = load_data(ids, b_date, end_data)
    # df = load_data_excel()
    # df = pd.read_excel('data_treasure.xls')
    df['RateDate'] = pd.to_datetime(df['RateDate'])
    # df_group = df['RateDate'].groupby([df.RateDate.values.astype('datetime64[D]')]).size()
    res = list()
    log.info('Have %d comments need to process' % len(df))
    # 分类模型导入
    new_grocery = Grocery('sample2')
    new_grocery.load()
    for record_data in range(0, len(df)):
        # 按日期分类摘取内容
        # tmp_df = df[df['RateDate'] > df_group.index[record_data]][df['RateDate'] < df_group.index[record_data + 1]]
        # 自然语言处理
        content_sw, level, tag = nlp_process_with_sw(df.iloc[record_data],
                                                     new_grocery, stop_word)
        # 记录结果
        res.append({
            'RateContent': json.dumps(content_sw, ensure_ascii=False),
            'RateDate': df.iloc[record_data]['RateDate'],
            'TreasureID': df.iloc[record_data]['TreasureID'],
            'Level': level,
            'Tag': tag,
            'Sentence': df.iloc[record_data]['RateContent'],
        })
    return res
예제 #8
0
파일: classify.py 프로젝트: Honlan/CleverTL
	def __init__(self, keyword):
		print '进行新闻分类'
		(db, cursor) = connectdb()
		cursor.execute("update task set status=1 where keyword=%s", [keyword])
		cursor.execute("select id, title from news where keyword=%s",[keyword])
		news = cursor.fetchall()
		new_grocery = Grocery('static/paris')
		new_grocery.load()

		for item in news:
			tag = new_grocery.predict(item['title'])
			if tag == '新闻背景':
				tag = 1
			elif tag == '事实陈述':
				tag = 2
			elif tag == '事件演化':
				tag = 3 
			elif tag == '各方态度':
				tag = 4
			elif tag == '直接关联':
				tag = 6
			elif tag == '暂无关联':
				tag = 7
			cursor.execute("update news set tag=%s where id=%s", [tag, item['id']])
		closedb(db, cursor)
		return
	def labelmaker(self):
		result=[]
		grocery = Grocery('11c_20k_20171226')
		grocery.load()	
		label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0]
		result.append(label_confidence[0])#置信度最高的分类结果
		result.append(label_confidence[1])# 置信度
		return result
예제 #10
0
파일: grocery.py 프로젝트: SwoJa/ruman
def test_grocery():
    grocery = Grocery('model_redian')
    grocery.train('trdata_4.txt')
    grocery.save()
    new_grocery = Grocery('model_redian')
    new_grocery.load()
    test_result = new_grocery.test('tedata_4.txt')
    print test_result.accuracy_labels
    print test_result.recall_labels
    test_result.show_result()
예제 #11
0
def phgrocery(text):
    # result_text = []
    model_grocery = Grocery('model_redian_5')
    model_grocery.load()

    result = int(model_grocery.predict(text).predicted_y)
    # print result
    # if result == 1:
    #     result_text.append(text)
    return result
예제 #12
0
def test_grocery():
    grocery = Grocery('model_redian')
    grocery.train('trdata_4.txt')
    grocery.save()
    new_grocery = Grocery('model_redian')
    new_grocery.load()
    test_result = new_grocery.test('tedata_4.txt')
    print test_result.accuracy_labels
    print test_result.recall_labels
    test_result.show_result()
예제 #13
0
	def GET(self,name):
		#i = web.input(name=None)	
		#url = "http://"+name
		#html = urllib2.urlopen(url).read()
		#soup = BeautifulSoup(html)
		#title =  soup.html.head.title.contents.pop().encode('utf-8')
		title = name.encode('utf-8')
		new_grocery = Grocery('sample')
		new_grocery.load()
		return new_grocery.predict(title)
예제 #14
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
예제 #15
0
    def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'):
        '''

        :param text:
        :param model_name:
        :return:
        '''
        new_grocery = Grocery(self.model_name)
        new_grocery.load()
        result = new_grocery.predict(text)
        return result.dec_values[u'postive']
예제 #16
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
예제 #17
0
 def test_main(self):
     grocery = Grocery(self.grocery_name)
     grocery.train(self.train_src)
     grocery.save()
     new_grocery = Grocery('test')
     new_grocery.load()
     assert grocery.get_load_status()
     result = grocery.predict('just a testing')
     print(result)
     result = grocery.predict('考生必读:新托福写作考试评分标准')
     print(result)
     print("type of result is :",type(result))
     assert str(grocery.predict('考生必读:新托福写作考试评分标准')) == 'education'
     assert str(grocery.predict('法网')) == 'sports'
     # cleanup
     if self.grocery_name and os.path.exists(self.grocery_name):
         shutil.rmtree(self.grocery_name)
예제 #18
0
class MyGrocery(object):
  def __init__(self, name):
    super(MyGrocery, self).__init__()
    self.grocery = Grocery(name)
    self.loaded = False
    self.correct = 1.0

  def train(self, src):
    lines = []
    for line in csv.reader(open(src)):
      label, s = line[0],line[1]
      text = s.decode('utf8')
      lines.append((label, text))
    self.grocery.train(lines)

  def save_model(self):
    self.grocery.save()

  def train_and_save(self, src):
    self.train(src)
    self.save_model()

  def load_model(self):
    if not self.loaded:
      self.grocery.load()
      self.loaded = True

  def predict(self, text):
    self.load_model()
    return self.grocery.predict(text)

  def test(self, src):
    self.load_model()
    total, wrong_num = 0.0, 0.0
    for line in csv.reader(open(src)):
      total += 1
      if line[0] != self.predict(line[1]):
        wrong_num += 1

    print "load test file from " + src
    correct = (total - wrong_num ) / total
    self.correct = correct
    print "total: %d , wrong_num: %d, success percentage: %f" %(total, wrong_num, correct)
    result = dict(type="test", total=total, wrong_num=wrong_num, correct=correct)
    return json.dumps(result)
예제 #19
0
def tgrocery_train(train_data,test_data):
        '''model预测'''
        print("训练语料总数为: " + str(len(train_data)))
        test_corpus, test_label = test_split(test_data)

        grocery = Grocery('TextGrocery')
        print("start training......")
        grocery.train(train_data)
        grocery.save()
        new_grocery = Grocery('TextGrocery')
        new_grocery.load()

        predict_label = []
        for sample in test_corpus:
                label = new_grocery.predict(sample)

                predict_label.append(str(label))
        # print(predict_label)
        return test_corpus,test_label,predict_label
def predict_corpus(input_file,output_csv):
    import csv
    csvfile = file(output_csv, 'wb')
    writer = csv.writer(csvfile)
    corpus = []
    f = xlrd.open_workbook(input_file)
    table = f.sheet_by_name('Sheet1')
    nrows = table.nrows  # 读取行数
    for rownum in range(0, nrows):
        row = table.row_values(rownum)
        row[2].strip()
        corpus.append(row[2])
    corpus_grocery = Grocery(project_name)
    corpus_grocery.load()
    output = []
    for sentence in corpus:
        predict = corpus_grocery.predict(sentence)
        output.append((sentence,predict))
    writer.writerows(output)
    print('Done!')
    csvfile.close()
예제 #21
0
class jdParser(object):

    def __init__(self):
        self.clf = Grocery("./jdclf")
        self.clf.load()
        self.LINE_SPLIT = re.compile(u"[;。;\n]")



    def get_demand_and_duty(self,jdstr):
        linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip()>4) ]

        result = {}
        demand = []
        duty = []
        for line in linelist:
            pred = str(self.clf.predict(line))
            if pred =="demand":
                demand.append(line)
            elif pred == "duty":
                duty.append(line)

        result['demand'] = '\n'.join(demand)
        result['duty'] = '\n'.join(duty)
예제 #22
0
class jdParser(object):
    def __init__(self):
        self.clf = Grocery("./jdclf")
        self.clf.load()
        self.LINE_SPLIT = re.compile(u"[;。;\n]")

    def get_demand_and_duty(self, jdstr):
        linelist = [
            line.strip() for line in self.LINE_SPLIT.split(jdstr)
            if len(line.strip() > 4)
        ]

        result = {}
        demand = []
        duty = []
        for line in linelist:
            pred = str(self.clf.predict(line))
            if pred == "demand":
                demand.append(line)
            elif pred == "duty":
                duty.append(line)

        result['demand'] = '\n'.join(demand)
        result['duty'] = '\n'.join(duty)
예제 #23
0
class TagPredictor(object):
    def _custom_tokenize(self, line, **kwargs):
        try:
            kwargs["method"]
        except:
            method = str(self.kwargs["method"])
        else:
            method = str(kwargs["method"])
        if method == "normal":
            tokens = self.key_ext.calculateTokens(line,
                                                  doc_len_lower_bound=5,
                                                  doc_len_upper_bound=500,
                                                  method="normal")
        elif method == "processed":
            tokens = line.split(',')
        return tokens

    def __init__(self, *args, **kwargs):
        self.grocery_name = str(kwargs["grocery_name"])
        method = str(kwargs["method"])
        train_src = str(kwargs["train_src"])
        self.PREFIX = conf.load("predict_label")["prefix"]
        self.MODEL_DIR = conf.load("predict_label")["model_dir"]

        self.kwargs = kwargs
        if method == "normal":
            self.key_ext = keyExt()
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        elif method == "jieba":
            self.grocery = Grocery(self.grocery_name)
        elif method == "processed":
            self.grocery = Grocery(self.grocery_name,
                                   custom_tokenize=self._custom_tokenize)
        pass

    def trainFromDocs(self, *args, **kwargs):
        model = self.grocery.train(self.kwargs["train_src"])
        return model

    def autoEvaluation(self, *args, **kwargs):
        prune_threshold = float(kwargs["threshold"])
        excluded_labels = kwargs["excluded_labels"]
        excluded_docs = kwargs["excluded_docs"]

        train_data = []
        with open(self.kwargs["train_src"], 'rb') as f:
            for line in f:
                try:
                    line.split('\t', 1)[1]
                except:
                    continue
                else:
                    train_data.append(
                        (line.split('\t',
                                    1)[0], line.split('\t',
                                                      1)[1].split('\n', 1)[0]))
        f.close()

        print "#items before filtering:", len(train_data)
        print "-- Now we filter out the excluded docs --"
        train_data = [i for i in train_data if i[1] not in excluded_docs]
        print "#items after filtering:", len(train_data)
        print "-- Now we filter out the excluded labels --"
        train_data = [i for i in train_data if i[0] not in excluded_labels]
        print "#items after filtering:", len(train_data)

        n = len(train_data)  #number of rows in your dataset
        indices = range(n)
        indices = shuffle(indices)
        train_set = map(lambda x: train_data[x], indices[:n * 10 // 10])
        test_set = map(lambda x: train_data[x], indices[:n * 10 // 10])

        self.grocery.train(train_set)
        test_result = self.grocery.test(test_set)
        print '-- Accuracy after training --'
        print 'Accuracy, A-0:', test_result

        low_recall_label = []
        for item in test_result.recall_labels.items():
            if item[1] < prune_threshold:
                low_recall_label.append(item[0])
        new_train_set = [
            item for item in train_set if item[0] not in low_recall_label
        ]
        new_test_set = [
            item for item in train_set if item[0] not in low_recall_label
        ]

        self.grocery.train(new_train_set)
        new_test_result = self.grocery.test(new_test_set)

        print '-- Accuracy after training, with low-recall labels (less than', str(
            prune_threshold * 100), '%) pruned --'
        print 'Accuracy, A-1:', new_test_result

        return self.grocery, new_test_result

    def manualEvaluation(self, *args, **kwargs):
        n_docs = int(kwargs["n_docs"])
        excluded_labels = kwargs["excluded_labels"]
        excluded_docs = kwargs["excluded_docs"]

        train_data = []
        with open(self.kwargs["train_src"], 'rb') as f:
            for line in f:
                try:
                    line.split('\t', 1)[1]
                except:
                    continue
                else:
                    train_data.append(
                        (line.split('\t',
                                    1)[0], line.split('\t',
                                                      1)[1].split('\n', 1)[0]))
        f.close()

        train_data = [
            item for item in train_data if item[0] not in excluded_labels
        ]
        train_data = [i for i in train_data if i[1] not in excluded_docs]

        n = len(train_data)  #number of rows in your dataset
        indices = range(n)
        indices = shuffle(indices)
        test_set = map(lambda x: train_data[x], indices[0:n_docs])
        g = self.loadTrainModel()
        test_result = g.test(test_set)
        return test_set, test_result

    def saveTrainModel(self, *args, **kwargs):
        self.grocery.save()
        os.rename(
            self.PREFIX + self.grocery_name + '_train.svm',
            self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm')
        return

    def loadTrainModel(self, *args, **kwargs):
        os.rename(
            self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm',
            self.PREFIX + self.grocery_name + '_train.svm')
        self.grocery.load()
        os.rename(
            self.PREFIX + self.grocery_name + '_train.svm',
            self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm')
        return self.grocery

    def predict(self, line, **kwargs):
        tag = self.grocery.predict(line)
        return tag

    def test(self, *args, **kwargs):
        test_src = str(kwargs["test_src"])
        test_result = self.grocery.test(test_src)
        print "Total Accuracy", test_result

        return test_result
예제 #24
0
class JdParserTop(object):
    def __init__(self):
        self.CLEAN_TEXT = re.compile(
            u"[^\u4e00-\u9fa5\w\d;::;,。、\.,。!!@()\r\n\(\)\-\+ - ]")

        self.clf = Grocery(base_dir + "/jdclf")
        self.clf.load()

        self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}")
        self.CLEAN_LINE = re.compile(
            u"^[\u2022(【\[\s\t\r\n\(\-  ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]"
        )
        self.CLEAN_JOBNAME = re.compile(
            u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]")

        self.PAY = re.compile("(\d{3,}\-)?\d{3,}元")
        self.SEX = re.compile(u"性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.JOB_TAG = re.compile(u"全职|实习")
        self.DEGREE = re.compile(
            u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限")

        self.START_DEMAND = re.compile(
            u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?")
        self.START_DUTY = re.compile(
            u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]")
        self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]")

        self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+")
        self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验")
        self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作")
        self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴")
        self.CERT = re.compile(
            u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\
                                 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书")

        self.degreedic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/degrees.txt', 'rb', 'utf-8')
        ])
        self.majordic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/majordic.txt', 'rb', 'utf-8')
        ])
        self.skilldic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/skills.txt', 'rb', 'utf-8')
        ])
        self.jobdic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/jobnames.txt', 'rb', 'utf-8')
        ])

        jieba.load_userdict(base_dir + '/data/majordic.txt')
        jieba.load_userdict(base_dir + '/data/skills.txt')
        jieba.load_userdict(base_dir + '/data/firm.txt')
        jieba.load_userdict(base_dir + '/data/degrees.txt')
        jieba.load_userdict(base_dir + '/data/benefits.txt')

    def clean_line(self, line):
        """
        清除一个句子首尾的标点符号
        """
        line = self.CLEAN_LINE.sub("", line).strip()
        line = re.sub("\s+|^/d+[;’、,/。\.]", "", line)
        return line

    def clean_cnNum(self, line):
        """
        经验年限提取时,中文一二三等转为123
        """
        line = unicode(line)
        a = [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十", u"两"]
        b = range(1, 11) + [2]
        table = dict((ord(aa), bb) for aa, bb in zip(a, b))
        return line.translate(table)

    def line2vec(self, line):
        """
        句子转换为向量
        """
        vec = np.zeros(50)
        for word in jieba.cut(line):
            if word in self.w2v.vocab:
                vec += self.w2v[word]

        return vec

    def clean_jobname(self, jobname):
        """
        职位名清洗
        """
        if jobname.lower() in self.jobdic:
            return jobname.lower()
        else:
            res = [(lcs_len(jobname, job), job) for job in self.jobdic]
            res.sort()
            return res[-1][1]
##########################################
# init
model_choose = "svm"  # svm, lda, rnn
grocery_name = "./SVM_models/svm_for_news"
corpus_path = "./Corpus/NewsClassCorpus/"
file_path = "./"
file_name = "post.txt"

t_text = delete_stop_words(codecs.open(file_path + file_name, encoding="UTF-8").read())

###########################################
# 调用 SVM 模型分类
if model_choose == "svm":
    tic = time.time()
    grocery = Grocery(grocery_name)
    grocery.load()
    t_pre_result = grocery.predict(delete_stop_words(t_text))
    toc = time.time()

    t_label = t_pre_result.predicted_y
    print("Sentiment: ", t_label)
    print("How much: ", t_pre_result.dec_values[t_label])
    print("Elapsed time of predict is: %s s" % (toc - tic))
elif model_choose == "lda":
    pass
elif model_choose == "rnn":
    pass
else:
    print("")
예제 #26
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import MySQLdb
from tgrocery import Grocery
import sys
reload(sys)
sys.setdefaultencoding('utf8')

grocery = Grocery('sample')
dict_list = list()

conn = MySQLdb.connect(host = 'localhost', db = 'newsdata', user = '******', passwd = 'root', charset = 'utf8', use_unicode = False)
cur = conn.cursor()
cur.execute('select com_new_type_id, com_new_name from tbl_new where com_new_type_id is not null')
for row in cur.fetchall():
    dict_list.append(row)


grocery.train(dict_list)
grocery.save()

news_grocery = Grocery('sample')
news_grocery.load()
while True:
    result = news_grocery.predict(raw_input('please input title:' ))
    print result

예제 #27
0
def load_first_classifier(model_path):
    new_grocery = Grocery(model_path)
    new_grocery.load()
    return new_grocery
예제 #28
0
파일: base.py 프로젝트: jkmiao/ipin2015
class JdParserTop(object):


    def __init__(self):
        self.CLEAN_TEXT = re.compile(u"[^\u4e00-\u9fa5\w\d;::;,。、\.,。!!@()\r\n\(\)\-\+ - ]")
        
        self.clf = Grocery(base_dir+"/jdclf")
        self.clf.load()
        
        self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}")
        self.CLEAN_LINE = re.compile(u"^[\u2022(【\[\s\t\r\n\(\-  ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]")
        self.CLEAN_JOBNAME = re.compile(u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]")

        self.PAY = re.compile("(\d{3,}\-)?\d{3,}元")
        self.SEX = re.compile(u"性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.JOB_TAG = re.compile(u"全职|实习")
        self.DEGREE = re.compile(u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限")

        self.START_DEMAND = re.compile(u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?")
        self.START_DUTY = re.compile(u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]")
        self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]")
        
        self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+")
        self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验")
        self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作")
        self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴")
        self.CERT = re.compile(u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\
                                 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书")


        self.degreedic = set([line.strip() for line in codecs.open(base_dir+'/data/degrees.txt','rb','utf-8')])
        self.majordic = set([line.strip() for line in codecs.open(base_dir+'/data/majordic.txt','rb','utf-8')])
        self.skilldic = set([line.strip() for line in codecs.open(base_dir+'/data/skills.txt','rb','utf-8')])
        self.jobdic = set([line.strip() for line in codecs.open(base_dir+'/data/jobnames.txt','rb','utf-8')])

        jieba.load_userdict(base_dir+'/data/majordic.txt')
        jieba.load_userdict(base_dir+'/data/skills.txt')
        jieba.load_userdict(base_dir+'/data/firm.txt')
        jieba.load_userdict(base_dir+'/data/degrees.txt')
        jieba.load_userdict(base_dir+'/data/benefits.txt')

    
    def clean_line(self,line):
        """
        清除一个句子首尾的标点符号
        """
        line = self.CLEAN_LINE.sub("",line).strip()
        line = re.sub("\s+|^/d+[;’、,/。\.]","",line)
        return line


    def clean_cnNum(self,line):
        """
        经验年限提取时,中文一二三等转为123
        """
        line = unicode(line)
        a = [u"一",u"二",u"三",u"四",u"五",u"六",u"七",u"八",u"九",u"十",u"两"]
        b = range(1,11)+[2]
        table = dict((ord(aa),bb) for aa,bb in zip(a,b))
        return line.translate(table)




    def line2vec(self,line):
        """
        句子转换为向量
        """
        vec = np.zeros(50)
        for word in jieba.cut(line):
            if word in self.w2v.vocab:
                vec += self.w2v[word]

        return vec
    
    
    def clean_jobname(self,jobname):
        """
        职位名清洗
        """
        if jobname.lower() in self.jobdic:
            return jobname.lower()
        else:
           res = [(lcs_len(jobname,job),job) for job in self.jobdic]
           res.sort()
           return res[-1][1]
예제 #29
0
class JdParserTop(object):
    def __init__(self):

        self.result = OrderedDict()
        self.result["jdFrom"] = ""
        self.result["pubTime"] = ""
        inc_keys = [
            "incName", "incScale", "incType", "incIndustry", "incLocation",
            "incUrl", "incStage", "incAliasName", "investIns",
            "incContactInfo", "incCity", "incZipCode", "incContactName",
            "incIntro"
        ]
        job_keys = [
            "jobType", "jobPosition", "jobCate", "jobSalary", "jobWorkAge",
            "jobDiploma", "jobNum", "jobWorkCity", "jobWorkLoc", "jobWelfare",
            "age", "jobEndTime", "email", "gender", "jobMajorList", "jobDesc"
        ]
        others_keys = [
            "keyWords", "isFullTime", "jdRemedy", "posType", "urgent",
            "holidayWelfare", "livingWelfare", "salaryCombine",
            "socialWelfare", "trafficWelfare", "jobDepartment", "jobReport",
            "jobReportDetail", "jobSubSize", "language", "overSea"
        ]
        jdInc = OrderedDict()
        for k in inc_keys:
            jdInc[k] = ""
        self.result["jdInc"] = jdInc
        jdJob = OrderedDict()
        for k in job_keys:
            jdJob[k] = ""
        self.result["jdJob"] = jdJob
        others = OrderedDict()
        for k in others_keys:
            others[k] = ""
        self.result["others"] = others

        self.CLEAN_TEXT = re.compile(
            u"[^\u4e00-\u9fa5\w\d;::;,。、%\.,/。!!@()\r\n\(\)\-\+ - `]")

        self.clf = Grocery(base_dir + "/jdclf")
        self.clf.load()

        self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}")
        self.CLEAN_LINE = re.compile(
            u"^[\u2022(【\[\s\t\r\n\(\-  ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]"
        )
        self.CLEAN_JOBNAME = re.compile(
            u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]")

        self.PAY = re.compile("(\d{3,}\-)?\d{3,}元")
        self.SEX = re.compile(u"性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.JOB_TAG = re.compile(u"全职|实习")
        self.DEGREE = re.compile(
            u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限")
        self.MAIL = re.compile(u"\w+@[\w\.]+")

        self.START_DEMAND = re.compile(
            u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?")
        self.START_DUTY = re.compile(
            u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]")
        self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]")

        self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+")
        self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验")
        self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作")
        self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴")
        self.CERT = re.compile(
            u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\
                                 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书")

        self.degreedic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/degrees.txt', 'rb', 'utf-8')
        ])
        self.majordic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/majordic.txt', 'rb', 'utf-8')
        ])
        self.skilldic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/skills.txt', 'rb', 'utf-8')
        ])
        self.jobdic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/jobnames.txt', 'rb', 'utf-8')
        ])
        self.citydic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/citydic.txt', 'rb', 'utf-8')
        ])
        self.province_city = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/province_city.txt', 'rb', 'utf-8')
        ])

        self.SALARY = re.compile(u'万')

        jieba.load_userdict(base_dir + '/data/majordic.txt')
        jieba.load_userdict(base_dir + '/data/skills.txt')
        jieba.load_userdict(base_dir + '/data/firm.txt')
        jieba.load_userdict(base_dir + '/data/degrees.txt')
        jieba.load_userdict(base_dir + '/data/benefits.txt')
        jieba.load_userdict(base_dir + '/data/citydic.txt')
        jieba.load_userdict(base_dir + '/data/province_city.txt')

    def refresh(self):
        self.result = OrderedDict()
        self.result["jdFrom"] = ""
        self.result["pubTime"] = ""
        inc_keys = [
            "incName", "incScale", "incType", "incIndustry", "incLocation",
            "incUrl", "incStage", "incAliasName", "investIns",
            "incContactInfo", "incCity", "incZipCode", "incContactName",
            "incIntro"
        ]
        job_keys = [
            "jobType", "jobPosition", "jobCate", "jobSalary", "jobWorkAge",
            "jobDiploma", "jobNum", "jobWorkCity", "jobWorkLoc", "jobWelfare",
            "age", "jobEndTime", "email", "gender", "jobMajorList", "jobDesc"
        ]
        others_keys = [
            "keyWords", "isFullTime", "jdRemedy", "posType", "urgent",
            "holidayWelfare", "livingWelfare", "salaryCombine",
            "socialWelfare", "trafficWelfare", "jobDepartment", "jobReport",
            "jobReportDetail", "jobSubSize", "language", "overSea"
        ]
        jdInc = OrderedDict()
        for k in inc_keys:
            jdInc[k] = ""
        self.result["jdInc"] = jdInc
        jdJob = OrderedDict()
        for k in job_keys:
            jdJob[k] = ""
        self.result["jdJob"] = jdJob
        others = OrderedDict()
        for k in others_keys:
            others[k] = ""
        self.result["others"] = others

    def clean_line(self, line):
        """
        清除一个句子首尾的标点符号
        """
        line = self.CLEAN_LINE.sub("", line).strip()
        line = re.sub("\s+|^/d+[;’、,/。\.]", "", line)
        return line

    def clean_cnNum(self, line):
        """
        经验年限提取时,中文一二三等转为123
        """
        line = unicode(line)
        a = [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十", u"两"]
        b = range(1, 11) + [2]
        table = dict((ord(aa), bb) for aa, bb in zip(a, b))
        return line.translate(table)

    def line2vec(self, line):
        """
        句子转换为向量
        """
        vec = np.zeros(50)
        for word in jieba.cut(line):
            if word in self.w2v.vocab:
                vec += self.w2v[word]

        return vec

    def clean_jobname(self, jobname):
        """
        职位名清洗
        """
        print jobname
        if jobname.lower() in self.jobdic:
            return jobname
        else:
            res = [(lcs_len(jobname, job), job) for job in self.jobdic]
            res.sort()
            return res[-1][1]

    def desc_extract(self, soup):
        line_list = soup.find_all("p")
        return '\n'.join([line.get_text() for line in line_list])

    #去除img标签,1-7位空格,&nbsp;
    removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #将表格制表<td>替换为\t
    replaceTD = re.compile('<td>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    #将多行空行删除
    removeNoneLine = re.compile('\n+')

    def replace(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "\n", x)
        x = re.sub(self.replaceTD, "\t", x)
        x = re.sub(self.replaceBR, "\n", x)
        x = re.sub(self.removeExtraTag, "", x)
        x = re.sub(self.removeNoneLine, "\n", x)
        #strip()将前后多余内容删除
        return x.strip()
예제 #30
0
class JdParser(object):
    def __init__(self):
        self.degreedic = set(line.strip() for line in codecs.open(
            './data/degrees.txt', 'rb', 'utf-8'))  # 载入学历词库
        self.majordic = set(line.strip() for line in codecs.open(
            './data/majordic.txt', 'rb', 'utf-8'))  # 载入专业词库
        self.citydic = set(line.strip() for line in codecs.open(
            "./data/citydic.txt", 'rb', 'utf-8'))  # 载入城市词库
        self.firmnames = set(line.strip() for line in codecs.open(
            './data/firm.txt', 'rb', 'utf-8'))  # 载入公司缩写名库
        self.jobdic = set(line.strip() for line in codecs.open(
            './data/jobposition.txt', 'rb', 'utf-8'))  # 载入招聘职位名库
        self.skills = set(
            line.strip()
            for line in codecs.open('./data/skills.txt', 'rb', 'utf-8'))
        #        self.wordlisttf = pickle.load(open('./data/wordlist.pkl'))  # 出现频率最高的2000个单词
        # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector
        self.clf = Grocery("jdclf")  # 句子分类器,分为demand,duty,other
        self.clf.load()

        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(
            u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验|工作年限|工作经历|项目经[历验]|\d年经[历验]|.{1,2}年相关工作经验")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")

        self.INCNAME = re.compile(
            u"\S+(有限公司|酒店|银行|集团|厂|研究中心|研究所|学校|旅行社|中心/s|分?公司|研发中心|技术部|事.部|招聘|商务平台)"
        )
        self.INCTAG = re.compile(
            u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企"
        )

        self.JOBNAME = re.compile(
            u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析'
        )

        self.START_DEMAND = re.compile(
            u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]"
        )
        self.DEMAND = re.compile(
            u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|学历|经验|喜欢|较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利"
        )
        self.DUTY = re.compile(
            u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译"
        )
        self.START_DUTY = re.compile(
            u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]"
        )
        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")
        self.BENEFIT = re.compile(
            u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
        |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动")

        self.SPLIT_JD = re.compile(
            u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]")
        self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]")
        self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]")
        self.SKILL = re.compile(
            u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的"
        )

        jieba.load_userdict('./data/majordic.txt')
        jieba.load_userdict('./data/skills.txt')
        jieba.load_userdict('./data/firm.txt')
        jieba.load_userdict('./data/degrees.txt')
        jieba.load_userdict('./data/benefits.txt')

        self.jdStr = ""
        self.linelist = []
        self.lineindex = defaultdict(int)
        self.result = OrderedDict()

    # 分句,预处理
    def preprocess(self, jdstr):
        self.result.clear()
        jdstr = re.sub(u"[【】◆ \u25cf\u25c6\u2605]", "", jdstr.decode('utf-8'))
        self.linelist = [
            line.strip() for line in jdstr.split('\n') if len(line) > 1
        ]
        self.jdStr = '\n'.join(self.linelist)
        for line in self.linelist:
            # print self.clf.predict(line),'\t',line
            self.lineindex[re.sub(u"[\s ]+", " ", line)] = 0

    def line2vec(self, line):
        vec = np.zeros(50)
        cnt = 1
        for word in jieba.cut(line):
            if word in self.w2vdict:
                vec += self.w2vdict[word]
                cnt += 1
        vec = vec / cnt
        return vec

    # 抽取性别要求
    def regular_sex(self):
        res = set()
        for line in self.linelist:
            if self.clf.predict(line) == 'demand' or self.DEMAND.search(line):
                findsex = self.SEX.search(line)
                if findsex:
                    getsex = re.search(u"性别不限|男|女",
                                       line.replace(u"男女不限", u"性别不限"))
                    if getsex:
                        res.add(getsex.group())
                        break
        if res:
            self.result['sex'] = ' / '.join(res)
        else:
            self.result['sex'] = u'性别不限'

    # 抽取年龄要求
    def regular_age(self):
        res = ''
        for line in self.linelist:
            if re.search(u'\d{2}后', line): continue
            findage = self.AGE.search(line)
            if findage:
                age = re.findall(u'\d{2}', line)
                if len(age) >= 2:
                    res = '-'.join(age)
                elif len(age) == 1:
                    if re.search(u'以上|不低于', line):
                        res = age[0] + u'以上'
                    if re.search(u"不超过|不高于|以下", line):
                        res = age[0] + '以下'
                    if re.search(u"左右|大约|大概", line):
                        res = age[0] + '左右'
                break
        if len(res) < 2:
            res = u'年龄不限'
        self.result['age'] = res
        return res

    # 抽取专业要求
    def regular_major(self):
        res = []

        for line in self.linelist:
            findmajor = re.search(u"专业要求[::\s]", line)
            if findmajor:
                print 'major demand', line
                items = self.clean_line(line[findmajor.span()[1]:]).split()
                items = filter(
                    lambda x: x not in self.degreedic and not re.search(
                        u"薪酬|经验|元|\d+|月", x), items)
                res.append(' / '.join(items))
                break

        if not res:
            for line in self.linelist:
                if re.search(u"专业.限|.限专业",
                             line) and not re.search(u"专业优先", line):
                    res.append(u"专业不限")
                    print 'major demand', line
                    break
                else:
                    findmajor = self.MAJOR.search(line)
                    if findmajor:
                        majoritem = re.split(u'[\s,,;; ]', findmajor.group())
                        for item in majoritem:
                            if re.search(
                                    u'学历|年龄|岁|学校|公司|性格|具有|具备|能够|经验|有|毕业|性别|男|女',
                                    item):
                                continue
                            print 'major item', item
                            if self.BENEFIT.search(line): continue
                            print 'major item', item
                            if re.search(u"专业", item) and len(item) < 3:
                                continue
                            res.append(self.clean_line(item))
                        break
                        if not res:
                            for majorword in jieba.cut(line):
                                if majorword in self.majordic or majorword[:
                                                                           -2] in self.majordic:
                                    res.append(majorword)

                            if re.search(u"[等及类]?相关专业",
                                         self.jdStr) and len(res) == 1:
                                res[0] += u"等相关专业"

        if not res:
            res.append(u"专业不限")

        self.result['major'] = res

    # 抽取学历要求
    def regular_degree(self):
        """
        抽查学历信息,先整找关键字,而后再切词,用词典匹配
        """
        degree = [
            u'小学', u'初中', u'中专', u'中技', u'高中', u'专科', u'大专', u'本科', u'硕士',
            u'博士', u'博士后'
        ]
        res = set()
        for line in self.linelist:
            finddegree = re.search(u"学历要求[::\s]", line)
            if finddegree:
                items = self.clean_line(line[finddegree.span()[1]:]).split()
                items = filter(lambda x: not re.search(u"薪酬|经验|元|月|年|\d+", x),
                               items)
                res.add(' / '.join(items))
                break

        if not res:
            for line in self.linelist:
                if re.search(u"学历不限|学历要求不限|不限学历", line):
                    res.add(u"学历不限")
                    break
                else:
                    finddegree = self.DEGREE.search(line)
                    if finddegree:
                        res.add(finddegree.group())
                        break

        # 如果没有匹配到学历的要求信息,就整个文本切词后匹配查找
        if len(res) == 0:
            for word in jieba.cut(self.jdStr):
                if word in self.degreedic:
                    res.add(word)
        res = list(res)
        if len(res) == 1 and re.search(u'[及或]?以上', res[0]):
            tmp = res[0][:2]
            if tmp == u'全日':
                tmp = u'本科'
            elif tmp == u'研究':
                tmp = u'硕士'
            if tmp in degree:
                idx = degree.index(tmp)
                res = degree[idx:]

        self.result['degree'] = ' / '.join(res)

    # 抽取工作经验年限要求
    def regular_exp(self):

        cnyear = u'[半一二三四五六七八九十两]年|\d-\d{1,2}年|\d年及?以上|不少于\d年|\d年'
        res = set()
        jdStr = self.jdStr

        findexp = re.search(u'经验不限|(经验)?\d{1,2}年及以上|经验\d-\d{1,2}年', jdStr)
        if findexp:
            res = findexp.group()
            self.result['exp'] = res.replace(u"经验", "")
            return res

        findexp = self.EXP.search(jdStr)
        if findexp:
            pos = findexp.span()[1]
            jdStr = jdStr[max(0, pos - 25):min(pos + 15, len(jdStr))]
            exp = re.search(cnyear, jdStr)
            if exp:
                res.add(exp.group())

        if not res:
            exp = re.search(
                u"(\d-)?\d{1,2}年(工作|开发|项目)?经[历验]|(不少于)?([半\d]年)及?(以上)?经[历验]|经[历验]\s?(\d-)?\d{1,2}年",
                ' '.join(self.regular_jobtag()))
            if exp:
                res.add(exp.group())
            else:
                exp = re.search(cnyear, ' '.join(self.regular_jobtag()))
                if exp:
                    res.add(exp.group())

        self.result["exp"] = "-".join(res)
        self.result["exp"] = self.result['exp'].replace(u'经验',
                                                        "").replace(u"经历", "")
        return res

    def regular_jobtag(self):
        """
        有关职位标签信息
        """
        res = []
        job_tag = re.search(u"应届生|全职|兼职|实习生|应届毕业生|社招|急招|急聘", self.jdStr)
        if job_tag:
            res.append(job_tag.group())

        job_tag = re.search(u"招聘人数[::]?|招聘[::\s]|人数[::\s]", self.jdStr)
        if job_tag:
            jdstr = self.jdStr[job_tag.span()[1]:]
            for line in jdstr.split():
                if len(line.strip()) < 1: continue
                else:
                    num = re.search(u"(\d+\-)?\d+人?|若干|\d+位", line)
                    if num:
                        res.append(u"招聘人数:" + num.group())
                    break

        job_tag = re.search(u"(职能类别|职位标签)[:: ]?", self.jdStr)
        if job_tag:
            jdstr = self.jdStr[job_tag.span()[1]:]
            for line in jdstr.split('\n'):
                if len(line.strip()) < 3: continue
                else:
                    res.append("职业标签:" + line.strip())
                    break
                if len(line) > 25: break

        #  根据产品部需求专门切割出包含经验的句子等有关职位标注信息,句子进行更精细化切割
        linelist = [
            line for line in re.split(u"[,。;\s]", self.jdStr)
            if 5 < len(line) < 15
        ]
        for line in linelist:
            if re.search(u"经验", line) and not re.search(u"月薪|地点|日期", line):
                if re.search(u"\d+k|[。?)\)\]]", line): continue
                res.append(self.clean_line(line))
                break

        self.result["job_tag"] = res
        return res

    # 清除句子前的数字和标点符合
    def clean_line(self, line):
        line = self.CLEAR_NUM.sub("", line.strip())
        line = self.CLEAR_COLO.sub("", line)
        return line

    # 抽取工作地点
    def regular_workplace(self):
        res = set()
        jdstr = self.jdStr
        pos = list(re.finditer(u"(工作地.|上班地.|实习地.|地址|地点)[::\s]", jdstr))
        if pos:
            jdstr = jdstr[pos[0].span()[1]:]

            for line in jdstr.split():
                if len(line.strip()) < 2: continue
                if len(line) < 26:
                    res.add(line.strip().replace(":", "").replace(":", ""))
                else:
                    for city in jieba.cut(line):
                        if city in self.citydic and city[:-1] not in res:
                            res.add(city)
                break
        if not res:
            for city in jieba.cut(jdstr):
                if city in self.citydic and city[:
                                                 -1] not in res and u"国" not in city:
                    res.add(city)
                    break
        self.result["workplace"] = " / ".join(res)
        return res

    # 抽取证书获奖情况等其他要求
    def regular_cert(self):
        res = set()
        linelist = [
            line for line in re.split(u"[\s ,。;,]", self.jdStr)
            if len(line) > 3
        ]
        for line in linelist:
            findcert = re.search(
                u"(\S+证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|职业资格|律师证|会计证",
                line)
            if findcert:
                res.add(findcert.group())
            else:
                findcert = re.search(u"有(.+证)书?", line)
                if findcert:
                    res.add(findcert.group(1))
                else:
                    findcert = re.search(u"有.+资格", line)
                    if findcert:
                        res.add(findcert.group())

        self.result['cert'] = re.sub(u"[或及以上]", "", ' / '.join(res))
        if self.result['cert']:
            self.result['cert'] = self.result['cert'].split(' / ')
        else:
            self.result['cert'] = []

    # 利用技能词库抽取相关技能
    def regular_skill(self, num=6):
        res = []
        for line in self.linelist:
            if self.DEMAND.search(line) or self.clf.predict(line) == 'demand':
                for word in jieba.cut(line):
                    word = strQ2B(word).lower()
                    if word in self.skills:
                        res.append(word)

        sorted_words = [w[0] for w in Counter(res).most_common(2 * num)]

        for word in jieba.cut(self.result['job_name']):
            word = strQ2B(word).lower()
            if word in self.skills and word not in sorted_words:
                sorted_words.insert(0, word)

        after_top3 = sorted_words[3:]
        np.random.shuffle(after_top3)

        self.result['skill'] = sorted_words[:3] + after_top3[:num - 3]

    # 抽取岗位职责
    def regular_duty(self):
        res = []
        jdStr = self.jdStr
        pos = list(self.START_DUTY.finditer(jdStr))
        if len(pos) > 0:
            linelist = [
                re.sub("[\s ]+", " ", line)
                for line in jdStr[pos[-1].span()[1]:].split("\n")
                if len(line) > 2
            ]
            for i in xrange(len(linelist)):
                line = linelist[i]
                if self.START_DUTY.search(
                        line) or self.lineindex[line] == 1 or (
                            re.search(u".年来|谢谢|请在|公司介绍|举报|收藏|岗位职责", line)
                            and not re.search(u"了解", line)):
                    continue
                if re.search(u"要求[::\s]?|岗位要求", line) and len(line) < 6: break
                if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]",
                            line.strip()) or self.DUTY.search(
                                line) or self.clf.predict(line) == 'duty':
                    res.append(line.strip())
                elif i < len(linelist) - 1 and self.clf.predict(
                        linelist[i + 1]) == 'duty':
                    res.append(line)
                else:
                    break
        if not res:
            for line in self.linelist:
                if re.search(u"粉丝团", line) and len(line) < 12: continue
                if self.DUTY.search(line) and self.clf.predict(line) == "duty":
                    if self.lineindex[line] != 1:
                        res.append(line)

        self.result["duty"] = "\n".join(res)
        for line in res:
            self.lineindex[line] = 1

        return res

    # 抽取岗位要求
    def regular_demand(self):
        res = []
        jdStr = self.jdStr
        pos = list(self.START_DEMAND.finditer(jdStr))
        if len(pos) > 0:
            tmppos = pos[-1].span()[0]
            if re.search(u"具有|具备", jdStr[tmppos - 5:tmppos + 5]) or re.search(
                    u"证书|证", jdStr[tmppos:tmppos + 8]):
                pos.pop()
            if pos:
                linelist = [
                    re.sub("[\s ]+", " ", line)
                    for line in jdStr[pos[-1].span()[1]:].split("\n")
                    if len(line) > 2
                ]
            else:
                linelist = []
            for i in xrange(len(linelist)):
                line = linelist[i]
                if self.START_DEMAND.search(linelist[i]) or re.search(
                        u"谢谢|请在|公司介绍|举报|收藏|\d+k?元|加分", line):
                    continue
                if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]",
                            line) or self.DEMAND.search(
                                line) or self.clf.predict(line) == 'demand':
                    res.append(line)
                elif i < len(linelist) - 1 and self.clf.predict(
                        linelist[i + 1]) == 'demand':
                    res.append(line)
                else:
                    break
        if not res:
            for line in self.linelist:
                if self.lineindex[line] == 1 or len(line.split()) > 6:
                    continue  # 如果该句已经被处理过,就不再重复显示
                if self.clf.predict(line) == 'demand' or self.DEMAND.search(
                        line):
                    res.append(line.strip())

        self.result['demand'] = '\n'.join(res)
        for line in res:
            self.lineindex[line] = 1

        return res

    # 招聘的职位名
    def regular_jobname(self):
        res = set()
        jdStr = self.jdStr
        findpos = re.search(u"(招聘岗位|招聘职位|职位名称|岗位名称|岗位[一二三四五六七八九])[:、:\s ]",
                            jdStr)
        #        if not findpos:
        #            findpos = re.search(u"(职位类别|职位职能)[::\s ]",jdStr)

        if findpos:
            pos = findpos.span()[1]
            linelist = jdStr[pos:].split("\n")
            for line in linelist:
                if len(line) < 2: continue
                if len(line) >= 2 and len(line) < 20:
                    if re.search(u"职位描述|查看|地址|工作|分享|举报|下一条|时间|福利|待遇|周末|双休",
                                 line):
                        continue
                    res.add(re.sub(u"聘请|高薪诚聘|诚聘|[,。、\d!]+", "", line.strip()))
                    break

        # 如果没有匹配到招聘的具体职位信息,就切词后到职位列表去匹配
        if not res:
            for line in self.linelist:
                if re.search(u"招聘|高薪|诚聘", line): continue
                if len(line) < 6 and not re.search(
                        u'岗位|岗位内容|工作内容|职责|任职|资格',
                        line) and self.clf.predict(line) == 'job_name':
                    res.add(line)
                    break
                findPos = self.JOBNAME.search(line)
                if findPos and len(findPos.group()) < 20 and not re.match(
                        u'\d', findPos.group()):
                    jobname = findPos.group()
                    res.add(re.sub(u"聘请|高薪诚聘|诚聘|急.|[,。、!]+", "", jobname))
                    break
                #   res.add(re.sub(u"\(.+\)|(.+)|【.+】|[,。、\s\d]+|聘请|高薪诚聘|诚聘|急招|","",line.strip()))

        if not res:
            for line in self.linelist:
                for word in jieba.cut(line.lower()):
                    if word in self.jobdic:
                        res.add(word)
                        self.result["job_name"] = " / ".join(res)
                        return res
        if not res:
            tag = re.search(u"实习生|兼职", self.jdStr)
            if tag:
                res.add(tag.group())
        self.result["job_name"] = strQ2B(" / ".join(res)).lower()
        return res

    # 薪酬
    def regular_pay(self):
        pay = ""
        lagoup = re.search(
            u"(\d+[kK][-——]\d+[kK])|(\d{3,5}-\d{3,5}元?/[月日天])|(\d{3,5}-\d{3,5}元)|((\d+[-~]\d+)万.[年月])|底薪\d+(-\d+)?元?|\d{3,5}元(左右|以上)?|年薪\d+万?元(左右|以上)?",
            self.jdStr)  # 针对拉勾网,没有待遇等关键字符
        if lagoup:
            pay = lagoup.group()
            self.result["pay"] = pay
            self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000')
            return pay

        findpay = self.PAY.search(self.jdStr)
        if findpay:
            pos = findpay.span()[1]

            jdstr = self.jdStr[max(0, pos - 5):min(pos + 10, len(self.jdStr))]
            if re.search(u"面议", jdstr):
                pay = u"面议"
            else:
                findpay = re.findall(u"\d{3,7}", jdstr)
                pay = "-".join(findpay)
        self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000')
        return pay

    # 抽取薪资福利
    def regular_benefits(self):
        res = []
        jdStr = self.jdStr
        findpos = list(re.finditer(u"薪酬福利[::\s]|(福利|待遇)\s?[::]", jdStr))
        if not findpos:
            findpos = list(
                re.finditer(u"(晋升制度|工作环境|职位诱惑|你会获得什么)\s?[?\?::]", jdStr))
        if findpos:
            pos = findpos[-1].span()[1]
            linelist = jdStr[pos:].split('\n')
            for line in linelist:
                print 'benefits', line
                if len(line.strip()) < 3: continue
                if re.match(ur"[((]?\d+", line) or self.BENEFIT.search(line):
                    res.append(line.strip())
                    self.lineindex[line.strip()] = 1
                else:
                    break

        if not res:
            for line in jdStr.split():
                if len(line) > 1 and re.search(
                        u"带薪|双休|股票期权|五险一金|发展空间|福利|诱惑|休假|薪酬|补助|年假|弹性工作", line):
                    if re.search(u"福利|待遇|诱惑", line) and len(line.strip()) < 6:
                        continue
                    res.append(line.strip())

        if len(res) == 1 and re.search(
                u"险一金", res[0]) and not re.search(u"[,、]", res[0]):
            res[0] = self.clean_line(' '.join(jieba.cut(res[0])))

        self.result["benefits"] = "\n".join(res)
        return res
예제 #31
0
파일: fin_text.py 프로젝트: lvleilei/screen
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.path.append('../../')
from config import *

from tgrocery import Grocery
STOP_WORDS_FILE = 'stopwords.txt'
USER_DICT_FILE = 'user_dict.txt'

model_fintext = Grocery('model_fintext')
model_fintext.load()
sys.path.append('../')
from get_es import *
es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])

def search(index_name):
    es_search_options = set_search_optional()
    es_result = get_search_result(es_search_options,index=index_name)
    # final_result = get_result_list(es_result)
    # return final_result
    return es_result


def get_result_list(es_result):
    final_result = []
    for item in es_result:
        final_result.append(item['_source'])
    return final_result

예제 #32
0
def test_sample(path, test_path):
    new_grocery = Grocery(path.encode('utf-8'))
    new_grocery.load()
    test_path = os.path.join(BASE_DIR, 'learn', test_path)
    res = new_grocery.test(test_path.encode('utf-8'))
    return str(res)
예제 #33
0
import rospy
from tgrocery import Grocery
from nlu.srv import *
import jieba
import jieba.posseg as pseg

model = '/home/hntea/ros-speech/nlu-model/model'
new_grocery = Grocery(model)


def handle_is_music(req):
    # print "Request = [%s]"%(req.topic)
    label = str(new_grocery.predict(req.topic))
    if label == 'music':
        ret = 1
    else:
        ret = 0
    return ret


def is_music_server():
    rospy.init_node('nlu_is_music')
    s = rospy.Service('is_music', IsMusic, handle_is_music)
    print "Servic Is Ready"
    rospy.spin()


if __name__ == "__main__":
    print "Servic Load Model..."
    new_grocery.load()  # 加载模型
    is_music_server()
예제 #34
0
# -*- coding: utf-8 -*-

# 分类模型
# Author: Alex
# Created Time: 2016年12月30日 星期五 14时18分58秒

from tgrocery import Grocery

gr = Grocery("test")
gr.load()

lessVal = 0.2

# 统计变量
statDict = {
    "total": 0,  # 总数
    "notEq": 0,  # title和content的识别结果不一致
    "less": 0,  # 小于某个阀值的数量
    "title": 0,  # 以title为结果的次数
    "content": 0  # 以content为结果的次数
}


def getStatDict():
    return statDict


def classify(title, content):
    """
    分类器
    :return cat 预测的分类
# -*- coding: utf-8 -*-
import csv, codecs
from tgrocery import Grocery
import preprocessing as pp

testFileName = '../data/test.txt'
outputFileName = '../output/upload.csv'

# test ##################################
#grocery=Grocery('sample')
grocery = Grocery('version1.0')
grocery.load()

print 'start test'

filetest = codecs.open(testFileName, 'r', 'utf-8')
test_reader = filetest.readlines()

fileOutput = codecs.open(outputFileName, 'w', 'utf-8')

i = 0
for line in test_reader:
    content = pp.getcontent(test_reader, i)
    i = i + 1
    #if(i>10):
    #break
    if (i % 5000 == 0):
        print("%d " % (i)) + '#' * 30

    if (content == ''):
        print "test.py#" * 3 + line
예제 #36
0
class JdCRF(object):
    def __init__(self):
        self.data = []
        self.clf = Grocery("jdclf")
        self.clf.load()
        
        self.SEX = re.compile(u"性别不限|性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?")
        self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)")
        self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年")
        self.PUB_TIME = re.compile(u"(\d+)(天前发布)")
        
        self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") 
        self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") 
        self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\
                                 |互联网|创业型|国企|央企")

        self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\
                                  文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\
                                  |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析')

        self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\
                                       |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]")

        self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\
                                 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利")

        self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\
                               |完成|沟通|需求|秘书.{2,5}翻译")

        self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]")

        self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利")

        self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\
                                  |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红")
        
    


    def gen_data(self,fname='./data/lagou_train.txt'):
        fw = codecs.open('./data/jd_train_crf.txt','wb','utf-8')
        cnt = 1
        for line in codecs.open(fname,'rb','utf-8'):
            if line.startswith(u"====="):
                fw.write(line)
                continue

            cnt +=1
            if len(line.strip())>1:
                    pred = self.clf.predict(line)
                    newline = pred+'\t\t'+line.strip()+'\t\t'+str(len(line))+"\n"
                    fw.write(newline)
        print cnt
        print 'done'


    def load_data(self,fname="./data/jd_train_crf.txt"):
        data = []
        tmp = []
        for line in codecs.open(fname,'rb','utf-8'):
            if line.startswith(u"===="):
                data.append(tmp)
                tmp = []
                continue
            else:
                tag_data = line.strip().split('\t\t')
                if len(tag_data)==3:
                    tmp.append(tuple(tag_data))
                else:
                    print '\t  '.join(tag_data)

        
        n = len(data)/2
        print 'train data',n
        print 'test data',len(data)-n
        return data[n:],data[:n]
    

    def word2features(self,sent,i):
        word = sent[i][0]
        postag = sent[i][1]

        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[:2]=' +word[:2],
            'word.isdigit=%s'%word.isdigit(),
            'postag='+postag,
            'demand=%s'% '1' if self.DEMAND.search(word) else '0',
            'start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0',
            'start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
            'duty=%s'% '1' if self.DUTY.search(word) else '0',
            'jobname=%s'% '1' if self.JOBNAME.search(word) else '0',
            'incname=%s'% '1' if self.INCNAME.search(word) else '0',
            'benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
            'pred=%s' % self.clf.predict(word)
        ]

        if i>0:
            word1 = sent[i-1][0]
            postag1 = sent[i-1][1]

            features.extend([
                '-1:postag='+postag1,
                '-1:word.islower='+word1[:3].lower(),
                '-1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '1',
                '-1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
                '-1:demand=%s'% '1' if self.DEMAND.search(word1) else '0',
                '-1:duty=%s'% '1' if self.DUTY.search(word1) else '0',
                '-1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0',
                '-1:incname=%s'% '1' if self.INCNAME.search(word1) else '0',
                '-1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
                '-1:pred=%s' % self.clf.predict(word),
            ])

        else:
            features.append('BOS')


        if i<len(sent)-1:
            word1 = sent[i+1][1]
            postag1 = sent[i+1][1]
            features.extend([
                '+1:word.lower=' + word1[:3].lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:postag=' + postag1,
                '+1:postag[:2]=' + postag1[:2],
                '+1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0',
                '+1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0',
                '+1:demand=%s'% '1' if self.DEMAND.search(word1) else '0',
                '+1:duty=%s'% '1' if self.DUTY.search(word1) else '0',
                '+1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0',
                '+1:incname=%s'% '1' if self.INCNAME.search(word1) else '0',
                '+1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0',
                '+1:pred=%s' % self.clf.predict(word),
            ])
        else:
            features.append('EOS')


        return features




    def sent2features(self,sent):
        return [self.word2features(sent,i) for i in range(len(sent))]

    def sent2labels(self,sent):
        return [label for (label,token,postag) in sent]

    def sent2tokens(self,sent):
        return [token for (label,token,postag) in sent]
    

    def train(self,x_train,y_train):
        
        assert len(x_train)==len(y_train),"not the same %d  %d"%(len(x_train),len(y_train))

        trainer = pycrfsuite.Trainer(verbose=False)

        for xseq,yseq in zip(x_train,y_train):
            trainer.append(xseq,yseq)

        trainer.set_params({
            'c1':1.0,
            'c2':1e-3,
            'max_iterations':50,
            'feature.possible_transitions':True
        })

        trainer.train('jd_skill.crfsuite')

    
    def test(self,sent):
        tagger = pycrfsuite.Tagger()
        tagger.open('./jd_skill.crfsuite')
        
        print 'tokens   ','\n '.join(self.sent2tokens(sent))
        print 'Predicted','\t '.join(tagger.tag(self.sent2features(sent)))
        print 'Correct  ','\t '.join(self.sent2labels(sent))
예제 #37
0
class XzParserTop(object):
    def __init__(self):

        self.result = OrderedDict()
        inc_keys = [
            "jdFrom", "incName", "incAliasName", "incLogo", "incScale",
            "incType", "incIndustry", "incIntro", "incCity", "incLocation",
            "incZipCode", "incContactName", "incContactInfo", "incUrl"
        ]
        # job_keys = ["pub_time", "jobEndTime", "jobPosition", "jobCate", "jobSalary", "jobWorkAge","jobDiploma", "jobDesc",
        #             "jobType","jobNum", "jobWorkCity","jobWorkLoc","jobWelfare", "jobMajorList", "age", "gender", "email",
        #             "jobCVformat", "jobMinimumDays","jobSkill","jobCertificate"]

        jdInc = OrderedDict()
        for k in inc_keys:
            jdInc[k] = ""
        self.result["jdInc"] = jdInc
        jdJob = OrderedDict()
        # for k in job_keys:
        #     jdJob[k] = ""
        self.result["jdJob"] = jdJob

        self.CLEAN_TEXT = re.compile(
            u"[^\u4e00-\u9fa5\w\d;::;,。、%\.,/。!!@()\r\n\(\)\-\+ - `]")

        self.clf = Grocery(base_dir + "/jdclf")
        self.clf.load()

        self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}")
        self.CLEAN_LINE = re.compile(
            u"^[\u2022(【\[\s\t\r\n\(\-  ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]"
        )
        self.CLEAN_JOBNAME = re.compile(
            u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]")

        self.SEX = re.compile(u"性别|男|女")
        self.JOB_TAG = re.compile(u"全职|实习")

        self.START_DEMAND = re.compile(
            u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?")
        self.START_DUTY = re.compile(
            u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]")
        self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]")

        self.DEMAND = re.compile(u"精通|掌握|熟悉|熟练|有.+经验")
        self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作")
        self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴")
        self.CERT = re.compile(
            u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\
                                 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书")

        self.degreedic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/degrees.txt', 'rb', 'utf-8')
        ])
        self.majordic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/majordic_new.txt', 'rb', 'utf-8')
        ])
        self.skilldic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/skills.txt', 'rb', 'utf-8')
        ])
        self.jobdic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/jobnames.txt', 'rb', 'utf-8')
        ])
        self.position = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/jobposition_new.txt', 'rb', 'utf-8')
        ])
        self.position_prefix = set([
            line.strip() for line in codecs.open(
                base_dir + '/data/jobposition_prefix.txt', 'rb', 'utf-8')
        ])
        self.position_postfix = set([
            line.strip() for line in codecs.open(
                base_dir + '/data/jobposition_postfix.txt', 'rb', 'utf-8')
        ])
        self.citydic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/citydic.txt', 'rb', 'utf-8')
        ])
        self.province_city = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/province_city.txt', 'rb', 'utf-8')
        ])
        self.city_area = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/city_area.txt', 'rb', 'utf-8')
        ])

        jieba.load_userdict(base_dir + '/data/majordic_new.txt')
        jieba.load_userdict(base_dir + '/data/skills.txt')
        jieba.load_userdict(base_dir + '/data/jobposition.txt')
        jieba.load_userdict(base_dir + '/data/firm.txt')
        jieba.load_userdict(base_dir + '/data/degrees.txt')
        jieba.load_userdict(base_dir + '/data/benefits.txt')
        jieba.load_userdict(base_dir + '/data/citydic.txt')
        jieba.load_userdict(base_dir + '/data/province_city.txt')

        #new

        self.INTRO = re.compile(u"公司介绍|公司简介|企业简介|企业介绍|关于我们|单位简介|关于")
        self.JOBNAME_LINE = re.compile(
            u"岗位:|招聘岗位|实习生招聘|职位名称|招聘职位|实习岗位|岗位方向|定向岗位|岗位$|岗位名称")
        self.JOBNAME = re.compile(
            u".*?工程师|\S{2}专员$|\S{4,}岗$|工程师$|\S{4,}实习生招聘$|职位\d.*?分析|[^招聘]实习生$|研究员$\
            |经理|.*?实习生[((].*?[))|培训生$]")
        self.CONTACTINFO = re.compile(
            u'联络方式|联络电话|固定电话|固话|电话|联系电话|QQ|联系方式|传真|Tel')
        self.CONTACTNAME = re.compile(u'联络|联系人$|联络人')
        self.NUMBER = re.compile(
            u'(?<=[^-Q/——_ 0123456789])([-/_ ——0123456789]{7,})')
        self.QQ = re.compile(u'QQ\d{6,}|QQ|qq')
        self.PUNCTION = re.compile(u'[~\],.;:: ,、。《》【】!#……<>;“”]')
        self.INC_URL = re.compile(u"(主页|网站|网址|官网):(.{0,5}[\w\d_/\.:\-]+)")
        self.MAIL = re.compile(u"\w+@[\w\.]+|\w+\.\w+@[\w\.]+|\w.*?at.*?com")
        self.FORMART = re.compile(
            u'(邮件|简历)名称?以?(主题|格式|标题){0,}为?[~\],.;::,、。()《》【】!#……()-<>;“”](.*)|\
        ("|“){1,}(.*?姓名.*?)["”]|(“|"){1,}(.*?姓名.*?学校.*?)[”"]|("|“){1,}(\S{2,}-\S{2,}-\S{2,}.*?)[”"]|([ 姓名年级学校职位可入职时间-]{5,})'
        )

        self.JOBLOC = re.compile(u"工作地址|上班地点|公司地址|工作地点|实习地点|总部地址|[^邮件](地址|地点)")
        self.MAJOR = re.compile(u"相关专业|等?专业|专业优先|以上学历|优先考虑|专业背景")
        self.AGE_LINE = re.compile(u"\d+周?岁|年龄|\d{2}岁")
        self.AGE = re.compile(
            u"\d{2}?\s?[\- -~到至]?\s?\d{2}周?岁|(至少|不低于|不超过|不大于|大概|大约|不少于|大于)\d+周?岁|\d+周?岁(以上|以下|左右|上下)"
        )
        self.WEEKTIME = re.compile(
            u"(每|一)周(最好)?(至少|最少|保证|起码)?(实习|工作)?[\d一二三四五六七八九十].*?(天|日)(以上)?|实习天数[\d一二三四五六七](天|日)|\
(每|一)周.*?(最好|可到岗)?(至少|最少|保证|起码)?(实习|工作)?[\d一二三四五六七八九十].*?(天|日)(以上)?")
        self.JOBLENGTH = re.compile(
            u"(实习期)?(至少|保证|起码)?(工作)?[\d一二三四五六七八九十]个.*?月(或)?(以上)|\
周期?.*?(\d个月[-―]{1,2}\d个月)|(实习期)?(至少|保证|起码)(工作)?[\d一二三四五六七八九十]个.*?月(以上)?|至少.{1,5}年(以上)?|(实习).{1,5}年以上"
        )
        self.XUHAO = re.compile(u"[0123456789一二三四五六七八九]")
        self.JOBNUM_LINE = re.compile(u"招?(\d{1,}人)|招聘人数:|岗位人数|人数:")
        self.JOBNUM = re.compile(
            u"(\(| )?(\d{1,}[人名]?)|(\d{1,}[-_~]?\d{1,}[人名])")
        self.PHONE = re.compile(
            u'(?<!\d)\d{11,12}(?!\d)|\d{3,4} \d{7,8}|\d{3,4}-\d{7,8}|\d{3,4}—\d{7,8}|\d{3}-\d{3}-\d{4}|(?<!\d)\d{8}(?!\d) '
        )
        self.ENDTIME = re.compile(
            u"(\d.*?[号|日])之?.{0,5}前|截止时间(\d.*日)|截止日期(\d.*日)|(\d.*日)之前")
        self.DEGREE_LINE = re.compile(u"(最低)?学历要求|(以上)?学历")
        self.DEGREE = re.compile(
            u"(小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|Master)(?!优先)|(小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限|Master)$"
        )

        self.SALARY = re.compile(
            u"\d{3,}元?\-?\d{3,}元|(本科|研究生|硕士)?\d{2,4}(元|RMB)/?(天|周|月|day|Day)|\dk[--]\dk"
        )
        #找标题规则
        self.FIRST = re.compile(u"一、|二、|三、|四、|五、|六、|七、|八、|九、")
        self.THIRD = re.compile(u"[\[【]\S{3,6}[\[】]|^\S{4,5}:$")

        #CNN模型
        # self.model,self.word_idx_map,self.config = load_model()

        #创一个ac自动机用于关键词的匹配。
        # builder = AcoraBuilder(list(self.position))
        # self.ac = builder.build(builder)

    '''
        逐行去找信息
        self.flag是标志位,主要处理抽取信息跨行的情况:
        工作地址:
        广东省深圳市XX路

        self.extra_info有四个参数:
        第一个line是一行文本(不带标签信息)
        第二个idx是该行文本所在的行号
        第三个add_desc是是否将该行文本加入职位描述信息
        第四个clean_major为是否将之前得到的专业与技能信息进行清空
    '''

    def extra_info(self, line, idx=None, add_desc=True, clean_major=False):
        '''
        Args:
            line (): 输入一行文本
            idx (): 该行文本在段落中的行号
            add_desc (): 是否将该行信息加入职位描述中,默认True
            clean_major (): 是否清空之前的优先专业与技能需求

        Returns:

        '''
        if self.jdType == "regular":
            jobName = deepcopy(self.jobName)
        elif self.jdType == "special":
            jobName = deepcopy(self.jobNameList[idx])
        # 加入职位描述
        if add_desc:
            if self.jdJob[jobName].has_key("jobDesc"):
                self.jdJob[jobName]["jobDesc"] += line + u'\n'
            else:
                self.jdJob[jobName]["jobDesc"] = line + u'\n'
        if clean_major:
            self.majorList = []
            self.skillList = []

        # 如果职位名发现人数情况
        if self.JOBNUM.search(jobName):
            jobNum = re.search(u"\d{1,}人", jobName).group(0)
            jobName = jobName.split(jobNum)[0]
            self.jdJob[jobName]["jobNum"] = jobNum

        if self.flag == "workloc":
            self.jdJob[jobName]["jobWorkLoc"] = line
            self.flag = None

        if self.JOBLOC.search(line):
            print 'workloc'
            if (len(line) < 20
                    or len(line) > 100) and not re.search(u"[^城]市|路", line):
                pass
            elif line.count(u":") == 1:
                workloc = line.split(":")[1]
                if len(workloc) > 60:
                    workloc = re.split(self.PUNCTION, workloc)[0]
                self.jdJob[jobName]["jobWorkLoc"] = workloc
                if not line.split(":")[1].strip():
                    self.flag = "workloc"
            elif line.count(u":") > 1:
                for tag in filter(None, line.split(" ")):
                    if self.JOBLOC.search(tag):
                        if tag.count(u":") == 1:
                            self.jdJob[jobName]["jobWorkLoc"] = tag.split(
                                u":")[1]
            #
            elif len(filter(None, self.PUNCTION.split(line))) > 1:
                self.jdJob[jobName]["jobWorkLoc"] = filter(
                    None, self.PUNCTION.split(line))[-1]

            else:
                self.flag = "workloc"

        # 专业、技能采用词库匹配,
        if self.DEMAND.search(line):
            word_split = jieba.cut(line, cut_all=True)
            # 做分词去词库匹配
            print "demand"
            for word in word_split:
                word = word.lower()
                if word in self.skilldic:
                    self.skillList.append(word)

        if self.MAJOR.search(line):
            word_split = jieba.cut(line)
            # 做分词去词库匹配
            print "major"
            for word in word_split:
                word = word.lower()
                # print word
                word = re.sub(u"相关|学校|专业", u"", word)
                if word in self.majordic:
                    self.majorList.append(word)

        if self.FORMART.search(line):
            print "format"
            if line.count(u":") == 1 and len(line) < 30:
                self.jdJob[jobName]["jobCVformat"] = line.split(u":")[1]
            else:
                groups = filter(None, self.FORMART.search(line).groups())
                format = groups[np.argmax(map(lambda x: len(x), groups))]
                self.jdJob[jobName]["jobCVformat"] = format
        if self.ENDTIME.search(line):
            print "endtime"
            self.jdJob[jobName]["jobEndTime"] = self.ENDTIME.search(
                line).group()
        if self.MAIL.search(line):
            print "email"
            if len(self.MAIL.search(line).group()) > 8:
                self.jdJob[jobName]["email"] = self.MAIL.search(line).group()
            else:
                if line.count(u":") == 1:
                    self.jdJob[jobName]["email"] = line.split(u":")[1]
        if self.SALARY.search(line):
            print "salary"
            for item in re.split(u" |;|,|,", line):
                if self.SALARY.search(item):
                    if self.jdJob[jobName].has_key("jobSalary"):
                        self.jdJob[jobName][
                            "jobSalary"] += u" " + self.SALARY.search(
                                item).group()
                    else:
                        self.jdJob[jobName]["jobSalary"] = self.SALARY.search(
                            item).group()

        if self.JOBNUM_LINE.search(line):
            print "jobnum"
            self.jdJob[jobName]["jobNum"] = self.JOBNUM.search(line).group()
        if self.WEEKTIME.search(line):
            print "weektime"
            self.jdJob[jobName]["jobMinimumDays"] = self.WEEKTIME.search(
                line).group()
        if self.JOBLENGTH.search(line):
            print "jobLength"
            self.jdJob[jobName]["jobLength"] = self.JOBLENGTH.search(
                line).group()
        if self.DEGREE.search(line):
            print "degree"
            line = re.sub(u"士研究生", u"士", line)
            print filter(lambda x: len(x) > 1, self.DEGREE.findall(line))
            self.jdJob[jobName]["jobDiploma"] = list(
                set(filter(None,
                           self.DEGREE.findall(line)[0])))
        if self.AGE_LINE.search(line):
            print "age"
            findage = self.AGE.search(line)
            if findage:
                self.jdJob[jobName]["age"] = findage.group()

        if len(self.majorList) > 0:
            self.jdJob[jobName]["jobMajorList"] = list(set(self.majorList))
        if len(self.skillList) > 0:
            self.jdJob[jobName]["jobSkill"] = list(set(self.skillList))

            # elif self.WEEKTIME.search(line):
            #     print "worktime"
            #     if line.count(u":") == 2:
            #         worktime = self.CLEAN_TEXT.sub(u"", line.split(u":")[1])
            #         self.jdJob[jobName]["jobMinimumDays"] = worktime
            #         if not worktime:
            #             self.flag = "worktime"
            #     else:
            #
            #         if self.flag == "worktime":
            #             self.jdJob[jobName]["jobMinimumDays"] += self.WORKTIME.search(line).group(0)
            #         else:
            #             if self.WEEKTIME.search(line).group(0).find(u":")<2:
            #                 self.jdJob[jobName]["jobMinimumDays"] = self.WORKTIME.search(line).group(0)

    def refresh(self):
        self.result = OrderedDict()
        inc_keys = [
            "jdFrom", "incName", "incAliasName", "incLogo", "incScale",
            "incType", "incIndustry", "incIntro", "incCity", "incLocation",
            "incZipCode", "incContactName", "incContactInfo", "incUrl"
        ]
        # job_keys = ["pub_time", "jobEndTime", "jobPosition", "jobCate", "jobSalary", "jobWorkAge", "jobDiploma",
        #             "jobDesc",
        #             "jobType", "jobNum", "jobWorkCity", "jobWorkLoc", "jobWelfare", "jobMajorList", "age", "gender",
        #             "email",
        #             "jobCVformat", "jobMinimumDays", "jobSkill", "jobCertificate"]

        jdInc = OrderedDict()
        for k in inc_keys:
            jdInc[k] = ""
        self.result["jdInc"] = jdInc
        jdJob = OrderedDict()
        # for k in job_keys:
        #     jdJob[k] = ""
        self.result["jdJob"] = jdJob

        self.first = []
        self.second = []
        self.third = []
        # 记录jd是何种类型的,是否名企界面,是否包含表格
        self.jdType = None
        # 是否包含表格
        self.has_table = False
        # 职位名列表
        self.jobNameList = []
        self.jobNameLine = []
        self.jobType = []
        # 专业列表和技能列表
        self.majorList = []
        self.skillList = []
        self.intro_range = []
        # 用来存职位信息
        self.jdJob = defaultdict(lambda: defaultdict(unicode))

    def replace_space(self, line):
        '''
        输入:一行文本
        输出:将具有多个空格的文本替换为一个空格之后的文本
        '''
        regex = re.compile(u' +')
        line = re.sub(regex, u' ', line)
        return line

    def judge_eng(self, word):
        '''
        输入:一个单词
        功能:判断这个单词是否为英文
        '''
        if len(re.split(u"\w", word.lower())) > 4:
            return True
        else:
            return False

    def clean_line(self, line):
        """
        清除一个句子首尾的标点符号
        """
        line = self.CLEAN_LINE.sub("", line).strip()
        line = re.sub("\s+|^/d+[;’、,/。\.]", "", line)
        return line

    def clean_cnNum(self, line):
        """
        经验年限提取时,中文一二三等转为123
        """
        line = unicode(line)
        a = [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十", u"两"]
        b = range(1, 11) + [2]
        table = dict((ord(aa), bb) for aa, bb in zip(a, b))
        return line.translate(table)

    def line2vec(self, line):
        """
        句子转换为向量
        """
        vec = np.zeros(50)
        for word in jieba.cut(line):
            if word in self.w2v.vocab:
                vec += self.w2v[word]

        return vec

    def clean_jobname(self, jobname):
        """
        职位名清洗
        """
        print jobname
        if jobname.lower() in self.jobdic:
            return jobname
        else:
            res = [(lcs_len(jobname, job), job) for job in self.jobdic]
            res.sort()
            return res[-1][1]

    def desc_extract(self, soup):
        line_list = soup.find_all("p")
        return '\n'.join([line.get_text() for line in line_list])

    # 去除img标签,1-7位空格,&nbsp;
    removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
    # 删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    # 把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    # 将表格制表<td>替换为\t
    replaceTD = re.compile('<td>')
    # 将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    # 将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    # 将多行空行删除
    removeNoneLine = re.compile('\n+')

    def replace(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "\n", x)
        x = re.sub(self.replaceTD, "\t", x)
        x = re.sub(self.replaceBR, "\n", x)
        x = re.sub(self.removeExtraTag, "", x)
        x = re.sub(self.removeNoneLine, "\n", x)
        # strip()将前后多余内容删除
        return x.strip()
예제 #38
0
# coding=utf-8
from tgrocery import Grocery

grocery = Grocery('sample')

train_src = [('education', '名师指导托福语法技巧:名词的复数形式'),
             ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
             ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
             ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')]
grocery.train(train_src)
#grocery.train('/home/wangjianfei/git/data/train_ch.txt')
# grocery.train('train_ch.txt')
grocery.save()
new_grocery = Grocery('sample')
new_grocery.load()
print(
    new_grocery.predict(
        'Abbott government spends $8 million on higher education media blitz'))
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
print("start test..................")
#grocery.test('/home/wangjianfei/git/data/test.txt')
# grocery.train('train_ch.txt'))
# custom_grocery = Grocery('custom', custom_tokenize=list)
print(new_grocery.test(test_src))
예제 #39
0
class CoParserTop(object):
    def __init__(self):

        self.result = OrderedDict()
        co_keys = ['incName', 'incAliasName', 'incScale', 'incType', 'incIndustry', 'incSubIndustry', 'incIntro',
                   'incIntroShort', 'incCity', 'incLocation', 'locationInfo', 'incZipCode', \
                   'incContactName', 'incContactEmail', 'incContactPhone', 'incContactQQ', 'incUrl', 'investIns',
                   'incStage', 'incLogo', 'incPhoto' \
            , 'incLabel', 'prdInfo', 'leaderInfo', 'developInfo', \
                   'incWeiboName', 'incWeiboUrl', 'incWechatName', 'incWechatUrl', 'incWechatCode']

        coInc = OrderedDict()
        for k in co_keys:
            coInc[k] = ""
        self.result["coInc"] = coInc

        self.CLEAN_TEXT = re.compile(
            u"[^\u4e00-\u9fa5\w\d;::;,。、%\.,/。!!@()\r\n\(\)\-\+ - `]")

        self.clf = Grocery(base_dir + "/jdclf")
        self.clf.load()

        self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}")
        self.CLEAN_LINE = re.compile(
            u"^[\u2022(【\[\s\t\r\n\(\-  ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]"
        )
        self.CLEAN_JOBNAME = re.compile(
            u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]")

        self.PAY = re.compile("(\d{3,}\-)?\d{3,}元")
        self.SEX = re.compile(u"性别|男|女")
        self.AGE = re.compile(u"\d+周?岁|年龄")
        self.JOB_TAG = re.compile(u"全职|实习")
        self.DEGREE = re.compile(
            u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限")
        self.MAIL = re.compile(u"\w+@[\w\.]+")
        self.ZIP = re.compile(u"(\d{6})")
        self.QQ = re.compile(u"\d{6,10}")
        self.PHONE = re.compile(
            u"1\d{10}|0\d{11}|\d{3,4}-\d{3,4}-\d{3,4}|\d{3,4}-\d{7,8}-\d{7,8}-\d{7,8}|\d{3,4}-\d{7,8}-\d{7,8}|\d{3,4}-\d{7,8}|\d{3,4}-\d{7,8}"
        )

        self.START_DEMAND = re.compile(
            u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?")
        self.START_DUTY = re.compile(
            u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]")
        self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]")

        self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+")

        self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验")
        self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作")
        self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴")
        self.CERT = re.compile(
            u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\
                                     医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书")

        self.degreedic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/degrees.txt', 'rb', 'utf-8')
        ])
        self.majordic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/majordic.txt', 'rb', 'utf-8')
        ])
        self.skilldic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/skills.txt', 'rb', 'utf-8')
        ])
        self.jobdic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/jobnames.txt', 'rb', 'utf-8')
        ])
        self.citydic = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/citydic.txt', 'rb', 'utf-8')
        ])
        self.province_city = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/province_city.txt', 'rb', 'utf-8')
        ])
        self.city_area = set([
            line.strip()
            for line in codecs.open(base_dir +
                                    '/data/city_area.txt', 'rb', 'utf-8')
        ])
        self.SALARY = re.compile(u'万')

        jieba.load_userdict(base_dir + '/data/majordic.txt')
        jieba.load_userdict(base_dir + '/data/skills.txt')
        jieba.load_userdict(base_dir + '/data/firm.txt')
        jieba.load_userdict(base_dir + '/data/degrees.txt')
        jieba.load_userdict(base_dir + '/data/benefits.txt')
        jieba.load_userdict(base_dir + '/data/citydic.txt')
        jieba.load_userdict(base_dir + '/data/province_city.txt')

    def refresh(self):
        self.result = OrderedDict()
        co_keys = ['incName', 'incAliasName', 'incScale', 'incType', 'incIndustry', 'incSubIndustry', 'incIntro',
                   'incIntroShort', 'incCity', 'incLocation', 'locationInfo','incZipCode', \
                   'incContactName', 'incContactEmail', 'incContactPhone', 'incContactQQ', 'incUrl', 'investIns',
                   'incStage', 'incLogo', 'incPhoto' \
            , 'incLabel', 'prdInfo', 'leaderInfo','developInfo',\
                   'incWeiboName', 'incWeiboUrl', 'incWechatName', 'incWechatUrl', 'incWechatCode']

        coInc = OrderedDict()
        for k in co_keys:
            coInc[k] = ""
        self.result["coInc"] = coInc
예제 #40
0
#coding=utf-8
from tgrocery import Grocery

text_model = Grocery('all_no_town')
text_model.load()


#输入文本,预测出class_name和class_prob
def predict(text):
    c = text_model.predict(' '.join(list(text)))
    class_name = str(c)
    class_prob = c.dec_values[class_name]
    return class_name, class_prob


print predict(u'100')
예제 #41
0
		tdic['id'].append(_id)
		tdic['type'].append(_type)
		tdic['contents'].append(contents)
	i +=1
	
#train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 )
#test = pd.read_csv( test_file, header = 1, delimiter = "\t", quoting = 3 )
train = DataFrame(dic)
test = DataFrame(tdic)
#
#classfynews_instance 是模型保存路径
grocery = Grocery('classfynews_instance')

train_in = [train['contents'],train['type']]
grocery.train(train_in)
print grocery.get_load_status()
#grocery.save()

copy_grocery = Grocery('classfynews_instance')
copy_grocery.load()
#copy_grocery = grocery
test_in = [test['contents'],test['type']]
#输入类似 ['我是中国人','台北*****']
#输出 [11,12]
test_result = copy_grocery.predict(test['contents'])
print test_result.predicted_y
#test_result = copy_grocery.test(test_in)
#print test_result.show_result()


예제 #42
0
def main():
    # Get market_sentiment of word from NTUSD-Fin
    train_t = []
    train_s = []
    targetIn = {}
    targetDict = dict()
    with open('NTUSD-Fin/NTUSD_Fin_hashtag_v1.0.json', 'r',
              encoding='utf-8') as f:
        targetIn = json.load(f)
    N = len(targetIn)
    for i in range(N):
        word = "#" + targetIn[i]['token']
        targetDict[word] = targetIn[i]['market_sentiment']
        sg = str(GroupValue_s(str(targetDict[word] / 3.5)))
        train_s.append((sg, word))
    with open('NTUSD-Fin/NTUSD_Fin_word_v1.0.json', 'r',
              encoding='utf-8') as f:
        targetIn = json.load(f)
    N = len(targetIn)
    for i in range(N):
        word = targetIn[i]['token']
        targetDict[word] = targetIn[i]['market_sentiment']
        sg = str(GroupValue_s(str(targetDict[word] / 3.5)))
        train_s.append((sg, word))

    # Training File: Load data & Use tgrocery to train classification model
    TrainingFile = open('training_set.json', 'r')
    TrainingData = json.load(TrainingFile)
    TrainingFile.close()
    DataList = []
    grocery_t = Grocery("tweet")
    grocery_s = Grocery("snippet")
    for DataElement in TrainingData:
        tempt = DataManager()
        tempt.insertData(DataElement)
        tempt.group_t = GroupValue_t(tempt.sentiment)
        tempt.group_s = GroupValue_s(tempt.sentiment)
        line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ",
                      DataElement["tweet"])
        train_t.append((str(tempt.group_t), line))
        if isinstance(DataElement["snippet"], list):
            for line in DataElement["snippet"]:
                train_s.append((str(tempt.group_s), line))
        elif DataElement["snippet"] != "":
            train_s.append((str(tempt.group_s), DataElement["snippet"]))
        else:
            tempt.group_s = 0.0
        DataList.append(tempt)
    grocery_t.train(train_t + train_s)
    grocery_t.save()
    grocery_s.train(train_s)
    grocery_s.save()

    # Save training data created by WordScore() and GroupValue_*()
    # Data will be uesd for LinearRegression() in BOTH.py
    outfile = open('TG_train.txt', 'w', encoding='utf-8')
    dataScore = []
    dataSentiment = []
    for row in DataList:
        dataSentiment.append([float(row.sentiment)])
        a = WordScore(row.tweet, targetDict)
        b = WordScore(row.snippet, targetDict)
        c = row.group_t
        d = row.group_s
        dataScore.append([a, b, c, d])
        print(a, b, c, d, file=outfile)
    outfile.close()
    '''
	# Train linear regression model
	model = LinearRegression()
	model.fit(dataScore, dataSentiment)

	# Test for training data
	print('(train)R-squared: %.3f' % model.score(dataScore, dataSentiment)) #0.915
	predictions = model.predict(dataScore)
	rms = mean_squared_error(dataSentiment,predictions)
	print('RMSE: %.3f' % sqrt(rms)) #0.110
	print('MSE: %.3f' % rms) #0.012
	'''

    # Testing File: Load data & Use tgrocery classification model to predict
    TestingFile = open('test_set.json', 'r')
    TestingData = json.load(TestingFile)
    TestingFile.close()
    DataList = []
    new_grocery_t = Grocery('tweet')
    new_grocery_t.load()
    new_grocery_s = Grocery('snippet')
    new_grocery_s.load()
    for DataElement in TestingData:
        tempt = DataManager()
        tempt.insertData(DataElement)
        line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ",
                      DataElement["tweet"])
        tempt.group_t = float('{0}'.format(new_grocery_t.predict(line)))
        value = 0.0
        if isinstance(DataElement["snippet"], list):
            for line in DataElement["snippet"]:
                value = value + float('{0}'.format(
                    new_grocery_s.predict(line)))
            value = value / len(DataElement["snippet"])
        elif DataElement["snippet"] != "":
            value = float('{0}'.format(
                new_grocery_s.predict(DataElement["snippet"])))
        tempt.group_s = value
        DataList.append(tempt)

# Save testing data created by WordScore() and classification prediction
# Data will be uesd for LinearRegression() in BOTH.py
    outfile = open('TG_test.txt', 'w', encoding='utf-8')
    dataScore = []
    dataSentiment = []
    for row in DataList:
        dataSentiment.append([float(row.sentiment)])
        a = WordScore(row.tweet, targetDict)
        b = WordScore(row.snippet, targetDict)
        c = row.group_t
        d = row.group_s
        dataScore.append([a, b, c, d])
        print(a, b, c, d, file=outfile)
    outfile.close()
    '''
예제 #43
0
grocery = Grocery('sample')
# 训练文本可以用列表传入
train_src = [
    ('education', '名师指导托福语法技巧:名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗?'),
    ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')
]
grocery.train(train_src)
# 也可以用文件传入(默认以tab为分隔符,也支持自定义)
#grocery.train('train_ch.txt')
# 保存模型
grocery.save()
# 加载模型(名字和保存的一样)
new_grocery = Grocery('sample')
new_grocery.load()
# 预测
new_grocery.predict('考生必读:新托福写作考试评分标准')
#education

# 测试
test_src = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
new_grocery.test(test_src)
# 输出测试的准确率
#0.5

# 同样可支持文件传入
#new_grocery.test('test_ch.txt')
예제 #44
0
    ftest = open(path2, 'w')
    for line in open(path):
        if random.random() < theta:
            ftest.write(line)
        else:
            ftrain.write(line)
    ftrain.close()
    ftest.close()

def train(path,name):
    grocery = Grocery(name)   
    grocery.train(path)
    grocery.save()

if __name__ == "__main__":
    data2tt(sys.argv[3], sys.argv[1], sys.argv[2], 0.02)
    train(sys.argv[1], "music")
    new_grocey = Grocery("music")
    new_grocey.load()
    n = 0
    for line in open(sys.argv[2],"r"):
        ls = line.strip().split("\t")
        predict = new_grocey.predict(ls[1])
        test = ls[0]
        result = 0
        if test == str(predict):
            result = 1
        n += result
        print predict,test,result
    print n