Python CRFWordSegment示例，nlp.CRFWordSegment Python示例

示例#1

0

显示文件

def topic_feature_process(x_train, x_test, y_train, y_test):
    x_train_msg = []
    x_test_msg = []
    x_corpus = []
    crf = CRFWordSegment()
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    x_corpus.extend(x_train_msg)
    x_corpus.extend(x_test_msg)
    vectorizer = TfidfVectorizer()
    tfidf_corpus = vectorizer.fit_transform(x_corpus).toarray()

    tfidf_train = tfidf_corpus[0:len(x_train_msg)]
    tfidf_test = tfidf_corpus[len(x_train_msg):len(tfidf_corpus)]
    y_pred = cls_cos_sim(tfidf_test, tfidf_train, y_train)

    f1 = f1_score(y_test, y_pred)
    return f1

示例#2

0

显示文件

文件： pre_processing.py 项目： chaluemwut/topicmodel

 def process(self, word_list):
     ret = []
     lst = self.remove_dup_sentense(word_list)
     crf = CRFWordSegment()
     for l in lst:
         ret.append(crf.crfpp(unicode(l,'utf8'))[0])
     return ret

示例#3

0

显示文件

    def text_mining(self, x_train, x_test, y_train, y_test):
        x_train_msg = []
        x_test_msg = []
        crf = CRFWordSegment()
        for x_msg in x_train:
            data_lst = crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_train_msg.append(data_msg)

        for x_msg in x_test:
            data_lst = crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_test_msg.append(data_msg)

        text_clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', RandomForestClassifier())])
        start_time = time.time()
        text_clf = text_clf.fit(x_train_msg, y_train)
        total_time = time.time() - start_time
        self.time_train_text.append(total_time / len(y_train))

        start_time = time.time()
        y_pred = text_clf.predict(x_test_msg)
        total_time = time.time() - start_time
        self.time_predict_text.append(total_time / len(y_pred))

        f1 = f1_score(y_test, y_pred)
        return f1

示例#4

0

显示文件

def load_data():
    print('start...')
    nlp = CRFWordSegment()
    with codecs.open('data/db/filterel4000.json', 'r', 'utf-8') as f:
        lines = f.readlines()
        data_obj = []
        for data in lines:
            json_data = json.loads(data)
            if json_data['cred_value'] == 'maybe' or json_data[
                    'tag_with'] == 'NaN':
                continue

            mapping = NewDataMapping()
            message = json_data['message']
            mapping.message = message
            if json_data['cred_value'] == 'no':
                mapping.prediction_result = 0
            else:
                mapping.prediction_result = 1
            social_features = []
            social_features.append(int(json_data['likes']))
            social_features.append(int(json_data['shares']))
            social_features.append(int(json_data['comments']))
            social_features.append(int(json_data['url']))
            social_features.append(int(json_data['hashtag']))
            social_features.append(int(json_data['images']))
            social_features.append(int(json_data['vdo']))
            social_features.append(int(json_data['location']))
            social_features.append(int(json_data['non_location']))
            social_features.append(int(json_data['share_only_friend']))
            social_features.append(int(json_data['is_public']))
            social_features.append(int(json_data['feeling_status']))
            social_features.append(int(json_data['tag_with']))
            mapping.social_features = social_features

            text_features = []
            text_features.append(len(message))
            text_features.append(message.count('?'))
            text_features.append(message.count('!'))
            message_lst = nlp.crfpp(message)
            number_in_dict = dict_list & set(message_lst)
            out_side_dict = len(message_lst) - len(number_in_dict)
            text_features.append(len(message_lst))
            text_features.append(len(number_in_dict))
            text_features.append(out_side_dict)
            mapping.text_features = text_features

            social_and_text_features = []
            social_and_text_features.extend(social_features)
            social_and_text_features.extend(text_features)
            mapping.social_and_text_features = social_and_text_features

            data_obj.append(mapping)

    pickle.dump(data_obj, open('data/newresult/data/data_obj.obj', 'wb'))
    return data_obj

示例#5

0

显示文件

文件： pre_processing.py 项目： chaluemwut/topicmodel

 def process(self, word_list):
     ret = []
     lst = self.remove_dup_sentense(word_list)
     crf = CRFWordSegment()
     for l in lst:
         try:
             ret.append(crf.crfpp(unicode(l, 'utf8')))
         except Exception as e:
             pass
     return ret

示例#6

0

显示文件

文件： pre_processing.py 项目： chaluemwut/non-newsworthy-remove

 def process(self, word_list):
     ret = []
     lst = self.remove_dup_sentense(word_list)
     crf = CRFWordSegment()
     for l in lst:
         try:
             ret.append(crf.crfpp(l))            
         except Exception as e:
             pass
     return ret

示例#7

0

显示文件

def load_data():
    print('start...')
    nlp = CRFWordSegment()
    with open('data/db/filterel4000.json') as f:
        lines = f.readlines()
        data_obj = []
        for data in lines:
            json_data = json.loads(data)
            if json_data['cred_value'] == 'maybe' or json_data[
                    'tag_with'] == 'NaN':
                continue

            mapping = MappingData()
            message = json_data['message']
            mapping.message = message
            if json_data['cred_value'] == 'no':
                mapping.prediction_result = 0
            else:
                mapping.prediction_result = 1
            feature_data = []
            feature_data.append(int(json_data['likes']))
            feature_data.append(int(json_data['shares']))
            feature_data.append(int(json_data['comments']))
            feature_data.append(int(json_data['url']))
            feature_data.append(int(json_data['hashtag']))
            feature_data.append(int(json_data['images']))
            feature_data.append(int(json_data['vdo']))
            feature_data.append(int(json_data['location']))
            feature_data.append(int(json_data['non_location']))
            feature_data.append(int(json_data['share_only_friend']))
            feature_data.append(int(json_data['is_public']))
            feature_data.append(int(json_data['feeling_status']))
            feature_data.append(int(json_data['tag_with']))
            mapping.feature_list = feature_data

            feature_and_word_data = feature_data[:]

            feature_and_word_data.append(len(message))
            feature_and_word_data.append(message.count('?'))
            feature_and_word_data.append(message.count('!'))

            message_lst = nlp.crfpp(message)
            number_in_dict = dict_list & set(message_lst)
            out_side_dict = len(message_lst) - len(number_in_dict)
            feature_and_word_data.append(len(message_lst))
            feature_and_word_data.append(len(number_in_dict))
            feature_and_word_data.append(out_side_dict)
            mapping.feature_and_word_list = feature_and_word_data
            data_obj.append(mapping)

        pickle.dump(data_obj, open('data/data/data4000.data', 'wb'))
        print('end load...')

示例#8

0

显示文件

def topic_text_social(x_train, x_test, y_train, y_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    x_train_msg = []
    x_test_msg = []
    crf = CRFWordSegment()
    x_cropus = []
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    x_cropus.extend(x_train_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    x_cropus.extend(x_test_msg)
    tf = TfidfVectorizer()
    tf_id = tf.fit_transform(x_cropus)

    x_all = []
    x_all.extend(x_train)
    x_all.extend(x_test)

    tf_id = tf_id.toarray()
    tf_and_feature = []
    for i in range(0, len(tf_id)):
        all_data = []
        all_data.extend(tf_id[i])
        all_data.extend(x_all[i].social_features)
        all_data.extend(x_all[i].text_features)
        tf_and_feature.append(all_data)

    x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)]
    x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)]

    y_pred = cls_cos_sim(x_tf_and_feature_test, x_tf_and_feature_train,
                         y_train)

    f1 = f1_score(y_test, y_pred)
    return f1

示例#9

0

显示文件

def topic_and_text(x_train, x_test, y_train, y_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    x_train_msg = []
    x_test_msg = []
    crf = CRFWordSegment()
    x_cropus = []
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    x_cropus.extend(x_train_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    x_cropus.extend(x_test_msg)
    tf = TfidfVectorizer()
    tf_id = tf.fit_transform(x_cropus)

    x_all = []
    x_all.extend(x_train)
    x_all.extend(x_test)

    tf_id = tf_id.toarray()
    tf_and_feature = []
    for i in range(0, len(tf_id)):
        all_data = []
        all_data.extend(tf_id[i])
        all_data.extend(x_all[i].text_features)
        tf_and_feature.append(all_data)

    x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)]
    x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)]

    clf = RandomForestClassifier()
    clf.fit(x_tf_and_feature_train, y_train)

    y_pred = clf.predict(x_tf_and_feature_test)
    return get_result(y_test, y_pred)

示例#10

0

显示文件

def topic_feature_process(x_train, x_test, y_train, y_test):
    x_train_msg = []
    x_test_msg = []
    crf = CRFWordSegment()
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier())])

    text_clf = text_clf.fit(x_train_msg, y_train)
    y_pred = text_clf.predict(x_test_msg)
    return get_result(y_test, y_pred)

示例#11

0

显示文件

class MainCompare():

    crf = CRFWordSegment()

    time_train_ml = []
    time_predict_ml = []

    time_train_ml_word = []
    time_predict_ml_word = []

    time_train_topic = []
    time_predict_topic = []

    time_train_text = []
    time_predict_text = []

    repeating_time = 10

    log = logging.getLogger('resize')
    log.setLevel(logging.INFO)
    format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(format)
    log.addHandler(ch)

    fh = logging.FileHandler("resize.log")
    fh.setFormatter(format)
    log.addHandler(fh)

    def ml_prediction(self, x_train, x_test, y_train, y_test):
        clf = RandomForestClassifier()
        x_train = [np.array(x.feature_list) for x in x_train]
        x_test = [np.array(x.feature_list) for x in x_test]
        y_train = [np.array(x) for x in y_train]
        y_test = [np.array(x) for x in y_test]

        start_time = time.time()
        clf.fit(x_train, y_train)
        total_time = time.time() - start_time
        self.time_train_ml.append(total_time / len(y_train))

        start_time = time.time()
        y_pred = clf.predict(x_test)
        total_time = time.time() - start_time
        self.time_predict_ml.append(total_time)

        f1 = f1_score(y_test, y_pred)
        return f1

    def ml_word_prediction(self, x_train, x_test, y_train, y_test):
        clf = RandomForestClassifier()
        x_train = [np.array(x.feature_and_word_list) for x in x_train]
        x_test = [np.array(x.feature_and_word_list) for x in x_test]
        y_train = [np.array(x) for x in y_train]
        y_test = [np.array(x) for x in y_test]

        start_time = time.time()
        clf.fit(x_train, y_train)
        total_time = time.time() - start_time
        self.time_train_ml_word.append(total_time / len(y_train))

        start_time = time.time()
        y_pred = clf.predict(x_test)
        total_time = time.time() - start_time
        self.time_predict_ml_word.append(total_time / len(y_pred))

        f1 = f1_score(y_test, y_pred)
        return f1

    def to_message_lst(self, msg_obj):
        msg_seg = self.crf.crfpp(msg_obj.message)
        msg_data = ' '.join(msg_seg)
        return msg_data

    def topic_detection(self, x_train, x_test, y_train, y_test):
        tfidf_vectorizer = TfidfVectorizer()

        x_train_inner, x_test_inner, y_train_inner, y_test_inner = train_test_split(x_train, y_train,
                                                                                    test_size=0.2,
                                                                                    random_state=random.randrange(1000))

        x_train_msg_inner = []
        for x_msg in x_train_inner:
            data_lst = self.crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_train_msg_inner.append(data_msg)

        cosin_lst = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        f1_lst = []
        start_time = time.time()
        for cosin in cosin_lst:
            self.log.info('****** {} *******'.format(cosin))
            y_pred_lst = []
            for x_inner in x_test_inner:
                test_message = self.to_message_lst(x_inner)
                x_train_msg_inner.append(test_message)
                tfidf_matrix = tfidf_vectorizer.fit_transform(x_train_msg_inner)
                cos_lst = np.sort(cosine_similarity(tfidf_matrix[-1:], tfidf_matrix))[0]
                sim_max = cos_lst[len(cos_lst) - 2]
                if sim_max > cosin:
                    y_pred_lst.append(1)
                else:
                    y_pred_lst.append(0)
                del x_train_msg_inner[-1]
            f1 = f1_score(y_test_inner, y_pred_lst)
            f1_lst.append(f1)

        f1_lst = np.array(f1_lst)
        f1_max_idx = f1_lst.argmax()
        cosin_max = cosin_lst[f1_max_idx]

        total_time = time.time() - start_time
        self.time_train_topic.append(total_time / len(y_train_inner))

        x_test_corpus = []
        per_y_pred = []
        start_time = time.time()
        for x_data in x_test:
            data_seg = self.crf.crfpp(x_data.message)
            data = ' '.join(data_seg)
            x_test_corpus.append(data)

        for x in x_test:
            test_message = self.to_message_lst(x)
            x_test_corpus.append(test_message)
            tfidf_test = tfidf_vectorizer.fit_transform(x_test_corpus)
            cos_lst = np.sort(cosine_similarity(tfidf_test[-1:], tfidf_test))[0]
            sim_max = cos_lst[len(cos_lst) - 2]
            if sim_max > cosin_max:
                per_y_pred.append(1)
            else:
                per_y_pred.append(0)

        total_time = time.time() - start_time
        self.time_predict_topic.append(total_time / len(y_test))

        f1 = f1_score(y_true=y_test, y_pred=per_y_pred)
        return f1

    def text_mining(self, x_train, x_test, y_train, y_test):
        x_train_msg = []
        x_test_msg = []
        crf = CRFWordSegment()
        for x_msg in x_train:
            data_lst = crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_train_msg.append(data_msg)

        for x_msg in x_test:
            data_lst = crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_test_msg.append(data_msg)

        text_clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', RandomForestClassifier())])
        start_time = time.time()
        text_clf = text_clf.fit(x_train_msg, y_train)
        total_time = time.time() - start_time
        self.time_train_text.append(total_time / len(y_train))

        start_time = time.time()
        y_pred = text_clf.predict(x_test_msg)
        total_time = time.time() - start_time
        self.time_predict_text.append(total_time / len(y_pred))

        f1 = f1_score(y_test, y_pred)
        return f1

    def print_all_result(self, all_result):
        print('********** performance result')
        for ml, ml_word, topic, perf_text in zip(all_result['perf_ml'], all_result['perf_ml_word'],
                                                 all_result['perf_topic'], all_result['perf_text']):
            self.log.info('{},{},{},{}'.format(ml, ml_word, topic, perf_text))

        print('********** training time')
        for t_ml, t_ml_word, t_topic, t_text in zip(all_result['time_train_ml'], all_result['time_train_ml_word'],
                                                    all_result['time_train_topic'], all_result['time_train_text']):
            self.log.info('{},{},{},{}'.format(t_ml, t_ml_word, t_topic, t_text))

        for p_ml, p_ml_word, p_topic, p_text in zip(all_result['time_predict_ml'], all_result['time_predict_ml_word'],
                                                    all_result['time_predict_topic'], all_result['time_predict_text']):
            self.log.info('{},{},{},{}'.format(p_ml, p_ml_word, p_topic, p_text))

    def main_process(self, test_size):
        mapping_lst = pickle.load(open('data/data/data4000.data', 'rb'))
        x = []
        y = []
        for mapping in mapping_lst:
            x.append(mapping)
            y.append(mapping.prediction_result)

        ml_lst = []
        ml_word_lst = []
        topic_lst = []
        text_lst = []
        for i in range(0, self.repeating_time):
            self.log.info('****** start loop {} '.format(i))
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size,
                                                                random_state=random.randrange(1000))

            ml_result = self.ml_prediction(x_train, x_test, y_train, y_test)
            ml_word_result = self.ml_word_prediction(x_train, x_test, y_train, y_test)
            topic_result = self.topic_detection(x_train, x_test, y_train, y_test)
            text_result = self.text_mining(x_train, x_test, y_train, y_test)

            ml_lst.append(ml_result)
            ml_word_lst.append(ml_word_result)
            topic_lst.append(topic_result)
            text_lst.append(text_result)
            self.log.info('[ml : {}, text : {}, ml word : {}, topic : {}]'.format(ml_result, text_result,
                                                                             ml_word_result, topic_result))
            self.log.info('****** end loop {} '.format(i))

        all_result = {}
        all_result['perf_ml'] = ml_lst
        all_result['perf_ml_word'] = ml_word_lst
        all_result['perf_topic'] = topic_lst
        all_result['perf_text'] = text_lst

        all_result['time_train_ml'] = self.time_train_ml
        all_result['time_predict_ml'] = self.time_predict_ml

        all_result['time_train_ml_word'] = self.time_train_ml_word
        all_result['time_predict_ml_word'] = self.time_predict_ml_word

        all_result['time_train_topic'] = self.time_train_topic
        all_result['time_predict_topic'] = self.time_predict_topic

        all_result['time_train_text'] = self.time_train_text
        all_result['time_predict_text'] = self.time_predict_text

        self.print_all_result(all_result)

        return all_result