def text_mining(self, x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())]) start_time = time.time() text_clf = text_clf.fit(x_train_msg, y_train) total_time = time.time() - start_time self.time_train_text.append(total_time / len(y_train)) start_time = time.time() y_pred = text_clf.predict(x_test_msg) total_time = time.time() - start_time self.time_predict_text.append(total_time / len(y_pred)) f1 = f1_score(y_test, y_pred) return f1
def topic_feature_process(x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] x_corpus = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) x_corpus.extend(x_train_msg) x_corpus.extend(x_test_msg) vectorizer = TfidfVectorizer() tfidf_corpus = vectorizer.fit_transform(x_corpus).toarray() tfidf_train = tfidf_corpus[0:len(x_train_msg)] tfidf_test = tfidf_corpus[len(x_train_msg):len(tfidf_corpus)] y_pred = cls_cos_sim(tfidf_test, tfidf_train, y_train) f1 = f1_score(y_test, y_pred) return f1
def process(self, word_list): ret = [] lst = self.remove_dup_sentense(word_list) crf = CRFWordSegment() for l in lst: ret.append(crf.crfpp(unicode(l,'utf8'))[0]) return ret
def load_data(): print('start...') nlp = CRFWordSegment() with codecs.open('data/db/filterel4000.json', 'r', 'utf-8') as f: lines = f.readlines() data_obj = [] for data in lines: json_data = json.loads(data) if json_data['cred_value'] == 'maybe' or json_data[ 'tag_with'] == 'NaN': continue mapping = NewDataMapping() message = json_data['message'] mapping.message = message if json_data['cred_value'] == 'no': mapping.prediction_result = 0 else: mapping.prediction_result = 1 social_features = [] social_features.append(int(json_data['likes'])) social_features.append(int(json_data['shares'])) social_features.append(int(json_data['comments'])) social_features.append(int(json_data['url'])) social_features.append(int(json_data['hashtag'])) social_features.append(int(json_data['images'])) social_features.append(int(json_data['vdo'])) social_features.append(int(json_data['location'])) social_features.append(int(json_data['non_location'])) social_features.append(int(json_data['share_only_friend'])) social_features.append(int(json_data['is_public'])) social_features.append(int(json_data['feeling_status'])) social_features.append(int(json_data['tag_with'])) mapping.social_features = social_features text_features = [] text_features.append(len(message)) text_features.append(message.count('?')) text_features.append(message.count('!')) message_lst = nlp.crfpp(message) number_in_dict = dict_list & set(message_lst) out_side_dict = len(message_lst) - len(number_in_dict) text_features.append(len(message_lst)) text_features.append(len(number_in_dict)) text_features.append(out_side_dict) mapping.text_features = text_features social_and_text_features = [] social_and_text_features.extend(social_features) social_and_text_features.extend(text_features) mapping.social_and_text_features = social_and_text_features data_obj.append(mapping) pickle.dump(data_obj, open('data/newresult/data/data_obj.obj', 'wb')) return data_obj
def topic_text_social(x_train, x_test, y_train, y_test): from sklearn.feature_extraction.text import TfidfVectorizer x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() x_cropus = [] for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) x_cropus.extend(x_train_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) x_cropus.extend(x_test_msg) tf = TfidfVectorizer() tf_id = tf.fit_transform(x_cropus) x_all = [] x_all.extend(x_train) x_all.extend(x_test) tf_id = tf_id.toarray() tf_and_feature = [] for i in range(0, len(tf_id)): all_data = [] all_data.extend(tf_id[i]) all_data.extend(x_all[i].social_features) all_data.extend(x_all[i].text_features) tf_and_feature.append(all_data) x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)] x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)] y_pred = cls_cos_sim(x_tf_and_feature_test, x_tf_and_feature_train, y_train) f1 = f1_score(y_test, y_pred) return f1
def process(self, word_list): ret = [] lst = self.remove_dup_sentense(word_list) crf = CRFWordSegment() for l in lst: try: ret.append(crf.crfpp(unicode(l, 'utf8'))) except Exception as e: pass return ret
def process(self, word_list): ret = [] lst = self.remove_dup_sentense(word_list) crf = CRFWordSegment() for l in lst: try: ret.append(crf.crfpp(l)) except Exception as e: pass return ret
def topic_and_text(x_train, x_test, y_train, y_test): from sklearn.feature_extraction.text import TfidfVectorizer x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() x_cropus = [] for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) x_cropus.extend(x_train_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) x_cropus.extend(x_test_msg) tf = TfidfVectorizer() tf_id = tf.fit_transform(x_cropus) x_all = [] x_all.extend(x_train) x_all.extend(x_test) tf_id = tf_id.toarray() tf_and_feature = [] for i in range(0, len(tf_id)): all_data = [] all_data.extend(tf_id[i]) all_data.extend(x_all[i].text_features) tf_and_feature.append(all_data) x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)] x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)] clf = RandomForestClassifier() clf.fit(x_tf_and_feature_train, y_train) y_pred = clf.predict(x_tf_and_feature_test) return get_result(y_test, y_pred)
def topic_feature_process(x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())]) text_clf = text_clf.fit(x_train_msg, y_train) y_pred = text_clf.predict(x_test_msg) return get_result(y_test, y_pred)
def load_data(): print('start...') nlp = CRFWordSegment() with open('data/db/filterel4000.json') as f: lines = f.readlines() data_obj = [] for data in lines: json_data = json.loads(data) if json_data['cred_value'] == 'maybe' or json_data[ 'tag_with'] == 'NaN': continue mapping = MappingData() message = json_data['message'] mapping.message = message if json_data['cred_value'] == 'no': mapping.prediction_result = 0 else: mapping.prediction_result = 1 feature_data = [] feature_data.append(int(json_data['likes'])) feature_data.append(int(json_data['shares'])) feature_data.append(int(json_data['comments'])) feature_data.append(int(json_data['url'])) feature_data.append(int(json_data['hashtag'])) feature_data.append(int(json_data['images'])) feature_data.append(int(json_data['vdo'])) feature_data.append(int(json_data['location'])) feature_data.append(int(json_data['non_location'])) feature_data.append(int(json_data['share_only_friend'])) feature_data.append(int(json_data['is_public'])) feature_data.append(int(json_data['feeling_status'])) feature_data.append(int(json_data['tag_with'])) mapping.feature_list = feature_data feature_and_word_data = feature_data[:] feature_and_word_data.append(len(message)) feature_and_word_data.append(message.count('?')) feature_and_word_data.append(message.count('!')) message_lst = nlp.crfpp(message) number_in_dict = dict_list & set(message_lst) out_side_dict = len(message_lst) - len(number_in_dict) feature_and_word_data.append(len(message_lst)) feature_and_word_data.append(len(number_in_dict)) feature_and_word_data.append(out_side_dict) mapping.feature_and_word_list = feature_and_word_data data_obj.append(mapping) pickle.dump(data_obj, open('data/data/data4000.data', 'wb')) print('end load...')