def topic_feature_process(x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] x_corpus = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) x_corpus.extend(x_train_msg) x_corpus.extend(x_test_msg) vectorizer = TfidfVectorizer() tfidf_corpus = vectorizer.fit_transform(x_corpus).toarray() tfidf_train = tfidf_corpus[0:len(x_train_msg)] tfidf_test = tfidf_corpus[len(x_train_msg):len(tfidf_corpus)] y_pred = cls_cos_sim(tfidf_test, tfidf_train, y_train) f1 = f1_score(y_test, y_pred) return f1
def process(self, word_list): ret = [] lst = self.remove_dup_sentense(word_list) crf = CRFWordSegment() for l in lst: ret.append(crf.crfpp(unicode(l,'utf8'))[0]) return ret
def text_mining(self, x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())]) start_time = time.time() text_clf = text_clf.fit(x_train_msg, y_train) total_time = time.time() - start_time self.time_train_text.append(total_time / len(y_train)) start_time = time.time() y_pred = text_clf.predict(x_test_msg) total_time = time.time() - start_time self.time_predict_text.append(total_time / len(y_pred)) f1 = f1_score(y_test, y_pred) return f1
def load_data(): print('start...') nlp = CRFWordSegment() with codecs.open('data/db/filterel4000.json', 'r', 'utf-8') as f: lines = f.readlines() data_obj = [] for data in lines: json_data = json.loads(data) if json_data['cred_value'] == 'maybe' or json_data[ 'tag_with'] == 'NaN': continue mapping = NewDataMapping() message = json_data['message'] mapping.message = message if json_data['cred_value'] == 'no': mapping.prediction_result = 0 else: mapping.prediction_result = 1 social_features = [] social_features.append(int(json_data['likes'])) social_features.append(int(json_data['shares'])) social_features.append(int(json_data['comments'])) social_features.append(int(json_data['url'])) social_features.append(int(json_data['hashtag'])) social_features.append(int(json_data['images'])) social_features.append(int(json_data['vdo'])) social_features.append(int(json_data['location'])) social_features.append(int(json_data['non_location'])) social_features.append(int(json_data['share_only_friend'])) social_features.append(int(json_data['is_public'])) social_features.append(int(json_data['feeling_status'])) social_features.append(int(json_data['tag_with'])) mapping.social_features = social_features text_features = [] text_features.append(len(message)) text_features.append(message.count('?')) text_features.append(message.count('!')) message_lst = nlp.crfpp(message) number_in_dict = dict_list & set(message_lst) out_side_dict = len(message_lst) - len(number_in_dict) text_features.append(len(message_lst)) text_features.append(len(number_in_dict)) text_features.append(out_side_dict) mapping.text_features = text_features social_and_text_features = [] social_and_text_features.extend(social_features) social_and_text_features.extend(text_features) mapping.social_and_text_features = social_and_text_features data_obj.append(mapping) pickle.dump(data_obj, open('data/newresult/data/data_obj.obj', 'wb')) return data_obj
def process(self, word_list): ret = [] lst = self.remove_dup_sentense(word_list) crf = CRFWordSegment() for l in lst: try: ret.append(crf.crfpp(unicode(l, 'utf8'))) except Exception as e: pass return ret
def process(self, word_list): ret = [] lst = self.remove_dup_sentense(word_list) crf = CRFWordSegment() for l in lst: try: ret.append(crf.crfpp(l)) except Exception as e: pass return ret
def load_data(): print('start...') nlp = CRFWordSegment() with open('data/db/filterel4000.json') as f: lines = f.readlines() data_obj = [] for data in lines: json_data = json.loads(data) if json_data['cred_value'] == 'maybe' or json_data[ 'tag_with'] == 'NaN': continue mapping = MappingData() message = json_data['message'] mapping.message = message if json_data['cred_value'] == 'no': mapping.prediction_result = 0 else: mapping.prediction_result = 1 feature_data = [] feature_data.append(int(json_data['likes'])) feature_data.append(int(json_data['shares'])) feature_data.append(int(json_data['comments'])) feature_data.append(int(json_data['url'])) feature_data.append(int(json_data['hashtag'])) feature_data.append(int(json_data['images'])) feature_data.append(int(json_data['vdo'])) feature_data.append(int(json_data['location'])) feature_data.append(int(json_data['non_location'])) feature_data.append(int(json_data['share_only_friend'])) feature_data.append(int(json_data['is_public'])) feature_data.append(int(json_data['feeling_status'])) feature_data.append(int(json_data['tag_with'])) mapping.feature_list = feature_data feature_and_word_data = feature_data[:] feature_and_word_data.append(len(message)) feature_and_word_data.append(message.count('?')) feature_and_word_data.append(message.count('!')) message_lst = nlp.crfpp(message) number_in_dict = dict_list & set(message_lst) out_side_dict = len(message_lst) - len(number_in_dict) feature_and_word_data.append(len(message_lst)) feature_and_word_data.append(len(number_in_dict)) feature_and_word_data.append(out_side_dict) mapping.feature_and_word_list = feature_and_word_data data_obj.append(mapping) pickle.dump(data_obj, open('data/data/data4000.data', 'wb')) print('end load...')
def topic_text_social(x_train, x_test, y_train, y_test): from sklearn.feature_extraction.text import TfidfVectorizer x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() x_cropus = [] for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) x_cropus.extend(x_train_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) x_cropus.extend(x_test_msg) tf = TfidfVectorizer() tf_id = tf.fit_transform(x_cropus) x_all = [] x_all.extend(x_train) x_all.extend(x_test) tf_id = tf_id.toarray() tf_and_feature = [] for i in range(0, len(tf_id)): all_data = [] all_data.extend(tf_id[i]) all_data.extend(x_all[i].social_features) all_data.extend(x_all[i].text_features) tf_and_feature.append(all_data) x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)] x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)] y_pred = cls_cos_sim(x_tf_and_feature_test, x_tf_and_feature_train, y_train) f1 = f1_score(y_test, y_pred) return f1
def topic_and_text(x_train, x_test, y_train, y_test): from sklearn.feature_extraction.text import TfidfVectorizer x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() x_cropus = [] for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) x_cropus.extend(x_train_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) x_cropus.extend(x_test_msg) tf = TfidfVectorizer() tf_id = tf.fit_transform(x_cropus) x_all = [] x_all.extend(x_train) x_all.extend(x_test) tf_id = tf_id.toarray() tf_and_feature = [] for i in range(0, len(tf_id)): all_data = [] all_data.extend(tf_id[i]) all_data.extend(x_all[i].text_features) tf_and_feature.append(all_data) x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)] x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)] clf = RandomForestClassifier() clf.fit(x_tf_and_feature_train, y_train) y_pred = clf.predict(x_tf_and_feature_test) return get_result(y_test, y_pred)
def topic_feature_process(x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())]) text_clf = text_clf.fit(x_train_msg, y_train) y_pred = text_clf.predict(x_test_msg) return get_result(y_test, y_pred)
class MainCompare(): crf = CRFWordSegment() time_train_ml = [] time_predict_ml = [] time_train_ml_word = [] time_predict_ml_word = [] time_train_topic = [] time_predict_topic = [] time_train_text = [] time_predict_text = [] repeating_time = 10 log = logging.getLogger('resize') log.setLevel(logging.INFO) format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = logging.FileHandler("resize.log") fh.setFormatter(format) log.addHandler(fh) def ml_prediction(self, x_train, x_test, y_train, y_test): clf = RandomForestClassifier() x_train = [np.array(x.feature_list) for x in x_train] x_test = [np.array(x.feature_list) for x in x_test] y_train = [np.array(x) for x in y_train] y_test = [np.array(x) for x in y_test] start_time = time.time() clf.fit(x_train, y_train) total_time = time.time() - start_time self.time_train_ml.append(total_time / len(y_train)) start_time = time.time() y_pred = clf.predict(x_test) total_time = time.time() - start_time self.time_predict_ml.append(total_time) f1 = f1_score(y_test, y_pred) return f1 def ml_word_prediction(self, x_train, x_test, y_train, y_test): clf = RandomForestClassifier() x_train = [np.array(x.feature_and_word_list) for x in x_train] x_test = [np.array(x.feature_and_word_list) for x in x_test] y_train = [np.array(x) for x in y_train] y_test = [np.array(x) for x in y_test] start_time = time.time() clf.fit(x_train, y_train) total_time = time.time() - start_time self.time_train_ml_word.append(total_time / len(y_train)) start_time = time.time() y_pred = clf.predict(x_test) total_time = time.time() - start_time self.time_predict_ml_word.append(total_time / len(y_pred)) f1 = f1_score(y_test, y_pred) return f1 def to_message_lst(self, msg_obj): msg_seg = self.crf.crfpp(msg_obj.message) msg_data = ' '.join(msg_seg) return msg_data def topic_detection(self, x_train, x_test, y_train, y_test): tfidf_vectorizer = TfidfVectorizer() x_train_inner, x_test_inner, y_train_inner, y_test_inner = train_test_split(x_train, y_train, test_size=0.2, random_state=random.randrange(1000)) x_train_msg_inner = [] for x_msg in x_train_inner: data_lst = self.crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg_inner.append(data_msg) cosin_lst = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] f1_lst = [] start_time = time.time() for cosin in cosin_lst: self.log.info('****** {} *******'.format(cosin)) y_pred_lst = [] for x_inner in x_test_inner: test_message = self.to_message_lst(x_inner) x_train_msg_inner.append(test_message) tfidf_matrix = tfidf_vectorizer.fit_transform(x_train_msg_inner) cos_lst = np.sort(cosine_similarity(tfidf_matrix[-1:], tfidf_matrix))[0] sim_max = cos_lst[len(cos_lst) - 2] if sim_max > cosin: y_pred_lst.append(1) else: y_pred_lst.append(0) del x_train_msg_inner[-1] f1 = f1_score(y_test_inner, y_pred_lst) f1_lst.append(f1) f1_lst = np.array(f1_lst) f1_max_idx = f1_lst.argmax() cosin_max = cosin_lst[f1_max_idx] total_time = time.time() - start_time self.time_train_topic.append(total_time / len(y_train_inner)) x_test_corpus = [] per_y_pred = [] start_time = time.time() for x_data in x_test: data_seg = self.crf.crfpp(x_data.message) data = ' '.join(data_seg) x_test_corpus.append(data) for x in x_test: test_message = self.to_message_lst(x) x_test_corpus.append(test_message) tfidf_test = tfidf_vectorizer.fit_transform(x_test_corpus) cos_lst = np.sort(cosine_similarity(tfidf_test[-1:], tfidf_test))[0] sim_max = cos_lst[len(cos_lst) - 2] if sim_max > cosin_max: per_y_pred.append(1) else: per_y_pred.append(0) total_time = time.time() - start_time self.time_predict_topic.append(total_time / len(y_test)) f1 = f1_score(y_true=y_test, y_pred=per_y_pred) return f1 def text_mining(self, x_train, x_test, y_train, y_test): x_train_msg = [] x_test_msg = [] crf = CRFWordSegment() for x_msg in x_train: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_train_msg.append(data_msg) for x_msg in x_test: data_lst = crf.crfpp(x_msg.message) data_msg = ' '.join(data_lst) x_test_msg.append(data_msg) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())]) start_time = time.time() text_clf = text_clf.fit(x_train_msg, y_train) total_time = time.time() - start_time self.time_train_text.append(total_time / len(y_train)) start_time = time.time() y_pred = text_clf.predict(x_test_msg) total_time = time.time() - start_time self.time_predict_text.append(total_time / len(y_pred)) f1 = f1_score(y_test, y_pred) return f1 def print_all_result(self, all_result): print('********** performance result') for ml, ml_word, topic, perf_text in zip(all_result['perf_ml'], all_result['perf_ml_word'], all_result['perf_topic'], all_result['perf_text']): self.log.info('{},{},{},{}'.format(ml, ml_word, topic, perf_text)) print('********** training time') for t_ml, t_ml_word, t_topic, t_text in zip(all_result['time_train_ml'], all_result['time_train_ml_word'], all_result['time_train_topic'], all_result['time_train_text']): self.log.info('{},{},{},{}'.format(t_ml, t_ml_word, t_topic, t_text)) for p_ml, p_ml_word, p_topic, p_text in zip(all_result['time_predict_ml'], all_result['time_predict_ml_word'], all_result['time_predict_topic'], all_result['time_predict_text']): self.log.info('{},{},{},{}'.format(p_ml, p_ml_word, p_topic, p_text)) def main_process(self, test_size): mapping_lst = pickle.load(open('data/data/data4000.data', 'rb')) x = [] y = [] for mapping in mapping_lst: x.append(mapping) y.append(mapping.prediction_result) ml_lst = [] ml_word_lst = [] topic_lst = [] text_lst = [] for i in range(0, self.repeating_time): self.log.info('****** start loop {} '.format(i)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random.randrange(1000)) ml_result = self.ml_prediction(x_train, x_test, y_train, y_test) ml_word_result = self.ml_word_prediction(x_train, x_test, y_train, y_test) topic_result = self.topic_detection(x_train, x_test, y_train, y_test) text_result = self.text_mining(x_train, x_test, y_train, y_test) ml_lst.append(ml_result) ml_word_lst.append(ml_word_result) topic_lst.append(topic_result) text_lst.append(text_result) self.log.info('[ml : {}, text : {}, ml word : {}, topic : {}]'.format(ml_result, text_result, ml_word_result, topic_result)) self.log.info('****** end loop {} '.format(i)) all_result = {} all_result['perf_ml'] = ml_lst all_result['perf_ml_word'] = ml_word_lst all_result['perf_topic'] = topic_lst all_result['perf_text'] = text_lst all_result['time_train_ml'] = self.time_train_ml all_result['time_predict_ml'] = self.time_predict_ml all_result['time_train_ml_word'] = self.time_train_ml_word all_result['time_predict_ml_word'] = self.time_predict_ml_word all_result['time_train_topic'] = self.time_train_topic all_result['time_predict_topic'] = self.time_predict_topic all_result['time_train_text'] = self.time_train_text all_result['time_predict_text'] = self.time_predict_text self.print_all_result(all_result) return all_result