def load_dict(dic_path): """Load pre-trained word2vec/char2vec vector dictionary.""" log.info("Start loading dict") with open(dic_path, "r") as f: word_dic = json.load(f) log.info("End loading dict") return word_dic
def read_origin_data_file(data_file_path): """This is used to read the data file""" log.info("Start reading the origin data file") file_origin_list = [] with open(data_file_path, "r") as f: while True: line = f.readline() if not line: break line = line.replace("\n", "") file_origin_list.append(line) return file_origin_list
def calculate_tf_idf_matrix(file_origin_list): """This is used to calculate the tf-idf matrix value""" log.info("Start calculating the tf idf matrix") vectorizer = CountVectorizer() transformer = TfidfTransformer() #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 tfidf = transformer.fit_transform( vectorizer.fit_transform(file_origin_list)) word_list = vectorizer.get_feature_names() # convert tfidf to csr_matrix tfidf_sparse_matrix = csr_matrix(tfidf) return word_list, tfidf_sparse_matrix
def main(): train_word_file = "/home/chenyu/daguan/data/train_word" test_word_file = "/home/chenyu/daguan/data/test_word" train_label_file = "/home/chenyu/daguan/data/train_label" dict_file = "/home/chenyu/daguan/output/word_dic_64.json" #model_save_path = "/home/chenyu/daguan/model/basic_svm" basic_model = BaselineModel(64) train_vector, train_label, train_cv_vector, train_cv_label, test_vector = generate_model_input_data( train_word_file, train_label_file, test_word_file, dict_file, basic_model.embedding_size) c_value = [1, 10, 50, 100] for c in c_value: model_save_path = "/home/chenyu/daguan/model/basic_svm_" + str(c) basic_model.fit(c, model_save_path, train_vector, train_label) train_predict_result = basic_model.predict(model_save_path, train_vector) cv_predict_result = basic_model.predict(model_save_path, train_cv_vector) accuracy_score = metrics.precision_score(train_label, train_predict_result, average='micro') F1_score = metrics.f1_score(train_label, train_predict_result, average='weighted') log.info("The following is the result for c value " + str(c)) log.info("The accuracy for train data is " + str(accuracy_score)) log.info("The f1 score for train data is " + str(F1_score)) accuracy_score = metrics.precision_score(train_cv_label, cv_predict_result, average='micro') F1_score = metrics.f1_score(train_cv_label, cv_predict_result, average='weighted') log.info("The accuracy for cv data is " + str(accuracy_score)) log.info("The f1 score for cv data is " + str(F1_score))
def create_file_vector(train_file_path, test_file_path, dic_path, embedding_size): """This is used to create the file vector represent This baseline method is as follow: 1. train word2vec 2. calculate tf-idf 3. sort tf-idf value, get Top 128 word 4. get average vector value of Top 128 word as file vector represent """ # Read train file train_origin_list = read_origin_data_file(train_file_path) test_origin_list = read_origin_data_file(test_file_path) file_origin_list = train_origin_list + test_origin_list word_list, tf_idf_sparse_matrix = calculate_tf_idf_matrix(file_origin_list) word_dic = load_dict(dic_path) file_vector_list = [] # Create word vector array word_vector = [] for word in word_list: if word_dic.has_key(word): word_vector.append(word_dic[word]) else: word_vector.append([0.0] * 64) word_vector = np.array(word_vector) # Loop all file to create file vector representation log.info("The number of file is %d" % (len(file_origin_list))) log.info("The number of word list is %d" % (len(word_list))) for i in range(len(file_origin_list)): if i % 1000 == 0: log.info("Now has processed %d file" % (i)) normal_array = tf_idf_sparse_matrix[i].toarray() cur_file_vector = calculate_file_vector(word_vector, normal_array) file_vector_list.append(cur_file_vector) return file_vector_list[:len(train_origin_list )], file_vector_list[len(train_origin_list):]
def predict(self, model_save_path, file_vector_list): log.info("Start predicting the model") classifier = joblib.load(model_save_path) predict_result = classifier.predict(file_vector_list) return predict_result
def fit(self, value_c, model_save_path, train_vector, train_label): log.info("Start fitting the model") classifier = svm.LinearSVC(random_state=0, C=value_c) classifier.fit(train_vector, train_label) joblib.dump(classifier, model_save_path, compress=3) log.info("End fitting the modle")