def predict_on_file(self, test_path=None, test_result_path=None, chosen_word_dict=None): # test_data: label_name \t title \t text fw = open(test_result_path, 'w', encoding='utf-8') line_cnt = 0 with open(test_path, 'r', encoding='utf-8') as fr: label_dict = get_label_dict() # {label_name: label_index} for line in fr: buf = line[:-1].split('\t') if len(buf) != 3: continue y_true_label_name = buf[0].strip() y_true_label_index = str(label_dict[y_true_label_name]) title = buf[1] text = buf[2] word_dict = {} # {word: True} for word in (title + ' ' + text).split(' '): word = word.strip() if word not in chosen_word_dict: # feature selection continue if word in word_dict: continue word_dict[word] = True probs = {} # {label_name: prob} V = len(chosen_word_dict) for label_name, label_prob in self.label_prob.items(): prob = np.log(label_prob) N_k = self.label_count[label_name] for word in word_dict.keys(): if word not in self.label_word_prob[label_name]: #continue word_pos_prob = 1. / (N_k + V) #print(label_name, word, word_pos_prob) else: word_pos_prob = self.label_word_prob[label_name][ word] prob += np.log(word_pos_prob) probs[label_name] = prob # === Sort by prob, DESC # sorted[top_0][first_item=label_name] y_pred_label_name = sorted(probs.items(), key=lambda x: -x[1])[0][0] y_pred_label_index = str(label_dict[y_pred_label_name]) fw.write( str(y_true_label_index) + '\t' + str(y_pred_label_index) + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print(line_cnt) print("Predict done! %s" % test_result_path)
def dataset_generator(data_path=None, epochs=10, shuffle_buffer_size=1024, batch_size=16, max_seq_len = 100, word2id_dict=None): # input_data: label \t word_1, word_2, ..., split by ' ' # Output: label [word_index_1, word_index_2, ..., word_index_max_seq_len], 0 for pad, 1 for unk # `pad` for less sequence, while `truncate` for longer one. # {label_name: label_index} label_dict = get_label_dict() def generator(): with open(data_path, 'r', encoding='utf-8') as f: for line in f: buf = line[:-1].split('\t') label_name = buf[0] label = int(label_dict[label_name]) inputs = np.array(list(map(lambda x: float(x), buf[1].split(','))), dtype=np.float32) yield inputs, [label] dataset = tf.data.Dataset.from_generator(generator, output_shapes=((300, ), (1, )), output_types=(tf.float32, tf.int32)) return dataset.repeat(epochs)\ .shuffle(buffer_size=shuffle_buffer_size)\ .batch(batch_size=batch_size)
def pretrained_dataset_generator(data_path=None, epochs=10, shuffle_buffer_size=1024, batch_size=16): # input_data: label \t vec_1, vec_2, ..., vec_300, split by ',' # Output: inputs, label; [None, 300], [None, 1] # {label_name: label_index} label_dict = get_label_dict() def generator(): with open(data_path, 'r', encoding='utf-8') as f: for line in f: buf = line[:-1].split('\t') label_name = buf[0] label = int(label_dict[label_name]) inputs = np.array(list(map(lambda x: float(x), buf[1].split(','))), dtype=np.float32) yield inputs, [label] dataset = tf.data.Dataset.from_generator(generator, output_shapes=((300, ), (1, )), output_types=(tf.float32, tf.int32)) return dataset.repeat(epochs)\ .shuffle(buffer_size=shuffle_buffer_size)\ .batch(batch_size=batch_size)
def test_model(model=None, vec_path=None, result_path=None): # data: label_name \t vec_1, vec_2, ..., vec_300; split by ',' # result: true_label_index \t pred_label_index with open(vec_path, 'r', encoding='utf-8') as fr: with open(result_path, 'w', encoding='utf-8') as fw: # {label_name: label_index} label_dict = get_label_dict() line_cnt = 0 for line in fr: buf = line[:-1].split('\t') label_name = buf[0] true_label = label_dict[label_name] # [1, 300] inputs = np.array(list( map(lambda x: float(x), buf[1].split(','))), dtype=np.float32).reshape((-1, 300)) # [1, num_classes=14] softmax = model(inputs) pred_label = np.argmax(softmax, axis=1) fw.write( str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt)
def compute_tfidf(data_path=None, idf_dict=None, word2id_dict=None, tfidf_path=None): # data: label \t title_words \t text_words, split by '\s' # idf_dict: {word: idf} # word2id_dict: {word: index} # tfidf: label_index \s word_index:tfidf \s word_index:tfidf, word_index sorted ASC. fw = open(tfidf_path, 'w', encoding='utf-8') label_dict = get_label_dict() with open(data_path, 'r', encoding='utf-8') as fr: for line in fr: buf = line[:-1].split('\t') if len(buf) != 3: continue label_name = buf[0] label_index = label_dict[label_name] title = buf[1] text = buf[2] # === Count tf tf_dict = {} for word in (title + ' ' + text).split(' '): if word not in word2id_dict: continue if word not in tf_dict: tf_dict[word] = 1 else: tf_dict[word] += 1 # === Compute tfidf tfidf_dict = {} for word, tf in tf_dict.items(): if word not in idf_dict: continue idf = idf_dict[word] tfidf = tf * idf word_index = word2id_dict[word] tfidf_dict[word_index] = tfidf # === Store in the format of libsvm # LIBSVM: sort key ASC {key: value}. tfidf_list = [] for word_index, tfidf in sorted(tfidf_dict.items(), key=lambda x: x[0]): tfidf_list.append( str(word_index) + ':' + "{:.4f}".format(tfidf)) if len(tfidf_list) > 0: fw.write(str(label_index) + ' ' + ' '.join(tfidf_list) + '\n') fw.close() print("Write done! %s " % tfidf_path)
def test_model(model=None, test_path=None, result_path=None, word_vec_dict=None, max_seq_len=100): # test_data: \label \t title_words \t text_words # result: true_label_index \t pred_label_index with open(test_path, 'r', encoding='utf-8') as fr: with open(result_path, 'w', encoding='utf-8') as fw: # {label_name: label_index} label_dict = get_label_dict() line_cnt = 0 for line in fr: buf = line[:-1].split('\t') if len(buf) != 3: # label \t title \t text continue label_name = buf[0] true_label = label_dict[label_name] title = buf[1] text = buf[2] words = (title + ' ' + text).split(' ') inputs = [] i = 0 for word in words: if word not in word_vec_dict: continue word_vec = word_vec_dict[word] inputs.append(word_vec) i += 1 if i >= max_seq_len: break for _ in range(len(inputs), max_seq_len): inputs.append([0] * 300) # 300: embedding_dim inputs = np.expand_dims(np.array(inputs), axis=0) #print(inputs.shape) # [1, num_classes=14] softmax = model(inputs) pred_label = np.argmax(softmax, axis=1) fw.write( str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt)
def dataset_generator(data_path=None, epochs=10, shuffle_buffer_size=1024, batch_size=16, max_seq_len=100, word_vec_dict=None): # input_data: label \t title_words \t text_words, words split by ',' # Output: label [word_index_1, word_index_2, ..., word_index_max_seq_len], 0 for pad, 1 for unk # `pad` for less sequence, while `truncate` for longer one. # {label_name: label_index} label_dict = get_label_dict() def generator(): with open(data_path, 'r', encoding='utf-8') as f: for line in f: buf = line[:-1].split('\t') if len(buf) != 3: continue label_name = buf[0] label = int(label_dict[label_name]) title = buf[1] text = buf[2] words = (title + ' ' + text).split(' ') inputs = [] for i, word in zip(range(max_seq_len), words): if word not in word_vec_dict: continue word_vec = word_vec_dict[word] inputs.append(word_vec) for i in range(len(inputs), max_seq_len): # embedding_dim: 300, hard code here inputs.append([0] * 300) inputs = np.array(inputs) yield inputs, [label] dataset = tf.data.Dataset.from_generator(generator, output_shapes=(( max_seq_len, 300, ), (1, )), output_types=(tf.float32, tf.int32)) return dataset.repeat(epochs)\ .shuffle(buffer_size=shuffle_buffer_size)\ .batch(batch_size=batch_size)
def tfidf_to_libsvm_format(tfidf_path=None, libsvm_path=None, word2id_dict_path=None): # tfidf: label_name \t word:tfidf \s word:tfidf # libsvm: label_index \s word_index:tfidf, in sorted word_index # word2id_dict: {word: index}, start from 0 with open(word2id_dict_path, 'rb') as fr: word2id_dict = pickle.load(fr) print("#word2id_dict=%d" % len(word2id_dict)) with open(libsvm_path, 'w', encoding='utf-8') as fw: with open(tfidf_path, 'r', encoding='utf-8') as fr: label_dict = get_label_dict() line_cnt = 0 for line in fr: buf = line[:-1].split('\t') if len(buf) != 2: continue label_name = buf[0] label_index = label_dict[label_name] libsvm_dict = {} for pair in buf[1].split(' '): if len(pair.split(':')) != 2: continue word = pair.split(':')[0] if word not in word2id_dict: continue word_index = word2id_dict[word] tfidf = pair.split(':')[1] libsvm_dict[word_index] = tfidf libsvm_list = [] # === sort in word_index ASC for word_index, tfidf in sorted(libsvm_dict.items(), key=lambda x: x[0]): libsvm_list.append(str(word_index) + ':' + tfidf) fw.write(str(label_index) + ' ' + ' '.join(libsvm_list) + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt)
def word_to_libsvm_format(text_path=None, libsvm_path=None, word2id_dict_path=None, df_info_dict_path=None): # text: label_name \t title_words \t text_words; words split by '\s' # libsvm: label_index \s word_index:tfidf, in sorted word_index # word2id_dict: {word: index}, start from 0 # df_info: {'df_dict': df_dict, 'total_num_doc': total_num_doc} # df_dict: {df: count} with open(word2id_dict_path, 'rb') as fr: word2id_dict = pickle.load(fr) print("#word2id_dict=%d" % len(word2id_dict)) with open(df_info_dict_path, 'rb') as fr: df_info_dict = pickle.load(fr) total_num_doc = df_info_dict['total_num_doc'] df_dict = df_info_dict['df_dict'] print('total_num_doc %d' % total_num_doc) print("#df_dict=%d" % len(df_dict)) with open(libsvm_path, 'w', encoding='utf-8') as fw: with open(text_path, 'r', encoding='utf-8') as fr: label_dict = get_label_dict() line_cnt = 0 for line in fr: buf = line[:-1].split('\t') if len(buf) != 3: continue label_name = buf[0] label_index = label_dict[label_name] title = buf[1] text = buf[2] # === count tf in current doc tf_dict = {} # {word: tf_count} for word in (title + ' ' + text).split(' '): if len( word ) == 0 or word not in word2id_dict or word not in df_dict: continue if word not in tf_dict: tf_dict[word] = 1 else: tf_dict[word] += 1 # === compute tf-idf tfidf_dict = {} #{word_index: tfidf} for word, tf in tf_dict.items(): word_index = word2id_dict[word] df = int(df_dict[word]) tfidf = tf * np.log(total_num_doc / (df + 1)) tfidf_dict[word_index] = tfidf # === sort in word_index ASC libsvm_list = [] for word_index, tfidf in sorted(tfidf_dict.items(), key=lambda x: x[0]): libsvm_list.append( str(word_index) + ':' + "{:.4f}".format(tfidf)) fw.write(str(label_index) + ' ' + ' '.join(libsvm_list) + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt)