def caculate_tfidf(words): tfidf = [] seq = text.text_to_word_sequence(words) counts = Counter(seq) for item in seq: tf_idf = float((counts[item]/sum(counts.values())) * math.log(doc_size/(word_docs[item]+1), 10)) # tf = self.word_counts[item]/self.sum_word_counts # idf = math.log(self.doc_size/(self.word_docs[item]+1), 10) tfidf.append(tf_idf) return tfidf
def gen_vectors(): d = [] with codecs.open(join(TRAIN_DATA, 'stoplist.txt'), 'r', 'utf-8') as f: for word in f.readlines(): d.append(word[:-1]) global MAX_SEQUENCE_LENGTH print("Fit tokenizer...") f = codecs.open(join(TRAIN_DATA, 'train_data.txt'), 'r', 'utf-8') datalist = eval(f.read()) shuffle(datalist) print("Input data shuffled...") labels = [] data_mag = [] data_aminer = [] rate = [] for label, mag, aminer in datalist: labels.append(label) x = text.text_to_word_sequence(mag) y = text.text_to_word_sequence(aminer) MAX_SEQUENCE_LENGTH = max(MAX_SEQUENCE_LENGTH, max(len(x), len(y))) data_mag.append(x) data_aminer.append(y) if os.path.exists(join(SAVED_MODEL, 'tokenizer')): f1 = open(join(SAVED_MODEL, 'tokenizer'), 'rb') tokenizer = _pickle.load(f1) else: tokenizer = text.Tokenizer() tokenizer.fit_on_texts(data_mag + data_aminer) print("Save tokenizer...") _pickle.dump(tokenizer, codecs.open(join(SAVED_MODEL, 'tokenizer'), 'wb')) # tokenized_mag = tokenizer.texts_to_sequences(data_mag) # tokenized_aminer = tokenizer.texts_to_sequences(data_aminer) mag = [] aminer = [] mag_len = [] aminer_len = [] reverse = [] max_len = 0 for i in range(len(data_mag)): x, y, a, b, c, e = jaccard_index(data_mag[i], data_aminer[i], d) mag.append(a) aminer.append(b) rate.append([c]) mag_len.append([x]) aminer_len.append([y]) reverse.append([e] * 16) max_len = max(max(len(a), len(b)), max_len) if os.path.exists(join(SAVED_MODEL, 'tokenizer')): f1 = open(join(SAVED_MODEL, 'tokenizer'), 'rb') tokenizer = _pickle.load(f1) else: tokenizer = text.Tokenizer() tokenizer.fit_on_texts(data_mag + data_aminer) print("Save tokenizer...") _pickle.dump(tokenizer, codecs.open(join(SAVED_MODEL, 'tokenizer'), 'wb')) # rate = out() sseq_mag = pad_sequences(tokenizer.texts_to_sequences(data_mag), maxlen=MAX_SEQUENCE_LENGTH) sseq_aminer = pad_sequences(tokenizer.texts_to_sequences(data_aminer), maxlen=MAX_SEQUENCE_LENGTH) seq_mag = pad_sequences(mag, maxlen=8) seq_aminer = pad_sequences(aminer, maxlen=8) return MAX_SEQUENCE_LENGTH, len( tokenizer.word_index), labels, seq_mag, seq_aminer, rate, np.array( mag_len), np.array(aminer_len), sseq_mag, sseq_aminer, np.array( reverse)
def split_and_tokenize(self): for i, pair in enumerate(self.train_data.copy()): seq1 = text.text_to_word_sequence(pair[1]) seq2 = text.text_to_word_sequence(pair[2]) self.train_data[i] = [pair[0], seq1, seq2] return len(self.tokenizer.word_index)