示例#1
0
def caculate_tfidf(words):
    tfidf = []
    seq = text.text_to_word_sequence(words)
    counts = Counter(seq)
    for item in seq:
        tf_idf = float((counts[item]/sum(counts.values())) * math.log(doc_size/(word_docs[item]+1), 10))
        # tf = self.word_counts[item]/self.sum_word_counts
        # idf = math.log(self.doc_size/(self.word_docs[item]+1), 10)
        tfidf.append(tf_idf)
    return tfidf
        
示例#2
0
def gen_vectors():
    d = []
    with codecs.open(join(TRAIN_DATA, 'stoplist.txt'), 'r', 'utf-8') as f:
        for word in f.readlines():
            d.append(word[:-1])
    global MAX_SEQUENCE_LENGTH
    print("Fit tokenizer...")
    f = codecs.open(join(TRAIN_DATA, 'train_data.txt'), 'r', 'utf-8')
    datalist = eval(f.read())
    shuffle(datalist)
    print("Input data shuffled...")
    labels = []
    data_mag = []
    data_aminer = []
    rate = []
    for label, mag, aminer in datalist:
        labels.append(label)
        x = text.text_to_word_sequence(mag)
        y = text.text_to_word_sequence(aminer)
        MAX_SEQUENCE_LENGTH = max(MAX_SEQUENCE_LENGTH, max(len(x), len(y)))
        data_mag.append(x)
        data_aminer.append(y)
    if os.path.exists(join(SAVED_MODEL, 'tokenizer')):
        f1 = open(join(SAVED_MODEL, 'tokenizer'), 'rb')
        tokenizer = _pickle.load(f1)
    else:
        tokenizer = text.Tokenizer()
        tokenizer.fit_on_texts(data_mag + data_aminer)
        print("Save tokenizer...")
        _pickle.dump(tokenizer,
                     codecs.open(join(SAVED_MODEL, 'tokenizer'), 'wb'))
    # tokenized_mag = tokenizer.texts_to_sequences(data_mag)
    # tokenized_aminer = tokenizer.texts_to_sequences(data_aminer)
    mag = []
    aminer = []
    mag_len = []
    aminer_len = []
    reverse = []
    max_len = 0
    for i in range(len(data_mag)):
        x, y, a, b, c, e = jaccard_index(data_mag[i], data_aminer[i], d)
        mag.append(a)
        aminer.append(b)
        rate.append([c])
        mag_len.append([x])
        aminer_len.append([y])
        reverse.append([e] * 16)
        max_len = max(max(len(a), len(b)), max_len)
    if os.path.exists(join(SAVED_MODEL, 'tokenizer')):
        f1 = open(join(SAVED_MODEL, 'tokenizer'), 'rb')
        tokenizer = _pickle.load(f1)
    else:
        tokenizer = text.Tokenizer()
        tokenizer.fit_on_texts(data_mag + data_aminer)
        print("Save tokenizer...")
        _pickle.dump(tokenizer,
                     codecs.open(join(SAVED_MODEL, 'tokenizer'), 'wb'))
    # rate = out()
    sseq_mag = pad_sequences(tokenizer.texts_to_sequences(data_mag),
                             maxlen=MAX_SEQUENCE_LENGTH)
    sseq_aminer = pad_sequences(tokenizer.texts_to_sequences(data_aminer),
                                maxlen=MAX_SEQUENCE_LENGTH)
    seq_mag = pad_sequences(mag, maxlen=8)
    seq_aminer = pad_sequences(aminer, maxlen=8)

    return MAX_SEQUENCE_LENGTH, len(
        tokenizer.word_index), labels, seq_mag, seq_aminer, rate, np.array(
            mag_len), np.array(aminer_len), sseq_mag, sseq_aminer, np.array(
                reverse)
示例#3
0
 def split_and_tokenize(self):
     for i, pair in enumerate(self.train_data.copy()):
         seq1 = text.text_to_word_sequence(pair[1])
         seq2 = text.text_to_word_sequence(pair[2])
         self.train_data[i] = [pair[0], seq1, seq2]
     return len(self.tokenizer.word_index)