def get_data_1(train_sents, maxlen): word_list = [] for i in range(len(train_sents)): for words in train_sents[i]: word_list.append(words) sequence=[] stride=1 #applying windowing for sequence genration for i in range(0,len(word_list)-maxlen,stride): line=word_list[i:i+maxlen] sequence.append(line) tokenizer=Tokenizer() tokenizer.fit_on_texts(sequence) seq=tokenizer.texts_to_sequences(sequence) vocab_len=len(tokenizer.word_index.items())+1 seq=np.array(seq) x_train=seq[:,:-1] y_train=np.zeros((x_train.shape[0],x_train.shape[1],1)) for i in range(x_train.shape[0]): for j in range(x_train.shape[1]): y_train[i,j,0]=seq[i,j+1] return x_train,y_train,vocab_len,tokenizer
def read_copus_generator(self, batch_size=64): """ return a generator with the specified batch_size """ logger.info("Beigin read copus {0}".format(file_name)) data = [] index = 0 with open(file_name, 'r') as fread: while True: try: line = fread.readline() data.append(line) index += 1 if index % 100000 == 0: logger.info("The program has processed {0} lines ". format(index)) except: logger.info("Read End") break tokenizer = Tokenizer(nb_words=30000) tokenizer.fit_on_texts(data) logger.info("word num: {0}".format(len(tokenizer.word_counts))) sorted_word_counts = sorted( tokenizer.word_counts.items(), key=operator.itemgetter(1), reverse=True) # save the word_counts to the meta with open(file_name.replace("train.", "meta."), "w") as fwrite: for word_cnt in sorted_word_counts: key = word_cnt[0] val = word_cnt[1] line = key + ":" + str(val) + "\n" fwrite.write(line) vectorize_data = tokenizer.texts_to_matrix(data) return vectorize_data
def train(dataReader, oneHot, oneHotAveraged, contextHashes): n = (Epochs + 1) * SamplesPerEpoch # TODO + 1 should not be needed tokeniser = Tokenizer(nb_words=MaxWords) tokeniser.fit_on_texts((row[0] for row in dataReader.trainingData(n))) # `word_index` maps each word to its unique index dictionarySize = len(tokeniser.word_index) + 1 oneHotDimension = (1 if oneHotAveraged else SequenceLength) * dictionarySize if oneHot else 0 contextHashesDimension = dictionarySize * 2 if contextHashes else 0 model = Sequential() model.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension))) model.add(Dense(Labels, activation='softmax')) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) trainingGenerator = mapGenerator(dataReader.trainingData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes) validationGenerator = mapGenerator(dataReader.validationData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes) model.fit_generator(trainingGenerator, nb_epoch=Epochs, samples_per_epoch=SamplesPerEpoch, validation_data=validationGenerator, nb_val_samples=SamplesPerEpoch) model2 = Sequential() model2.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension), weights=model.layers[0].get_weights())) return model, model2, tokeniser, dictionarySize
def get_fitted_tokenizer(df_train, df_test): comments_train = df_train[COMMENT_COL].values.tolist() comments_test = df_test[COMMENT_COL].values.tolist() tokenizer = Tokenizer() # tokenizer.num_words = MAX_NUM_WORDS tokenizer.fit_on_texts(comments_train + comments_test) return tokenizer
def get_train_val_matrix(texts, labels, max_features=10000, max_len=100): tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(texts) sequens = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print(f'Found {len(word_index)} unique tokens') data = pad_sequences(sequens, maxlen=max_len) labels = np.asarray(labels) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] train_sample_n = 20000 validation_sample_n = 5000 x_train = data[:train_sample_n] x_val = data[train_sample_n:validation_sample_n+train_sample_n] y_train = labels[:train_sample_n] y_val = labels[train_sample_n:validation_sample_n+train_sample_n] return (x_train, y_train), (x_val, y_val), word_index
def prepare_tokenizer(words): ''' funtion to generate vocabulary of the given list of words implemented by Anindya @param words => the list of words to be tokenized ''' # obtain a tokenizer t = Tokenizer(filters = '') # don't let keras ignore any words t.fit_on_texts(words) field_dict = dict(); rev_field_dict = dict() for key,value in t.word_index.items(): field_dict[value] = key rev_field_dict[key] = value vocab_size = len(t.word_index) + 1 ''' Small modification from Animesh # also add the '<unk>' token to the dictionary at 0th position ''' field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0 #print (vocab_size) # integer encode the documents encoded_docs = t.texts_to_sequences(words) # print "debug: " + str(encoded_docs) #print(padded_docs) return np.array(encoded_docs), field_dict, rev_field_dict, vocab_size
def LoadSMILESData(duplicateProb = 0,seed=7): dataComp = dataset.LoadData('data',0) smiles = list(map(lambda x: x._SMILE, dataComp)) tokenizer = Tokenizer(num_words=None, char_level=True) tokenizer.fit_on_texts(smiles) print(smiles[0]) dictionary = {} i=0 k=0 for smile in smiles: i+=1 for c in list(smile): k+=1 if c in dictionary: dictionary[c]+=1 else: dictionary[c]=1 print(len(dictionary)) # sequence encode encoded_docs = tokenizer.texts_to_sequences(smiles) # pad sequences max_length = max([len(s) for s in smiles]) vocab = {'C': 1, 'c': 2, '(': 3, ')': 4, 'O': 5, '=': 6, '1': 7, 'N': 8, '2': 9, '3': 10, '[': 11, ']': 12, 'F': 13, '4': 14, 'l': 15, 'n': 16, 'S': 17, '@': 18, 'H': 19, '5': 20, '+': 21, '-': 22, 'B': 23, 'r': 24, '\\': 25, '#': 26, '6': 27, '.': 28, '/': 29, 's': 30, 'P': 31, '7': 32, 'i': 33, 'o': 34, '8': 35, 'I': 36, 'a': 37, '%': 38, '9': 39, '0': 40, 'K': 41, 'e': 42, 'A': 43, 'g': 44, 'p': 45, 'M': 46, 'T': 47, 'b': 48, 'd': 49, 'V': 50, 'Z': 51, 'G': 52, 'L': 53} Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define vocabulary size (largest integer value) labels = list(map(lambda x: 1 if x.mutagen==True else 0,dataComp)) return Xtrain, labels,vocab,max_length
class SequenceTransformer(BaseEstimator, TransformerMixin): " Transforms np array of strings into sequences" def __init__(self, analyzer='word', max_features=10000, max_len=100): self.max_len = max_len self.analyzer = analyzer self.max_features = max_features def transform(self, X, y=None): try: getattr(self, "transformer_") except AttributeError: raise RuntimeError("You must fit transformer before using it!") X_seq = self.transformer_.texts_to_sequences(list(X)) X_seq = sequence.pad_sequences(X_seq, maxlen=self.max_len) return X_seq def fit(self, X, y=None): if self.analyzer == 'char': char_level = True elif self.analyzer == 'word': char_level = False else: print("invalid analyzer") return self.transformer_ = Tokenizer(nb_words=self.max_features, lower=True, char_level = char_level) self.transformer_.fit_on_texts(X) return self
class Featurizer: max_words = None tokenizer = None def __init__(self, max_words=1000): self.max_words = max_words self.tokenizer = Tokenizer(num_words=max_words) def fit_transform(self, data): texts = [l['text'] for l in data] self.tokenizer.fit_on_texts(texts) # remove words that cross the max_words limit self.tokenizer.word_index = {k: v for k, v in self.tokenizer.word_index.items() if v <= self.max_words} return self.transform(data) def transform(self, data): texts = [l['text'] for l in data] return self.tokenizer.texts_to_matrix(texts, mode='binary') def transform_inv(self, m): index = {v: k for k, v in self.tokenizer.word_index.items()} # word index by id return [[index.get(i) for i in np.nonzero(line)[0] if i in index] for line in m] def save(self, filepath): with open(filepath + '_word_index.json', 'w') as f: f.write(json.dumps(self.tokenizer.word_index)) @classmethod def load(cls, filepath): with open(filepath + '_word_index.json', 'r') as f: word_index = json.load(f) c = cls(max_words=len(word_index)) c.tokenizer.word_index = word_index return c
def preprocess_embedding(): corpus_train, target, filenames = get_corpus() tokenizer = Tokenizer() tokenizer.fit_on_texts(corpus_train) sequences = tokenizer.texts_to_sequences(corpus_train) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) MAX_SEQUENCE_LENGTH = 50 data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/home/flippped/Desktop/xiangmu/baseline/GoogleNews-vectors-negative300.bin.gz', binary=True) word2vec_model.init_sims(replace=True) # create one matrix for documents words EMBEDDING_DIM = 300 embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) print embedding_matrix.shape for word, i in word_index.items(): try: embedding_vector = word2vec_model[str(word)] if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector except: continue return data,target,filenames,embedding_matrix, word_index
def get_fitted_tokenizer(df_train, df_test): comments_train = df_train[COMMENT_COL].values.tolist() comments_test = df_test[COMMENT_COL].values.tolist() # remain '!' and '?' tokenizer = Tokenizer(filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n') tokenizer.fit_on_texts(comments_train + comments_test) return tokenizer
def test_tokenizer_unicode(): texts = [u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'] tokenizer = Tokenizer(num_words=5) tokenizer.fit_on_texts(texts) assert len(tokenizer.word_counts) == 5
def tokenizeAndGenerateIndex(texts): tokenizer = Tokenizer(nb_words=vocab_size) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=maxlen,padding='post') return data
def tokenaize(train_path, dev_path): with open(train_path) as fd: data = fd.read() with open(dev_path) as fd: data += fd.read() tokenizer = Tokenizer(split='\t', oov_token='<UNK>') tokenizer.fit_on_texts([data]) return tokenizer
def handle(self, *args, **options): ptt = PTT.objects.all() ptt_json = PTTSerializer(ptt, many=True).data user_comments_times = dict() labels_index = 2 labels = [] texts = [] for article in ptt_json: pointer = 1 if article['score'] > 0 else 0 words = jieba.cut(article['contents']) for word in words: labels.append(pointer) texts.append(word) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) print('Token word index:', tokenizer.word_index) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] print('Training model.') # train a 1D convnet with global maxpooling sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32') x = Embedding(output_dim=100, input_dim=len(tokenizer.word_index), input_length=self.MAX_SEQUENCE_LENGTH)(sequence_input) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(35)(x) x = Flatten()(x) x = Dense(128, activation='relu')(x) preds = Dense(labels_index, activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) # happy learning! model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=2, batch_size=64) score = model.evaluate(x_val, y_val, verbose=0) print('Test score:', score[0]) print('Test accuracy:', score[1])
def question_to_input(df_q1,df_q2): tokenizer = Tokenizer() tokenizer.fit_on_texts(df_q1 + df_q2) encoded_1 = tokenizer.texts_to_sequences(df_q1) encoded_2 = tokenizer.texts_to_sequences(df_q2) question_input_train = sequence.pad_sequences(encoded_1, maxlen=15) question_input_test = sequence.pad_sequences(encoded_2, maxlen=15) return question_input_train,question_input_test
def tokenize(texts, texts_train, texts_test): tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) sequences_train = tokenizer.texts_to_sequences(texts_train) sequences_test = tokenizer.texts_to_sequences(texts_test) return word_index, sequences_train, sequences_test
def get_tokenizer(train_comments, nwords): print("getting tokenizer..") t = Tokenizer(num_words=nwords) texts = train_comments t.fit_on_texts(texts) sequences = t.texts_to_sequences(texts) return (t,sequences)
def fit_tokenizer(fname, open_encoding='utf-8'): file = open(fname, 'r', encoding=open_encoding) text = file.read() file.close() texts = [text] # 不过滤低频词 tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) return tokenizer
def keras_classify(df): # 预处理,把 text 中的词转成数字编号 from keras.preprocessing.text import Tokenizer from keras.preprocessing import sequence from keras.callbacks import EarlyStopping from sklearn.cross_validation import train_test_split print "----- Classification by Keras -----" max_features = 50000 # 只选最重要的词 # Tokenizer 只能处理 str,不能处理 unicode textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist()) token = Tokenizer(nb_words=max_features) # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词 token.fit_on_texts(textraw) # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本 # 如 textraw = ['a b c', 'c d e f'] ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]] text_seq = token.texts_to_sequences(textraw) nb_classes = len(np.unique(df.label.values)) print "num of features(vocabulary): ", len(token.word_counts) print "num of labels: ", nb_classes max_sent_len = np.max([len(s) for s in text_seq]) print "max length or document is: ", max_sent_len median_sent_len = np.median([len(s) for s in text_seq]) print "median length or document is: ", median_sent_len # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错 train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1) # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练 seqlen = int(max_sent_len / 2 + median_sent_len / 2) X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post') # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) model = build_cnn_model(max_features, seqlen, nb_classes) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop]) evaluate(earlystop.model, X_test, Y_test, test_y) model = build_lstm_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) model = build_mixed_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) graph = build_graph_model(max_features, seqlen, nb_classes) graph.fit({'input': X_train, 'output': Y_train}, nb_epoch=3, batch_size=32, validation_split=0.1) predict = graph.predict({'input': X_test}, batch_size=32) predict = predict['output'] classes = predict.argmax(axis=1) acc = np_utils.accuracy(classes, test_y) print('Test accuracy: ', acc)
def df2seq(df, nb_words): textraw = df.EssayText.values.tolist() textraw = [line.encode('utf-8') for line in textraw] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) text_seq = token.texts_to_sequences(textraw) return(text_seq, df.Score1.values)
def tokenizeAndGenerateIndex(train, test, maxFeatures, maxLength): merged = np.concatenate([train, test]) tokenizer = Tokenizer(nb_words=maxFeatures) tokenizer.fit_on_texts(merged) sequences_train = tokenizer.texts_to_sequences(train) sequences_test = tokenizer.texts_to_sequences(test) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data_train = pad_sequences(sequences_train, maxlen=maxLength) data_test = pad_sequences(sequences_test, maxlen=maxLength) return data_train, data_test, word_index
def prepare_tokenizer(words, max_word_length = None): ''' funtion to generate vocabulary of the given list of words implemented by Anindya @param words => the list of words to be tokenized ''' # flatten the words list: print("flattening the words into a single sequence ... ") flat_words = []; # initialize to empty list for i in range(len(words)): flat_words += words[i] if(i % 10000 == 0): print("joined", i, "examples") # obtain a tokenizer print("\nmaximum words to work with: ", max_word_length) t = Tokenizer(num_words = max_word_length, filters = '') # don't let keras ignore any words print("\nKeras's tokenizer kicks off ... ") t.fit_on_texts(flat_words) field_dict = dict(); rev_field_dict = dict() print("\nbuilding the dict and the rev_dict ... ") if(max_word_length is not None): vals = t.word_index.items() vals = sorted(vals, key=lambda x: x[1]) for key,value in vals[:max_word_length - 1]: field_dict[value] = key rev_field_dict[key] = value else: for key,value in t.word_index.items(): field_dict[value] = key rev_field_dict[key] = value ''' Small modification from Animesh # also add the '<unk>' token to the dictionary at 0th position ''' field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0 print("\nencoding the words using the dictionary ... ") for i in range(len(words)): for j in range(len(words[i])): if(words[i][j] in rev_field_dict): words[i][j] = rev_field_dict[words[i][j]] else: words[i][j] = rev_field_dict['<unk>'] if(i % 10000 == 0): print("encoded", i, "examples") vocab_size = len(field_dict) return words, field_dict, rev_field_dict, vocab_size
def word_to_index(self, text, tok=None): real_text = [' '.join(z) for z in text] if tok is None: tokenizer = Tokenizer(lower=False, filters=" ") tokenizer.fit_on_texts(real_text) else: tokenizer = tok # here do not need the loop, just put the list of sentences (str) as input sequences = tokenizer.texts_to_sequences(real_text) # tokenizer.word_docs.items() return sequences, tokenizer
def save_tokenizer(question1, question2): questions = question1 + question2 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index)) # save tokenizer with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) return word_index
def load_mr(nb_words=20000, maxlen=64, embd_type='self'): """ :param embd_type: self vs. w2v :return: """ train_size = 0.8 df = pickled2df('data/mr.p') print(df.head()) train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(), df.label.values, train_size=train_size, random_state=1) train_X_wds = train_X test_X_wds = test_X nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print('train len vs. test len', n_ta, n_ts) textraw = [line.encode('utf-8') for line in train_X+test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print('nb_words: ',len(token.word_counts)) print('mean len: ',np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if(embd_type == 'self'): X_train = xcol_nninput_embd(train_X, nb_words, maxlen) X_test = xcol_nninput_embd(test_X, nb_words, maxlen) elif(embd_type == 'w2v'): w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print('wrong embd_type') print('X tensor shape: ', X_train.shape) print('Y tensor shape: ', Y_train.shape) return (X_train, Y_train, X_test, Y_test, nb_classes)
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type): train_df = pd.read_csv(traincsv) test_df = pd.read_csv(testcsv) print(train_df.head()) train_X = train_df.text.values.tolist() test_X = test_df.text.values.tolist() # save for w2v embd train_X_wds = train_X test_X_wds = test_X train_y = train_df.label.values test_y = test_df.label.values nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print('train len vs. test len', n_ta, n_ts) textraw = [line.encode('utf-8') for line in train_X+test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print('nb_words: ', len(token.word_counts)) print('mean len: ', np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if(embd_type == 'self'): X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post') elif(embd_type == 'w2v'): w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print('wrong embd_type') print('X tensor shape: ', X_train.shape) print('Y tensor shape: ', Y_train.shape) return(X_train, Y_train, X_test, Y_test, nb_classes)
def ch_to_index(self, text, tok=None): sequences = [] if tok is None: tokenizer = Tokenizer(lower=False, char_level=True) all_of_them = [' '.join(z) for z in text] tokenizer.fit_on_texts(all_of_them) else: tokenizer = tok for words in text: charaters = [] for ch in tokenizer.texts_to_sequences_generator(words): charaters.append(ch) sequences.append(charaters) return sequences, tokenizer
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type, w2v): train_df = pd.read_csv(traincsv) test_df = pd.read_csv(testcsv) print(train_df.head()) train_X = train_df.text.values.tolist() test_X = test_df.text.values.tolist() # save for w2v embd train_X_wds = train_X test_X_wds = test_X train_y = train_df.label.values test_y = test_df.label.values nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print("train len vs. test len", n_ta, n_ts) textraw = [line.encode("utf-8") for line in train_X + test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print("nb_words: ", len(token.word_counts)) print("mean len: ", np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if embd_type == "self": X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding="post", truncating="post") X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding="post", truncating="post") elif embd_type == "w2v": X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print("wrong embd_type") print("X tensor shape: ", X_train.shape) print("Y tensor shape: ", Y_train.shape) return (X_train, Y_train, X_test, Y_test, nb_classes)
def word_freq(lines): """ 返回 DataFrame,按词频倒序排列 这个是词频统计,其实没有使用,用的是下面的字符频率统计函数 """ # default filter is base_filter(), which is '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' # 这样的话,比如 a-b-c 不会被当作一个词,而会被当作 a b c 三个词看待 # 另外注意,不设置上限 nb_words token = Tokenizer(filters='') # token 只能接受 str 不能接受 unicode token.fit_on_texts(map(lambda x: x.encode('utf-8'), lines)) wc = token.word_counts df = pd.DataFrame({'word': map(lambda x: x.decode('utf-8'), wc.keys()), 'freq': wc.values()}) df.sort('freq', ascending=False, inplace=True) df['idx'] = np.arange(len(wc)) return df
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 1)) vectorizer.fit(datas_word_train) x_train_tfidf = vectorizer.transform(datas_word_train) x_dev_tfidf = vectorizer.transform(datas_word_dev) x_test_tfidf = vectorizer.transform(datas_word_test) print("x_train_tfidf\t\tshape=(%s, %s)" % (x_train_tfidf.shape[0], x_train_tfidf.shape[1])) print("x_dev_tfidf\t\tshape=(%s, %s)" % (x_dev_tfidf.shape[0], x_dev_tfidf.shape[1])) print("x_test_tfidf\t\tshape=(%s, %s)" % (x_test_tfidf.shape[0], x_test_tfidf.shape[1])) print() # keras extract feature tokenizer = Tokenizer() tokenizer.fit_on_texts(datas_word_train) # feature1: count x_train_count = tokenizer.texts_to_matrix(datas_word_train, mode='count') x_dev_count = tokenizer.texts_to_matrix(datas_word_dev, mode='count') x_test_count = tokenizer.texts_to_matrix(datas_word_test, mode='count') print("x_train_count\t\tshape=(%s, %s)" % (x_train_count.shape[0], x_train_count.shape[1])) print("x_dev_count\t\tshape=(%s, %s)" % (x_dev_count.shape[0], x_dev_count.shape[1])) print("x_test_count\t\tshape=(%s, %s)" % (x_test_count.shape[0], x_test_count.shape[1])) print() # feature2: binary x_train_binary = tokenizer.texts_to_matrix(datas_word_train, mode='binary') x_dev_binary = tokenizer.texts_to_matrix(datas_word_dev, mode='binary')
BATCH_SIZE = 128 NUM_EPOCHS = 20 lines = [] fin = open("../data/alice_in_wonderland.txt", "rb") for line in fin: line = line.strip().decode("ascii", "ignore").encode("utf-8") if len(line) == 0: continue lines.append(line) fin.close() sents = nltk.sent_tokenize(" ".join(lines)) tokenizer = Tokenizer(5000) # use top 5000 words only tokens = tokenizer.fit_on_texts(sents) vocab_size = len(tokenizer.word_index) + 1 w_lefts, w_centers, w_rights = [], [], [] for sent in sents: embedding = one_hot(sent, vocab_size) triples = list(nltk.trigrams(embedding)) w_lefts.extend([x[0] for x in triples]) w_centers.extend([x[1] for x in triples]) w_rights.extend([x[2] for x in triples]) ohe = OneHotEncoder(n_values=vocab_size) Xleft = ohe.fit_transform(np.array(w_lefts).reshape(-1, 1)).todense() Xright = ohe.fit_transform(np.array(w_rights).reshape(-1, 1)).todense() X = (Xleft + Xright) / 2.0 Y = ohe.fit_transform(np.array(w_centers).reshape(-1, 1)).todense()
def load_data(debug=False): if (os.path.exists(TRAIN_PICKLE) and os.path.exists(TEST_PICKLE) and os.path.exists(DEV_PICKLE)): with open(TRAIN_PICKLE, 'rb') as fp: X_train_1, X_train_2, Y_train = pickle.load(fp) with open(TEST_PICKLE, 'rb') as fp: X_test_1, X_test_2, Y_test = pickle.load(fp) with open(DEV_PICKLE, 'rb') as fp: X_dev_1, X_dev_2, Y_dev = pickle.load(fp) else: x_train_1, x_train_2, y_train = [], [], [] x_test_1, x_test_2, y_test = [], [], [] x_dev_1, x_dev_2, y_dev = [], [], [] with open("snli_1.0_train.jsonl", encoding='utf8') as fp: for line in fp: try: x_1, x_2, y = _formatting(line) x_train_1.append(x_1) x_train_2.append(x_2) y_train.append(y) except KeyError: continue with open("snli_1.0_test.jsonl", encoding='utf8') as fp: for line in fp: try: x_1, x_2, y = _formatting(line) x_test_1.append(x_1) x_test_2.append(x_2) y_test.append(y) except KeyError: continue with open("snli_1.0_dev.jsonl", encoding='utf8') as fp: for line in fp: try: x_1, x_2, y = _formatting(line) x_dev_1.append(x_1) x_dev_2.append(x_2) y_dev.append(y) except KeyError: continue tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train_1) tokenizer.fit_on_texts(x_train_2) tokenizer.fit_on_texts(x_test_1) tokenizer.fit_on_texts(x_test_2) tokenizer.fit_on_texts(x_dev_1) tokenizer.fit_on_texts(x_dev_2) X_train_1 = tokenizer.texts_to_sequences(x_train_1) X_train_2 = tokenizer.texts_to_sequences(x_train_2) X_test_1 = tokenizer.texts_to_sequences(x_test_1) X_test_2 = tokenizer.texts_to_sequences(x_test_2) X_dev_1 = tokenizer.texts_to_sequences(x_dev_1) X_dev_2 = tokenizer.texts_to_sequences(x_dev_2) MAX_SEQUENCE_LENGTH = max([ len(seq) for seq in X_train_1 + X_train_2 + X_test_1 + X_test_2 + X_dev_1 + X_dev_2 ]) # print(X_train_1 + X_train_2 + X_test_1 + X_test_2 + X_dev_1 + X_dev_2) MAX_NB_WORDS = len(tokenizer.word_index) + 1 if debug: print("MAX_SEQUENCE_LENGTH: {}".format(MAX_SEQUENCE_LENGTH)) print("MAX_NB_WORDS: {}".format(MAX_NB_WORDS)) X_train_1 = pad_sequences(X_train_1, maxlen=MAX_SEQUENCE_LENGTH) X_train_2 = pad_sequences(X_train_2, maxlen=MAX_SEQUENCE_LENGTH) X_test_1 = pad_sequences(X_test_1, maxlen=MAX_SEQUENCE_LENGTH) X_test_2 = pad_sequences(X_test_2, maxlen=MAX_SEQUENCE_LENGTH) X_dev_1 = pad_sequences(X_dev_1, maxlen=MAX_SEQUENCE_LENGTH) X_dev_2 = pad_sequences(X_dev_2, maxlen=MAX_SEQUENCE_LENGTH) Y_train = np_utils.to_categorical(y_train, NB_CLASSES) Y_test = np_utils.to_categorical(y_test, NB_CLASSES) Y_dev = np_utils.to_categorical(y_dev, NB_CLASSES) with open(TRAIN_PICKLE, 'wb') as fp: pickle.dump((X_train_1, X_train_2, Y_train), fp) with open(TEST_PICKLE, 'wb') as fp: pickle.dump((X_test_1, X_test_2, Y_test), fp) with open(DEV_PICKLE, 'wb') as fp: pickle.dump((X_dev_1, X_dev_2, Y_dev), fp) with open(TOKENIZER_PICKLE, 'wb') as fp: pickle.dump(tokenizer, fp) return (X_train_1, X_train_2, Y_train, X_test_1, X_test_2, Y_test, X_dev_1, X_dev_2, Y_dev)
text_list = [] count = 1 my_dict = {} POS_TAG_SIZE = 14 Epoch = [3, 5] Batch_size = [16, 32] max_length = 40 #length of longest sentence seed = 7 Embedding_Dim = 100 NUM_WORDS = 50000 # the file "g" is for reading the labels where as "f" have texts(tweets) with open("File_name.txt", "r") as f: texts = f.readlines() tokenizer = Tokenizer(NUM_WORDS) tokenizer.fit_on_texts(texts) # we fit tokenizer on texts we will process sequences = tokenizer.texts_to_sequences( texts) # here the conversion to tokens happens word_index = tokenizer.word_index invert = dict(map(reversed, word_index.items())) #Text data for training with word embeddings features data = pad_sequences(sequences, maxlen=max_length, padding='post') with open("POS_Tag.txt", "r") as k, open("POS_Tag.txt", "r") as h: for line in k: line = line.lower() line = line.split() text_list.append(line) count += 1 texts_1 = h.readlines() tokenizer_POS = Tokenizer(POS_TAG_SIZE)
def get_tokenizer(self): tokenizer = Tokenizer() phoneme_list = get_phoneme_list() tokenizer.fit_on_texts(phoneme_list) return tokenizer
from keras.preprocessing.text import Tokenizer text = '나는 맛있는 밥을 먹었다' token = Tokenizer() # 한개의 문장을 단어 단위로 잘라서 인덱싱(수치화)을 걸어 줌 token.fit_on_texts([text]) print(token.word_index) # {'나는': 1, '맛있는': 2, '밥을': 3, '먹었다': 4} x = token.texts_to_sequences([text]) print(x) # [[1, 2, 3, 4]] # 문제점 : '나는'과 '먹었다'의 가치가 다르다. from keras.utils import to_categorical word_size = len(token.word_index) + 1 # [0]추가 x = to_categorical(x, num_classes=word_size) print(x) # [[[0. 1. 0. 0. 0.] # 문제점 : 단어 수가 많아지면 data(컬럼)이 너무 많아짐 # [0. 0. 1. 0. 0.] # [0. 0. 0. 1. 0.] # [0. 0. 0. 0. 1.]]]
y_shuffle = y_rt[shuffled_rt] print('X:', x_shuffle) print('Y:', y_shuffle) print( pairwise2.align.globalxx(x_shuffle[0], x_shuffle[1], one_alignment_only=True)) x_train, x_valid, y_train, y_valid = train_test_split(x_shuffle, y_shuffle, stratify=y_shuffle, test_size=0.2) print('x shape:', x_train.shape) tokenizer = Tokenizer() tokenizer.fit_on_texts(get_vocab('atcgx')) V = len(tokenizer.word_index) + 1 print('Num Words:', V) #alignments2vec(x_train, y_train, V, tokenizer) #uncomment to train word2vec representation ''' model = Sequential() ''' ''' model.add(Conv1D(filters=64, kernel_size=word_length, input_shape=(None, word_length))) model.add(Activation('relu')) model.add(Conv1D(filters=64, kernel_size=3)) model.add(Activation('relu')) model.add(MaxPooling1D(3)) model.add(Conv1D(128, 3, activation='relu')) model.add(Conv1D(128, 3, activation='relu'))
xf = xf.sample(frac=1) train_data = xf[:900] test_data = xf[900:] df = train_data x = len(df['Department'].unique()) MAX_NB_WORDS = 200000 MAX_SEQUENCE_LENGTH = 559 EMBEDDING_DIM = 300 EMBEDDING_FILE = "../GoogleNews-vectors-negative300.bin" ## Tokenizing and ppadding the data tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(df['Description']) description_sequence = tokenizer.texts_to_sequences(df['Description']) description_data = pad_sequences(description_sequence,MAX_SEQUENCE_LENGTH) word_index = tokenizer.word_index ## Encoding the output labels le = LabelEncoder() df['target'] = le.fit_transform(df['Department']) category = to_categorical(df['target']) data = description_data VALIDATION_SPLIT = 0.4 indices = np.arange(data.shape[0]) # get sequence of row index np.random.shuffle(indices) # shuffle the row indexes
def testEmbeddingLayer20NewsGroup(self): """ Test Keras 'Embedding' layer returned by 'get_embedding_layer' function for a smaller version of the 20NewsGroup classification problem. """ MAX_SEQUENCE_LENGTH = 1000 # Prepare text samples and their labels # Processing text dataset texts = [] # list of text samples texts_w2v = [] # used to train the word embeddings labels = [] # list of label ids data = fetch_20newsgroups( subset='train', categories=['alt.atheism', 'comp.graphics', 'sci.space']) for index in range(len(data)): label_id = data.target[index] file_data = data.data[index] i = file_data.find('\n\n') # skip header if i > 0: file_data = file_data[i:] try: curr_str = str(file_data) sentence_list = curr_str.split('\n') for sentence in sentence_list: sentence = (sentence.strip()).lower() texts.append(sentence) texts_w2v.append(sentence.split(' ')) labels.append(label_id) except Exception: pass # Vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) # word_index = tokenizer.word_index data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) x_train = data y_train = labels # prepare the embedding layer using the wrapper keras_w2v = self.model_twenty_ng keras_w2v.build_vocab(texts_w2v) keras_w2v.train(texts, total_examples=keras_w2v.corpus_count, epochs=keras_w2v.epochs) keras_w2v_wv = keras_w2v.wv embedding_layer = keras_w2v_wv.get_keras_embedding() # create a 1D convnet to solve our classification task sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) x = Conv1D(128, 5, activation='relu')(embedded_sequences) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(35)(x) # global max pooling x = Flatten()(x) x = Dense(128, activation='relu')(x) preds = Dense(y_train.shape[1], activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) fit_ret_val = model.fit(x_train, y_train, epochs=1) # verify the type of the object returned after training # value returned is a `History` instance. # Its `history` attribute contains all information collected during training. self.assertTrue(type(fit_ret_val) == keras.callbacks.History)
string = re.sub(r"\"", "", string) return string.strip().lower() input_data = pd.read_csv('labeledTrainData.tsv', sep='\t') for idx in range(input_data.review.shape[0]): text = BeautifulSoup(input_data.review[idx], features="html5lib") text = clean_str(text.get_text().encode('ascii', 'ignore')) texts.append(text) sentences = tokenize.sent_tokenize(text) reviews.append(sentences) labels.append(input_data.sentiment[idx]) tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(texts) data = np.zeros((len(texts), max_sentences, maxlen), dtype='int32') for i, sentences in enumerate(reviews): for j, sent in enumerate(sentences): if j < max_sentences: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < maxlen and tokenizer.word_index[word] < max_words: data[i, j, k] = tokenizer.word_index[word] k = k + 1 word_index = tokenizer.word_index print('Total %s unique tokens.' % len(word_index))
all_texts[filename] = f.read().replace("\n", "").lower() except: with codecs.open(path.join(text_path, filename), encoding='latin-1') as f: all_texts[filename] = f.read().replace("\n", "").lower() return all_texts all_texts = get_all_texts() pattern = re.compile('[\W_]+') all_texts_cleaned = { host: pattern.sub(' ', text) for host, text in all_texts.items() } tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(all_texts_cleaned.values()) try: wget.download( "http://embeddings.net/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin", 'frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin') except: pass embeds = KeyedVectors.load_word2vec_format( 'frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin', binary=True) word_index = tokenizer.word_index embedding_matrix = np.zeros((max_words, d)) for word, i in word_index.items(): if i >= max_words:
from keras import layers import matplotlib.pyplot as plt from keras import optimizers from keras.preprocessing.text import Tokenizer amount=60000 newsData = pd.read_json('../../_data/News_Category_Dataset_v2.json', lines=True) newsData=newsData.drop(columns=['date','link'],axis=1) newsData= newsData.dropna(how='any') newsData.category=newsData.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x) newsData['text'] = newsData.headline + " " + newsData.short_description tokenizer = Tokenizer() tokenizer.fit_on_texts(newsData.text) newsData['words'] = tokenizer.texts_to_sequences(newsData.text) # print(newsData.loc[:100,'words']) def vectorize_sequences(sequences): dimension=10000 results = np.zeros((amount, dimension)) for i in range(amount): for k in sequences[i]: if(k<10000): results[i,k] = 1. return results inputData=vectorize_sequences(newsData.words) print(inputData[:4,:10]) print(newsData.words[:4])
from keras.callbacks import TensorBoard from sklearn.preprocessing import LabelEncoder data = pd.read_csv('Sentiment.csv') # Keeping only the neccessary columns data = data[['text', 'sentiment']] data['text'] = data['text'].apply(lambda x: x.lower()) data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) for idx, row in data.iterrows(): row[0] = row[0].replace('rt', ' ') max_fatures = 2000 tokenizer = Tokenizer(num_words=max_fatures, split=' ') tokenizer.fit_on_texts(data['text'].values) X = tokenizer.texts_to_sequences(data['text'].values) X = pad_sequences(X) embed_dim = 128 lstm_out = 196 def createmodel(): model = Sequential() model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1])) model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(3, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam',
from keras.preprocessing.text import Tokenizer from sklearn import preprocessing samples = ['种植 牙 牙周病 治疗 修复 及口 内 治疗', '乳腺 肿瘤 及 乳房 整形 领域 的 手术 消化系统 等 常见 肿瘤 的 诊治'] tokenizer = Tokenizer(num_words=20) # i创建一个分词器(tokenizer),设置为只考虑前1000个最常见的单词 tokenizer.fit_on_texts(samples) # 构建索引单词 sequences = tokenizer.texts_to_sequences(samples) # 将字符串转换为整数索引组成的列表 print(sequences) one_hot_results = tokenizer.texts_to_matrix( samples, mode='binary') # 可以直接得到one-hot二进制表示。这个分词器也支持除 print(one_hot_results.tolist()) # one-hot编码外其他向量化模式 word_index = tokenizer.word_index # 得到单词索引 print('Found %s unique tokens.' % len(word_index))
new_prob = new_prob.replace(curr_entities[j], repl_entity) if (new_prob not in all_problems): all_problems.append(new_prob) tr_x += [x[i]] tr_y += [new_prob] if (new_prob1 not in all_problems): all_problems.append(new_prob1) tr_x += [x[i]] tr_y += [new_prob] return tr_x, tr_y, all_problems tr_x, tr_y, all_problems = create_data(data) tokenizer = Tokenizer(nb_words=100, lower=True, split=' ') tokenizer.fit_on_texts(all_problems) #print(tokenizer.word_index) # To see the dicstionary TR_X = tokenizer.texts_to_sequences(tr_x) TR_X = pad_sequences(TR_X, maxlen=40) TR_Y = tokenizer.texts_to_sequences(tr_y) TR_Y = pad_sequences(TR_Y, maxlen=40) encode_data = tokenizer.texts_to_sequences(data) encode_data = pad_sequences(encode_data, maxlen=40) word_index = tokenizer.word_index embeddings_index = {} f = open('glove.6B.50d.txt')
def create_tokenizer(descriptions): lines = to_lines(descriptions) tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer
for file_name in os.listdir(dir_name): if file_name[-4:] == '.txt': f = open(os.path.join(dir_name, file_name), encoding='utf8') texts.append(f.read()) f.close() labels.append(label_type == 'pos') # endregion # region tokenizing the text max_length = 100 training_samples = 2000 validation_samples = 10000 max_words = 10000 tok = Tokenizer(num_words=max_words) tok.fit_on_texts(texts) sequences = tok.texts_to_sequences(texts) word_index = tok.word_index print('Found %s unique token.' % len(word_index)) # zero padding data = pad_sequences(sequences, maxlen=max_length) labels = np.asarray(labels) print('Shape of data tensor : ', data.shape) print('Shape of label tensor : ', labels.shape) # shuffle the data/labels indices = np.arange(data.shape[0]) np.random.shuffle(indices)
df = pd.DataFrame({"Reviews": Train_data_cleaned, "Labels": y_labels}) # 2. Train your network # Training the word2vec model word_sentences = [ nltk.word_tokenize(sentence) for sentence in df["Reviews"] ] W2v = Word2Vec(word_sentences, size=400, window=10, min_count=10) embedding_vectors = W2v.wv.vectors # USING KERAS PREPROCESSING FOr CREATING INTO VECTORS tokens = Tokenizer(num_words=embedding_vectors.shape[0]) tokens.fit_on_texts(df["Reviews"]) pkl.dump(tokens, open("models/tokens.pkl", "wb")) encoded_docs_train = tokens.texts_to_sequences(df["Reviews"]) max_length = 450 padded_docs = pad_sequences(encoded_docs_train, maxlen=max_length, padding='pre') y_train = np.array(df["Labels"]) embedding_layer = Embedding(input_dim=embedding_vectors.shape[0], output_dim=embedding_vectors.shape[1], weights=[embedding_vectors], trainable=True, input_length=450)
for sentence in df["comment"]: seg_list = jieba.cut(sentence.replace(" ", ""), cut_all=False) x_word.write(" ".join(seg_list).encode('utf-8')) x_word.write(b'\n') x_word.close() x_word = list() f = open(wordlist_path, "r") for line in f: x_word.append(line[:-1]) MAX_SEQUENCE_LENGTH = 100 tokenizer = Tokenizer(num_words=None) tokenizer.fit_on_texts(x_word) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) count_thres = 3 low_count_words = [ w for w, c in tokenizer.word_counts.items() if c < count_thres ] for w in low_count_words: del tokenizer.word_index[w] del tokenizer.word_docs[w] del tokenizer.word_counts[w] sequences = tokenizer.texts_to_sequences(x_word) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# conv1d로 구성 from keras.preprocessing.text import Tokenizer import numpy as np docs = [ "너무 재밋어요", "참 최고에요", "참 잘 만든 영화에요", "추천하고 싶은 영화입니다", "한 번 더 보고 싶네요", "글쎄요", "별로에요", "생각보다 지루해요", "연기가 어색해요", "재미없어요", "너무 재미없다", "참 재밋네요" ] # 긍정 1, 부정0 labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]) # 토큰화 token = Tokenizer() token.fit_on_texts(docs) print(token.word_index) x = token.texts_to_sequences(docs) print('x : ', x) from keras.preprocessing.sequence import pad_sequences pad_x = pad_sequences(x, padding='pre') # ex) 0 이 앞에서 채워짐 0 0 0 3 7 print('pad_x :', pad_x) # (12, 5) pad_x = pad_x.reshape(12, 5, 1) print('pad_x :', pad_x) word_size = len(token.word_index) + 1 print("전체 토큰 사이즈 : ", word_size) # 25 전체 단어의 갯수
def create_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer
batch_size = 64 embedding_dims = 50 epochs = 100 print('Loading data...') with open("./dialog_seg.pkl", "rb") as f: dialog = pickle.load(f) f.close() with open("./label_index_onehot.pkl", "rb") as f: label = pickle.load(f) f.close() tokenizer = Tokenizer() tokenizer.fit_on_texts(dialog) sequences = tokenizer.texts_to_sequences(dialog) # print(sequences) # exit() x_train, x_test, y_train, y_test = train_test_split(sequences, label, test_size=0.3333, random_state=42) # (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format( np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format( np.mean(list(map(len, x_test)), dtype=int)))
y = np.array(list(y)) # create empty list for X X = [] reviews = list(data['reviews']) # copy all the reviews into X for review in reviews: X.append(review) # train test split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) """ Text Processing """ # tokenize words # create embedded layers - convert sentences to number tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(X_train) train_embedded = tokenizer.texts_to_sequences(X_train) test_embedded = tokenizer.texts_to_sequences(X_test) # the corpus contains 15163 unique words vocab_size = len(tokenizer.word_index) + 1 ## make all the sentences of uniform size - the length of longest sentences # find out the longest sentences and get the length - length should be uniformed in training and testing dataset - use longest length longest_sent = max(X_train, key=lambda sent: len(nltk.word_tokenize(sent))) len_longest = len(nltk.word_tokenize(longest_sent)) # increase the length of sentences by padding padding_sent = pad_sequences(train_embedded, len_longest, padding="post") padding_sent_test = pad_sequences(test_embedded, len_longest, padding="post")
raw_text = open(file, 'r').read() raw_text = [line.strip() for line in raw_text.split('\n')] raw_text = ' '.join(raw_text) clean_text = re.sub("[^a-zA-Z]", " ", raw_text) clean_text = clean_text.lower() words = clean_text.split() text_sequences = [] next_word = [] for i in range(0, len(words) - maxlen, step): text_sequences.append(' '.join(words[i:i + maxlen])) next_word.append(words[i + maxlen]) print('nb sequences:', len(text_sequences)) tokenizer = Tokenizer(malower=True, split=' ') tokenizer.fit_on_texts(words) print(tokenizer.word_counts) print(tokenizer.word_index) vocab_size = len(tokenizer.word_index) + 1 train_sequences = tokenizer.texts_to_sequences(text_sequences) X_train = np.array(train_sequences) target = tokenizer.texts_to_sequences(next_word) y_train = np_utils.to_categorical(target, vocab_size) def loadGloveWordEmbeddings(glove_file): embedding_vectors = {} f = open(glove_file, encoding='utf8') for line in f: values = line.split()
MAX_NUM_WORDS = 20000 EMBEDDING_DIM = 300 VALIDATION_SPLIT = 0.15 batch_size = 100 n_epoch = 1 path = r'C:\Users\zhyzhang\Desktop\News Samples\trainingandtestdata\training.1600000.processed.noemoticon.csv' df = pd.read_csv(path, index_col=None, header=None, engine='python', encoding=None) df = df.sample(frac=1).reset_index(drop=True) news = list(df.iloc[:, 5]) tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(news) sequences = tokenizer.texts_to_sequences(news) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): try: embedding_vector = google_model[word] embedding_matrix[i] = embedding_vector except KeyError: continue
list_classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] y = train_df[list_classes].values list_sentences_test = test_df["comment_text"].fillna("NA").values comments = [] for text in list_sentences_train: comments.append(text_to_wordlist(text)) test_comments = [] for text in list_sentences_test: test_comments.append(text_to_wordlist(text)) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(comments + test_comments) sequences = tokenizer.texts_to_sequences(comments) test_sequences = tokenizer.texts_to_sequences(test_comments) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', y.shape) test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of test_data tensor:', test_data.shape) ########################################
def process_request(self, entityname: str, model_id: int): """ Predicts intent or outcome for the entity :param entityname: Entity name :param model_id: Primary key ID of record to be inserted :return: Output value based on success or failure """ table = str.maketrans('', '', string.punctuation.replace('&', '')) entityname = entityname.translate(table) print('Entity being searched : {0}', entityname) entityList = [] entityList.append(entityname) entityArr = re.split('\W+', entityname) if len(entityArr) == 2: entityList.append(f'{entityArr[1]} {entityArr[0]}') print(entityList) er = EventRegistry(apiKey="f4a005ab-a24f-487e-bff4-f39b1b2ba6c2") cq = ComplexArticleQuery(query=CombinedQuery.AND([ BaseQuery( keyword=QueryItems.OR(entityList), # sourceLocationUri=er.getLocationUri("United States"), lang="eng", dateStart=date.today() - timedelta(days=365), dateEnd=date.today()), BaseQuery(keyword=QueryItems.OR([ "sanction", "bribery", "laundering", "corruption", "blacklist", "crime", "scam", "fraud" ])) # "drugs","trafficking","gambling","illegal","smuggling","terrorism", # "extortion","forgery","tax evasion","SDN","burglary","robbery","murder"])) ])) q = QueryArticles.initWithComplexQuery(cq) q.setRequestedResult( RequestArticlesInfo(page=1, count=self.news_fetch_count, sortBy="date", sortByAsc=False, returnInfo=ReturnInfo())) res = er.execQuery(q) # sql_db_path = cfg.read_config('sql_db_path') # Remove similar redundant news articles # article_list = [] # match_list = [] # for article1 in res['articles']['results']: # similarity_flag = False # for article2 in res['articles']['results']: # val = SequenceMatcher(a=article1['body'], b=article2['body']).ratio() # match_list.append(val) # if article1 != article2 and val > 0.8: # similarity_flag = True # print(val) # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # if not similarity_flag: # article_list.append(article1) # # print(match_list) articles = [] print('Number of articles found: {0}', str(len(res['articles']['results']))) for article in res['articles']['results']: content = article["body"] # print(content.encode("utf-8")) title = article["title"] url = article["url"] articleDateTime = article["dateTime"].replace('T', ' ', 1).replace( 'Z', '', 1) # Replace name of the entity with word ENTITY if entityname.lower() in content.lower(): re_replace = re.compile(re.escape(entityname), re.IGNORECASE) content = re_replace.sub('ENTITY', content) replaced_entityname = entityname if len(entityArr) == 2: entity_temp = f'{entityArr[1]}" "{entityArr[0]}' if entity_temp.lower() in content.lower(): re_replace = re.compile(re.escape(entity_temp), re.IGNORECASE) content = re_replace.sub('ENTITY', content) replaced_entityname = entity_temp if 'ENTITY' in content: lst = [title, content, articleDateTime, url] articles.append(lst) # obtain at most 10 newest articles or blog posts X = [] print('remaining length : {0}', str(len(articles))) for article in articles: content = article[1] articleDateTime = article[2] url = article[3] # print(url) # Insert article into database training_model = TrainingModel(ArticleText=content, TrainingDate=articleDateTime, SearchModel_id=str(model_id), IsTrained=0, Url=url) training_model.save() # print('---------------------------------Article Body---------------------------------') # print(content.encode("utf-8")) # print('---------------------------------Tokens---------------------------------') # tokens = self.unique_list(self.clean_article(content)) tokens = self.clean_article(content) # Slice out max length characters from the article if len(tokens) > self.max_length: tokens = tokens[:self.max_length] # Grab Before and After 2 sentences of sentence with ENTITY word sentences = sent_tokenize(" ".join(tokens)) indices = [ idx for idx, sent in enumerate(sentences) if 'ENTITY' in sent ] # print(indices) extended_indices = [] for i, sentence in enumerate(sentences): extended_indices.extend( list( set([ i for index in indices if abs(index - i) <= self.sentence_buffer ]))) # print(extended_indices) # print(len(sentences)) desired_list = list(itemgetter(*extended_indices)(sentences)) token_sentence = " ".join(desired_list) # print(token_sentence) X.append(token_sentence) # Before prediction K.clear_session() if path.exists(f'{self.model_path}trained_model.h5'): if len(X) > 0: # Load the model model = load_model(f'{self.model_path}trained_model.h5') # prepare tokenizer t = Tokenizer() t.fit_on_texts(X) # integer encode the documents encoded_docs = t.texts_to_sequences(X) # print(encoded_docs) # pad documents to a max length of words # max_length = max([len(word.split()) for word in X]) padded_docs = pad_sequences(encoded_docs, maxlen=self.max_length, padding='post') # print(padded_docs) # Predict on searched articles probabilities = model.predict(x=padded_docs, batch_size=5, verbose=2) # classes = model.predict_classes(x=padded_docs, batch_size=5, verbose=2) classes = binarize(probabilities, 0.6) # print(classes) print(probabilities) prediction = [] for idx in range(len(classes)): prediction.append((int(classes[idx][0]), round(probabilities[idx][0] * 100, 2))) # Sort in descending order of probability prediction = sorted(prediction, key=lambda x: x[1], reverse=True) print(prediction) # After prediction K.clear_session() # Replace ENTITY word with original entity name for article in articles: re_replace = re.compile(re.escape('ENTITY'), re.IGNORECASE) article[1] = re_replace.sub(replaced_entityname, article[1]) # print(articles) return self.OutputParams(True, prediction, articles) else: return self.OutputParams(False, "", "") else: raise Exception('Model is not yet ready to predict')
df = pd.read_pickle("combined_data.pkl") # df = df[:5000] # df["text"] = df.text.str.join(" ") # Unpack column by column into an num_review-by-num_metacategories matrix again target_vecs = np.vstack( [df["cat_{}".format(i)] for i in range(num_metacategories)]).T max_words = 2000 max_len = 10 X = df.text Y = target_vecs X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40) tok = Tokenizer(num_words=max_words) tok.fit_on_texts(X_train) sequences = tok.texts_to_sequences(X_train) sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len) print(sorted(tok.word_counts.items(), key=lambda x: -x[1])[:max_words]) model = get_model(max_words, max_len) model.fit(sequences_matrix, Y_train, batch_size=512, epochs=15, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)]) test_sequences = tok.texts_to_sequences(X_test) test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)
print("모델이 비속어 처리중...") list_sentences_train = train["comment_text"].fillna( "_na_").values # comment_text만 가져와서 fillna를 통해 nan를 거른다. # Just import comment_text and filter nan through fillna. list_classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] # 사용할 컬럼들(to use columns) y = train[list_classes].values # labels of comment_text list_sentences_test = test["comment_text"].fillna( "_na_").values # Do the same things for test_data tokenizer = Tokenizer( num_words=max_features) # max_features 만큼의 단어를 Tokenize하기 위한 틀 생성. # Create a frame to Tokenize words as many as max_features. tokenizer.fit_on_texts(list(list_sentences_train)) # just fit list_tokenized_train = tokenizer.texts_to_sequences( list_sentences_train) # Tokenize(Transform word into number) list_tokenized_test = tokenizer.texts_to_sequences( list_sentences_test) # Tokenize(Transform word into number) X_te = pad_sequences(list_tokenized_test, maxlen=maxlen) # do the same thing from keras.models import load_model model = load_model('toxic_model.h5') # ## Predict result y_test = model.predict([X_te], batch_size=1024, verbose=1) # model에 test data를 넣고 예측
# -*- coding: utf-8 -*- """ 6.3 - Using Keras for word-level one-hot encoding @author: migue """ from keras.preprocessing.text import Tokenizer samples = ['The cat sat on the mat.', 'The dog ate my homework.'] # Creates a tokenizer, configured to only take into account the 1,000 most # common words tokenizer = Tokenizer(num_words=1000) # Builds the word index tokenizer.fit_on_texts(samples) # Turns strings into lists of integer indices sequences = tokenizer.texts_to_sequences(samples) # You could also directly get the one-hot binary representations. # Vectorization modes other than one-hot encoding are supported by this # tokenizer. one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') # How you can recover the word index that was computed word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index))