def LoadSMILESData(duplicateProb = 0,seed=7): dataComp = dataset.LoadData('data',0) smiles = list(map(lambda x: x._SMILE, dataComp)) tokenizer = Tokenizer(num_words=None, char_level=True) tokenizer.fit_on_texts(smiles) print(smiles[0]) dictionary = {} i=0 k=0 for smile in smiles: i+=1 for c in list(smile): k+=1 if c in dictionary: dictionary[c]+=1 else: dictionary[c]=1 print(len(dictionary)) # sequence encode encoded_docs = tokenizer.texts_to_sequences(smiles) # pad sequences max_length = max([len(s) for s in smiles]) vocab = {'C': 1, 'c': 2, '(': 3, ')': 4, 'O': 5, '=': 6, '1': 7, 'N': 8, '2': 9, '3': 10, '[': 11, ']': 12, 'F': 13, '4': 14, 'l': 15, 'n': 16, 'S': 17, '@': 18, 'H': 19, '5': 20, '+': 21, '-': 22, 'B': 23, 'r': 24, '\\': 25, '#': 26, '6': 27, '.': 28, '/': 29, 's': 30, 'P': 31, '7': 32, 'i': 33, 'o': 34, '8': 35, 'I': 36, 'a': 37, '%': 38, '9': 39, '0': 40, 'K': 41, 'e': 42, 'A': 43, 'g': 44, 'p': 45, 'M': 46, 'T': 47, 'b': 48, 'd': 49, 'V': 50, 'Z': 51, 'G': 52, 'L': 53} Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define vocabulary size (largest integer value) labels = list(map(lambda x: 1 if x.mutagen==True else 0,dataComp)) return Xtrain, labels,vocab,max_length
def get_data_1(train_sents, maxlen): word_list = [] for i in range(len(train_sents)): for words in train_sents[i]: word_list.append(words) sequence=[] stride=1 #applying windowing for sequence genration for i in range(0,len(word_list)-maxlen,stride): line=word_list[i:i+maxlen] sequence.append(line) tokenizer=Tokenizer() tokenizer.fit_on_texts(sequence) seq=tokenizer.texts_to_sequences(sequence) vocab_len=len(tokenizer.word_index.items())+1 seq=np.array(seq) x_train=seq[:,:-1] y_train=np.zeros((x_train.shape[0],x_train.shape[1],1)) for i in range(x_train.shape[0]): for j in range(x_train.shape[1]): y_train[i,j,0]=seq[i,j+1] return x_train,y_train,vocab_len,tokenizer
def train(dataReader, oneHot, oneHotAveraged, contextHashes): n = (Epochs + 1) * SamplesPerEpoch # TODO + 1 should not be needed tokeniser = Tokenizer(nb_words=MaxWords) tokeniser.fit_on_texts((row[0] for row in dataReader.trainingData(n))) # `word_index` maps each word to its unique index dictionarySize = len(tokeniser.word_index) + 1 oneHotDimension = (1 if oneHotAveraged else SequenceLength) * dictionarySize if oneHot else 0 contextHashesDimension = dictionarySize * 2 if contextHashes else 0 model = Sequential() model.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension))) model.add(Dense(Labels, activation='softmax')) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) trainingGenerator = mapGenerator(dataReader.trainingData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes) validationGenerator = mapGenerator(dataReader.validationData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes) model.fit_generator(trainingGenerator, nb_epoch=Epochs, samples_per_epoch=SamplesPerEpoch, validation_data=validationGenerator, nb_val_samples=SamplesPerEpoch) model2 = Sequential() model2.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension), weights=model.layers[0].get_weights())) return model, model2, tokeniser, dictionarySize
def get_train_val_matrix(texts, labels, max_features=10000, max_len=100): tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(texts) sequens = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print(f'Found {len(word_index)} unique tokens') data = pad_sequences(sequens, maxlen=max_len) labels = np.asarray(labels) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] train_sample_n = 20000 validation_sample_n = 5000 x_train = data[:train_sample_n] x_val = data[train_sample_n:validation_sample_n+train_sample_n] y_train = labels[:train_sample_n] y_val = labels[train_sample_n:validation_sample_n+train_sample_n] return (x_train, y_train), (x_val, y_val), word_index
def read_copus_generator(self, batch_size=64): """ return a generator with the specified batch_size """ logger.info("Beigin read copus {0}".format(file_name)) data = [] index = 0 with open(file_name, 'r') as fread: while True: try: line = fread.readline() data.append(line) index += 1 if index % 100000 == 0: logger.info("The program has processed {0} lines ". format(index)) except: logger.info("Read End") break tokenizer = Tokenizer(nb_words=30000) tokenizer.fit_on_texts(data) logger.info("word num: {0}".format(len(tokenizer.word_counts))) sorted_word_counts = sorted( tokenizer.word_counts.items(), key=operator.itemgetter(1), reverse=True) # save the word_counts to the meta with open(file_name.replace("train.", "meta."), "w") as fwrite: for word_cnt in sorted_word_counts: key = word_cnt[0] val = word_cnt[1] line = key + ":" + str(val) + "\n" fwrite.write(line) vectorize_data = tokenizer.texts_to_matrix(data) return vectorize_data
class SequenceTransformer(BaseEstimator, TransformerMixin): " Transforms np array of strings into sequences" def __init__(self, analyzer='word', max_features=10000, max_len=100): self.max_len = max_len self.analyzer = analyzer self.max_features = max_features def transform(self, X, y=None): try: getattr(self, "transformer_") except AttributeError: raise RuntimeError("You must fit transformer before using it!") X_seq = self.transformer_.texts_to_sequences(list(X)) X_seq = sequence.pad_sequences(X_seq, maxlen=self.max_len) return X_seq def fit(self, X, y=None): if self.analyzer == 'char': char_level = True elif self.analyzer == 'word': char_level = False else: print("invalid analyzer") return self.transformer_ = Tokenizer(nb_words=self.max_features, lower=True, char_level = char_level) self.transformer_.fit_on_texts(X) return self
def get_fitted_tokenizer(df_train, df_test): comments_train = df_train[COMMENT_COL].values.tolist() comments_test = df_test[COMMENT_COL].values.tolist() tokenizer = Tokenizer() # tokenizer.num_words = MAX_NUM_WORDS tokenizer.fit_on_texts(comments_train + comments_test) return tokenizer
def prepare_tokenizer(words): ''' funtion to generate vocabulary of the given list of words implemented by Anindya @param words => the list of words to be tokenized ''' # obtain a tokenizer t = Tokenizer(filters = '') # don't let keras ignore any words t.fit_on_texts(words) field_dict = dict(); rev_field_dict = dict() for key,value in t.word_index.items(): field_dict[value] = key rev_field_dict[key] = value vocab_size = len(t.word_index) + 1 ''' Small modification from Animesh # also add the '<unk>' token to the dictionary at 0th position ''' field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0 #print (vocab_size) # integer encode the documents encoded_docs = t.texts_to_sequences(words) # print "debug: " + str(encoded_docs) #print(padded_docs) return np.array(encoded_docs), field_dict, rev_field_dict, vocab_size
def mlp_model(X_train, y_train, X_test, y_test): tokenizer = Tokenizer(nb_words=1000) nb_classes = np.max(y_train) + 1 X_train = tokenizer.sequences_to_matrix(X_train, mode="freq") X_test = tokenizer.sequences_to_matrix(X_test, mode="freq") Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) print("Building model...") model = Sequential() model.add(Dense(512, input_shape=(max_len,))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode='categorical') history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1) model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True) # print('Test score:', score[0]) # print('Test accuracy:', score[1]) pred_labels = model.predict_classes(X_test) # print pred_labels # print y_test accuracy = accuracy_score(y_test, pred_labels) precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted') print precision, recall, f1, supp return accuracy, precision, recall, f1
def test_tokenizer_unicode(): texts = [u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'] tokenizer = Tokenizer(num_words=5) tokenizer.fit_on_texts(texts) assert len(tokenizer.word_counts) == 5
class Featurizer: max_words = None tokenizer = None def __init__(self, max_words=1000): self.max_words = max_words self.tokenizer = Tokenizer(num_words=max_words) def fit_transform(self, data): texts = [l['text'] for l in data] self.tokenizer.fit_on_texts(texts) # remove words that cross the max_words limit self.tokenizer.word_index = {k: v for k, v in self.tokenizer.word_index.items() if v <= self.max_words} return self.transform(data) def transform(self, data): texts = [l['text'] for l in data] return self.tokenizer.texts_to_matrix(texts, mode='binary') def transform_inv(self, m): index = {v: k for k, v in self.tokenizer.word_index.items()} # word index by id return [[index.get(i) for i in np.nonzero(line)[0] if i in index] for line in m] def save(self, filepath): with open(filepath + '_word_index.json', 'w') as f: f.write(json.dumps(self.tokenizer.word_index)) @classmethod def load(cls, filepath): with open(filepath + '_word_index.json', 'r') as f: word_index = json.load(f) c = cls(max_words=len(word_index)) c.tokenizer.word_index = word_index return c
def preprocess_embedding(): corpus_train, target, filenames = get_corpus() tokenizer = Tokenizer() tokenizer.fit_on_texts(corpus_train) sequences = tokenizer.texts_to_sequences(corpus_train) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) MAX_SEQUENCE_LENGTH = 50 data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/home/flippped/Desktop/xiangmu/baseline/GoogleNews-vectors-negative300.bin.gz', binary=True) word2vec_model.init_sims(replace=True) # create one matrix for documents words EMBEDDING_DIM = 300 embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) print embedding_matrix.shape for word, i in word_index.items(): try: embedding_vector = word2vec_model[str(word)] if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector except: continue return data,target,filenames,embedding_matrix, word_index
def get_fitted_tokenizer(df_train, df_test): comments_train = df_train[COMMENT_COL].values.tolist() comments_test = df_test[COMMENT_COL].values.tolist() # remain '!' and '?' tokenizer = Tokenizer(filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n') tokenizer.fit_on_texts(comments_train + comments_test) return tokenizer
def preproc_for_sklearn(X, y, nb_features): try: tokenizer = Tokenizer(num_words=nb_features) except: tokenizer = Tokenizer(num_words=nb_features) X = tokenizer.sequences_to_matrix(X, mode='binary') return X, y
def tokenaize(train_path, dev_path): with open(train_path) as fd: data = fd.read() with open(dev_path) as fd: data += fd.read() tokenizer = Tokenizer(split='\t', oov_token='<UNK>') tokenizer.fit_on_texts([data]) return tokenizer
def tokenizeAndGenerateIndex(texts): tokenizer = Tokenizer(nb_words=vocab_size) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=maxlen,padding='post') return data
def handle(self, *args, **options): ptt = PTT.objects.all() ptt_json = PTTSerializer(ptt, many=True).data user_comments_times = dict() labels_index = 2 labels = [] texts = [] for article in ptt_json: pointer = 1 if article['score'] > 0 else 0 words = jieba.cut(article['contents']) for word in words: labels.append(pointer) texts.append(word) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) print('Token word index:', tokenizer.word_index) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] print('Training model.') # train a 1D convnet with global maxpooling sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32') x = Embedding(output_dim=100, input_dim=len(tokenizer.word_index), input_length=self.MAX_SEQUENCE_LENGTH)(sequence_input) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(35)(x) x = Flatten()(x) x = Dense(128, activation='relu')(x) preds = Dense(labels_index, activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) # happy learning! model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=2, batch_size=64) score = model.evaluate(x_val, y_val, verbose=0) print('Test score:', score[0]) print('Test accuracy:', score[1])
def question_to_input(df_q1,df_q2): tokenizer = Tokenizer() tokenizer.fit_on_texts(df_q1 + df_q2) encoded_1 = tokenizer.texts_to_sequences(df_q1) encoded_2 = tokenizer.texts_to_sequences(df_q2) question_input_train = sequence.pad_sequences(encoded_1, maxlen=15) question_input_test = sequence.pad_sequences(encoded_2, maxlen=15) return question_input_train,question_input_test
def fit_tokenizer(fname, open_encoding='utf-8'): file = open(fname, 'r', encoding=open_encoding) text = file.read() file.close() texts = [text] # 不过滤低频词 tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) return tokenizer
def get_tokenizer(train_comments, nwords): print("getting tokenizer..") t = Tokenizer(num_words=nwords) texts = train_comments t.fit_on_texts(texts) sequences = t.texts_to_sequences(texts) return (t,sequences)
def tokenize(texts, texts_train, texts_test): tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) sequences_train = tokenizer.texts_to_sequences(texts_train) sequences_test = tokenizer.texts_to_sequences(texts_test) return word_index, sequences_train, sequences_test
def keras_classify(df): # 预处理,把 text 中的词转成数字编号 from keras.preprocessing.text import Tokenizer from keras.preprocessing import sequence from keras.callbacks import EarlyStopping from sklearn.cross_validation import train_test_split print "----- Classification by Keras -----" max_features = 50000 # 只选最重要的词 # Tokenizer 只能处理 str,不能处理 unicode textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist()) token = Tokenizer(nb_words=max_features) # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词 token.fit_on_texts(textraw) # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本 # 如 textraw = ['a b c', 'c d e f'] ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]] text_seq = token.texts_to_sequences(textraw) nb_classes = len(np.unique(df.label.values)) print "num of features(vocabulary): ", len(token.word_counts) print "num of labels: ", nb_classes max_sent_len = np.max([len(s) for s in text_seq]) print "max length or document is: ", max_sent_len median_sent_len = np.median([len(s) for s in text_seq]) print "median length or document is: ", median_sent_len # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错 train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1) # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练 seqlen = int(max_sent_len / 2 + median_sent_len / 2) X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post') # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) model = build_cnn_model(max_features, seqlen, nb_classes) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop]) evaluate(earlystop.model, X_test, Y_test, test_y) model = build_lstm_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) model = build_mixed_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) graph = build_graph_model(max_features, seqlen, nb_classes) graph.fit({'input': X_train, 'output': Y_train}, nb_epoch=3, batch_size=32, validation_split=0.1) predict = graph.predict({'input': X_test}, batch_size=32) predict = predict['output'] classes = predict.argmax(axis=1) acc = np_utils.accuracy(classes, test_y) print('Test accuracy: ', acc)
def df2seq(df, nb_words): textraw = df.EssayText.values.tolist() textraw = [line.encode('utf-8') for line in textraw] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) text_seq = token.texts_to_sequences(textraw) return(text_seq, df.Score1.values)
def tokenizeAndGenerateIndex(train, test, maxFeatures, maxLength): merged = np.concatenate([train, test]) tokenizer = Tokenizer(nb_words=maxFeatures) tokenizer.fit_on_texts(merged) sequences_train = tokenizer.texts_to_sequences(train) sequences_test = tokenizer.texts_to_sequences(test) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data_train = pad_sequences(sequences_train, maxlen=maxLength) data_test = pad_sequences(sequences_test, maxlen=maxLength) return data_train, data_test, word_index
def word_to_index(self, text, tok=None): real_text = [' '.join(z) for z in text] if tok is None: tokenizer = Tokenizer(lower=False, filters=" ") tokenizer.fit_on_texts(real_text) else: tokenizer = tok # here do not need the loop, just put the list of sentences (str) as input sequences = tokenizer.texts_to_sequences(real_text) # tokenizer.word_docs.items() return sequences, tokenizer
def prepare_tokenizer(words, max_word_length = None): ''' funtion to generate vocabulary of the given list of words implemented by Anindya @param words => the list of words to be tokenized ''' # flatten the words list: print("flattening the words into a single sequence ... ") flat_words = []; # initialize to empty list for i in range(len(words)): flat_words += words[i] if(i % 10000 == 0): print("joined", i, "examples") # obtain a tokenizer print("\nmaximum words to work with: ", max_word_length) t = Tokenizer(num_words = max_word_length, filters = '') # don't let keras ignore any words print("\nKeras's tokenizer kicks off ... ") t.fit_on_texts(flat_words) field_dict = dict(); rev_field_dict = dict() print("\nbuilding the dict and the rev_dict ... ") if(max_word_length is not None): vals = t.word_index.items() vals = sorted(vals, key=lambda x: x[1]) for key,value in vals[:max_word_length - 1]: field_dict[value] = key rev_field_dict[key] = value else: for key,value in t.word_index.items(): field_dict[value] = key rev_field_dict[key] = value ''' Small modification from Animesh # also add the '<unk>' token to the dictionary at 0th position ''' field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0 print("\nencoding the words using the dictionary ... ") for i in range(len(words)): for j in range(len(words[i])): if(words[i][j] in rev_field_dict): words[i][j] = rev_field_dict[words[i][j]] else: words[i][j] = rev_field_dict['<unk>'] if(i % 10000 == 0): print("encoded", i, "examples") vocab_size = len(field_dict) return words, field_dict, rev_field_dict, vocab_size
def save_tokenizer(question1, question2): questions = question1 + question2 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index)) # save tokenizer with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) return word_index
def load_mr(nb_words=20000, maxlen=64, embd_type='self'): """ :param embd_type: self vs. w2v :return: """ train_size = 0.8 df = pickled2df('data/mr.p') print(df.head()) train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(), df.label.values, train_size=train_size, random_state=1) train_X_wds = train_X test_X_wds = test_X nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print('train len vs. test len', n_ta, n_ts) textraw = [line.encode('utf-8') for line in train_X+test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print('nb_words: ',len(token.word_counts)) print('mean len: ',np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if(embd_type == 'self'): X_train = xcol_nninput_embd(train_X, nb_words, maxlen) X_test = xcol_nninput_embd(test_X, nb_words, maxlen) elif(embd_type == 'w2v'): w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print('wrong embd_type') print('X tensor shape: ', X_train.shape) print('Y tensor shape: ', Y_train.shape) return (X_train, Y_train, X_test, Y_test, nb_classes)
def NNclassify(X_train,X_test,y_train,y_test,inputtype): classtype="gender" max_words=10000 batch_size=32 nb_epoch=20 if inputtype=='categorical': nb_epoch=10 classtype="age" print('Loading data...') print(len(X_train), 'train instances') print(len(X_test), 'test instances') nb_classes = np.max(y_train)+1 print(nb_classes, 'classes') print('Vectorizing sequence data...') tokenizer = Tokenizer(nb_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Convert class vector to binary class matrix (for use with categorical_crossentropy)') Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) print('Building model...') model = Sequential() model.add(MaxoutDense(100, input_shape=(max_words,))) model.add(Dropout(0.7)) model.add(Dense(nb_classes,init='uniform')) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam',class_mode=inputtype) history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1) score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True) print('Test score:', score[0]) print('Test accuracy:', score[1]) prediction=model.predict(X_test, batch_size=batch_size, verbose=1) pred_classes = np.argmax(prediction, axis=1) print(Counter(pred_classes)) results=open('results.txt', 'a') results.write("{} \t {} features \t {} epochs \t {} batch size \t {} accuracy \n".format(classtype, max_words, nb_epoch, batch_size,score[1])) results.close() return pred_classes
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type): train_df = pd.read_csv(traincsv) test_df = pd.read_csv(testcsv) print(train_df.head()) train_X = train_df.text.values.tolist() test_X = test_df.text.values.tolist() # save for w2v embd train_X_wds = train_X test_X_wds = test_X train_y = train_df.label.values test_y = test_df.label.values nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print('train len vs. test len', n_ta, n_ts) textraw = [line.encode('utf-8') for line in train_X+test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print('nb_words: ', len(token.word_counts)) print('mean len: ', np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if(embd_type == 'self'): X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post') elif(embd_type == 'w2v'): w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print('wrong embd_type') print('X tensor shape: ', X_train.shape) print('Y tensor shape: ', Y_train.shape) return(X_train, Y_train, X_test, Y_test, nb_classes)
def evaluate(model): print("Processing", QUESTION_PAIRS_FILE) question1 = [] question2 = [] is_duplicate = [] with open(QUESTION_PAIRS_FILE, encoding='utf-8') as jsondata: file = json.load(jsondata) for row in file: if row['is_duplicate'] != 0 and row['is_duplicate'] != 1: pass else: question1.append(row['question1']) question2.append(row['question2']) is_duplicate.append(row['is_duplicate']) print('Question pairs: %d' % len(question1)) questions = question1 + question2 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) question1_word_sequences = tokenizer.texts_to_sequences(question1) question2_word_sequences = tokenizer.texts_to_sequences(question2) q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH) q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH) X = np.stack((q1_data, q2_data), axis=1) y = np.array(is_duplicate, dtype=int) Q1_test = X[:, 0] Q2_test = X[:, 1] results = model.predict([Q1_test, Q2_test], batch_size=32, verbose=0) #loss, accuracy = model.evaluate([Q1_test, Q2_test], y, verbose=0) #print('Test loss = {0:.4f}, test accuracy = {1:.4f}'.format(loss, accuracy)) print("Finishing predict") TP = 0 TN = 0 FP = 0 FN = 0 for i in range(len(is_duplicate)): if i % 10000 == 0: print(i) if round(results[i][0]) == 1 and is_duplicate[i] == 1: TP += 1 elif round(results[i][0]) == 0 and is_duplicate[i] == 0: TN += 1 elif round(results[i][0]) == 0 and is_duplicate[i] == 1: FN += 1 elif round(results[i][0]) == 1 and is_duplicate[i] == 0: FP += 1 N = len(is_duplicate) accuracy = (TP + TN) / N precision = TP / (TP + FP) recall = TP / (TP + FN) f1 = (2 * precision * recall) / (precision + recall) print("Accuracy: ", accuracy) print("Precision: ", precision) print("Recall: ", recall) print("F1 score: ", f1)
class textgenrnn: META_TOKEN = '<s>' config = { 'rnn_layers': 2, 'rnn_size': 128, 'rnn_bidirectional': False, 'max_length': 40, 'max_words': 10000, 'dim_embeddings': 100, 'word_level': False, 'single_text': False } default_config = config.copy() def __init__(self, weights_path=None, vocab_path=None, config_path=None, name="textgenrnn"): if weights_path is None: weights_path = resource_filename(__name__, 'textgenrnn_weights.hdf5') if vocab_path is None: vocab_path = resource_filename(__name__, 'textgenrnn_vocab.json') if config_path is not None: with open(config_path, 'r', encoding='utf8', errors='ignore') as json_file: self.config = json.load(json_file) self.config.update({'name': name}) self.default_config.update({'name': name}) with open(vocab_path, 'r', encoding='utf8', errors='ignore') as json_file: self.vocab = json.load(json_file) self.tokenizer = Tokenizer(filters='', lower=False, char_level=True) self.tokenizer.word_index = self.vocab self.num_classes = len(self.vocab) + 1 self.model = textgenrnn_model(self.num_classes, cfg=self.config, weights_path=weights_path) self.indices_char = dict((self.vocab[c], c) for c in self.vocab) def generate(self, n=1, return_as_list=False, prefix=None, temperature=0.5, max_gen_length=300, interactive=False, top_n=3): gen_texts = [] for _ in range(n): gen_text = textgenrnn_generate( self.model, self.vocab, self.indices_char, prefix, temperature, self.config['max_length'], self.META_TOKEN, self.config['word_level'], self.config.get('single_text', False), max_gen_length, interactive, top_n) if not return_as_list: print("{}\n".format(gen_text)) gen_texts.append(gen_text) if return_as_list: return gen_texts def generate_samples(self, n=3, temperatures=[0.2, 0.5, 1.0], **kwargs): for temperature in temperatures: print('#' * 20 + '\nTemperature: {}\n'.format(temperature) + '#' * 20) self.generate(n, temperature=temperature, **kwargs) def train_on_texts(self, texts, context_labels=None, batch_size=128, num_epochs=50, verbose=1, new_model=False, gen_epochs=1, train_size=1.0, max_gen_length=300, validation=True, dropout=0.0, via_new_model=False, save_epochs=0, multi_gpu=False, **kwargs): if new_model and not via_new_model: self.train_new_model(texts, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, batch_size=batch_size, dropout=dropout, validation=validation, save_epochs=save_epochs, multi_gpu=multi_gpu, **kwargs) return if context_labels: context_labels = LabelBinarizer().fit_transform(context_labels) if 'prop_keep' in kwargs: train_size = prop_keep if self.config['word_level']: texts = [text_to_word_sequence(text, filters='') for text in texts] # calculate all combinations of text indices + token indices indices_list = [ np.meshgrid(np.array(i), np.arange(len(text) + 1)) for i, text in enumerate(texts) ] indices_list = np.block(indices_list) # If a single text, there will be 2 extra indices, so remove them # Also remove first sequences which use padding if self.config['single_text']: indices_list = indices_list[self.config['max_length']:-2, :] indices_mask = np.random.rand(indices_list.shape[0]) < train_size if multi_gpu: num_gpus = len(K.tensorflow_backend._get_available_gpus()) batch_size = batch_size * num_gpus gen_val = None val_steps = None if train_size < 1.0 and validation: indices_list_val = indices_list[~indices_mask, :] gen_val = generate_sequences_from_texts(texts, indices_list_val, self, context_labels, batch_size) val_steps = max( int(np.floor(indices_list_val.shape[0] / batch_size)), 1) indices_list = indices_list[indices_mask, :] num_tokens = indices_list.shape[0] assert num_tokens >= batch_size, "Fewer tokens than batch_size." level = 'word' if self.config['word_level'] else 'character' print("Training on {:,} {} sequences.".format(num_tokens, level)) steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1) gen = generate_sequences_from_texts(texts, indices_list, self, context_labels, batch_size) base_lr = 4e-3 # scheduler function must be defined inline. def lr_linear_decay(epoch): return (base_lr * (1 - (epoch / num_epochs))) if context_labels is not None: if new_model: weights_path = None else: weights_path = "{}_weights.hdf5".format(self.config['name']) self.save(weights_path) self.model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config, context_size=context_labels.shape[1], weights_path=weights_path) model_t = self.model if multi_gpu: # Do not locate model/merge on CPU since sample sizes are small. parallel_model = multi_gpu_model(self.model, gpus=num_gpus, cpu_merge=False) parallel_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=4e-3, rho=0.99)) model_t = parallel_model print("Training on {} GPUs.".format(num_gpus)) model_t.fit_generator(gen, steps_per_epoch=steps_per_epoch, epochs=num_epochs, callbacks=[ LearningRateScheduler(lr_linear_decay), generate_after_epoch(self, gen_epochs, max_gen_length), save_model_weights(self, num_epochs, save_epochs) ], verbose=verbose, max_queue_size=10, validation_data=gen_val, validation_steps=val_steps) # Keep the text-only version of the model if using context labels if context_labels is not None: self.model = Model(inputs=self.model.input[0], outputs=self.model.output[1]) def train_new_model(self, texts, context_labels=None, num_epochs=50, gen_epochs=1, batch_size=128, dropout=0.0, validation=True, save_epochs=0, multi_gpu=False, **kwargs): self.config = self.default_config.copy() self.config.update(**kwargs) print("Training new model w/ {}-layer, {}-cell {}LSTMs".format( self.config['rnn_layers'], self.config['rnn_size'], 'Bidirectional ' if self.config['rnn_bidirectional'] else '')) # If training word level, must add spaces around each punctuation. # https://stackoverflow.com/a/3645946/9314418 if self.config['word_level']: punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\\n\\t\'‘’“”’–—' for i in range(len(texts)): texts[i] = re.sub('([{}])'.format(punct), r' \1 ', texts[i]) texts[i] = re.sub(' {2,}', ' ', texts[i]) # Create text vocabulary for new texts # if word-level, lowercase; if char-level, uppercase self.tokenizer = Tokenizer(filters='', lower=self.config['word_level'], char_level=(not self.config['word_level'])) self.tokenizer.fit_on_texts(texts) # Limit vocab to max_words max_words = self.config['max_words'] self.tokenizer.word_index = { k: v for (k, v) in self.tokenizer.word_index.items() if v <= max_words } if not self.config.get('single_text', False): self.tokenizer.word_index[self.META_TOKEN] = len( self.tokenizer.word_index) + 1 self.vocab = self.tokenizer.word_index self.num_classes = len(self.vocab) + 1 self.indices_char = dict((self.vocab[c], c) for c in self.vocab) # Create a new, blank model w/ given params self.model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config) # Save the files needed to recreate the model with open('{}_vocab.json'.format(self.config['name']), 'w', encoding='utf8') as outfile: json.dump(self.tokenizer.word_index, outfile, ensure_ascii=False) with open('{}_config.json'.format(self.config['name']), 'w', encoding='utf8') as outfile: json.dump(self.config, outfile, ensure_ascii=False) self.train_on_texts(texts, new_model=True, via_new_model=True, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, batch_size=batch_size, dropout=dropout, validation=validation, save_epochs=save_epochs, multi_gpu=multi_gpu, **kwargs) def save(self, weights_path="textgenrnn_weights_saved.hdf5"): self.model.save_weights(weights_path) def load(self, weights_path): self.model = textgenrnn_model(self.num_classes, cfg=self.config, weights_path=weights_path) def reset(self): self.config = self.default_config.copy() self.__init__(name=self.config['name']) def train_from_file(self, file_path, header=True, delim="\n", new_model=False, context=None, is_csv=False, **kwargs): context_labels = None if context: texts, context_labels = textgenrnn_texts_from_file_context( file_path) else: texts = textgenrnn_texts_from_file(file_path, header, delim, is_csv) print("{:,} texts collected.".format(len(texts))) if new_model: self.train_new_model(texts, context_labels=context_labels, **kwargs) else: self.train_on_texts(texts, context_labels=context_labels, **kwargs) def train_from_largetext_file(self, file_path, new_model=True, **kwargs): with open(file_path, 'r', encoding='utf8', errors='ignore') as f: texts = [f.read()] if new_model: self.train_new_model(texts, single_text=True, **kwargs) else: self.train_on_texts(texts, single_text=True, **kwargs) def generate_to_file(self, destination_path, **kwargs): texts = self.generate(return_as_list=True, **kwargs) with open(destination_path, 'w') as f: for text in texts: f.write("{}\n".format(text)) def encode_text_vectors(self, texts, pca_dims=50, tsne_dims=None, tsne_seed=None, return_pca=False, return_tsne=False): # if a single text, force it into a list: if isinstance(texts, str): texts = [texts] vector_output = Model(inputs=self.model.input, outputs=self.model.get_layer('attention').output) encoded_vectors = [] maxlen = self.config['max_length'] for text in texts: if self.config['word_level']: text = text_to_word_sequence(text, filters='') text_aug = [self.META_TOKEN] + list(text[0:maxlen]) encoded_text = textgenrnn_encode_sequence(text_aug, self.vocab, maxlen) encoded_vector = vector_output.predict(encoded_text) encoded_vectors.append(encoded_vector) encoded_vectors = np.squeeze(np.array(encoded_vectors), axis=1) if pca_dims is not None: assert len(texts) > 1, "Must use more than 1 text for PCA" pca = PCA(pca_dims) encoded_vectors = pca.fit_transform(encoded_vectors) if tsne_dims is not None: tsne = TSNE(tsne_dims, random_state=tsne_seed) encoded_vectors = tsne.fit_transform(encoded_vectors) return_objects = encoded_vectors if return_pca or return_tsne: return_objects = [return_objects] if return_pca: return_objects.append(pca) if return_tsne: return_objects.append(tsne) return return_objects def similarity(self, text, texts, use_pca=True): text_encoded = self.encode_text_vectors(text, pca_dims=None) if use_pca: texts_encoded, pca = self.encode_text_vectors(texts, return_pca=True) text_encoded = pca.transform(text_encoded) else: texts_encoded = self.encode_text_vectors(texts, pca_dims=None) cos_similairity = cosine_similarity(text_encoded, texts_encoded)[0] text_sim_pairs = list(zip(texts, cos_similairity)) text_sim_pairs = sorted(text_sim_pairs, key=lambda x: -x[1]) return text_sim_pairs
for s, l in train_data: training_sentences.append(str(s.numpy())) training_labels.append(l.numpy()) for s, l in test_data: test_sentences.append(str(s.numpy())) test_labels.append(l.numpy()) training_labels_final = np.array(training_labels) test_labels_final = np.array(test_labels) vocab_size = 10000 embedding_dim = 16 max_length = 120 trunc_type = 'post' oov_tok = "<OOV>" tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) testing_sequences = tokenizer.texts_to_sequences(test_sentences) testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type) reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) def decode_review(text):
def __init__(self, config: Configuration) -> None: ##初始化 位置向量矩阵 :max_sequence_length * 20 self.config = config self.word_segmentor = config.word_segmentor self.MAX_SEQUENCE_LENGTH = config.MAX_SEQUENCE_LENGTH self.EMBEDDING_DIM = config.EMBEDDING_DIM self.MAX_NB_WORDS = config.MAX_NB_WORDS if not os.path.isfile(config.position_matrix_file_path): position_matrix = np.random.randn(config.MAX_SEQUENCE_LENGTH, 20) np.save(config.position_matrix_file_path[0:-4], position_matrix) self.position_matrix = np.load(config.position_matrix_file_path) print("位置向量矩阵的大小", self.position_matrix.shape) ##初始化 tokenizer 转化文本为sequence default_model = {} default_model = tw_w2v.get_word2vec_dic(config.word2vec_file_path) self.tokenizer = Tokenizer(num_words=config.MAX_NB_WORDS) # words = None if isinstance(default_model, dict): words = default_model.keys() else: words = default_model.vocab.keys() self.tokenizer.fit_on_texts(words) word_index = self.tokenizer.word_index self.num_words = min(config.MAX_NB_WORDS, len(word_index) + 1) ##初始化词向量,词向量矩阵 : 50000*64 model_dim = 0 for key in words: model_dim = default_model[key].shape[0] break if self.EMBEDDING_DIM != model_dim: print("WARN ! 设置的词向量与读取维数不同,默认采用读取的词向量维数。", self.EMBEDDING_DIM, model_dim) self.EMBEDDING_DIM = model_dim config.EMBEDDING_DIM = model_dim self.embedding_matrix = np.zeros( (self.num_words, config.EMBEDDING_DIM)) for word, i in word_index.items(): if i >= config.MAX_NB_WORDS: continue embedding_vector = default_model[word] if embedding_vector is not None: # 文本数据中的词在词向量字典中没有,向量为取0;如果有则取词向量中该词的向量 try: self.embedding_matrix[i] = embedding_vector except Exception as e: print(e) else: print("warn! ", word, "不在词向量列表") print("词向量矩阵的大小", self.embedding_matrix.shape) ##初始化词性标注List self.POS_list = [] if os.path.exists(config.POS_list_file_path): with open(config.POS_list_file_path, encoding="UTF-8") as f: for line in f.readlines(): if len(line.strip()) > 0: self.POS_list.append(line.strip()) else: file_types = [] file_sentences = [] with open(config.corpus_file_path, 'r', encoding="UTF-8") as f: for line in f.readlines(): file_types.append(line.split("|")[0].strip()) file_sentences.append(line.split("|")[1].strip()) all_pos_set = set(self.POS_list) wordPairList_allSen, entityPosition_allSen = self.word_segmentor.segListWithNerTag( file_sentences) for pairs in wordPairList_allSen: for pair in pairs: if not all_pos_set.__contains__(pair.flag): self.POS_list.append(pair.flag) all_pos_set.add(pair.flag) with open(config.POS_list_file_path, "w", encoding="UTF-8") as f: for pos in self.POS_list: f.write(pos) f.write("\n") print("POS类型", len(self.POS_list)) ##关系种类,RelationWordAdmin有relations和relation_word_dic self.relationWordAdmin = RelationWordAdmin(config.types_file_path) self.types = self.relationWordAdmin.relations print("分类类型", len(self.types))
from sklearn import preprocessing from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.datasets import fetch_20newsgroups from keras.layers.embeddings import Embedding from keras.layers import Flatten twenty_train = fetch_20newsgroups(subset='train', shuffle=True) y = twenty_train.target sentences = twenty_train.data max_review_len = max([len(s.split()) for s in sentences]) tokenizer = Tokenizer(num_words=max_review_len) tokenizer.fit_on_texts(sentences) sentences = tokenizer.texts_to_matrix(sentences) le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000) # Build model model = Sequential() model.add(layers.Dense(300, input_dim=max_review_len, activation='relu')) model.add(layers.Dense(20,activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc']) history = model.fit(X_train, y_train, epochs=5, verbose=True, validation_data=(X_test, y_test), batch_size=256)
from sklearn.model_selection import GridSearchCV from keras.preprocessing.text import Tokenizer vocab_size = 3000 batch_size = 32 epochs = 5 (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=vocab_size, test_split=0.2) print(f"x_train shape: {x_train.shape}\nx_test shape: {x_test.shape}") classes = np.max(y_train) + 1 print("Vectorizing data . . . ") tokenizer = Tokenizer(num_words=vocab_size) x_train = tokenizer.sequences_to_matrix(x_train, mode="binary") x_test = tokenizer.sequences_to_matrix(x_test, mode="binary") y_train = keras.utils.to_categorical(y_train, num_classes=classes) y_test = keras.utils.to_categorical(y_test, num_classes=classes) print(f"y_train shape: {y_train.shape}\n y_test shape: {y_test.shape}") def make_model(activator="relu", alpha=0.3, optimizer="sgd", dense_layer_size=32, num_layers=3, dropout_rate=0.1, loss="categorical_crossentropy",
model.fit(X_train_nhot, y_train,validation_split=0.1 , epochs=15, verbose=1, batch_size=128) loss, accuracy = model.evaluate(X_test_nhot,y_test) EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto') print("Accuracy: ", accuracy *100) feed_forward = model.predict_classes(X_test_nhot, verbose=1) t = Tokenizer() t.fit_on_texts(X_train) vocab_size = len(t.word_index) + 1 encoded_docs = t.texts_to_sequences(X_train) encoded_doc = t.texts_to_sequences(X_test) num_classes = len(np.unique(y_train)) # how many labels we have y_train_one_hot = y_train y_test_one_hot = y_test y_dev_one_hot = y_dev # NEURAL NETWORK WITH DEEPLEARNING w2i = defaultdict(lambda: len(w2i)) PAD = w2i["<pad>"] # index 0 is padding UNK = w2i["<unk>"] # index 1 is for UNK
from keras.preprocessing.text import Tokenizer from sklearn.preprocessing import LabelBinarizer os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" np.random.seed(1337) # for reproducibility print("Reading pkl PORTUGUESE") start = time.time() train_data = pd.read_pickle('./data/train_subset_portuguese.pkl') test_data = pd.read_pickle('./data/test_subset_portuguese.pkl') stop = time.time() print(stop - start) t = Tokenizer() print("Loading the tokenizer") with open('./tokenizer/tokenizer_portuguese.pickle', 'rb') as handle: t = pickle.load(handle) print("Summary portuguese Train and test size: ") print(train_data.shape) print(test_data.shape) # Creating train data set encoded_seqs = t.texts_to_sequences(train_data['text_cleaned']) encoded_seqs_dummy = t.texts_to_sequences(test_data['text_cleaned']) s1 = np.max([len(item) for item in encoded_seqs]) s2 = np.max([len(item) for item in encoded_seqs_dummy]) s3 = np.max([s1, s2])
class CharModel(BaseModel): def __init__(self, vocab_size=200, max_charlen=250, tokenize_args={}, embedding_dim=64, filters=128, kernel_size=7, pooling_size=3, recursive_class=LSTM, recursive_units=128, dense_units=64, dropout=[0.75, 0.50], **kwargs): self._max_charlen = max_charlen self._vocab_size = vocab_size self._char_tokenizer = KerasTokenizer(num_words=vocab_size, char_level=True) # Build the graph input_char = Input(shape=(max_charlen, ), name="Char_Input") x = Embedding(vocab_size, embedding_dim)(input_char) x = Conv1D(filters=filters, kernel_size=kernel_size, padding='same', activation='relu')(x) x = MaxPooling1D(pool_size=pooling_size)(x) x = Bidirectional(recursive_class(recursive_units))(x) if dropout[0] > 0: x = Dropout(dropout[0])(x) x = Dense(dense_units, activation='relu')(x) if dropout[1] > 0: x = Dropout(dropout[1])(x) output = Dense(1, activation='sigmoid')(x) tok_args = { "preserve_case": False, "deaccent": True, "reduce_len": True, "strip_handles": False, "stem": True, "alpha_only": False } tok_args.update(tokenize_args) super().__init__(inputs=[input_char], outputs=[output], tokenize_args=tok_args, **kwargs) def _preprocess_text(self, X): tokens = map(self._tokenizer.tokenize, X) instances = [" ".join(seq_tokens) for seq_tokens in tokens] return instances def preprocess_fit(self, X): text_train = self._preprocess_text(X) self._char_tokenizer.fit_on_texts(text_train) def preprocess_transform(self, X): X_transf = self._preprocess_text(X) X_transf = self._char_tokenizer.texts_to_sequences(X_transf) return pad_sequences(X_transf, self._max_charlen)
# The RNN is an expressive model that is known to learn highly complex relationships from an arbitrarily long sequence of data. It maintains a vector of activation units for each element in the data sequence, this makes RNN very deep. The depth of RNN leads to two well-known issues, the exploding and the vanishing gradient problems. # # There are many ways to implement nueral network in python. Here, I will be using tensorflow/keras. # In[110]: # Importing the libraries from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences # In[111]: tk = Tokenizer(lower = True) tk.fit_on_texts(X) X_seq = tk.texts_to_sequences(X) X_pad = pad_sequences(X_seq, maxlen=100, padding='post') # In[112]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size = 0.25, random_state = 1) # In[113]:
test_texts_1 = [] test_texts_2 = [] test_ids = [] def get_test_text(row): global test_texts_1, test_texts_2, test_ids test_texts_1.append(row.question1) test_texts_2.append(row.question2) test_ids.append(row.test_id) test.apply(get_test_text, axis=1) print('Found %s texts in test.csv' % len(test_texts_1)) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2) sequences_1 = tokenizer.texts_to_sequences(texts_1) sequences_2 = tokenizer.texts_to_sequences(texts_2) test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1) test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH) data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH) labels = np.array(labels) print('Shape of data tensor:', data_1.shape) print('Shape of label tensor:', labels.shape)
BATCH_SIZE = 64 NB_EPOCHS = 25 # 读取训练和测试数据 x_train, y_train = read_txt(train_path, mode="train") x_test = read_txt(test_path, mode="predict") print( u"length of train data is {0}, length of train label is {1}, length of test data is {2}" .format(len(x_train), len(y_train), len(x_test))) # 生成corpus corpus = x_train.tolist() + x_test.tolist() print("length of corpus is {0}".format(len(corpus))) # 顺序进行tokenizer,fit,txt2sequence,pad-sequence tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(corpus) print(u"fit_on_texts finished") sequences = tokenizer.texts_to_sequences(x_train) test_sequences = tokenizer.texts_to_sequences(x_test) print("tokenizer finished") data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', y_train.shape) test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of test_data tensor:', test_data.shape) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) # 将数据的id切分train和val
return string.strip().lower() data = pandas.read_csv('./data.csv') #默认以逗号为分隔符 texts = [clean_text(text.encode('ascii', 'ignore')) for text in data.review] labels = to_categorical(np.asarray(list(data.sentiment))) embeddings_dict = {} with open('data_50.txt', 'r', encoding='utf-8') as f: for data_word in f: word = data_word.split()[0] arra = np.asarray(data_word.split()[1:], dtype='float32') embeddings_dict[word] = arra tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index data = pad_sequences(sequences, maxlen=1000) data = data[np.arange(MAX_NUM)] labels = labels[np.arange(MAX_NUM)] x_train = data[:24999] y_train = labels[:24999] x_test = data[25000:] y_test = labels[25000:] embedding_matrix = np.random.random((len(word_index) + 1, 50)) for word, i in word_index.items():
print("Processing", QUESTION_PAIRS_FILE) question1 = [] question2 = [] is_duplicate = [] with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: question1.append(row['question1']) question2.append(row['question2']) is_duplicate.append(row['is_duplicate']) print('Question pairs: %d' % len(question1)) questions = question1 + question2 tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) question1_word_sequences = tokenizer.texts_to_sequences(question1) question2_word_sequences = tokenizer.texts_to_sequences(question2) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index)) if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE): zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL)) zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR) print("Processing", GLOVE_FILE) embeddings_index = {} with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
def tokenize(x): x_tkzr = Tokenizer(char_level=False) x_tkzr.fit_on_texts(x) return x_tkzr.texts_to_sequences(x), x_tkzr
for t in os.listdir(path): tweet_count += 1 p2 = os.path.join(path, t) f = open(p2, "r") texts_.append(f.read()) labels_.append(x) f.close() print(x, '. Klasör tweet sayısı :', tweet_count) from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.preprocessing import StandardScaler from sklearn import preprocessing from keras.utils import to_categorical token = Tokenizer() token.fit_on_texts(texts_) texts_ = token.texts_to_sequences(texts_) texts_ = pad_sequences(texts_) texts_ = StandardScaler().fit_transform(texts_) labels_ = preprocessing.LabelEncoder().fit_transform(labels_) labels_ = to_categorical(labels_) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test =\ train_test_split(texts_, labels_, test_size = 0.5) max_futures = 1500 maxlen = 28
def get_padded_sequences(titles: pd.Series, tokenizer: Tokenizer) -> np.array: sequences = tokenizer.texts_to_sequences(titles) padded_sequences = pad_sequences(sequences, maxlen=config.MAX_SEQUENCE_LENGTH) return padded_sequences
q1 = train['0'].tolist() q2 = train['1'].tolist() test['0'] = test['0'].progress_apply( lambda x: re.sub(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', "", str(x))) test['1'] = test['1'].progress_apply( lambda x: re.sub(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', "", str(x))) q1_test = test['0'].tolist() q2_test = test['1'].tolist() labels = train['is_duplicate'].tolist() ids = test['test_id'].tolist() tokenizer = Tokenizer(num_words=u.max_nb_words) tokenizer.fit_on_texts(q1 + q2 + q1_test + q2_test) sequences_1 = tokenizer.texts_to_sequences(q1) sequences_2 = tokenizer.texts_to_sequences(q2) test_sequences_1 = tokenizer.texts_to_sequences(q1_test) test_sequences_2 = tokenizer.texts_to_sequences(q2_test) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) data_1 = pad_sequences(sequences_1, maxlen=u.seq_length) data_2 = pad_sequences(sequences_2, maxlen=u.seq_length) labels = np.array(labels) print("Elapsed time till loading word vectors", time() - start)
def main(): # columns = ['class', 'title', 'u1', 'authors', 'source', 'publisher', 'citations', 'abstract', 'keywords'] data = pd.read_csv(TRAIN_FILE) data.dropna(subset=['class', 'title', 'abstract']) print('Rows of data: {}'.format(len(data))) label_count = data['class'].nunique() print('Unique labels: {}'.format(label_count)) labels = np.array(data['class']) titles = list(data['title']) abstracts = list(data['abstract']) if DO_LOWER_CASE: sentences = [ titles[i].lower() + ' ' + abstracts[i].lower() for i in range(len(titles)) ] else: sentences = [ titles[i] + ' ' + abstracts[i] for i in range(len(titles)) ] if DO_REMOVE_PUNCTUATION: sentences = [ re.sub(r'[~`!@#$%^&*()_\-+={}\[\]|\\:;"\'<>,.?/]+', ' ', sentence) for sentence in sentences ] if DO_REMOVE_STOP_WORDS: sentences = remove_stop_words(sentences) lines = np.array(sentences) # Shuffle the labels and lines. permutation = np.random.permutation(labels.shape[0]) labels = labels[permutation] lines = lines[permutation] # Split the data for training and testing. train_ratio = 0.8 val_ratio = 0.25 # Proportion of the TRAINING data, not of the entire data set. train_end = int(train_ratio * len(lines)) train_labels = labels[:train_end] train_lines = lines[:train_end] test_labels = labels[train_end:] test_lines = lines[train_end:] # Print class spreads in each data set. val_start = int((1 - val_ratio) * train_end) train_spread = get_spread(train_labels[:val_start]) test_spread = get_spread(test_labels) val_spread = get_spread(train_labels[val_start:]) longest_label = max(len(label) for label in train_spread.keys()) for label in train_spread.keys(): print('{}{}: TR:{} VAL:{} TS:{}'.format( label, ''.join(' ' for _ in range(longest_label - len(label))), train_spread[label] / val_start, val_spread[label] / (train_end - val_start), test_spread[label] / len(test_labels))) # Pre-process; tokenize the text and transform it into [padded] sequences for the RNN. # See: https://www.kaggle.com/sbongo/for-beginners-tackling-toxic-using-keras # See: https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout max_features = 8000 # The maximum number of total unique words to use. tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(train_lines) # tokenizer.fit_on_texts(test_lines) # Can we do this? train_word_counts = tokenizer.texts_to_matrix(train_lines, mode='count') test_word_counts = tokenizer.texts_to_matrix(test_lines, mode='count') max_word_counts = np.array([ max(train_word_counts[:, col]) for col in range(len(train_word_counts[0])) ]) min_word_counts = np.array([ min(train_word_counts[:, col]) for col in range(len(train_word_counts[0])) ]) x_train = np.nan_to_num((train_word_counts - min_word_counts) / (max_word_counts - min_word_counts)) x_test = np.nan_to_num((test_word_counts - min_word_counts) / (max_word_counts - min_word_counts)) # Transform the labels into a one-hot encoding. encoder = LabelBinarizer() y_train = encoder.fit_transform(train_labels) y_test = encoder.fit_transform(test_labels) # Build the model. model = get_model(max_features, label_count) print(model.summary()) optimizer = 'adam' model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=val_ratio) # Save the model and the history. save_str = 'dense1-{}{}{}{}{}-{}-{}'.format( max_features, '-lower' if DO_LOWER_CASE else '', '-nopunc' if DO_REMOVE_PUNCTUATION else '', '-nostop' if DO_REMOVE_STOP_WORDS else '', '-stem' if DO_STEMMING else '', optimizer, epochs) print('Saving model `{}.h5`...'.format(save_str)) model.save(os.path.join(MODELS_FOLDER, '{}.h5'.format(save_str))) print('Saving training history `history-{}.txt`...'.format(save_str)) with open(os.path.join(LOGS_FOLDER, 'history-{}.txt'.format(save_str)), 'w') as fd: for key in history.history.keys(): values = history.history.get(key) fd.write(key + ' ' + ' '.join(str(value) for value in values) + '\n') # Test. # predictions = model.predict([test_sequences], batch_size=1024, verbose=1) test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=1024, verbose=1) print('Test loss:', test_loss) print('Test accuracy:', test_accuracy)
def create_tokenizer(descriptions): lines = to_lines(descriptions) tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer
def keras_classify(df): # 预处理,把 text 中的词转成数字编号 from keras.preprocessing.text import Tokenizer from keras.preprocessing import sequence from keras.callbacks import EarlyStopping from sklearn.cross_validation import train_test_split print "----- Classification by Keras -----" max_features = 50000 # 只选最重要的词 # Tokenizer 只能处理 str,不能处理 unicode textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist()) token = Tokenizer(nb_words=max_features) # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词 token.fit_on_texts(textraw) # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本 # 如 textraw = ['a b c', 'c d e f'] ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]] text_seq = token.texts_to_sequences(textraw) nb_classes = len(np.unique(df.label.values)) print "num of features(vocabulary): ", len(token.word_counts) print "num of labels: ", nb_classes max_sent_len = np.max([len(s) for s in text_seq]) print "max length or document is: ", max_sent_len median_sent_len = np.median([len(s) for s in text_seq]) print "median length or document is: ", median_sent_len # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错 train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1) # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练 seqlen = int(max_sent_len / 2 + median_sent_len / 2) X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post') # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) model = build_cnn_model(max_features, seqlen, nb_classes) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop]) evaluate(earlystop.model, X_test, Y_test, test_y) model = build_lstm_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) model = build_mixed_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) graph = build_graph_model(max_features, seqlen, nb_classes) graph.fit({ 'input': X_train, 'output': Y_train }, nb_epoch=3, batch_size=32, validation_split=0.1) predict = graph.predict({'input': X_test}, batch_size=32) predict = predict['output'] classes = predict.argmax(axis=1) acc = np_utils.accuracy(classes, test_y) print('Test accuracy: ', acc)
class CaptionPreprocessor(object): """Preprocesses captions before feeded into the network.""" EOS_TOKEN = 'zeosz' def __init__(self, rare_words_handling=None, words_min_occur=None): """ If an arg is None, it will get its value from config.active_config. Args: rare_words_handling: {'nothing'|'discard'|'change'} words_min_occur: words whose occurrences are less than this are considered rare words """ self._tokenizer = Tokenizer() self._rare_words_handling = (rare_words_handling or active_config().rare_words_handling) self._words_min_occur = (words_min_occur or active_config().words_min_occur) self._word_of = {} @property def EOS_TOKEN_LABEL_ENCODED(self): return self._tokenizer.word_index[self.EOS_TOKEN] @property def vocabs(self): word_index = self._tokenizer.word_index return sorted(word_index, key=word_index.get) # Sort by word's index @property def vocab_size(self): return len(self._tokenizer.word_index) def fit_on_captions(self, captions_txt): captions_txt = self._handle_rare_words(captions_txt) captions_txt = self._add_eos(captions_txt) self._tokenizer.fit_on_texts(captions_txt) self._word_of = {i: w for w, i in self._tokenizer.word_index.items()} def encode_captions(self, captions_txt): captions_txt = self._add_eos(captions_txt) return self._tokenizer.texts_to_sequences(captions_txt) def decode_captions(self, captions_output, captions_output_expected=None): """ Args captions_output: 3-d array returned by a model's prediction; it's the same as captions_output returned by preprocess_batch """ captions = captions_output[:, :-1, :] # Discard the last word (dummy) label_encoded = captions.argmax(axis=-1) num_batches, num_words = label_encoded.shape if captions_output_expected is not None: caption_lengths = self._caption_lengths(captions_output_expected) else: caption_lengths = [num_words] * num_batches captions_str = [] for caption_i in range(num_batches): caption_str = [] for word_i in range(caption_lengths[caption_i]): label = label_encoded[caption_i, word_i] label += 1 # Real label = label in model + 1 caption_str.append(self._word_of[label]) captions_str.append(' '.join(caption_str)) return captions_str # TODO Test method below def decode_captions_from_list2d(self, captions_encoded): """ Args captions_encoded: 1-based (Tokenizer's), NOT 0-based (model's) """ captions_decoded = [] for caption_encoded in captions_encoded: words_decoded = [] for word_encoded in caption_encoded: # No need of incrementing word_encoded words_decoded.append(self._word_of[word_encoded]) captions_decoded.append(' '.join(words_decoded)) return captions_decoded def normalize_captions(self, captions_txt): captions_txt = self._add_eos(captions_txt) word_sequences = map(text_to_word_sequence, captions_txt) result = map(' '.join, word_sequences) return result def preprocess_batch(self, captions_label_encoded): captions = keras_seq.pad_sequences(captions_label_encoded, padding='post') # Because the number of timesteps/words resulted by the model is # maxlen(captions) + 1 (because the first "word" is the image). captions_extended1 = keras_seq.pad_sequences(captions, maxlen=captions.shape[-1] + 1, padding='post') captions_one_hot = map(self._tokenizer.sequences_to_matrix, np.expand_dims(captions_extended1, -1)) captions_one_hot = np.array(captions_one_hot, dtype='int') # Decrease/shift word index by 1. # Shifting `captions_one_hot` makes the padding word # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]), # so its cross entropy loss will be zero. captions_decreased = captions.copy() captions_decreased[captions_decreased > 0] -= 1 captions_one_hot_shifted = captions_one_hot[:, :, 1:] captions_input = captions_decreased captions_output = captions_one_hot_shifted return captions_input, captions_output def _handle_rare_words(self, captions): if self._rare_words_handling == 'nothing': return captions elif self._rare_words_handling == 'discard': tokenizer = Tokenizer() tokenizer.fit_on_texts(captions) new_captions = [] for caption in captions: words = text_to_word_sequence(caption) new_words = [w for w in words if tokenizer.word_counts.get(w, 0) >= self._words_min_occur] new_captions.append(' '.join(new_words)) return new_captions raise NotImplementedError('rare_words_handling={} is not implemented ' 'yet!'.format(self._rare_words_handling)) def _add_eos(self, captions): return map(lambda x: x + ' ' + self.EOS_TOKEN, captions) def _caption_lengths(self, captions_output): one_hot_sum = captions_output.sum(axis=2) return (one_hot_sum != 0).sum(axis=1)
import pickle f = open('/content/drive/My Drive/Sarcasm/Final_dataset3.p', 'rb') d_final3 = pickle.load(f) import pickle f = open('/content/drive/My Drive/Sarcasm/embedding_matrix.p', 'rb') embedding_matrix = pickle.load(f) f = open('/content/drive/My Drive/Sarcasm/Audio_features.p', 'rb') emb = pickle.load(f) MAX_NB_WORDS = 40000 MAX_SEQUENCE_LENGTH = 128 text = d_final3['text'] tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token=True) tokenizer.fit_on_texts(text) data = np.zeros((len(text), MAX_SEQUENCE_LENGTH), dtype='int32') labels1 = [] label_index1 = {} for label in d_final3['Sarcasm']: labelid = len(label_index1) label_index1[label] = labelid labels1.append(label) print(len(labels1))
def train_new_model(self, texts, context_labels=None, num_epochs=50, gen_epochs=1, batch_size=128, dropout=0.0, validation=True, save_epochs=0, multi_gpu=False, **kwargs): self.config = self.default_config.copy() self.config.update(**kwargs) print("Training new model w/ {}-layer, {}-cell {}LSTMs".format( self.config['rnn_layers'], self.config['rnn_size'], 'Bidirectional ' if self.config['rnn_bidirectional'] else '')) # If training word level, must add spaces around each punctuation. # https://stackoverflow.com/a/3645946/9314418 if self.config['word_level']: punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\\n\\t\'‘’“”’–—' for i in range(len(texts)): texts[i] = re.sub('([{}])'.format(punct), r' \1 ', texts[i]) texts[i] = re.sub(' {2,}', ' ', texts[i]) # Create text vocabulary for new texts # if word-level, lowercase; if char-level, uppercase self.tokenizer = Tokenizer(filters='', lower=self.config['word_level'], char_level=(not self.config['word_level'])) self.tokenizer.fit_on_texts(texts) # Limit vocab to max_words max_words = self.config['max_words'] self.tokenizer.word_index = { k: v for (k, v) in self.tokenizer.word_index.items() if v <= max_words } if not self.config.get('single_text', False): self.tokenizer.word_index[self.META_TOKEN] = len( self.tokenizer.word_index) + 1 self.vocab = self.tokenizer.word_index self.num_classes = len(self.vocab) + 1 self.indices_char = dict((self.vocab[c], c) for c in self.vocab) # Create a new, blank model w/ given params self.model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config) # Save the files needed to recreate the model with open('{}_vocab.json'.format(self.config['name']), 'w', encoding='utf8') as outfile: json.dump(self.tokenizer.word_index, outfile, ensure_ascii=False) with open('{}_config.json'.format(self.config['name']), 'w', encoding='utf8') as outfile: json.dump(self.config, outfile, ensure_ascii=False) self.train_on_texts(texts, new_model=True, via_new_model=True, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, batch_size=batch_size, dropout=dropout, validation=validation, save_epochs=save_epochs, multi_gpu=multi_gpu, **kwargs)
MAX_SEQUENCE_LENGTH = 2000 NB_WORDS = 65 EMBEDDING_DIM = 100 embedding_matrix = np.load('embedding_matrix.npy') # Tokenize----------------------------- f = ['a', 'c', 'g', 't'] c = itertools.product(f, f, f, f, f, f) res = [] for i in c: temp = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] res.append(temp) res = np.array(res) NB_WORDS = 4097 tokenizer = Tokenizer(num_words=NB_WORDS) tokenizer.fit_on_texts(res) word_index = tokenizer.word_index word_index['null'] = 0 # ------------------------------------ from keras.layers import Embedding embedding_layer = Embedding(len(word_index), EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input)
dataframe.drop('AUTHOR', axis=1, inplace=True) dataframe.drop('DATE', axis=1, inplace=True) # print(dataframe.head(5)) # print(numpy.unique(dataframe['CLASS'])) dataset = dataframe.values X = dataset[:, 0] Y = dataset[:, 1] # Summarize number of words print("Number of words: ") print(len(numpy.unique(numpy.hstack(X)))) tokenizer = Tokenizer() tokenizer.fit_on_texts(X) X = tokenizer.texts_to_sequences(X) validation_size = 0.33 seed = 7 X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size, random_state=seed) X_train = sequence.pad_sequences(X_train, maxlen=120) X_validation = sequence.pad_sequences(X_validation, maxlen=120) # print(X_train[0])
import pandas as pd import numpy as np import nltk, re, time from nltk.corpus import stopwords from string import punctuation from collections import namedtuple from sklearn.datasets import load_files from keras.preprocessing.text import Tokenizer input_file = "./DONE/s.csv" # Charge le csv dans une dataframe dataset = pd.read_csv(input_file, delimiter="\t") print(dataset.shape) # Tokenize le corpus et print la longueur de l'index tokenizer = Tokenizer() tokenizer.fit_on_texts(dataset) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index))
from keras.models import Sequential from keras import layers from keras.preprocessing.text import Tokenizer import pandas as pd from sklearn import preprocessing from sklearn.model_selection import train_test_split # read the file df = pd.read_csv('train.tsv', header=None, delimiter='\t', low_memory=False) # labels columns df.columns = ['PhraseID', 'SentenceID', 'Phrase', 'Sentiment'] sentences = df['Phrase'].values y = df['Sentiment'].values tokenizer = Tokenizer(num_words=2000) tokenizer.fit_on_texts(sentences) sentences = tokenizer.texts_to_matrix(sentences) le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000) # Number of features # print(input_dim) model = Sequential() model.add(layers.Dense(300, input_dim=2000, activation='relu')) model.add(layers.Dense(10, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy',
test = pd.read_csv(test_path) list_classes = [i for i in range(58)] y = train['Category'].values train["title"].fillna("no comment") test["title"].fillna("no comment") X_train = train Y_train = y del train del y raw_text_train = X_train["title"].str.lower() raw_text_test = test["title"].str.lower() tk = Tokenizer(num_words = max_features, lower = True) tk.fit_on_texts(raw_text_train) X_train["comment_seq"] = tk.texts_to_sequences(raw_text_train) test["comment_seq"] = tk.texts_to_sequences(raw_text_test) X_train = pad_sequences(X_train.comment_seq, maxlen = max_len) test = pad_sequences(test.comment_seq, maxlen = max_len) def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path)) word_index = tk.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_size)) for word, i in word_index.items(): if i >= max_features: continue
from keras.preprocessing.text import Tokenizer import pandas as pd from keras.utils import to_categorical from sklearn.model_selection import train_test_split from keras import models, layers import pickle tokenizer = Tokenizer() nama_file = "D:\\resa\\D\\KULIAH\\S2\\Semester 1\\python\\mlNN_1\\datasetSMS\\dataset.csv" df = pd.read_csv(nama_file).values data = df[:, 0] label = df[:, 1] label = to_categorical(label) print(label) print(label.shape) X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=123) # fit hanya berdasarkan data train tokenizer.fit_on_texts(X_train) # konversi train seq_x_train = tokenizer.texts_to_sequences(X_train) X_enc_train = tokenizer.sequences_to_matrix(seq_x_train, mode="tfidf") # konversi test seq_x_test = tokenizer.texts_to_sequences(X_test) X_enc_test = tokenizer.sequences_to_matrix(seq_x_test, mode="tfidf") # print(X_enc_train.shape)