def tokenize(texts, texts_train, texts_test): tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) sequences_train = tokenizer.texts_to_sequences(texts_train) sequences_test = tokenizer.texts_to_sequences(texts_test) return word_index, sequences_train, sequences_test
def tokenizeAndGenerateIndex(train, test, maxFeatures, maxLength): merged = np.concatenate([train, test]) tokenizer = Tokenizer(nb_words=maxFeatures) tokenizer.fit_on_texts(merged) sequences_train = tokenizer.texts_to_sequences(train) sequences_test = tokenizer.texts_to_sequences(test) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data_train = pad_sequences(sequences_train, maxlen=maxLength) data_test = pad_sequences(sequences_test, maxlen=maxLength) return data_train, data_test, word_index
def get_data_1(train_sents, maxlen): word_list = [] for i in range(len(train_sents)): for words in train_sents[i]: word_list.append(words) sequence=[] stride=1 #applying windowing for sequence genration for i in range(0,len(word_list)-maxlen,stride): line=word_list[i:i+maxlen] sequence.append(line) tokenizer=Tokenizer() tokenizer.fit_on_texts(sequence) seq=tokenizer.texts_to_sequences(sequence) vocab_len=len(tokenizer.word_index.items())+1 seq=np.array(seq) x_train=seq[:,:-1] y_train=np.zeros((x_train.shape[0],x_train.shape[1],1)) for i in range(x_train.shape[0]): for j in range(x_train.shape[1]): y_train[i,j,0]=seq[i,j+1] return x_train,y_train,vocab_len,tokenizer
def get_train_val_matrix(texts, labels, max_features=10000, max_len=100): tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(texts) sequens = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print(f'Found {len(word_index)} unique tokens') data = pad_sequences(sequens, maxlen=max_len) labels = np.asarray(labels) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] train_sample_n = 20000 validation_sample_n = 5000 x_train = data[:train_sample_n] x_val = data[train_sample_n:validation_sample_n+train_sample_n] y_train = labels[:train_sample_n] y_val = labels[train_sample_n:validation_sample_n+train_sample_n] return (x_train, y_train), (x_val, y_val), word_index
def prepare_tokenizer(words): ''' funtion to generate vocabulary of the given list of words implemented by Anindya @param words => the list of words to be tokenized ''' # obtain a tokenizer t = Tokenizer(filters = '') # don't let keras ignore any words t.fit_on_texts(words) field_dict = dict(); rev_field_dict = dict() for key,value in t.word_index.items(): field_dict[value] = key rev_field_dict[key] = value vocab_size = len(t.word_index) + 1 ''' Small modification from Animesh # also add the '<unk>' token to the dictionary at 0th position ''' field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0 #print (vocab_size) # integer encode the documents encoded_docs = t.texts_to_sequences(words) # print "debug: " + str(encoded_docs) #print(padded_docs) return np.array(encoded_docs), field_dict, rev_field_dict, vocab_size
def LoadSMILESData(duplicateProb = 0,seed=7): dataComp = dataset.LoadData('data',0) smiles = list(map(lambda x: x._SMILE, dataComp)) tokenizer = Tokenizer(num_words=None, char_level=True) tokenizer.fit_on_texts(smiles) print(smiles[0]) dictionary = {} i=0 k=0 for smile in smiles: i+=1 for c in list(smile): k+=1 if c in dictionary: dictionary[c]+=1 else: dictionary[c]=1 print(len(dictionary)) # sequence encode encoded_docs = tokenizer.texts_to_sequences(smiles) # pad sequences max_length = max([len(s) for s in smiles]) vocab = {'C': 1, 'c': 2, '(': 3, ')': 4, 'O': 5, '=': 6, '1': 7, 'N': 8, '2': 9, '3': 10, '[': 11, ']': 12, 'F': 13, '4': 14, 'l': 15, 'n': 16, 'S': 17, '@': 18, 'H': 19, '5': 20, '+': 21, '-': 22, 'B': 23, 'r': 24, '\\': 25, '#': 26, '6': 27, '.': 28, '/': 29, 's': 30, 'P': 31, '7': 32, 'i': 33, 'o': 34, '8': 35, 'I': 36, 'a': 37, '%': 38, '9': 39, '0': 40, 'K': 41, 'e': 42, 'A': 43, 'g': 44, 'p': 45, 'M': 46, 'T': 47, 'b': 48, 'd': 49, 'V': 50, 'Z': 51, 'G': 52, 'L': 53} Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define vocabulary size (largest integer value) labels = list(map(lambda x: 1 if x.mutagen==True else 0,dataComp)) return Xtrain, labels,vocab,max_length
def preprocess_embedding(): corpus_train, target, filenames = get_corpus() tokenizer = Tokenizer() tokenizer.fit_on_texts(corpus_train) sequences = tokenizer.texts_to_sequences(corpus_train) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) MAX_SEQUENCE_LENGTH = 50 data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/home/flippped/Desktop/xiangmu/baseline/GoogleNews-vectors-negative300.bin.gz', binary=True) word2vec_model.init_sims(replace=True) # create one matrix for documents words EMBEDDING_DIM = 300 embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) print embedding_matrix.shape for word, i in word_index.items(): try: embedding_vector = word2vec_model[str(word)] if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector except: continue return data,target,filenames,embedding_matrix, word_index
def tokenizeAndGenerateIndex(texts): tokenizer = Tokenizer(nb_words=vocab_size) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=maxlen,padding='post') return data
def handle(self, *args, **options): ptt = PTT.objects.all() ptt_json = PTTSerializer(ptt, many=True).data user_comments_times = dict() labels_index = 2 labels = [] texts = [] for article in ptt_json: pointer = 1 if article['score'] > 0 else 0 words = jieba.cut(article['contents']) for word in words: labels.append(pointer) texts.append(word) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) print('Token word index:', tokenizer.word_index) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] print('Training model.') # train a 1D convnet with global maxpooling sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32') x = Embedding(output_dim=100, input_dim=len(tokenizer.word_index), input_length=self.MAX_SEQUENCE_LENGTH)(sequence_input) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(35)(x) x = Flatten()(x) x = Dense(128, activation='relu')(x) preds = Dense(labels_index, activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) # happy learning! model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=2, batch_size=64) score = model.evaluate(x_val, y_val, verbose=0) print('Test score:', score[0]) print('Test accuracy:', score[1])
def get_tokenizer(train_comments, nwords): print("getting tokenizer..") t = Tokenizer(num_words=nwords) texts = train_comments t.fit_on_texts(texts) sequences = t.texts_to_sequences(texts) return (t,sequences)
def question_to_input(df_q1,df_q2): tokenizer = Tokenizer() tokenizer.fit_on_texts(df_q1 + df_q2) encoded_1 = tokenizer.texts_to_sequences(df_q1) encoded_2 = tokenizer.texts_to_sequences(df_q2) question_input_train = sequence.pad_sequences(encoded_1, maxlen=15) question_input_test = sequence.pad_sequences(encoded_2, maxlen=15) return question_input_train,question_input_test
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
def keras_classify(df): # 预处理,把 text 中的词转成数字编号 from keras.preprocessing.text import Tokenizer from keras.preprocessing import sequence from keras.callbacks import EarlyStopping from sklearn.cross_validation import train_test_split print "----- Classification by Keras -----" max_features = 50000 # 只选最重要的词 # Tokenizer 只能处理 str,不能处理 unicode textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist()) token = Tokenizer(nb_words=max_features) # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词 token.fit_on_texts(textraw) # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本 # 如 textraw = ['a b c', 'c d e f'] ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]] text_seq = token.texts_to_sequences(textraw) nb_classes = len(np.unique(df.label.values)) print "num of features(vocabulary): ", len(token.word_counts) print "num of labels: ", nb_classes max_sent_len = np.max([len(s) for s in text_seq]) print "max length or document is: ", max_sent_len median_sent_len = np.median([len(s) for s in text_seq]) print "median length or document is: ", median_sent_len # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错 train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1) # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练 seqlen = int(max_sent_len / 2 + median_sent_len / 2) X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post') # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) model = build_cnn_model(max_features, seqlen, nb_classes) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop]) evaluate(earlystop.model, X_test, Y_test, test_y) model = build_lstm_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) model = build_mixed_model(max_features, seqlen, nb_classes) model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1) evaluate(model, X_test, Y_test, test_y) graph = build_graph_model(max_features, seqlen, nb_classes) graph.fit({'input': X_train, 'output': Y_train}, nb_epoch=3, batch_size=32, validation_split=0.1) predict = graph.predict({'input': X_test}, batch_size=32) predict = predict['output'] classes = predict.argmax(axis=1) acc = np_utils.accuracy(classes, test_y) print('Test accuracy: ', acc)
def df2seq(df, nb_words): textraw = df.EssayText.values.tolist() textraw = [line.encode('utf-8') for line in textraw] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) text_seq = token.texts_to_sequences(textraw) return(text_seq, df.Score1.values)
def word_to_index(self, text, tok=None): real_text = [' '.join(z) for z in text] if tok is None: tokenizer = Tokenizer(lower=False, filters=" ") tokenizer.fit_on_texts(real_text) else: tokenizer = tok # here do not need the loop, just put the list of sentences (str) as input sequences = tokenizer.texts_to_sequences(real_text) # tokenizer.word_docs.items() return sequences, tokenizer
def load_mr(nb_words=20000, maxlen=64, embd_type='self'): """ :param embd_type: self vs. w2v :return: """ train_size = 0.8 df = pickled2df('data/mr.p') print(df.head()) train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(), df.label.values, train_size=train_size, random_state=1) train_X_wds = train_X test_X_wds = test_X nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print('train len vs. test len', n_ta, n_ts) textraw = [line.encode('utf-8') for line in train_X+test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print('nb_words: ',len(token.word_counts)) print('mean len: ',np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if(embd_type == 'self'): X_train = xcol_nninput_embd(train_X, nb_words, maxlen) X_test = xcol_nninput_embd(test_X, nb_words, maxlen) elif(embd_type == 'w2v'): w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print('wrong embd_type') print('X tensor shape: ', X_train.shape) print('Y tensor shape: ', Y_train.shape) return (X_train, Y_train, X_test, Y_test, nb_classes)
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type): train_df = pd.read_csv(traincsv) test_df = pd.read_csv(testcsv) print(train_df.head()) train_X = train_df.text.values.tolist() test_X = test_df.text.values.tolist() # save for w2v embd train_X_wds = train_X test_X_wds = test_X train_y = train_df.label.values test_y = test_df.label.values nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print('train len vs. test len', n_ta, n_ts) textraw = [line.encode('utf-8') for line in train_X+test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print('nb_words: ', len(token.word_counts)) print('mean len: ', np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if(embd_type == 'self'): X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post') X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post') elif(embd_type == 'w2v'): w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print('wrong embd_type') print('X tensor shape: ', X_train.shape) print('Y tensor shape: ', Y_train.shape) return(X_train, Y_train, X_test, Y_test, nb_classes)
def trans_to_indeces(self, train_data, test_data): total_data = train_data + test_data tokenizer = Tokenizer(char_level=True) tokenizer.fit_on_texts(total_data) word_index = tokenizer.word_index embed_matrix = np.zeros([len(word_index) + 1, self.embed_size]) if self.embed_path: for word, embeds in self._embeddings_generator(self.embed_path): if word in word_index: embed_matrix[word_index[word]] = embeds train_sequences = tokenizer.texts_to_sequences(train_data) test_sequences = tokenizer.texts_to_sequences(test_data) lengths = [len(sequence) for sequence in train_sequences] + [len(sequence) for sequence in test_sequences] max_len = np.max(lengths) train_sequences = pad_sequences(train_sequences, maxlen=max_len) test_sequences = pad_sequences(test_sequences, maxlen=max_len) self.time_step = max_len return train_sequences, test_sequences, embed_matrix, word_index
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type, w2v): train_df = pd.read_csv(traincsv) test_df = pd.read_csv(testcsv) print(train_df.head()) train_X = train_df.text.values.tolist() test_X = test_df.text.values.tolist() # save for w2v embd train_X_wds = train_X test_X_wds = test_X train_y = train_df.label.values test_y = test_df.label.values nb_classes = len(np.unique(train_y)) Y_train = np_utils.to_categorical(train_y, nb_classes) Y_test = np_utils.to_categorical(test_y, nb_classes) # tokenrize should be applied on train+test jointly n_ta = len(train_X) n_ts = len(test_X) print("train len vs. test len", n_ta, n_ts) textraw = [line.encode("utf-8") for line in train_X + test_X] # keras needs str # keras deals with tokens token = Tokenizer(nb_words=nb_words) token.fit_on_texts(textraw) textseq = token.texts_to_sequences(textraw) # stat about textlist print("nb_words: ", len(token.word_counts)) print("mean len: ", np.mean([len(x) for x in textseq])) train_X = textseq[0:n_ta] test_X = textseq[n_ta:] if embd_type == "self": X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding="post", truncating="post") X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding="post", truncating="post") elif embd_type == "w2v": X_train = sents_3dtensor(train_X_wds, maxlen, w2v) X_test = sents_3dtensor(test_X_wds, maxlen, w2v) else: print("wrong embd_type") print("X tensor shape: ", X_train.shape) print("Y tensor shape: ", Y_train.shape) return (X_train, Y_train, X_test, Y_test, nb_classes)
def tokenize(texts, max_nb_words, max_sequence_length): '''converts preprocessed texts into a list where each entry corresponds to a text and each entry is a list where entry i contains the index of ith word in the text as indexed by word_index''' tokenizer = Tokenizer(nb_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) print() print('Padding sequences') # pads the start of the sequences with zero, up to a length of 1000 data = pad_sequences(sequences, maxlen=max_sequence_length) print() return data, word_index, tokenizer
def prepare_tokenizer(words): t = Tokenizer() t.fit_on_texts(words) field_dict = dict() rev_field_dict = dict() for key,value in t.word_index.items(): field_dict[value] = key rev_field_dict[key] = value vocab_size = len(t.word_index) + 1 #print (vocab_size) # integer encode the documents encoded_docs = t.texts_to_sequences(words) # pad documents to a max length of max_length words padded_docs = pad_sequences(encoded_docs, maxlen=1, padding='post') #print(padded_docs) return padded_docs,field_dict,rev_field_dict,vocab_size
def load_labeled_data(datadir, tokenizer=None): print('Processing text dataset in {}'.format(datadir)) texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids for name in sorted(os.listdir(datadir)): if name == 'unsup': continue path = os.path.join(datadir, name) if os.path.isdir(path): # each label corresponds to a separate directory label_id = len(labels_index) labels_index[name] = label_id for fname in sorted(os.listdir(path)): if fname[0].isdigit(): fpath = os.path.join(path, fname) if sys.version_info < (3,): f = open(fpath) else: f = open(fpath, encoding='latin-1') texts.append(f.read()) f.close() labels.append(label_id) print('Found {} texts in {}.'.format(len(texts), datadir)) # finally, vectorize the text samples into a 2D integer tensor if not tokenizer: tokenizer = Tokenizer(nb_words=MAX_FEATURES) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = np.asarray(labels) # to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # randomize order indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] return data, labels, tokenizer
def build_dataset(data): tokenizer = Tokenizer(nb_words=1000) all_review_user = "" for single_example in data: all_review_user += single_example['rev'].encode('utf-8') tokenizer.fit_on_texts(all_review_user) X = [] y = [] for single_example in data: rating = int(float(single_example['rat'])) review_seq = tokenizer.texts_to_sequences(single_example['rev'].encode('utf-8')) # print review_seq x = list(itertools.chain(*review_seq)) X.append(x) y.append(rating) # break # X = sequence.pad_sequences(X, maxlen=max_len) X = np.asarray(X) y = np.asarray(y) return X, y
def prodPadData(self, totalTextList, nb_words): ''' prod word sequence padding data the order of total word sequence must corresponding to embedding matrix (in this function: totalTextList must same as another one in function prodPreWordEmbedingMat) ''' MAX_NB_WORDS = int(nb_words / 1000) * 1000 MAX_SEQUENCE_LENGTH = 20 print('MAX_NB_WORDS: ' + str(MAX_NB_WORDS) + ' MAX_SEQUENCE_LENGTH: ' + str(MAX_SEQUENCE_LENGTH)) # vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=False) # for text in totalTextList: # print(text) tokenizer.fit_on_texts(totalTextList) totalSequences = tokenizer.texts_to_sequences(totalTextList) pad_data = pad_sequences(totalSequences, maxlen=MAX_SEQUENCE_LENGTH) return MAX_SEQUENCE_LENGTH, pad_data
def tokenize_and_process(text, vocab_size=10000): # Will hold clean text text_clean = [] # List of stop words/ unwanted words stop = stopwords.words('english') + list(string.punctuation) for t in text: text_clean.append(" ".join([i for i in word_tokenize(t.lower()) if i not in stop and i[0] != "'"])) # Instantiate tokenizer T = Tokenizer(num_words=vocab_size) # Fit the tokenizer with text T.fit_on_texts(text_clean) # Turn our input text into sequences of index integers data = T.texts_to_sequences(text_clean) word_to_idx = T.word_index idx_to_word = {v: k for k, v in word_to_idx.items()} return data, word_to_idx, idx_to_word, T
def prepare_word_tokenizer(texts): if not os.path.exists('data/tokenizer.pkl'): # check if a prepared tokenizer is available tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) # if not, create a new Tokenizer tokenizer.fit_on_texts(texts) # prepare the word index map with open('data/tokenizer.pkl', 'wb') as f: pickle.dump(tokenizer, f) # save the prepared tokenizer for fast access next time print('Saved tokenizer.pkl') else: with open('data/tokenizer.pkl', 'rb') as f: # simply load the prepared tokenizer tokenizer = pickle.load(f) print('Loaded tokenizer.pkl') sequences = tokenizer.texts_to_sequences(texts) # transform text into integer indices lists word_index = tokenizer.word_index # obtain the word index map print('Average sequence length: {}'.format(np.mean(list(map(len, sequences)), dtype=int))) # compute average sequence length print('Max sequence length: {}'.format(np.max(list(map(len, sequences))))) # compute maximum sequence length data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # pad the sequence to the user defined max length return (data, word_index)
if index == yhat: out_word = word break # append to input in_text, result = out_word, result + ' ' + out_word return result # source text f = open('xaf', 'r') data = f.read() # integer encode text tokenizer = Tokenizer() tokenizer.fit_on_texts([data]) encoded = tokenizer.texts_to_sequences([data])[0] # determine the vocabulary size vocab_size = len(tokenizer.word_index) + 1 print('Vocabulary Size: %d' % vocab_size) # create word -> word sequences sequences = list() for i in range(1, len(encoded)): sequence = encoded[i - 1:i + 1] sequences.append(sequence) print('Total Sequences: %d' % len(sequences)) # split into X and y elements sequences = np.array(sequences) X, y = sequences[:, 0], sequences[:, 1] # one hot encode outputs y = to_categorical(y, num_classes=vocab_size)
print("X_train len: {}".format(len(X_train))) print("y_train len: {}".format(len(y_train))) print("X_test len: {}".format(len(X_test))) print("y_test len: {}".format(len(y_test))) print("Split train and test data.") # truncate and pad input sequences max_text_length = 700 #7356 #from calculations t = Tokenizer() t.fit_on_texts(X_train) vocab_size = len(t.word_index) + 1 #vocab_size; size of vocab of training text = 256994 # integer encode the documents encoded_train = t.texts_to_sequences(X_train) encoded_test = t.texts_to_sequences(X_test) # pad documents to a max length X_train = sequence.pad_sequences(encoded_train, maxlen=max_text_length) X_test = sequence.pad_sequences(encoded_test, maxlen=max_text_length) print("Padded data.") #WORD EMBEDDINGS embeddings_index = dict() f = open('word_vectors/glove.6B/glove.6B.300d.txt', encoding='utf-8') #Glove data #f = open('word_vectors/pub.50.vec/pub.50.vec', encoding='latin-1') #pubmed data embedding_vector_length = 300 for line in f: values = line.split() word = values[0]
published_post = use_data['retweet'] == 1 published_post.sum() # + maxlen = 50 train = 0.7 validation = 0.1 max_words = 35000 #データをランダムにシャッフル use_data_s = use_data.sample(frac=1, random_state=1) # word indexを作成 tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(use_data_s['tweet2']) sequences = tokenizer.texts_to_sequences(use_data_s['tweet2']) word_index = tokenizer.word_index print("Found {} unique tokens.".format(len(word_index))) data = pad_sequences(sequences, maxlen=maxlen) # バイナリの行列に変換 categorical_labels = to_categorical(use_data_s['retweet']) labels = np.asarray(categorical_labels) print("Shape of data tensor:{}".format(data.shape)) print("Shape of label tensor:{}".format(labels.shape)) indices = [int(len(labels) * n) for n in [train, train + validation]] x_train, x_validation, x_test = np.split(data, indices)
from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(num_words=2500, split=' ') tokenizer.fit_on_texts(x) # Keras 有一个内置的 API,使得准备计算文本变得更容易。tokenizer 类共有 4 个属性,可用于特征准备。请看下面的示例,了解 tokenizer 的实际功能。 ## CODE tokenizer = Tokenizer() texts = [ "The sun is shining in June!", "September is grey.", "Life is beautiful in August.", "I like it", "This and other things?" ] tokenizer.fit_on_texts(texts) print(tokenizer.word_index) tokenizer.texts_to_sequences(["June is beautiful and I like it!"]) ## OUPUT # {'sun': 3, 'september': 4, 'june': 5, 'other': 6, 'the': 7, 'and': 8, 'like': 9, 'in': 2, 'beautiful': 11, 'grey': 12, 'life': 17, 'it': 16, 'i': 14, 'is': 1, 'august': 15, 'things': 10, 'shining': 13, 'this': 18} # [[5, 1, 11, 8, 14, 9, 16]] # tokenizer 为句子中的每个单词分配索引值,并且可以使用该索引值表示新句子。由于我们使用的文本语料库包含大量不同的单词,因此我们设置了一个上限,只使用最经常出现的 2500 个单词。 from keras.preprocessing.sequence import pad_sequences X = tokenizer.texts_to_sequences(x) X = pad_sequences(X) # 现在,我们将文本转换为如上所示的数字序列,并填充数字序列。因为句子可以有不同的长度,它们的序列长度也会不同。因此,pad_sequences 会找出最长的句子,并用 0 填充其他较短语句以匹配该长度。 ## Pad Sequences Example pad_sequences([[1, 2, 3], [3, 4, 5, 6], [7, 8]]) # array([[0, 1, 2, 3], # [3, 4, 5, 6],
def data(): start_train = '2008-08-08' end_train = '2014-12-31' start_val = '2015-01-02' end_val = '2016-07-01' max_sequence_length = 110 vocab_size = 3000 # read csv file DJIA = pd.read_csv("Combined_News_DJIA.csv", usecols=[ 'Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25' ]) # create training and testing dataframe on 80 % and 20 % respectively Training_dataframe = DJIA[(DJIA['Date'] >= start_train) & (DJIA['Date'] <= end_train)] Testing_dataframe = DJIA[(DJIA['Date'] >= start_val) & (DJIA['Date'] <= end_val)] attrib = DJIA.columns.values x_train = Training_dataframe.loc[:, attrib[2:len(attrib)]] y_train = Training_dataframe.iloc[:, 1] x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]] y_test = Testing_dataframe.iloc[:, 1] y_train = to_categorical(y_train) y_test = to_categorical(y_test) # merge the 25 news together to form a single signal merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1) merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1) # =============== # pre-process # =============== merged_x_train = merged_x_train.apply(lambda x: pp.process(x)) merged_x_test = merged_x_test.apply(lambda x: pp.process(x)) #merged_x_train = merged_x_train.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x)))) #merged_x_test = merged_x_test.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x)))) #merged_x_train = merged_x_train.apply(lambda x: pp.stemmer(x)) #merged_x_test = merged_x_test.apply(lambda x: pp.stemmer(x)) # remove stopwords in the training and testing set train_without_sw = [] test_without_sw = [] train_temporary = list(merged_x_train) test_temporary = list(merged_x_test) s = pp.stop_words for i in train_temporary: f = i.split(' ') for j in f: if j in s: f.remove(j) s1 = "" for k in f: s1 += k + " " train_without_sw.append(s1) merged_x_train = train_without_sw for i in test_temporary: f = i.split(' ') for j in f: if j in s: f.remove(j) s1 = "" for k in f: s1 += k + " " test_without_sw.append(s1) merged_x_test = test_without_sw # tokenize and create sequences tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(merged_x_train) x_train_sequence = tokenizer.texts_to_sequences(merged_x_train) x_test_sequence = tokenizer.texts_to_sequences(merged_x_test) word_index = tokenizer.word_index input_dim = len(word_index) + 1 print('Found %s unique tokens.' % len(word_index)) x_train_sequence = pad_sequences(x_train_sequence, maxlen=max_sequence_length) x_test_sequence = pad_sequences(x_test_sequence, maxlen=max_sequence_length) print('Shape of training tensor:', x_train_sequence.shape) print(x_train_sequence) print('Shape of testing tensor:', x_test_sequence.shape) print(x_test_sequence) """ Data providing function: This function is separated from create_model() so that hyperopt won't reload data for each evaluation run. """ return x_train_sequence, y_train, x_test_sequence, y_test
class SummaryGeneratorClass: def __init__(self): self.news = [] self.summaries = [] bNews = "./BBC News Summary/News Articles/business/" eNews = "./BBC News Summary/News Articles/entertainment/" pNews = "./BBC News Summary/News Articles/politics/" sNews = "./BBC News Summary/News Articles/sport/" tNews = "./BBC News Summary/News Articles/tech/" self.readNews(bNews) self.readNews(eNews) self.readNews(pNews) self.readNews(sNews) self.readNews(tNews) bSumm = "./BBC News Summary/Summaries/business/" eSumm = "./BBC News Summary/Summaries/entertainment/" pSumm = "./BBC News Summary/Summaries/politics/" sSumms = "./BBC News Summary/Summaries/sport/" tSumm = "./BBC News Summary/Summaries/tech/" self.readSummaries(bSumm) self.readSummaries(eSumm) self.readSummaries(pSumm) self.readSummaries(sSumms) self.readSummaries(tSumm) self.contractionMapping = contractionMapping # for i in range(len(self.news)): # self.news[i] = self.textCleaner(self.news[i]) # for i in range(len(self.summaries)): # self.summaries[i] = '_START_ '+ self.textCleaner(self.summaries[i]) + ' _END_' self.a = [] self.b = [] for i in range(len(self.news)): self.a.append(self.textCleaner(self.news[i])) for i in range(len(self.summaries)): self.b.append('beginmush ' + self.textCleaner(self.summaries[i]) + ' endmush') self.df = pd.DataFrame({'Text': self.a, 'Summary': self.b}) def readNews(self, directory): for filename in os.listdir(directory): with open(directory + filename, errors='replace') as infile: i = 1 s = "" try: for line in infile.readlines(): if i != 0: if (line.isspace() == False): s += str(line) i += 1 # s = re.sub('\n', '', s) s = re.sub('\'', '', s) self.news.append(s) except: print(filename + ' is throwing an error') def readSummaries(self, directory): for filename in os.listdir(directory): with open(directory + filename, errors='replace') as infile: i = 0 s = "" try: for line in infile.readlines(): if (line.isspace() == False): s += str(line) # s = re.sub('\n', '', s) s = re.sub('\'', '', s) self.summaries.append(s) except: print(filename + ' is throwing an error') def textCleaner(self, string): stopWords = set(stopwords.words('english')) string = string.lower() string = ' '.join([ self.contractionMapping[t] if t in self.contractionMapping else t for t in string.split(" ") ]) #remove escape characters string = re.sub("(\\t)", ' ', str(string)) string = re.sub("(\\r)", ' ', str(string)) string = re.sub("(\\n)", ' ', str(string)) #remove 's string = re.sub(r"'s\b", "", str(string)) #remove extra spaces string = ' '.join(string.split()) #remove punctuations string = re.sub("[^a-zA-Z]", " ", str(string)) #remove short words tokens = [w for w in string.split() if not w in stopWords] long_words = [] for i in tokens: if len(i) >= 3: #removing short word long_words.append(i) string = (" ".join(long_words)).strip() return string def textCount(self): tCount = [] summaryCount = [] for string in self.df['Text']: tCount.append(len(string.split())) for sent in self.df['Summary']: summaryCount.append(len(sent.split())) graph = pd.DataFrame() graph['Text'] = tCount graph['Summary'] = summaryCount graph.hist(bins=100) plt.show() self.maxTextLen = 400 self.maxSummaryLen = 200 cnt = 0 for i in self.df['Summary']: if (len(i.split()) <= self.maxSummaryLen): cnt = cnt + 1 print(cnt / len(self.df['Summary'])) cnt = 0 for i in self.df['Text']: print(len(i.split())) print((self.maxTextLen)) if (len(i.split()) <= self.maxTextLen): cnt = cnt + 1 print(cnt / len(self.df['Text'])) def filterDataFrameUsingMaxTextCountAndMaxSummaryCount(self): textArray = np.array(self.df['Text']) summaryArray = np.array(self.df['Summary']) shorterTextArray = [] shorterSummaryArray = [] for i in range(len(textArray)): if (len(summaryArray[i].split()) <= self.maxSummaryLen and len(textArray[i].split()) <= self.maxTextLen): shorterTextArray.append(textArray[i]) shorterSummaryArray.append(summaryArray[i]) self.shorterDf = pd.DataFrame({ 'Text': shorterTextArray, 'Summary': shorterSummaryArray }) def splitData(self): self.xTrain, self.xVal, self.yTrain, self.yVal = train_test_split( np.array(self.shorterDf['Text']), np.array(self.shorterDf['Summary']), test_size=0.2, random_state=0, shuffle=True) def tokenizeTrainingData(self): tokenizerX = Tokenizer() tokenizerX.fit_on_texts(list(self.xTrain)) thr = 4 count = 0 totalCount = 0 freq = 0 totalFreq = 0 for key, value in tokenizerX.word_counts.items(): totalCount += 1 totalFreq += 1 if (value < thr): count += 1 freq += value print("% of rare words in vocabulary:", (count / totalCount) * 100) print("Total Coverage of rare words:", (freq / totalFreq) * 100) #Text Tokenizer self.tokenizerX = Tokenizer(num_words=totalCount - count) self.tokenizerX.fit_on_texts(list(self.xTrain)) self.xTrainSeq = self.tokenizerX.texts_to_sequences(self.xTrain) self.xValSeq = self.tokenizerX.texts_to_sequences(self.xVal) self.xTrainSeq = pad_sequences(self.xTrainSeq, maxlen=self.maxTextLen, padding='post') self.xValSeq = pad_sequences(self.xValSeq, maxlen=self.maxTextLen, padding='post') self.xVocabularySize = self.tokenizerX.num_words + 1 #Summary Tokenizer tokenizerY = Tokenizer() tokenizerY.fit_on_texts(list(self.yTrain)) for key, value in tokenizerY.word_counts.items(): totalCount += 1 totalFreq += 1 if (value < thr): count += 1 freq += value print("% of rare words in vocabulary:", (count / totalCount) * 100) print("Total Coverage of rare words:", (freq / totalFreq) * 100) self.tokenizerY = Tokenizer(num_words=totalCount - count) self.tokenizerY.fit_on_texts(list(self.yTrain)) self.yTrainSeq = self.tokenizerY.texts_to_sequences(self.yTrain) self.yValSeq = self.tokenizerY.texts_to_sequences(self.yVal) self.yTrainSeq = pad_sequences(self.yTrainSeq, maxlen=self.maxSummaryLen, padding='post') self.yValSeq = pad_sequences(self.yValSeq, maxlen=self.maxSummaryLen, padding='post') self.yVocabularySize = self.tokenizerY.num_words + 1 def getSummaries(self): return self.summaries def getNews(self): return self.news def getDF(self): return self.df
train_df = pd.read_csv("data/train.csv") print("Train shape : ", train_df.shape) # fill up the missing values train_X = train_df["question_text"].fillna("_na_").values train_X = train_df['question_text'] train_X = train_X.tolist() qid_train = train_df['qid'] qid_train = qid_train.tolist() # Tokenize the sentences tokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'’“”') tokenizer.fit_on_texts(train_X) train_X = tokenizer.texts_to_sequences(train_X) # Pad the sentences trunc = 'pre' train_X = pad_sequences(train_X, maxlen=maxlen, truncating=trunc) # Get the target values train_y = train_df['target'].values test_X = train_X[1000000:] train_X = train_X[:1000000] test_y = train_y[1000000:] train_y = train_y[:1000000] from scipy.io import loadmat embedding_matrix = loadmat('data/embedding_matrix2.mat')["embedding_matrix"] print(embedding_matrix.shape)
reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: question1.append(row['question1']) question2.append(row['question2']) is_duplicate.append(row['is_duplicate']) print('Question pairs: %d' % len(question1)) # ## Build tokenized word index # In[4]: questions = question1 + question2 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) question1_word_sequences = tokenizer.texts_to_sequences(question1) question2_word_sequences = tokenizer.texts_to_sequences(question2) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index)) # ## Download and process GloVe embeddings # In[5]: if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE): zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL)) zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR) print("Processing", GLOVE_FILE)
def trainEmbeddingLayers(imgData): # define documents docs = imgData.de # define class labels labels = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) # prepare tokenizer t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 # integer encode the documents encoded_docs = t.texts_to_sequences(docs) print(encoded_docs) # pad documents to a max length of 4 words max_length = 4 padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print(padded_docs) # load the whole embedding into memory embeddings_index = dict() f = open('../glove_data/glove.6B/glove.6B.100d.txt') for line in f: values = line.split() word = values[0] coefs = asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Loaded %s word vectors.' % len(embeddings_index)) # create a weight matrix for words in training docs embedding_matrix = zeros((vocab_size, 100)) for word, i in t.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # define model model = Sequential() e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False) model.add(e) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) # compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # summarize the model print(model.summary()) # fit the model model.fit(padded_docs, labels, epochs=50, verbose=0) # evaluate the model loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) print('Accuracy: %f' % (accuracy * 100))
def main(): news = pd.read_csv('data/data_seged_monpa.csv') news_tag = news[['text', 'replyType', 'seg_text']] news_tag = news_tag[news_tag['replyType'] != 'NOT_ARTICLE'] types = news_tag.replyType.unique() dic = {} for i, types in enumerate(types): dic[types] = i print(dic) news_tag['type_id'] = news_tag.replyType.apply(lambda x: dic[x]) labels = news_tag.replyType.apply(lambda x: dic[x]) news_tag = find_null(news_tag) X = news_tag.seg_text y = news_tag.type_id print(y.value_counts()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(X_train.shape, 'training data ') print(X_test.shape, 'testing data') X_train = transfer_lsit(X_train) X_test = transfer_lsit(X_test) all_data = pd.concat([X_train, X_test]) # embedding setting EMBEDDING_DIM = 100 NUM_WORDS = 2764036 vocabulary_size = NUM_WORDS embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) word_vectors = word2vec.Word2Vec.load("output/word2vec.model") embedding_matrix = to_embedding(EMBEDDING_DIM, NUM_WORDS, vocabulary_size, embedding_matrix, word_vectors, X_train, X_test) del (word_vectors) embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=True) tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'') tokenizer.fit_on_texts(all_data.values) train_text = X_train.values train_index = X_train.index sequences_train = tokenizer.texts_to_sequences(train_text) X_train = pad_sequences(sequences_train, maxlen=600) y_train = to_categorical(np.asarray(labels[train_index])) print('Shape of X train:', X_train.shape) print('Shape of label train:', y_train.shape) test_text = X_test.values test_index = X_test.index sequences_test = tokenizer.texts_to_sequences(test_text) X_test = pad_sequences(sequences_test, maxlen=X_train.shape[1]) y_test = to_categorical(np.asarray(labels[test_index])) sequence_length = X_train.shape[1] filter_sizes = [2, 3, 4] num_filters = 128 drop = 0.2 penalty = 0.0001 inputs = Input(shape=(sequence_length, )) embedding = embedding_layer(inputs) reshape = Reshape((sequence_length, EMBEDDING_DIM, 1))(embedding) conv_0 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM), activation='softmax', kernel_regularizer=regularizers.l2(penalty))(reshape) conv_1 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM), activation='relu', kernel_regularizer=regularizers.l2(penalty))(reshape) conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM), activation='relu', kernel_regularizer=regularizers.l2(penalty))(reshape) maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1, 1))(conv_0) maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1, 1))(conv_1) maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1, 1))(conv_2) merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1) dropout = Dropout(drop)(merged_tensor) flatten = Flatten()(dropout) reshape = Reshape((3 * num_filters, ))(flatten) output = Dense(units=2, activation='softmax', kernel_regularizer=regularizers.l2(penalty))(reshape) # this creates a model that includes model = Model(inputs, output) model.summary() adam = Adam(lr=1e-3) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc']) callbacks = [EarlyStopping(monitor='val_loss')] history = model.fit(X_train, y_train, batch_size=64, epochs=50, verbose=1, validation_split=0.1, callbacks=callbacks) predictions = model.predict(X_test) matrix = confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1)) print(matrix) # Plot training & validation accuracy values plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'val'], loc='upper left') plt.savefig("output/acc.png") score, acc = model.evaluate(X_test, y_test) print('Test accuracy:', acc) plot_model(model, to_file='output/model.png', show_shapes=False, show_layer_names=False)
question2 = [] is_duplicate = [] with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: question1.append(row['question1']) question2.append(row['question2']) is_duplicate.append(row['is_duplicate']) print('Question pairs: %d' % len(question1)) # Build tokenized word index questions = question1 + question2 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) question1_word_sequences = tokenizer.texts_to_sequences(question1) question2_word_sequences = tokenizer.texts_to_sequences(question2) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index)) # Download and process GloVe embeddings if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE): zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL)) zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR) print("Processing", GLOVE_FILE) embeddings_index = {} with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f: for line in f:
word_counter = collections.Counter([ word for sentence in tqdm(X_train, total=len(X_train)) for word in sentence.split() ]) from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False) tokenizer.fit_on_texts(list(X_train)) tokenized_train = tokenizer.texts_to_sequences(X_train) tokenized_test = tokenizer.texts_to_sequences(X_test) word_index = tokenizer.word_index vocab_size = len(word_index) longest = max(len(seq) for seq in tokenized_train) average = np.mean([len(seq) for seq in tokenized_train]) stdev = np.std([len(seq) for seq in tokenized_train]) max_len = int(average + stdev * 3) processed_X_train = pad_sequences( tokenized_train, maxlen=max_len, padding='post', truncating='post') # pad all sequences to the same length processed_X_test = pad_sequences(tokenized_test, maxlen=max_len, padding='post',
print('nb_classes = %s' % nb_classes) y_train = labeler.transform(raw_train_labels) Y_train = np_utils.to_categorical(y_train, nb_classes) y_valid = labeler.transform(raw_valid_labels) Y_valid = np_utils.to_categorical(y_valid, nb_classes) y_test = labeler.transform(raw_test_labels) Y_test = np_utils.to_categorical(y_test, nb_classes) print 'Tokenizing X_train' tokenizer = Tokenizer() tokenizer.fit_on_texts(raw_train['NARRATIVE']) tokenizer.word_index['BLANK_WORD'] = 0 X_train = tokenizer.texts_to_sequences(raw_train['NARRATIVE']) X_valid = tokenizer.texts_to_sequences(raw_valid['NARRATIVE']) X_test = tokenizer.texts_to_sequences(raw_test['NARRATIVE']) X_train = pad_sequences(X_train, maxlen=max_len) X_valid = pad_sequences(X_valid, maxlen=max_len) X_test = pad_sequences(X_test, maxlen=max_len) print('X_train shape:', X_train.shape) print('X_valid shape:', X_valid.shape) print('X_test shape:', X_test.shape) vocab_size = len(tokenizer.word_index) model = Sequential() model.add(LSTM(50, dropout_W=0.5, dropout_U=0.5, return_sequences=True, input_shape=(max_len, vocab_size)))
# %% test_x.head() # %% [markdown] # ## Data Preprocessing # %% max_words = 20000 tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(list(train_x)) # %% tokenized_train_x = tokenizer.texts_to_sequences(train_x) tokenized_valid_x = tokenizer.texts_to_sequences(valid_x) tokenized_test_x = tokenizer.texts_to_sequences(test_x) # %% len(tokenized_train_x), len(tokenized_valid_x), len(tokenized_test_x) # %% maxlen = 200 X_train = pad_sequences(tokenized_train_x, maxlen=maxlen) X_valid = pad_sequences(tokenized_valid_x, maxlen=maxlen) X_test = pad_sequences(tokenized_test_x, maxlen=maxlen)
additional_features.append(feature_getter(i)) additional_features = np.asarray(additional_features) for i in sentences: temp1 = np.zeros((1, EMBEDDING_DIM)) for w in i: if (w in glove_emb): temp1 += glove_emb[w] temp1 /= len(i) doctovec.append(temp1.reshape(300, )) doctovec = np.asarray(doctovec) tokenizer = Tokenizer() #num_words=MAX_NB_WORDS) #limits vocabulory size tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) #returns list of sequences word_index = tokenizer.word_index #dictionary mapping print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print(additional_features.shape, data.shape) print('Shape of data tensor:', data.shape) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] doctovec = doctovec[indices] additional_features = additional_features[indices]
train = pd.read_csv(TRAIN_DATA) test = pd.read_csv(TEST_DATA) submission = pd.read_csv(SAMPLE_SUB) # Replace missing values in training and test set list_train = train["comment_text"].fillna("_na_").values classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] y = train[classes].values list_test = test["comment_text"].fillna("_na_").values # Use Keras preprocessing tools tok = Tokenizer(num_words=max_features) tok.fit_on_texts(list(list_train)) tokenized_train = tok.texts_to_sequences(list_train) tokenized_test = tok.texts_to_sequences(list_test) # Pad vectors with 0s for sentences shorter than maxlen X_t = pad_sequences(tokenized_train, maxlen=maxlen) X_te = pad_sequences(tokenized_test, maxlen=maxlen) # Read word vectors into a dictionary def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embeddings_index = dict( get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING))
r_len.append(l) MAX_REVIEW_LEN = np.max(r_len) MAX_REVIEW_LEN max_features = num_unique_word max_words = MAX_REVIEW_LEN batch_size = 128 epochs = 10 num_classes=5 tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train_text)) X_train = tokenizer.texts_to_sequences(X_train_text) X_val = tokenizer.texts_to_sequences(X_val_text) X_test = tokenizer.texts_to_sequences(test_text) X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_val = sequence.pad_sequences(X_val, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) print(X_train.shape,X_val.shape,X_test.shape) model2= Sequential() model2.add(Embedding(max_features,100,input_length=max_words)) model2.add(Dropout(0.2)) model2.add(Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1))
'encoding': 'latin-1' } with open(fpath, **args) as f: t = f.read() i = t.find('\n\n') # skip header if 0 < i: t = t[i:] texts.append(t) labels.append(label_id) print('Found %s texts.' % len(texts)) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices]
#23-ast akar, de 24-est kap #first_text = list(csv.reader(szoveg, skipinitialspace=True)) #print (first_text[0:20]) # ez így karaktereket ad vissza #print(list(csv.reader(szoveg, skipinitialspace=True))[0:4]) # # A SZÖVEGMINTÁK VEKTORIZÁLÁSA # # Tokenizálás - szavak helyett indexek lesznek, ahol ugyanaz a szó szerepel, ott ugyanaz a szám... # nem véletlenszerű, hogy milyen szám szerepel az egyes szavak helyén(?)! tokenizer = Tokenizer( nb_words=MAX_NB_WORDS) ## Tokenizál, legfeljebb MAX_NB_WORDS szóra tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences( texts) # Soronként egy-egy szöveg szavai következnek egymás után listában. word_index = tokenizer.word_index # különböző tokenek száma print('Különböző szavak száma az összes szövegben: ', len(word_index)) print('tokenz: \n') #print(word_index) # ---------------------------------- import json # as requested in comment #word_index = {'word_index': word_index} with open('target.txt', 'w') as file: txt = "" for key in word_index:
class ToxModel(): """Toxicity model.""" def __init__(self, model_name=None, model_dir=DEFAULT_MODEL_DIR, embeddings_path=DEFAULT_EMBEDDINGS_PATH, hparams=None): self.model_dir = model_dir self.embeddings_path = embeddings_path self.model_name = model_name self.model = None self.tokenizer = None self.hparams = DEFAULT_HPARAMS.copy() if hparams: self.update_hparams(hparams) if model_name: self.load_model_from_name(model_name) self.print_hparams() def print_hparams(self): print('Hyperparameters') print('---------------') for k, v in six.iteritems(self.hparams): print('{}: {}'.format(k, v)) print('') def update_hparams(self, new_hparams): self.hparams.update(new_hparams) def get_model_name(self): return self.model_name def save_hparams(self, model_name): self.hparams['model_name'] = model_name with open( os.path.join(self.model_dir, '%s_hparams.json' % self.model_name), 'w') as f: json.dump(self.hparams, f, sort_keys=True) def load_model_from_name(self, model_name): self.model = load_model( os.path.join(self.model_dir, '%s_model.h5' % model_name)) self.tokenizer = six.moves.cPickle.load( open( os.path.join(self.model_dir, '%s_tokenizer.pkl' % model_name), 'rb')) with open( os.path.join(self.model_dir, '%s_hparams.json' % self.model_name), 'r') as f: self.hparams = json.load(f) def fit_and_save_tokenizer(self, texts): """Fits tokenizer on texts and pickles the tokenizer state.""" self.tokenizer = Tokenizer(num_words=self.hparams['max_num_words']) self.tokenizer.fit_on_texts(texts) six.moves.cPickle.dump( self.tokenizer, open( os.path.join(self.model_dir, '%s_tokenizer.pkl' % self.model_name), 'wb')) def prep_text(self, texts): """Turns text into into padded sequences. The tokenizer must be initialized before calling this method. Args: texts: Sequence of text strings. Returns: A tokenized and padded text sequence as a model input. """ text_sequences = self.tokenizer.texts_to_sequences(texts) return pad_sequences( text_sequences, maxlen=self.hparams['max_sequence_length']) def load_embeddings(self): """Loads word embeddings.""" embeddings_index = {} with open(self.embeddings_path) as f: for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs self.embedding_matrix = np.zeros((len(self.tokenizer.word_index) + 1, self.hparams['embedding_dim'])) num_words_in_embedding = 0 for word, i in self.tokenizer.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: num_words_in_embedding += 1 # words not found in embedding index will be all-zeros. self.embedding_matrix[i] = embedding_vector def train(self, training_data_path, validation_data_path, text_column, label_column, model_name): """Trains the model.""" self.model_name = model_name self.save_hparams(model_name) train_data = pd.read_csv(training_data_path) valid_data = pd.read_csv(validation_data_path) print('Fitting tokenizer...') self.fit_and_save_tokenizer(train_data[text_column]) print('Tokenizer fitted!') print('Preparing data...') train_text, train_labels = (self.prep_text(train_data[text_column]), to_categorical(train_data[label_column])) valid_text, valid_labels = (self.prep_text(valid_data[text_column]), to_categorical(valid_data[label_column])) print('Data prepared!') print('Loading embeddings...') self.load_embeddings() print('Embeddings loaded!') print('Building model graph...') self.build_model() print('Training model...') save_path = os.path.join(self.model_dir, '%s_model.h5' % self.model_name) callbacks = [ ModelCheckpoint( save_path, save_best_only=True, verbose=self.hparams['verbose']) ] if self.hparams['stop_early']: callbacks.append( EarlyStopping( min_delta=self.hparams['es_min_delta'], monitor='val_loss', patience=self.hparams['es_patience'], verbose=self.hparams['verbose'], mode='auto')) self.model.fit( train_text, train_labels, batch_size=self.hparams['batch_size'], epochs=self.hparams['epochs'], validation_data=(valid_text, valid_labels), callbacks=callbacks, verbose=2) print('Model trained!') print('Best model saved to {}'.format(save_path)) print('Loading best model from checkpoint...') self.model = load_model(save_path) print('Model loaded!') def build_model(self): """Builds model graph.""" sequence_input = Input( shape=(self.hparams['max_sequence_length'],), dtype='int32') embedding_layer = Embedding( len(self.tokenizer.word_index) + 1, self.hparams['embedding_dim'], weights=[self.embedding_matrix], input_length=self.hparams['max_sequence_length'], trainable=self.hparams['embedding_trainable']) embedded_sequences = embedding_layer(sequence_input) x = embedded_sequences for filter_size, kernel_size, pool_size in zip( self.hparams['cnn_filter_sizes'], self.hparams['cnn_kernel_sizes'], self.hparams['cnn_pooling_sizes']): x = self.build_conv_layer(x, filter_size, kernel_size, pool_size) x = Flatten()(x) x = Dropout(self.hparams['dropout_rate'])(x) # TODO(nthain): Parametrize the number and size of fully connected layers x = Dense(128, activation='relu')(x) preds = Dense(2, activation='softmax')(x) rmsprop = RMSprop(lr=self.hparams['learning_rate']) self.model = Model(sequence_input, preds) self.model.compile( loss='categorical_crossentropy', optimizer=rmsprop, metrics=['acc']) def build_conv_layer(self, input_tensor, filter_size, kernel_size, pool_size): output = Conv1D( filter_size, kernel_size, activation='relu', padding='same')( input_tensor) if pool_size: output = MaxPooling1D(pool_size, padding='same')(output) else: # TODO(nthain): This seems broken. Fix. output = GlobalMaxPooling1D()(output) return output def predict(self, texts): """Returns model predictions on texts.""" data = self.prep_text(texts) return self.model.predict(data)[:, 1] def score_auc(self, texts, labels): preds = self.predict(texts) return compute_auc(labels, preds) def summary(self): return self.model.summary()
def tokenize(max_features, max_len, on='train', train_path='f:/avito/train.csv', test_path=None, tokenizer=None, clean_text=False, return_tokenizer=False, return_full_train=False): """ Tokenize text. Read train and test data, process description feature, tokenize it. Parameters: - on: fit tokenizer on train or train + test; - train_path: path to train file; - test_path: past to test file; - max_features: tokenizer parameter; - max_len: tokenizer parameter; - tokenizer: can pass tokenizer with different parameters or use a default one; - clean_text: apply text cleaning or not; """ # check that "on" has a correct value. assert on in ['train', 'all'] print('Reading train data.') train = pd.read_csv(train_path, index_col=0) labels = train['deal_probability'].values train = train['description'].astype(str).fillna('') text = train # define tokenizer if tokenizer: tokenizer = tokenizer else: tokenizer = Tokenizer(num_words=max_features) if on == 'all': print('Reading test data.') test = pd.read_csv(test_path, index_col=0) test = test['description'].astype(str).fillna('') text = text.append(test) # clean text if clean_text: pass # print('Cleaning.') print('Fitting.') tokenizer.fit_on_texts(text) # split data X_train, X_valid, y_train, y_valid = train_test_split(train, labels, test_size=0.1, random_state=23) print('Converting to sequences.') X_train = tokenizer.texts_to_sequences(X_train) X_valid = tokenizer.texts_to_sequences(X_valid) if test_path: test = tokenizer.texts_to_sequences(test) print('Padding.') X_train = sequence.pad_sequences(X_train, maxlen=max_len) X_valid = sequence.pad_sequences(X_valid, maxlen=max_len) if test_path: test = sequence.pad_sequences(test, maxlen=max_len) data = {} data['X_train'] = X_train data['X_valid'] = X_valid data['y_train'] = y_train data['y_valid'] = y_valid if test_path: data['test'] = test if return_tokenizer: data['tokenizer'] = tokenizer if return_full_train: X = np.concatenate([X_train, X_valid]) y = np.concatenate([y_train, y_valid]) data['X'] = X data['y'] = y return data
# Paths tweets_path = "D:/tweets.csv" w_emb_path = 'D:/GoogleNews-vectors-negative300.bin' # Read data and preprocess tweet_data = read_data(tweets_path) tweet_data["Tweet"] = tweet_data["Tweet"].apply(preprocess_tweet) tweet_data = Filter_tweets(tweet_data, True) # Transform tweets to list of integers and add pad number_of_features = 5000 tokenizer = Tokenizer(num_words=number_of_features, split=' ') tokenizer.fit_on_texts(tweet_data['filtered_text'].values) X = tokenizer.texts_to_sequences(tweet_data['filtered_text'].values) X = pad_sequences(X) word_index = tokenizer.word_index embedding_dims = 300 # Load embeddings model = gensim.models.KeyedVectors.load_word2vec_format(w_emb_path, binary=True) # Create embedding matrix embedding_matrix = np.zeros((len(word_index) + 1, embedding_dims)) for word, i in word_index.items(): if word in model.wv.vocab: embedding_vector = model.wv[word]
from sklearn.preprocessing import LabelEncoder data = pd.read_csv('spam.csv', encoding='latin-1') # Keeping only the neccessary columns data = data[['v2', 'v1']] data['v2'] = data['v2'].apply(lambda x: x.lower()) data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) print(data[data['v1'] == 'ham'].size) print(data[data['v1'] == 'spam'].size) max_fatures = 2000 tokenizer = Tokenizer(num_words=max_fatures, split=' ') tokenizer.fit_on_texts(data['v2'].values) X = tokenizer.texts_to_sequences(data['v2'].values) print(X) X = pad_sequences(X) print(X) embed_dim = 128 lstm_out = 196 def createmodel(): model = Sequential() model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1])) model.add(SpatialDropout1D(0.4)) model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam',
X=[] sentences = list(tweets['text']) for sen in sentences: X.append(preprocess_text(sen)) y = tweets['sentiment'] #sentiment를 긍정은 1, 부정은 0으로 수정 y = np.array(list(map(lambda x: 1 if x == 4 else 0, y))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) #Preparing the Embedding Layer tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) # Adding 1 because of reserved 0 index vocab_size = len(tokenizer.word_index) + 1 maxlen = 69 X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) new_model = keras.models.load_model('./lstm모델/DB2048_twitter100D_69_30_lstm_model.h5', compile=False) new_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) new_model.summary() loss, acc = new_model.evaluate(X_test, y_test, verbose=1)
print('embedding_matrix shape', embedding_matrix.shape) # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) return embedding_matrix df = pd.read_csv(input_file, encoding="utf-8") question1 = df['question1'].values question2 = df['question2'].values y = df['label'].values from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(num_words=MAX_FEATURES) tokenizer.fit_on_texts(list(question1) + list(question2)) list_tokenized_question1 = tokenizer.texts_to_sequences(question1) list_tokenized_question2 = tokenizer.texts_to_sequences(question2) X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) inpath="test1.txt" test_data1 = [] test_data2 = [] linenos=[] import jieba jieba.add_word('花呗') jieba.add_word('借呗') jieba.add_word('余额宝') def seg(text): seg_list = jieba.cut(text)
dataset_df = read_csv(DATASET_PATH) dataset_df = dataset_df.dropna() dataset_df = dataset_df.sample(frac=1) dataset_df.info() print("Label Distributions: ", dataset_df['Label'].value_counts()) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(dataset_df['Log'].values) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) X = tokenizer.texts_to_sequences(dataset_df['Log'].values) X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', X.shape) Y = pd.get_dummies(dataset_df['Label']).values print('Shape of label tensor:', Y.shape) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42) epochs = 5 batch_size = 1000 model = Sequential()
print('단어 집합(vocabulary)의 크기 :', total_cnt) print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s' % (threshold - 1, rare_cnt)) print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt) * 100) print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq) * 100) # In[ ]: print('최대 길이 :', max(len(l) for l in x_train)) print('평균 길이 :', sum(map(len, x_train)) / len(x_train)) # In[ ]: tokenizer = Tokenizer(vocab_size, oov_token='OOV') tokenizer.fit_on_texts(x_train) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) # In[ ]: max_len = 58 # In[ ]: x_train = pad_sequences(x_train, maxlen=max_len) x_test = pad_sequences(x_test, maxlen=max_len) # In[ ]: import re from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
continue input_line = '<sos> ' + line target_line = line + ' <eos>' input_texts.append(input_line) target_texts.append(target_line) all_lines = input_texts + target_texts # convert the sentences (strings) into integers tokenizer = Tokenizer( num_words=MAX_VOCAB_SIZE, filters='' ) # filters='' ensures that special characters like <sos> are not filtered out tokenizer.fit_on_texts(all_lines) input_sequences = tokenizer.texts_to_sequences(input_texts) target_sequences = tokenizer.texts_to_sequences(target_texts) # find max seq length max_sequence_length_from_data = max(len(s) for s in input_sequences) print('Max sequence length:', max_sequence_length_from_data) # get word -> integer mapping (dictionary) word2idx = tokenizer.word_index print('Found %s unique tokens.' % len(word2idx)) assert ('<sos>' in word2idx) assert ('<eos>' in word2idx) # pad sequences so that we get a N x T matrix max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH) input_sequences = pad_sequences(input_sequences,
def model_word2vec(suffix="",suffix_fre=""): rtest=xlrd.open_workbook(filename="切割"+suffix+"test"+suffix_fre+".xls") rtrain=xlrd.open_workbook(filename="切割"+suffix+"train"+suffix_fre+".xls") r_vocall1=xlrd.open_workbook(filename="pre处理"+suffix+"test"+suffix_fre+".xls") r_vocall2=xlrd.open_workbook(filename="pre处理"+suffix+"train"+suffix_fre+".xls") sheet_test=rtest.sheet_by_index(0) sheet_train=rtrain.sheet_by_index(0) sheet1_vocall=r_vocall1.sheet_by_index(0) sheet2_vocall=r_vocall2.sheet_by_index(0) invocal1=sheet1_vocall.col_values(4) invocal2=sheet2_vocall.col_values(4) for i in range(0,len(invocal1)): if len(invocal1[i])==0: invocall=invocal1[:i] print("1") break for i in range(0,len(invocal2)): if len(invocal2[i])==0: print("1") invocal2=invocal2[:i] break for i in invocal2: if i not in invocall: invocall.append(i) print(len(invocall)) vocall_size=len(invocall) class_num=sheet2_vocall.col_values(10) allclass=[] for i in range(1,len(class_num)): if class_num[i]!="": if class_num[i] not in allclass: allclass.append(class_num[i]) print(allclass) for all_round in range(0,len(allclass)): for round in range(0,2): if round==1: ex_tag=sheet_test.col_values(6) xtrain=sheet_train.col_values(2+round*3) ztrain=sheet_train.col_values(0+round*3) ytrain=sheet_train.col_values(1+round*3) xtest=sheet_test.col_values(2+round*3) ztest=sheet_test.col_values(0+round*3) ytest=sheet_test.col_values(1+round*3) for i in range(0,len(xtrain)): if len(xtrain[i])==0: xtrain=xtrain[:i] ztrain=ztrain[:i] ytrain=ytrain[:i] break for i in range(0,len(xtest)): if len(xtest[i])==0: xtest=xtest[:i] ytest=ytest[:i] ztest=ztest[:i] break print(round*3) print(len(xtrain),"xtrain") print(len(ztrain),"ztrain") print(len(xtest),"xtest") print(len(ztest),"ztest") if round==1: other=sheet_train.cell(0,13).value other=int(other) print(other) if other==1: xtrain=xtrain+sheet_train.col_values(9) ytrain=ytrain+sheet_train.col_values(8) ztrain=ztrain+sheet_train.col_values(7) for i in range(0,len(xtrain)): if len(xtrain[i])==0: xtrain=xtrain[:i] ztrain=ztrain[:i] ytrain=ytrain[:i] break tokenizer=Tokenizer(num_words=vocall_size) tokenizer.fit_on_texts(invocall) xtrain=tokenizer.texts_to_sequences(xtrain) xtest=tokenizer.texts_to_sequences(xtest) maxlen=0 for i in xtrain: if len(i)>maxlen: maxlen=len(i) for i in xtest: if len(i)>maxlen: maxlen=len(i) print(maxlen,"maxlen") for i in range(0,len(ztest)): for n1 in range(0,len(allclass)): if ztest[i]==allclass[n1]: ztest[i]=n1 for i in range(0,len(ztrain)): for n1 in range(0,len(allclass)): if ztrain[i]==allclass[n1]: ztrain[i]=n1 xtrain=pad_sequences(xtrain,padding='post',maxlen=maxlen) xtest=pad_sequences(xtest,padding='post',maxlen=maxlen) print(len(ztrain),len(xtrain)) print(len(ztest),len(xtest)) for i in range(0,len(ztrain)): ztrain[i]=int(ztrain[i]) for i in range(0,len(ztest)): ztest[i]=int(ztest[i]) modelw2v = gensim.models.KeyedVectors.load("word2vec_150_lstm.model") embedding_matrix = np.zeros(shape=(vocall_size ,150)) for i,word in enumerate(invocall): try: embedding_vector = modelw2v[word] embedding_matrix[i,:] = embedding_vector except KeyError: pass embedding_size=150 hidden_layer_size=64 batch_size=128 num_epochs=3 model=Sequential() model.add(Embedding(vocall_size,embedding_size,weights=[embedding_matrix],input_length=maxlen)) model.add(SpatialDropout1D(0.2)) model.add(LSTM(hidden_layer_size,dropout=0.2,recurrent_dropout=0.2)) model.add(Dense(1)) model.add(Activation("sigmoid")) model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy']) model.summary() history=model.fit(xtrain,ztrain,epochs=1,batch_size=64) loss,accuracy=model.evaluate(xtest,ztest) print(loss,accuracy) """ plt.subplot(211) plt.title("Accuracy"+suffix) plt.plot(history.history['acc'],color="g",label="Train") plt.legend(loc="best") plt.subplot(212) plt.title("Loss") plt.plot(history.history['loss'],color="g",label="Train") plt.legend(loc="best") plt.tight_layout() plt.show() """ w=xlwt.Workbook() sheet2=w.add_sheet("准备文件",cell_overwrite_ok=True) sheet2.write(0,8,"predict") sheet2.write(0,9,"ztest") sheet2.write(0,10,"xtest") sheet2.write(0,11,"ex_tag") sheet2.write(0,4,"loss") sheet2.write(1,4,loss) sheet2.write(0,5,"acc") sheet2.write(1,5,accuracy) ypred=model.predict_classes(xtest,1) xtest=tokenizer.sequences_to_texts(xtest) for index in range(0,len(ypred)): sheet2.write(index,0,int(ypred[index][0])) sheet2.write(index,1,ztest[index]) sheet2.write(index,2,xtest[index]) if round==1: sheet2.write(index,3,ex_tag[index]) if round==0: w.save("result切割"+suffix+allclass[all_round]+suffix_fre+"w2v.xls") else: w.save("result扩充"+suffix+allclass[all_round]+suffix_fre+"w2v.xls") return allclass
def __init__( self, embed_files, seq_length=320, # 320 embed_flag='crawl', sent_flag=False): self.train_file = TRAIN_DATA_FILE self.process_files = TRAIN_PROCESS_FILES self.test_file = TEST_DATA_FILE self.embed_file = embed_files[embed_flag] self.seq_length = seq_length print(f'read train data: {self.train_file} ' f'and test data: {self.test_file}') self.train_df = pd.read_csv(self.train_file) self.test_df = pd.read_csv(self.test_file) self.train_df["comment_text"].fillna(NAN_WORD, inplace=True) self.test_df["comment_text"].fillna(NAN_WORD, inplace=True) sentences_train = self.train_df["comment_text"].values sentences_test = self.test_df["comment_text"].values self.y_train = self.train_df[CLASSES].values print(f'train sentences shape: {sentences_train.shape}') print(f'test sentences shape: {sentences_test.shape}') print(f'y train shape: {self.y_train.shape}') sentences_all = list(sentences_train) sentences_procs = [] sentences_df_procs = [] for train_pro in self.process_files: df = pd.read_csv(train_pro) df["comment_text"].fillna(NAN_WORD, inplace=True) sent = df["comment_text"].values sentences_procs.append(sent) sentences_all.extend(list(sent)) sentences_df_procs.append(df) print('Tokenzie sentence in train set and test set...') tokenizer = Tokenizer() tokenizer.fit_on_texts(sentences_all) if not sent_flag: tokenized_train = tokenizer.texts_to_sequences(sentences_train) self.x_train = pad_sequences(tokenized_train, maxlen=seq_length) tokenized_test = tokenizer.texts_to_sequences(sentences_test) self.x_test = pad_sequences(tokenized_test, maxlen=seq_length) self.x_procs = [] for sent in sentences_procs: tokenized_procs = tokenizer.texts_to_sequences(sent) tokenized_procs = pad_sequences(tokenized_procs, maxlen=seq_length) self.x_procs.append(tokenized_procs) else: sentences_train = self.train_df["comment_text"].apply( lambda x: tokenize.sent_tokenize(x)) sentences_test = self.test_df["comment_text"].apply( lambda x: tokenize.sent_tokenize(x)) max_sent = 5 self.x_train = self.sentenize(tokenizer, sentences_train, max_sent, seq_length) self.x_test = self.sentenize(tokenizer, sentences_test, max_sent, seq_length) self.x_procs = [] for sent_df in sentences_df_procs: sentences_df = sent_df["comment_text"].apply( lambda x: tokenize.sent_tokenize(x)) tokenized_procs = self.sentenize(tokenizer, sentences_df, max_sent, seq_length) self.x_procs.append(tokenized_procs) words_dict = tokenizer.word_index self.max_feature = len(words_dict) + 1 print(f'Loading {embed_flag} embeddings...') if embed_flag is 'wiki': ft_model = load_model(self.embed_file) self.embed_dim = ft_model.get_dimension() self.embed_matrix = self.get_wiki_embed_matrix( words_dict, ft_model) elif embed_flag is 'crawl': embed_index = self.load_crawl_embed_index(self.embed_file) self.embed_dim = list(embed_index.values())[0].shape[0] # 300 self.embed_matrix = self.get_crawl_or_glove_embed_matrix( words_dict, embed_index) else: embed_index = self.load_glove_embed_index(self.embed_file) self.embed_dim = list(embed_index.values())[0].shape[0] # 300 self.embed_matrix = self.get_crawl_or_glove_embed_matrix( words_dict, embed_index)
full_df.subcat_1 = le.transform(full_df.subcat_1) le.fit(full_df.subcat_2) full_df.subcat_2 = le.transform(full_df.subcat_2) del le print("Transforming text data to sequences...") raw_text = np.hstack([full_df.item_description.str.lower(), full_df.name.str.lower(), full_df.category_name.str.lower()]) print(" Fitting tokenizer...") tok_raw = Tokenizer() tok_raw.fit_on_texts(raw_text) print(" Transforming text to sequences...") full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower()) full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower()) # full_df['seq_category'] = tok_raw.texts_to_sequences(full_df.category_name.str.lower()) del tok_raw MAX_NAME_SEQ = 10 #17 MAX_ITEM_DESC_SEQ = 75 #269 MAX_CATEGORY_SEQ = 8 #8 MAX_TEXT = np.max([ np.max(full_df.seq_name.max()), np.max(full_df.seq_item_description.max()), # np.max(full_df.seq_category.max()), ]) + 100 MAX_CATEGORY = np.max(full_df.category.max()) + 1
df["question_text"] = df["question_text"].progress_apply(preprocess) n_words = len(vocab) + 1 del vocab emb_file = "glove.840B.300d/glove.840B.300d.txt" glove_dic = {} for line in tqdm_notebook(open(emb_file)): temp = line.split(" ") glove_dic[temp[0]] = np.asarray(temp[1:], dtype="float32") train, val = train_test_split(df, test_size=0.1) tokenizer = Tokenizer(num_words=n_words) tokenizer.fit_on_texts(list(train.question_text)) q_train = tokenizer.texts_to_sequences(train.question_text) q_val = tokenizer.texts_to_sequences(val.question_text) max_len = 65 q_train = pad_sequences(q_train, maxlen=max_len) q_val = pad_sequences(q_val, maxlen=max_len) y_train = train.target y_val = val.target del train, val word_index = tokenizer.word_index emb_size = glove_dic["."].shape[0] emb_matrix = np.zeros((n_words, emb_size)) for w, index in word_index.items(): if index >= n_words:
data = normal_data + botnet_data data = [x[3:] for x in data if len(x) > 3] print ("normal", len(normal_data)) print ("botnet", len(botnet_data)) # Split sequences with spaces every 5 characters to convert them to words n = 5 text = [] for x in data: text.append(" ".join([x[i:i+n] for i in range(0, len(x), n)])) assert len(text) == len(data) tokenizer = Tokenizer(filters=stf_dataset.text_filter(), lower=False) tokenizer.fit_on_texts(text) seq = tokenizer.texts_to_sequences(text) print("text - seq", len(text), len(seq)) print (tokenizer.word_index) print (len(max(seq, key=len))) mat = sequence.pad_sequences(seq, maxlen=500) _, max_word_index = max(tokenizer.word_index.iteritems(), key=lambda x:x[1]) print("max word index", max_word_index) raw_input("..") assert len(data) == len(seq) data = zip(mat, y_data) # shuffle seed(1) shuffle(data) # split into training and testing
from sklearn import preprocessing from sklearn.model_selection import train_test_split from keras.preprocessing.sequence import pad_sequences from keras.layers import Embedding df = pd.read_csv('/home/charan/imdb_master.csv', encoding='latin-1') print(df.head()) sentences = df['review'].values y = df['label'].values # tokenizing data tokenizer = Tokenizer(num_words=2000) tokenizer.fit_on_texts(sentences) max_review_len = max([len(s.split()) for s in sentences]) vocab_size = len(tokenizer.word_index) + 1 sentences = tokenizer.texts_to_sequences(sentences) padded_docs = pad_sequences(sentences, maxlen=max_review_len) le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.25, random_state=1000) print(len(X_train)) # Number of features # print(input_dim) model = Sequential() model.add(Embedding(vocab_size, 50, input_length=max_review_len))