def give_vocabulary(sentences_df): ''' @parameter: the dataframe from the json file with the 5 columns we need @returns: the vocabulary in a set. ''' vocabulary = [] list_of_sentences1 = sentences_df['sentence1'].tolist() list_of_sentences2 = sentences_df['sentence2'].tolist() list_sentence_words = [] ''' # Do same with keras for sentence in list_of_sentences1: sentence.lower() #tokenize or split by " " tokens1 = sentence.split(" ") for token1 in tokens1: if token1 not in vocabulary: vocabulary.append(token1) ''' list_sentence_word_tmp = [] for s1, s2 in zip (list_of_sentences1, list_of_sentences2): sentence_unicode1 = make_unicode(s1) sentence_unicode2 = make_unicode(s2) #print sentence_no_unicode list_sentence_word_tmp += text_to_word_sequence(sentence_unicode1.encode('ascii'), filters=base_filter(), lower=True, split=" ") list_sentence_word_tmp += text_to_word_sequence(sentence_unicode2.encode('ascii'), filters=base_filter(), lower=True, split=" ") set_words = set(list_sentence_word_tmp) #print word2idx print "length of vocabulary: %d"%len(set_words) return set_words
def get_docs_and_intervention_summaries(pico_elem_str="CHAR_INTERVENTIONS"): pairs = [] p = PDFBiViewer() for study in p: cdsr_entry = study.cochrane text = study.studypdf['text'] intervention_text = cdsr_entry["CHARACTERISTICS"][pico_elem_str] if intervention_text is not None: #pairs.append((nltk.word_tokenize(text), # nltk.word_tokenize(intervention_text))) pairs.append((text_to_word_sequence(text), text_to_word_sequence(intervention_text))) return pairs
def sentence2sequence(sentence, w2i): sequence = [] words = text_to_word_sequence(sentence) for word in words: if word in w2i: sequence.append(w2i[word]) return sequence
def fit(self,texts): sequences=[] for text in texts: sequences.append(text_to_word_sequence(text)) for seq in sequences: for word in seq: if word in self.wordCount: self.wordCount[word]+=1 else: self.wordCount[word]=1 wcounts = list(self.wordCount.items()) wcounts.sort(key = lambda x: x[1], reverse=True) self.encoder['<PAD>']=0 self.encoder['<END>']=1 self.encoder['<UNK>']=2 self.decoder[0]='<PAD>' self.decoder[1]='<END>' self.decoder[2]='<UNK>' self.wordCount.clear() for i in range(len(wcounts)): pair=wcounts[i] self.encoder[pair[0]]=i+3 self.decoder[i+3]=pair[0] if i<self.maxWords: self.wordCount[pair[0]]=pair[1] print('Most Frequent 20 words:') for i in range(min(20,len(wcounts))): print(wcounts[i])
def convert_text_to_index_array(text): words = kpt.text_to_word_sequence(text) wordIndices = [] for word in words: if word in dictionary: wordIndices.append(dictionary[word]) else: print("'%s' not in training corpus; ignoring." %(word)) return wordIndices
def get_vectors_from_text(dataset_list,word_to_ind=imdb.get_word_index(), start_char=1, index_from=3, maxlen=400, num_words=5000, oov_char=2,skip_top=0): ''' Gets the list vector mapped according to the word to indices dictionary. @param dataset_list = list of review texts in unicode format word_to_ind = word to indices dictionary hyperparameters: start_char-->sentence starting after this char. index_from-->indices below this will not be encoded. max-len-->maximum length of the sequence to be considered. num_words-->number of words to be considered according to the rank.Rank is given according to the frequency of occurence oov_char-->out of variable character. skip_top-->no of top rank words to be skipped @returns: x_train: Final list of vectors(as list) of the review texts ''' x_train = [] for review_string in dataset_list: review_string_list = text_to_word_sequence(review_string) review_string_list = [ele for ele in review_string_list] x_predict = [] for i in range(len(review_string_list)): if review_string_list[i] not in word_to_ind: continue x_predict.append(word_to_ind[review_string_list[i]]) x_train.append((x_predict)) # add te start char and also take care of indexfrom if start_char is not None: x_train = [[start_char] + [w + index_from for w in x] for x in x_train] elif index_from: x_train = [[w + index_from for w in x] for x in x_train] # only maxlen is out criteria x_train=[ele[:maxlen] for ele in x_train] # if num is not given take care if not num_words: num_words = max([max(x) for x in x_train]) # by convention, use 2 as OOV word # reserve 'index_from' (=3 by default) characters: # 0 (padding), 1 (start), 2 (OOV) if oov_char is not None: x_train = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in x_train] else: x_train = [[w for w in x if (skip_top <= w < num_words)] for x in x_train] # padd the sequences x_train = sequence.pad_sequences(x_train, maxlen=maxlen) # return the vectors form of the text return x_train
def generate_words_set(data_file): words = set() with open(data_file, 'r') as fr: lines = fr.readlines() for i in xrange(len(lines)): splits = lines[i].split('\t') text = splits[0] + ' ' + splits[1] for word in text_to_word_sequence(text): # 模型训练时,文本数字化也要使用该函数text_to_word_sequence words.add(word) words = sorted(words) return words
def transform(self,texts): rv=[] for i in range(len(texts)): sequence=text_to_word_sequence(texts[i]) if len(sequence)==0: rv.append([0]) continue list_of_scalars=[] for j in range(len(sequence)): if sequence[j] not in self.wordCount: list_of_scalars.append(self.encoder['<UNK>']) else: list_of_scalars.append(self.encoder[sequence[j]]) rv.append(list_of_scalars) return rv
def _handle_rare_words(self, captions): if self._rare_words_handling == 'nothing': return captions elif self._rare_words_handling == 'discard': tokenizer = Tokenizer() tokenizer.fit_on_texts(captions) new_captions = [] for caption in captions: words = text_to_word_sequence(caption) new_words = [w for w in words if tokenizer.word_counts.get(w, 0) >= self._words_min_occur] new_captions.append(' '.join(new_words)) return new_captions raise NotImplementedError('rare_words_handling={} is not implemented ' 'yet!'.format(self._rare_words_handling))
def encode_text_vectors(self, texts, pca_dims=50, tsne_dims=None, tsne_seed=None, return_pca=False, return_tsne=False): # if a single text, force it into a list: if isinstance(texts, str): texts = [texts] vector_output = Model(inputs=self.model.input, outputs=self.model.get_layer('attention').output) encoded_vectors = [] maxlen = self.config['max_length'] for text in texts: if self.config['word_level']: text = text_to_word_sequence(text, filters='') text_aug = [self.META_TOKEN] + list(text[0:maxlen]) encoded_text = textgenrnn_encode_sequence(text_aug, self.vocab, maxlen) encoded_vector = vector_output.predict(encoded_text) encoded_vectors.append(encoded_vector) encoded_vectors = np.squeeze(np.array(encoded_vectors), axis=1) if pca_dims is not None: assert len(texts) > 1, "Must use more than 1 text for PCA" pca = PCA(pca_dims) encoded_vectors = pca.fit_transform(encoded_vectors) if tsne_dims is not None: tsne = TSNE(tsne_dims, random_state=tsne_seed) encoded_vectors = tsne.fit_transform(encoded_vectors) return_objects = encoded_vectors if return_pca or return_tsne: return_objects = [return_objects] if return_pca: return_objects.append(pca) if return_tsne: return_objects.append(tsne) return return_objects
X_val_text = list(X_val_text) #------------get split point----------- train_val_split = len(X_train_text) val_test_split = int(len(X_val_text) / 2) train_val = X_train_text + X_val_text #------get the word index and split into in train,val,test set data = np.zeros((len(train_val), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') for i, sentences in enumerate(train_val): for j, sent in enumerate(sentences): if j < MAX_SENTS: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < MAX_SENT_LENGTH and tokenizer.word_index[ word] < MAX_NB_WORDS: data[i, j, k] = tokenizer.word_index[word] k = k + 1 #--------------now set data ready-------------------------------------- X_train = data[:train_val_split] X_val = data[train_val_split:train_val_split + val_test_split] X_test = data[train_val_split + val_test_split:] #sys.exit(0) #sys.exit(0)
# # print (indices) # except: # continue # train_out[i] = indices # i=i+1 import nltk from keras.preprocessing.text import text_to_word_sequence raw_output = corpus.findall('.//sentence') train_out = [] delet = [] print(data.shape) data = np.array(data) print(data.shape) i=0 for output in raw_output: s = text_to_word_sequence(output.find('text').text, lower=True) indices = np.zeros(MAX_SEQ_LENGTH) aspectTerms = output.find('aspectTerms') if (aspectTerms): aspectTerm = aspectTerms.findall('aspectTerm') k=0 if (len(aspectTerm)>0): for aspect_term in aspectTerm: try: aspt = text_to_word_sequence(aspect_term.attrib['term']) if(len(aspt) < 2): indices[s.index(aspt[0])] = 1 else: k=1 break
#------------------------------------------------------------------------------# #------------------------------------------------------------------------------# ########## Data Processing starts here ######################################### #------------------------------------------------------------------------------# load_training_data() #------------------------------------------------------------------------------# #------------------------------------------------------------------------------# text_words = [] for i in range(0, len(data)): temp_data = data['text'][i] temp_data_word = text_to_word_sequence( temp_data, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") text_words.append(temp_data_word) #print (text_words) word_counts = collections.Counter(itertools.chain(*text_words)) #print (word_counts) vocabulary_size = len(word_counts) print("vocabulary_size = %s" % vocabulary_size) """ ## for debugging only temp_list = [] for i in range (0, len(text_words)): temp_list.append(len(text_words[i])) print (heapq.nlargest(10, temp_list))
embedding_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) embedding_output = embedding_model.predict(data) print('Generated word Embeddings..') print('Shape of Embedding_output', embedding_output.shape) train_input = np.zeros(shape=(len(data), 69, 306)) le = preprocessing.LabelEncoder() tags = ["CC", "NN", "JJ", "VB", "RB", "IN"] le.fit(tags) i = 0 for sent in textPost: s = text_to_word_sequence(sent) tags_for_sent = nltk.pos_tag(s) sent_len = len(tags_for_sent) ohe = [0] * 6 for j in range(69): if j < len(tags_for_sent) and tags_for_sent[j][1][:2] in tags: ddLe = le.transform([tags_for_sent[j][1][:2]]) ohe[ddLe[0]] = 1 train_input[i][j] = np.concatenate([embedding_output[i][j], ohe]) i = i + 1 print('Concatenated Word-Embeddings and POS Tag Features...') print('Training Model...') model = Sequential() model.add(Conv1D(100, 5, padding="same", input_shape=(69, 306)))
def textprcoessingforword2vec(input_data): sentences = [] for i in range(len(input_data)): sentences.append(text_to_word_sequence(input_data[i],filters='\t\n'+"'",split=" ")) return sentences
def similarity(self, x, y): x_vec = self.doc2vec.infer_vector(text_to_word_sequence(x), steps=self.steps) y_vec = self.doc2vec.infer_vector(text_to_word_sequence(y), steps=self.steps) return 1 - cosine(x_vec, y_vec)
def clean(text): return text_to_word_sequence( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
if (record.find('[report_end]') != -1): content = record[record.find('\n') + 1:record.find('[report_end]')].strip() content = expand_abbr(content) content = content.replace('\'s', " 's").replace("'d", " 'd") content = content.replace("'s", " 's") content = content.replace("can't", "cannot") content = content.replace("couldn't", "could not") content = content.replace("won't", "will not") content = content.replace("wasn't", "was not") content = content.replace("hasn't", "has not") content = content.replace("don't", "do not") content = content.replace("didn't", "did not") content = content.replace("doesn't", "does not") word_list = text.text_to_word_sequence(content, lower=True, split=" ") word_list = clean_wds(word_list) #filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' str_to_write = ' ' str_to_write = str_to_write.join(word_list) corpus.append(str_to_write) corpus_file.write(str_to_write + '\n') print(len(corpus)) corpus_file.close() train_dic = get_dic('data/Obesity_data/train_groundtruth.xml') test_dic = get_dic('data/Obesity_data/test_groundtruth.xml') # Read Word Vectors word_vector_file = 'data/mimic3_pp100.txt' vocab, embd, word_vector_map = loadWord2Vec(word_vector_file) embedding_dim = len(embd[0])
if not os.path.isdir(fpath): program = [] with open(fpath) as fr: lines = fr.readlines() for row in range(len(lines)): if "-------" not in lines[row]: program.append(lines[row].strip("\n")) else: labels.append(int(program[-1])) program.pop(0) program.pop(-1) texts.append(" ".join(program)) program = [] print("Found %s programs." % len(texts)) texts = text_to_word_sequence(" ".join(texts), filters='!"#$%&()*+,./:;<=>?@[]^`|~') with open("word1.txt", "w") as fw: fw.write(" ".join(texts)) texts = list(set(texts)) i = 0 sequences = word2vec.Text8Corpus("./word1.txt") model = word2vec.Word2Vec(sequences, size=150, min_count=1) with open("word2vector1.txt", "w") as fw: writer = csv.writer(fw, delimiter=" ") for word in texts: # print(model[word].tolist()) try: print(i) i += 1 writer.writerow([word] + model[word].tolist())
def context_question_text_preprocess(cnt_max_len, qn_max_len): """ get corpus """ # file path name file_train_context = '../data/train.context' file_train_question = '../data/train.question' file_dev_context = '../data/dev.context' file_dev_question = '../data/dev.question' file_train_span = '../data/train.span' file_dev_span = '../data/dev.span' # text and index list txt_train_cnt = read_txt_from_file(file_train_context) txt_train_qst = read_txt_from_file(file_train_question) txt_dev_cnt = read_txt_from_file(file_dev_context) txt_dev_qst = read_txt_from_file(file_dev_question) idx_train_beg, idx_train_end = read_index_from_file( file_train_span, cnt_max_len) idx_dev_beg, idx_dev_end = read_index_from_file(file_dev_span, cnt_max_len) cnt_all_txt = txt_train_cnt + txt_dev_cnt qst_all_txt = txt_train_qst + txt_dev_qst # from keras.preprocessing.text import Tokenizer # 求 context 和 question 的长度列表 l_cnt = list(map(lambda x: len(T.text_to_word_sequence(x)), cnt_all_txt)) l_qst = list(map(lambda x: len(T.text_to_word_sequence(x)), qst_all_txt)) # 求 context 和 question 的平均长度(词) import functools l_all_cnt = functools.reduce(lambda x, y: x + y, l_cnt) l_all_qst = functools.reduce(lambda x, y: x + y, l_qst) l_average_cnt = l_all_cnt / len(cnt_all_txt) l_average_qst = l_all_qst / len(qst_all_txt) # 分词 t = Tokenizer() # 分词器 txt_list = txt_train_cnt + txt_train_qst + txt_dev_cnt + txt_dev_qst t.fit_on_texts(txt_list) vocab_size = len(t.word_index) + 1 enc_txt_train_cnt = t.texts_to_sequences(txt_train_cnt) enc_txt_train_qst = t.texts_to_sequences(txt_train_qst) enc_txt_dev_cnt = t.texts_to_sequences(txt_dev_cnt) enc_txt_dev_qst = t.texts_to_sequences(txt_dev_qst) pad_txt_train_cnt = pad_sequences(enc_txt_train_cnt, maxlen=cnt_max_len, padding='post') pad_txt_train_qst = pad_sequences(enc_txt_train_qst, maxlen=qn_max_len, padding='post') pad_txt_dev_cnt = pad_sequences(enc_txt_dev_cnt, maxlen=cnt_max_len, padding='post') pad_txt_dev_qst = pad_sequences(enc_txt_dev_qst, maxlen=qn_max_len, padding='post') # load embedding embeddings_index = load_emb() # create a weight matrix for words in training docs embedding_matrix = zeros((vocab_size, 50)) for word, i in t.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector print("context average number of character is {}".format(l_average_cnt)) print("context max number of character is {}".format(cnt_max_len)) print("question average number of character is {}".format(l_average_qst)) print("question max number of character is {}".format(qn_max_len)) print("index of answer is index of word, not character") return embedding_matrix, vocab_size, pad_txt_train_cnt, pad_txt_train_qst, pad_txt_dev_cnt, pad_txt_dev_qst, \ idx_train_beg, idx_train_end, idx_dev_beg, idx_dev_end
import keras.preprocessing.text as T text = ''' 本文介绍keras提供的预处理包keras.preproceing下的text与序列处理模块sequence模块 2 text模块提供的方法 text_to_word_sequence(text,fileter) 可以 ''' print(T.text_to_word_sequence(text=text, filters=))
df = drop_useless_features(df, [ "helpful", "reviewTime", "reviewerID", "unixReviewTime", "asin", "reviewerName", "overall", "summary" ]) # Drop Nan values as they are mostly the columns where the overall column has # Neutral sentiment or 3.0. This is not our target anyway df = df.dropna() # Removing punctuations # Converting to Lowercase and cleaning punctuations df['reviewText'] = df['reviewText'].apply( lambda x: ' '.join(text_to_word_sequence(x))) # removing numbers from the column of reviewText df['reviewText'] = df['reviewText'].str.replace('\d+', '') # Plot positive and negative rating plot_size = plt.rcParams["figure.figsize"] plot_size[0] = 8 plot_size[1] = 6 plt.rcParams["figure.figsize"] = plot_size df.rating.value_counts().plot(kind='pie', autopct='%1.0f%%') df.reviewText.str.len().max() # prepare tokenizer
final_stop_words = set( [word for word in stopword if word not in not_stopwords]) speller = Speller() for i in range(len(df['comments'])): df['comments'][i] = re.sub("[0-9]+", " ", str( df['comments'][i])) #removing digits, since they're not important df['comments'][i] = deEmojify(df['comments'][i]) df['comments'][i] = strip_punctuation(df['comments'][i]) df['comments'][i] = ' '.join( speller(word) for word in df['comments'][i].split() if word not in final_stop_words) #removing stopwords and spell-correcting max_sent_len = 100 max_vocab_size = 1500 word_seq = [text_to_word_sequence(comment) for comment in df['comments']] # print(word_seq) # vectorizing a text corpus, turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) tokenizer = Tokenizer(num_words=max_vocab_size) tokenizer.fit_on_texts( [' '.join(seq[:max_sent_len]) for seq in word_seq] ) #Updates internal vocabulary based on a list of texts up to the max_sent_len. # print("vocab size: ", len(tokenizer.word_index)) #vocab size: 949 #converting sequence of words to sequence of indices X = tokenizer.texts_to_sequences( [' '.join(seq[:max_sent_len]) for seq in word_seq]) X = pad_sequences(X, maxlen=max_sent_len, padding='post', truncating='post') # X = np.expand_dims(X, axis =2) #reshape X to 3 dimensions # X = np.reshape(X, (X.shape[0], X.shape[1], 1))
def convert_text_to_index_array(text): # one really important thing that `text_to_word_sequence` does # is make all texts the same length -- in this case, the length # of the longest text in the set. return [dictionary[word] for word in kpt.text_to_word_sequence(text)]
def tokenize(self, text): return text_to_word_sequence(text)
def get_nomalized_test_data(train_vocab): #get the train data data = get_data.Datasets() test_data = data.get_test_data() test_reviews = [] test_sentences = [] test_labels = [] #test data preprocessing ---------- #clean the test dataset for test in test_data["review"]: cleaned_test = data.clean_text_to_text(test) test_reviews.append(cleaned_test) sentences = tokenize.sent_tokenize(cleaned_test) test_sentences.append(sentences) #define the label for id in test_data["id"]: # print(id) id = id.strip('"') # print(id) id, score = id.split('_') score = int(score) if (score < 5): test_labels.append(0) if (score >= 7): test_labels.append(1) #test---- #create a tokenizer and limit only dealt with top 20000 words tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(test_reviews) print("train_vocab") print(train_vocab) print(len(train_vocab)) #define the test_matrix test_matrix = np.zeros((len(test_reviews), SEN_NUM, WORDS_NUM), dtype='int32') #(250000,15,100) #print(test_matrix.shape) non_exist = 0 for review_index, review in enumerate(test_sentences): for sentence_index, sentence in enumerate(review): if (sentence_index < SEN_NUM): #print(sentence) tokens = text_to_word_sequence(sentence) num = 0 for _, token in enumerate(tokens): #see if the token is in the vocab if (token not in train_vocab.keys()): print(token) non_exist += 1 continue if (num < WORDS_NUM and train_vocab[token] < MAX_NB_WORDS): test_matrix[review_index, sentence_index, num] = train_vocab[token] num += 1 print(non_exist) #test_labels-> tocategory predicted_labels = to_categorical(np.asarray(test_labels)) return test_matrix, predicted_labels
def similarity(self, x, y): x = text_to_word_sequence(x) y = text_to_word_sequence(y) return -self.word2vec.wv.wmdistance(x, y)
def run_model(df, MAX_WORDS, MAX_SENTS, MAX_SENT_LENGTH, VALIDATION_SPLIT, EMBEDDING_DIM): x = df['Text'] y = df['Class'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000, stratify=y) # split data into sentences, using spacy sentencizer # since we are only using spacy for sentencizing, we only invoke the sentencizer nlp = spacy.blank("en") nlp.add_pipe(nlp.create_pipe("sentencizer")) x_train_sentences = pd.Series([doc for doc in nlp.pipe(x_train)]).apply(lambda x: [sent for sent in x.sents]) # oov_token=True reserves a token for unknown words (rather than ignoring the word) tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token=True) tokenizer.fit_on_texts(x_train.values) data = np.zeros((len(x_train_sentences), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') # i is each sample index number # samples is each sample for i, samples in enumerate(x_train_sentences): # j is the sentence index number # sentences is each sentence for j, sentences in enumerate(samples): if j < MAX_SENTS: # wordTokens is list of tokens wordTokens = text_to_word_sequence(str(sentences)) k = 0 # word is each individual token for _, word in enumerate(wordTokens): if k < MAX_SENT_LENGTH: if word not in tokenizer.word_index: # remove special characters continue if tokenizer.word_index[word] < MAX_WORDS: data[i, j, k] = tokenizer.word_index[word] k += 1 print('Total %s unique tokens.' % len(tokenizer.word_index)) labels = pd.get_dummies(y_train.values) print('Shape of samples tensor:', data.shape) print('Shape of labels tensor:', labels.shape) nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] print('Number of samples of each class in training and validation set') print(y_train.sum(axis=0)) print(y_val.sum(axis=0)) embedding_layer = Embedding(len(tokenizer.word_index) + 1, EMBEDDING_DIM, input_length=MAX_SENT_LENGTH, trainable=True) word_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32') word_sequences = embedding_layer(word_input) word_lstm = Bidirectional(LSTM(100, return_sequences=True))(word_sequences) word_dense = TimeDistributed(Dense(200))(word_lstm) word_att = AttentionWithContext()(word_dense) wordEncoder = Model(word_input, word_att) sent_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') sent_encoder = TimeDistributed(wordEncoder)(sent_input) sent_lstm = Bidirectional(LSTM(100, return_sequences=True))(sent_encoder) sent_dense = TimeDistributed(Dense(200))(sent_lstm) sent_att = AttentionWithContext()(sent_dense) predictions = Dense(20, activation='softmax')(sent_att) model = Model(sent_input, predictions) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) print(model.summary()) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=50, batch_size=50, callbacks=[es]) # vectorize test set the same way x_test_sentences = pd.Series([doc for doc in nlp.pipe(x_test)]).apply(lambda x: [sent for sent in x.sents]) x_test_sentences = [] for doc in nlp.pipe(x_test): x_test_sentences.extend(sent.text for sent in doc.sents) test_data = np.zeros((len(x_test_sentences), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') test_labels = pd.get_dummies(y_test.values) print('Shape of test samples tensor:', test_data.shape) print('Shape of test labels tensor:', test_labels.shape) print('Number of samples of each class in test set: ', test_labels.sum(axis=0)) for i, samples in enumerate(x_test_sentences): # j is the sentence number # sentences is each sentence for j, sentences in enumerate(samples): if j < MAX_SENTS: # wordTokens is list of tokens wordTokens = text_to_word_sequence(str(sentences)) k = 0 # word is each individual token for _, word in enumerate(wordTokens): if k < MAX_SENT_LENGTH: if word not in tokenizer.word_index: # remove special characters continue if tokenizer.word_index[word] < MAX_WORDS: test_data[i, j, k] = tokenizer.word_index[word] k = k + 1 x_test = test_data y_test_single = y_test # need this for classification report f1 y_test = test_labels loss, accuracy = model.evaluate(x_train, y_train, verbose=2) print("Training Accuracy: {:.4f}".format(accuracy)) loss, accuracy = model.evaluate(x_test, y_test, verbose=2) print("Testing Accuracy: {:.4f}".format(accuracy)) plot_history(history) y_pred = np.argmax(model.predict(x_test), axis=1) + 1 cm = confusion_matrix(y_test_single, y_pred, labels=None, sample_weight=None) print("\nClassification report summary:") print(classification_report(y_test_single, y_pred, labels=[i + 1 for i in range(20)], digits=3)) return model, cm
#import data after preprocessing filename="C:/Documents/AI/RNN ass/cleanedup-news-file-test.csv" #read data into raw text raw_text = open(filename, encoding="utf8").read() #lowercase raw_text = raw_text.lower() #transfrom punctuations into integer dropPunctuation = str.maketrans("", "", string.punctuation) raw_text = raw_text.translate(dropPunctuation) #define start time start_time1 = datetime.datetime.now() """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Pretreat Data Section """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # create mapping of unique words to integers lines = text_to_word_sequence(raw_text) tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) encoded = tokenizer.texts_to_sequences(lines) encoded_sequence=[] for list in encoded: encoded_sequence.append(list[0]) sequences=[] #create sequence for training for i in range(0, len(encoded_sequence)-seq_length-1): sequence = encoded_sequence[i:i+seq_length+1] sequences.append(sequence) print('Total Sequences: %d' % len(sequences)) # vocabulary size vocab_size = len(tokenizer.word_index) + 1
from keras.preprocessing import text from keras.utils import np_utils from keras.preprocessing import sequence from nltk.corpus import gutenberg from string import punctuation bible = gutenberg.sents('bible-kjv.txt') remove_terms = punctuation + '0123456789' norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible] norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible] norm_bible = filter(None, normalize_corpus(norm_bible)) norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2] tokenizer = text.Tokenizer() tokenizer.fit_on_texts(norm_bible) word2id = tokenizer.word_index # build vocabulary of unique words word2id['PAD'] = 0 id2word = {v: k for k, v in word2id.items()} wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible] vocab_size = len(word2id) embed_size = 100 window_size = 2 # context window size print('Vocabulary Size:', vocab_size) print('Vocabulary Sample:', list(word2id.items())[:10])
def create_dataset(): """ Create the IMDB dataset as numpy arrays. """ st = time.time() print('Constructing dataset...') data_train = pd.read_csv('data/labeledTrainData.tsv', sep='\t') data_test = pd.read_csv('data/testData.tsv', sep='\t') from nltk import tokenize get_punkt_if_necessary(tokenize) reviews = [] labels = [] texts = [] for idx in range(data_train.review.shape[0]): text = BeautifulSoup(data_train.review[idx], features="html.parser") text = clean_str(text.get_text().encode('ascii','ignore').decode()) texts.append(text) sentences = tokenize.sent_tokenize(text) reviews.append(sentences) labels.append(data_train.sentiment[idx]) for idx in range(data_test.review.shape[0]): text = BeautifulSoup(data_test.review[idx], features="html.parser") text = clean_str(text.get_text().encode('ascii','ignore').decode()) texts.append(text) # texts is the raw text sentences = tokenize.sent_tokenize(text) reviews.append(sentences) if data_test.id[idx][-1] in "12345": labels.append(0) else: labels.append(1) tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) print('Tokenizing...') data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') for i, sentences in enumerate(reviews): for j, sent in enumerate(sentences): if j< MAX_SENTS: wordTokens = text_to_word_sequence(sent) k=0 for _, word in enumerate(wordTokens): if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<=MAX_NUM_WORDS: data[i,j,k] = tokenizer.word_index[word] k=k+1 word_index = tokenizer.word_index print('Total %s unique tokens.' % len(word_index)) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) x_train = data[:25000] y_train = labels[:25000] x_val = data[25000:] y_val = labels[25000:] print('Number of positive and negative reviews in traing and validation set') print('Creating dataset takes {}s.'.format(time.time()-st)) print('Storing dataset...') np.save('data/x_train.npy', x_train) np.save('data/y_train.npy', y_train) np.save('data/x_val.npy', x_val) np.save('data/y_val.npy', y_val) with open('data/word_index.pkl','wb') as f: pkl.dump(word_index, f)
with open('data.txt', 'r') as file: text = file.read() lines = text.lower().split('\n') from keras.preprocessing.text import text_to_word_sequence, Tokenizer words = text_to_word_sequence(text) tokenizer = Tokenizer() tokenizer.fit_on_texts(words) vocabulary_size = len(tokenizer.word_index) + 1 sequences = tokenizer.texts_to_sequences(lines) subsequences = [] for sequence in sequences: for i in range(1, len(sequence)): subsequence = sequence[:i + 1] subsequences.append(subsequence) from keras.preprocessing.sequence import pad_sequences sequence_length = max([len(sequence) for sequence in sequences]) sequences = pad_sequences(subsequences, maxlen=sequence_length, padding='pre') from keras.utils import to_categorical x, y = sequences[:, :-1], sequences[:, -1] y = to_categorical(y, num_classes=vocabulary_size) from keras.models import Sequential
def parse(self): """ parse json file to generate table and vocab dict """ print('start loading json file...') train_ann = json.load(open(self.config.train_annFile, 'r')) train_ques = json.load(open(self.config.train_questionFile, 'r')) val_ann = json.load(open(self.config.val_annFile, 'r')) val_ques = json.load(open(self.config.val_questionFile, 'r')) print('load completed!') questions_train_ls = [] questions_val_ls = [] answers_train_ls = [] answers_val_ls = [] self.parse_answer(train_ann, answers_train_ls) self.parse_answer(val_ann, answers_val_ls) print('complete parser train data') self.parse_question(train_ques, questions_train_ls) self.parse_question(val_ques, questions_val_ls) print('complete parser val data') assert len(questions_train_ls) == len(answers_train_ls) assert len(questions_val_ls) == len(answers_val_ls) # check the data integrity questions = ' '.join([x[2] for x in questions_train_ls]) questions = questions + ' ' + ' '.join( [x[2] for x in questions_val_ls]) q_counter = Counter(text_to_word_sequence(questions)) questions = set(dict(q_counter.most_common(6000)).keys()) # questions = set() an_ls = [x[2] for x in answers_train_ls] + [x[2] for x in answers_val_ls] a_counter = Counter(an_ls) answers = set(dict(a_counter.most_common(3000)).keys()) self.build_vocab(questions, answers) print('complete build vocab') question_id_answer_train_df = pd.DataFrame( data=answers_train_ls, columns=['image_id', 'question_id', 'answer']) question_id_answer_val_df = pd.DataFrame( data=answers_val_ls, columns=['image_id', 'question_id', 'answer']) question_id_question_train_df = pd.DataFrame( data=questions_train_ls, columns=['image_id', 'question_id', 'question']) question_id_question_val_df = pd.DataFrame( data=questions_val_ls, columns=['image_id', 'question_id', 'question']) question_id_answer_train_df['answer'] = question_id_answer_train_df[ 'answer'].apply(self.encode_answer) question_id_answer_val_df['answer'] = question_id_answer_val_df[ 'answer'].apply(self.encode_answer) question_id_question_train_df[ 'question'] = question_id_question_train_df['question'].apply( self.encode_question) question_id_question_val_df['question'] = question_id_question_val_df[ 'question'].apply(self.encode_question) self.train_data = pd.merge(question_id_question_train_df, question_id_answer_train_df, on=['image_id', 'question_id']).drop(['question_id'], axis=1) self.val_data = pd.merge(question_id_question_val_df, question_id_answer_val_df, on=['image_id', 'question_id']).drop(['question_id'], axis=1) self.train_sample_size = len(question_id_answer_train_df) self.val_sample_size = len(question_id_answer_val_df) print('train_sample_size:%d\n val_sample_size:%d ' % (self.train_sample_size, self.val_sample_size)) self.data_cleaning('train') self.data_cleaning('val') print('remove data which can not find picture')
#!/usr/bin/python3 # coding: utf-8 # https://github.com/EliasCai/sentiment/blob/master/sentiment_words.py#L78 from keras.preprocessing.text import Tokenizer from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import one_hot from keras.preprocessing.text import hashing_trick ################################################################## ## 1. text_to_word_sequence, one_hot, hashing_trick texts = ['some thing to eat', 'some thing to drink'] print(text_to_word_sequence(texts[0])) # ['some', 'thing', 'to', 'eat']; 简单的空格分开 print(one_hot(texts[0], 10)) # [5, 7, 5, 7]; (10 表示数字化向量为 10 以内的数字) print(one_hot(texts[1], 10)) # [5, 7, 5, 5]; 因为内部调用了 hash, 所以能够在定了 (text, n) 之后对每个 str 赋值相同 # This is a wrapper to the `hashing_trick` function using `hash` as the hashing function, unicity of word to index mapping non-guaranteed. ################################################################## ## 2. Tokenizer: 索引就是出现的先后位置 # keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False) # Tokenizer 是一个用于向量化文本, 或将文本转换为序列(即单词在字典中的下标构成的列表, 从 1 算起)的类. # num_words: None 或整数, 处理的最大单词数量. 若被设置为整数, 则分词器将被限制为待处理数据集中最常见的 num_words 个单词 # char_level: 如果为 True, 每个字符将被视为一个标记 texts = ['some thing to eat', 'some thing to drink'] tmp_tokenizer = Tokenizer(num_words=None) # num_words:None 或整数, 处理的最大单词数量; 少于此数的单词丢掉 tmp_tokenizer.fit_on_texts(texts) # tmp_tokenizer.fit_on_texts(texts[0]); tmp_tokenizer.fit_on_texts(texts[1]) # 不能这样, 会按单个字母来统计 # 属性 print(tmp_tokenizer.word_counts) # OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]); 在训练期间出现的次数 print(tmp_tokenizer.word_docs) # {'thing': 2, 'eat': 1, 'to': 2, 'some': 2, 'drink': 1}; 在训练期间所出现的文档或文本的数量 print(tmp_tokenizer.word_index) # {'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}; 排名或者索引 print(len(tmp_tokenizer.word_index)) # 5; 词典长度 print(tmp_tokenizer.index_docs) # {2: 2, 4: 1, 3: 2, 1: 2, 5: 1}; 将 word_index 和 word_docs 合并 print(tmp_tokenizer.document_count) # 2; 训练文档数
file = open("txt_sentoken/pos/" + str(file_name), "r") docs_list.append(file.read()) for file_name in negative_files_names: file = open("txt_sentoken/neg/" + str(file_name), "r") docs_list.append(file.read()) labels_positive = [1] * len(positive_files_names) labels_negative = [0] * len(negative_files_names) labels = labels_positive + labels_negative labels = np.array(labels) docs_tokens = [] for doc in docs_list: docs_tokens.append(text_to_word_sequence(doc)) Article = collections.namedtuple('Article', 'words tags paragraph') tuples_list = [] for i in range(len(docs_tokens)): tuples_list.append( Article(words=docs_tokens[i], tags=[str(i)], paragraph=docs_list[i])) tuples_list = shuffle(tuples_list) # model = Doc2Vec(size=50, # alpha=0.025, # min_alpha=0.00025, # min_count=1, # dm =1)
def test_text_to_word_sequence_unicode_multichar_split(): text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
batchSize = 128 # how many samples to feed neural network GRU_UNITS = 256 # Number of nodes in GRU Layer numClasses = 2 #{Positive,Negative} iterations = 100000 # How many iterations to train nodes_on_FC = 64 # Number of nodes on FC layer epsilon = 1e-4 # For batch normalization val_loop_iter = 50 # in how many iters we record #Reading csv's train = pd.read_csv('dataset/train_amazon.csv') test = pd.read_csv('dataset/test_amazon.csv') #Removing punctuations #Converting to Lowercase and cleaning punctiations train['text'] = train['text'].apply( lambda x: ' '.join(text_to_word_sequence(x))) test['text'] = test['text'].apply(lambda x: ' '.join(text_to_word_sequence(x))) def remove_numbers(x): x = re.sub('[0-9]{5,}', '#####', x) x = re.sub('[0-9]{4}', '####', x) x = re.sub('[0-9]{3}', '###', x) x = re.sub('[0-9]{2}', '##', x) return x #Removing Numbers train['text'] = train['text'].apply(lambda x: remove_numbers(x)) test['text'] = test['text'].apply(lambda x: remove_numbers(x))
def preprocess_text(text): word_sequence = text_to_word_sequence(text) indices_sequence = [[word_index[word] if word in word_index else 0 for word in word_sequence]] x = tokenizer.sequences_to_matrix(indices_sequence, mode='binary') return x
w2i, i2w = generate_w2i_i2w_dict(data_file) with open(data_file, 'r') as fr: for line in fr: query1, query2, label = line.split('\t')[:3] query = (sentence2sequence(query1, w2i), sentence2sequence(query2, w2i)) X.append(query) y.append(int(label)) X = np.array(X) y = np.array(y) return X, y if __name__ == '__main__': # data_file = '../Files/yahoo.data.dat' data_file = '../Files/yahooAnswer.txt' sentences_file = '../Files/sentences.dat' words_file = '../Files/words.dat' # output = pd.read_csv('C:\Users\AC\PycharmProjects\NLPSimilarity\Files\yahoo.data.dat', names=['Query1', 'Query2', 'label', 'ID'], sep='\t') # print type(output) # print output # output = output.as_matrix() # print type(output) # print output words = generate_words_set(data_file) print '共有%d个词' % len(words) # 61681 sentences = generate_sentences_set(data_file) print '最长的句子包含%d个单词' % max([len(text_to_word_sequence(sentence)) for sentence in sentences]) # 788 X, y = get_data(data_file) print type(X), len(X), X.ndim, X[0], X[0][0], X[0][1] # <type 'numpy.ndarray'> 20563 2 [[5859, 5489, 502, 5805, 3429, 9129] [5580, 5923, 10200, 3429, 9132]] [5859, 5489, 502, 5805, 3429, 9129] [5580, 5923, 10200, 3429, 9132] print type(y), len(y), y.ndim, y[0] # <type 'numpy.ndarray'> 20563 1
def test_text_to_word_sequence_multichar_split(): text = 'hello!stop?world!' assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
total_texts.extend(word_not_in_texts) fileRaw = open('./data/Trump_texts.txt', 'r') raw_strings = list() for line in fileRaw: raw_strings.append(text_cleaner(line)) tokenizer = Tokenizer() tokenizer.fit_on_texts(total_texts) word2id = tokenizer.word_index id2word = {v: k for k, v in word2id.items()} xtr = list() for q in X_train: xtr.append(list(word2id[w] for w in text_to_word_sequence(q))) xts = list() for q in X_test: xts.append(list(word2id[w] for w in text_to_word_sequence(q))) predict = list() # for q in raw_strings: # l = list() # for w in text_to_word_sequence(q): # if w in word2id.keys(): # l.append(word2id[w]) # else: # l.append(word2id[word_not_in_texts[0]]) # predict.append(l) predict = list.copy(xtr)
SEQ_LEN_TR = len(max(train_df['question_text'], key=len).split()) SEQ_LEN_TS = len(max(test_df['question_text'], key=len).split()) SEQ_LEN = max(SEQ_LEN_TR,SEQ_LEN_TS) print("SEQ_LEN:",SEQ_LEN) assert SEQ_LEN == 45 ## train_cat_list, train_text_list, train_questions = [], [], [] test_text_list, test_questions = [], [] for i in range(len(train_df)): quest = train_df.loc[i,'question_text'] train_questions.append(quest) train_cat_list.append(train_df.loc[i,'target']) train_text_list.append(text_to_word_sequence(process_text(quest),lower=LOWER_CASE)) for i in range(len(test_df)): quest = test_df.loc[i,'question_text'] test_questions.append(quest) test_text_list.append(text_to_word_sequence(process_text(quest),lower=LOWER_CASE)) assert len(train_cat_list) == len(train_text_list) assert len(train_cat_list) == len(train_questions) assert len(test_questions) == len(test_text_list) print(">> train_size:",len(train_cat_list)) print(">> train sample:",train_cat_list[44] , train_text_list[44], train_questions[44]) print(">> test_size:",len(test_questions)) print(">> test sample:", test_text_list[44] , test_questions[44])
def train_on_texts(self, texts, context_labels=None, batch_size=128, num_epochs=50, verbose=1, new_model=False, gen_epochs=1, train_size=1.0, max_gen_length=300, validation=True, dropout=0.0, via_new_model=False, **kwargs): if new_model and not via_new_model: self.train_new_model(texts, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, batch_size=batch_size, dropout=dropout, validation=validation, **kwargs) return if context_labels: context_labels = LabelBinarizer().fit_transform(context_labels) if 'prop_keep' in kwargs: train_size = prop_keep if self.config['word_level']: texts = [text_to_word_sequence(text, filters='') for text in texts] # calculate all combinations of text indices + token indices indices_list = [np.meshgrid(np.array(i), np.arange( len(text) + 1)) for i, text in enumerate(texts)] indices_list = np.block(indices_list) # If a single text, there will be 2 extra indices, so remove them # Also remove first sequences which use padding if self.config['single_text']: indices_list = indices_list[self.config['max_length']:-2, :] indices_mask = np.random.rand(indices_list.shape[0]) < train_size gen_val = None val_steps = None if train_size < 1.0 and validation: indices_list_val = indices_list[~indices_mask, :] gen_val = generate_sequences_from_texts( texts, indices_list_val, self, context_labels, batch_size) val_steps = max( int(np.floor(indices_list_val.shape[0] / batch_size)), 1) indices_list = indices_list[indices_mask, :] num_tokens = indices_list.shape[0] assert num_tokens >= batch_size, "Fewer tokens than batch_size." level = 'word' if self.config['word_level'] else 'character' print("Training on {:,} {} sequences.".format(num_tokens, level)) steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1) gen = generate_sequences_from_texts( texts, indices_list, self, context_labels, batch_size) base_lr = 4e-3 # scheduler function must be defined inline. def lr_linear_decay(epoch): return (base_lr * (1 - (epoch / num_epochs))) if context_labels is not None: if new_model: weights_path = None else: weights_path = "{}_weights.hdf5".format(self.config['name']) self.save(weights_path) self.model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config, context_size=context_labels.shape[1], weights_path=weights_path) self.model.fit_generator(gen, steps_per_epoch=steps_per_epoch, epochs=num_epochs, callbacks=[ LearningRateScheduler( lr_linear_decay), generate_after_epoch( self, gen_epochs, max_gen_length), save_model_weights( self.config['name'])], verbose=verbose, max_queue_size=2, validation_data=gen_val, validation_steps=val_steps ) # Keep the text-only version of the model if using context labels if context_labels is not None: self.model = Model(inputs=self.model.input[0], outputs=self.model.output[1])
df = pd.read_csv("test_test.csv") df.columns = ['seq_label', 'sequence'] print(df['sequence']) Ori = df['sequence'].tolist() Orilabel = df["seq_label"].tolist() from textwrap import wrap # cut to kmers kmer_size = 1 #cut to kmers df['sequence'] = df.apply(lambda x: wrap(x['sequence'], kmer_size), axis=1) df['sequence'] = [','.join(map(str, l)) for l in df['sequence']] max_length = df.sequence.map(lambda x: len(x)).max() max_length = max_length / kmer_size df['sequence'] = df.apply( lambda x: text_to_word_sequence(x['sequence'], split=','), axis=1) df['sequence'] = df['sequence'].astype(str) vocab_max = 4**kmer_size print(vocab_max) # integer encode the document df['sequence'] = df.apply(lambda x: one_hot(x['sequence'], vocab_max), axis=1) print(df['sequence']) from keras.utils import np_utils from sklearn.preprocessing import LabelEncoder dataset = df.values Y = dataset[:, 0] encoder_label = LabelEncoder() encoder_label.fit(Y) encoded_Y = encoder_label.transform(Y) dummy_y = np_utils.to_categorical(encoded_Y)
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
def main(): input_filepath = './instagram.csv' output_filepath = './instagram.pickle' embeddings_index = {} with open('glove.twitter.27B.50d.txt', 'r', encoding='utf-8') as fin: for line in fin: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs print('total {0} word vectors'.format(len(embeddings_index))) df = pd.read_csv(input_filepath, encoding="ISO-8859-1", index_col=False) # fill nan cell df.fillna('', inplace=True) # shuffle the dataframe df = shuffle(df) print('data shape:', df.shape) labels = [] comments_all = [] # all the comments social_content_all = [] # number of likes/shares/followed_by/follows for all the posts time_sequence_all = [] # time sequences for all the posts for _, session in df.iterrows(): # number of likes/shares/followed_by/follows social_content = [int(re.findall(r'\d+', session['likes'])[0]), session['shared media'], session['followed_by'], session['follows']] social_content_all.append(social_content) label = session['question2'] if label.startswith('n'): label = 0 else: label = 1 labels.append(label) post_time = ' '.join(session['cptn_time'].split()[-2:]) # datetime when the message is posted, %Y-%m-%d %H:%M:%S # handle corrupted time format -- some dates are missing the 2 at the head, length should be 19 if len(post_time) == 18: post_time = '2' + post_time last_post_time = datetime.datetime.strptime(post_time, DTFormat) comments = [] time_sequence = [0] # time when the owner posts the picture for comment_idx in range(1, MAX_SENTS + 1): comment = session['clmn{0}'.format(comment_idx)] if comment.strip() and 'empety' not in comment: comment = comment.strip() identifier = '(created_at:' ts_start_idx = comment.find(identifier) if ts_start_idx != -1: # comment timestamp len_comment = len(comment) ts = comment[ts_start_idx + len(identifier): len_comment - 1] ts = datetime.datetime.strptime(ts, DTFormat) time_lag = ts - last_post_time time_sequence.append(time_lag.seconds) last_post_time = ts # comment text comment = comment[: ts_start_idx] comment = clean_str(comment) comments.append(comment) comments_all.append(comments) time_sequence_all.append(time_sequence) pad_time_sequence_all = np.zeros((len(time_sequence_all), MAX_SENTS)) for ts_idx, time_sequence in enumerate(time_sequence_all): pad_time_sequence_all[ts_idx][0: len(time_sequence)] = time_sequence # uniq_time_sequence_size = len(np.unique(pad_time_sequence_all)) social_content_all = np.array(social_content_all) # uniq_social_content_size = len(np.unique(social_content_all)) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) words_all = [' '.join(comments) for comments in comments_all] tokenizer.fit_on_texts(words_all) word_index = tokenizer.word_index text_tensor = np.zeros((len(comments_all), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') print('total {0:,} unique tokens'.format(len(word_index))) for session_idx, comments in enumerate(comments_all): for comment_idx, comment in enumerate(comments): if comment_idx < MAX_SENTS: word_idx = 0 for _, word in enumerate(text_to_word_sequence(comment)): if word_idx < MAX_SENT_LENGTH and word_index[word] < MAX_NB_WORDS: text_tensor[session_idx, comment_idx, word_idx] = word_index[word] word_idx += 1 # Hierarchical Attention Network for text and other info pad_time_sequence_all = np.delete(pad_time_sequence_all, range(MAX_SENTS, pad_time_sequence_all.shape[1]), axis=1) pad_time_sequence_all = preprocessing.StandardScaler().fit_transform(pad_time_sequence_all) social_content_all = preprocessing.StandardScaler().fit_transform(social_content_all) print('text_tensor shape:', text_tensor.shape) print('pad_time_sequence_all shape:', pad_time_sequence_all.shape) print('social_content_all shape:', social_content_all.shape) han_data = np.dstack((text_tensor, pad_time_sequence_all)) print('Hierarchical Attention Network data shape (text + time):', han_data.shape) embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) for word, idx in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[idx] = embedding_vector store_data = {'embedding_matrix': embedding_matrix, 'data': han_data, 'labels': labels, 'postInfo': social_content_all, 'timeInfo': pad_time_sequence_all, 'word_index': word_index, 'df': df} pickle.dump(store_data, open(output_filepath, 'wb')) print('successfully write to output file {0}'.format(output_filepath))
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
def tokenizer(self, train_all_doc, dev_all_doc, test_all_doc, maxsentence, mylen): length_up_num = 0 max_token = 0 allfile = [train_all_doc, dev_all_doc, test_all_doc] wordvec = [] all_document = [] for file in allfile: documents = [] for doc in file: temp = [] word = [] for sencent in doc: sen = "" token = text_to_word_sequence( sencent, filters='!"#$%&()*+,-.:;=?@[\]^`{|}/~', lower=True, split=" ") max_token = max(max_token, len(token)) if (len(token) > mylen): length_up_num += 1 temp.append(token) for c in token: sen = sen + c + " " sen = sen.strip(" ") word.append(sen) wordvec.append(word) documents.append(temp) all_document.append(documents) all_raw = [] for file in all_document: for inst in file: instance = [] for sencent in inst: for word in sencent: instance.append(word) all_raw.append(instance) all_dict = [] all_dict.extend(all_raw) print('max token is ', max_token) print('up than ', mylen, ' num is', length_up_num) tokenizer = Tokenizer(filters='!"#$%&()*+,-.:;=/?@[\]^`{|}~\'>', lower=True, split=" ") tokenizer.fit_on_texts(all_dict) vocab_size = len(tokenizer.word_index) + 1 print(tokenizer.word_docs) print('vocab', vocab_size) pad_sentence = [] for i in range(mylen): pad_sentence.append(0) pad_sentence = np.array(pad_sentence) train_x = [] for i in all_document[0]: train_index_x = tokenizer.texts_to_sequences(i) train_index_x = (list)(sequence.pad_sequences(train_index_x, mylen, padding='post', truncating='post')) for j in range(len(train_index_x), maxsentence): train_index_x.append(pad_sentence) train_x.append(train_index_x) train_x = np.reshape(train_x, (len(train_x), maxsentence, mylen)) dev_x = [] for i in all_document[1]: train_index_x = tokenizer.texts_to_sequences(i) train_index_x = (list)(sequence.pad_sequences(train_index_x, mylen, padding='post', truncating='post')) for j in range(len(train_index_x), maxsentence): train_index_x.append(pad_sentence) dev_x.append(train_index_x) dev_x = np.reshape(dev_x, (len(dev_x), maxsentence, mylen)) test_x = [] for i in all_document[2]: train_index_x = tokenizer.texts_to_sequences(i) train_index_x = (list)(sequence.pad_sequences(train_index_x, mylen, padding='post', truncating='post')) for j in range(len(train_index_x), maxsentence): train_index_x.append(pad_sentence) test_x.append(train_index_x) test_x = np.reshape(test_x, (len(test_x), maxsentence, mylen)) return train_x, dev_x, test_x, tokenizer, wordvec
text = clean_str(text.get_text().encode('ascii', 'ignore')) texts.append(text) sentences = tokenize.sent_tokenize(text) reviews.append(sentences) labels.append(data_train.sentiment[idx]) tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32') for i, sentences in enumerate(reviews): for j, sent in enumerate(sentences): if j < MAX_SENTS: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS: data[i, j, k] = tokenizer.word_index[word] k = k + 1 word_index = tokenizer.word_index print('Total %s unique tokens.' % len(word_index)) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) indices = np.arange(data.shape[0]) np.random.shuffle(indices)
def convert_text_to_index_array(text): return [dictionary[word] for word in kpt.text_to_word_sequence(text)]