def get_sequence(rangeOfSequence, mode): #modes available (1,2,3,4) #1 categorical input, categorical output #2 categorical input, continous output #3 continous input, categorical output #4 continous input, continous output c = rangeOfSequence data = [[i, j] for i in range(1, c + 1) for j in range(i, c + 1)] data2 = [[j, i] for i in range(1, c + 1) for j in range(i, c + 1)] #the -1 serves as an 'end of sentence' indicator target_in = [[[0]] + [[k] for k in range(d[0], d[1] + 1)] + [[-1]] for d in data] #teacher forcing target_out = [[[k] for k in range(d[0], d[1] + 1)] + [[-1]] + [[0]] for d in data] #output target_in2 = [[[0]] + [[k] for k in range(d[0], d[1] - 1, -1)] + [[-1]] for d in data2] #for the other way around target_out2 = [[[k] for k in range(d[0], d[1] - 1, -1)] + [[-1]] + [[0]] for d in data2] #combine target_in = target_in + target_in2 target_out = target_out + target_out2 target_in = pad(target_in, padding='post') target_out = pad(target_out, padding='post') data = [[[i], [j]] for i in range(1, c + 1) for j in range(i, c + 1)] data2 = [[[j], [i]] for i in range(1, c + 1) for j in range(i, c + 1)] data = data + data2 data = np.array(data, dtype=float) target_out = np.array(target_out, dtype=float) target_in = np.array(target_in, dtype=float) if mode == 4: return data, target_in, target_out dim1 = target_in.shape[0] dim2 = target_in.shape[1] d = to_categorical([data], num_classes=c + 2) d = d.reshape(dim1, 2, c + 2) if mode == 2: return d, target_in, target_out t_in = to_categorical([target_in], num_classes=c + 2) t_in = t_in.reshape(dim1, dim2, c + 2) t_out = to_categorical([target_out], num_classes=c + 2) t_out = t_out.reshape(dim1, dim2, c + 2) if mode == 3: return data, t_in, t_out return d, t_in, t_out #mode 1
def pad_sequences(train_X, train_y): # Used to find the max array size. tempMax = -1 # Keep and store the max length of a sentence as it goes through train_X. for s in train_X: if (len(s) > tempMax): tempMax = len(s) # Define the MAX_LENGTH of a sentence. MAX_LENGTH = tempMax # Pad train_X and train_y. train_X = pad(train_X, MAX_LENGTH, 'int32', 'post', 'pre', 0.0) train_y = pad(train_y, MAX_LENGTH, 'int32', 'post', 'pre', 0.0) # Return the numpy arrays and MAX-LENGTH. return train_X, train_y, MAX_LENGTH
def process_text(text, to_pad=False, max_len=None, tok=None, save_name=None, num_word=None): """tips : tok is tokenizer: pass tok as none while processing trainig data text : should have line breaks and in each line there should be a '\t' also it should be a String NAME : Caption formatted return : tok: tokenizer, text_dict: """ names = [i.split('\t')[0] for i in text.split('\n')] descs = [i.split('\t')[1] for i in text.split('\n')] clean_descs = clean_text(descs) clean_descs = [i.split(' ') for i in clean_descs] if tok == None: tok = Tokenizer(num_words=num_word) tok.fit_on_texts(clean_descs) desc_seqs = tok.texts_to_sequences(clean_descs) text_dict = dict() if to_pad: if max_len == None: max_len = max([len(i) for i in clean_descs]) else: desc_seqs = pad(desc_seqs, maxlen=max_len, padding='post', value=0) for i in range(len(names)): if names[i] in text_dict: (text_dict[names[i]]).append(desc_seqs[i]) else: text_dict[names[i]] = [desc_seqs[i]] if not save_name == None: with open(save_name, 'wb') as fil: pickle.dump(obj=text_dict, file=fil) with open(save_name + 'tokenizer', 'wb') as fil: pickle.dump(obj=tok, file=fil) return tok, text_dict
def __getitem__(self, i): pg, tg = 'post', 'post' target = [self.target[i]] question = str(self.text[i]) quest_ids = self.tokenizer.encode(question.strip()) attention_mask_idx = len(quest_ids) - 1 if 0 not in quest_ids: quest_ids = 0 + quest_ids quest_ids = pad([quest_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg) attention_mask = np.zeros(MAXLEN) attention_mask[1:attention_mask_idx] = 1 attention_mask = attention_mask.reshape((1, -1)) if 2 not in quest_ids: quest_ids[-1], attention_mask[-1] = 2, 0 return FloatTensor(target), LongTensor(quest_ids), LongTensor(attention_mask)
def predict_sentiment(tweet): pg, tg = 'post', 'post' tweet_ids = tokenizer.encode(tweet.strip()) sent = {0: 'positive', 1: 'neutral', 2: 'negative'} att_mask_idx = len(tweet_ids) - 1 if 0 not in tweet_ids: tweet_ids = 0 + tweet_ids tweet_ids = pad([tweet_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg) att_mask = np.zeros(MAXLEN) att_mask[1:att_mask_idx] = 1 att_mask = att_mask.reshape((1, -1)) if 2 not in tweet_ids: tweet_ids[-1], att_mask[-1] = 2, 0 tweet_ids, att_mask = torch.LongTensor(tweet_ids), torch.LongTensor(att_mask) return sent[np.argmax(network.forward(tweet_ids.to(device), att_mask.to(device)).detach().cpu().numpy())]
def predict_insincerity(question): pg, tg = 'post', 'post' ins = {0: 'sincere', 1: 'insincere'} quest_ids = tokenizer.encode(question.strip()) attention_mask_idx = len(quest_ids) - 1 if 0 not in quest_ids: quest_ids = 0 + quest_ids quest_ids = pad([quest_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg) att_mask = np.zeros(MAXLEN) att_mask[1:attention_mask_idx] = 1 att_mask = att_mask.reshape((1, -1)) if 2 not in quest_ids: quest_ids[-1], attention_mask[-1] = 2, 0 quest_ids, att_mask = torch.LongTensor(quest_ids), torch.LongTensor(att_mask) output = network.forward(quest_ids.to(device), att_mask.to(device)) return ins[int(np.round(nn.Sigmoid()(output.detach().cpu()).item()))]
def __getitem__(self, i): pg, tg = 'post', 'post' tweet = str(self.text[i]).strip() tweet_ids = self.tokenizer.encode(tweet) attention_mask_idx = len(tweet_ids) - 1 if 0 not in tweet_ids: tweet_ids = 0 + tweet_ids tweet_ids = pad([tweet_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg) attention_mask = np.zeros(MAXLEN) attention_mask[1:attention_mask_idx] = 1 attention_mask = attention_mask.reshape((1, -1)) if 2 not in tweet_ids: tweet_ids[-1], attention_mask[-1] = 2, 0 sentiment = [self.sentiment_dict[self.sentiment[i]]] sentiment = torch.FloatTensor(to_categorical(sentiment, num_classes=3)) return sentiment, torch.LongTensor(tweet_ids), torch.LongTensor(attention_mask)
def get_supporting_facts_training(X, Xq, word_idx, train_supporting_facts, trained_attention, max_hops=2): supporting_sentences = [[0] for i in range(len(X))] totalX = [] totalXq = [] totalY = [] enough_memories = [0 for i in range(len(X))] allX = X[:] selected = [[] for i in range(len(X))] leftoversX = [] for i in range(0, max_hops): print (bcolors.BOLD + "Entering hop " + str(i) + " ..." + bcolors.ENDC) _, combinedXq, leftoverS, supporting_sentences, X, Xq, Y , found_supporting, leftover= supporting_facts_inc(None, Xq, word_idx, train_supporting_facts, supporting_sentences, trained_attention, enough_memories, allX, selected, ) totalX.extend(X) totalY.extend(Y) totalXq.extend(Xq) leftoversX.extend(leftover) Xq = combinedXq train_supporting_facts = leftoverS print (bcolors.BOLD + "Found supporting facts " + str(len(selected)) + " ..." + bcolors.ENDC) print(np.sum(enough_memories), "break") if (np.sum(enough_memories) == len(enough_memories) or found_supporting == 0 ): print (bcolors.BOLD + "breaking at hop " + str(i) + " ..." ) break X = pad(totalX, maxlen=max(map(len, totalX))) Xq = pad(totalXq, maxlen=max(map(len, totalXq))) Y = pad(totalY, maxlen=max(map(len, totalY))) leftoversX = pad(leftoversX, maxlen=max(map(len, leftoversX))) # print (len(supporting_sentences)) # for sentence in supporting_sentences: # print(sentence) supporting_sentences = pad(supporting_sentences, maxlen=max(map(len, supporting_sentences))) #X = reverse(X, word_idx) #Xq = reverse(Xq, word_idx) # import collections # y=collections.Counter([tuple(list(x[0]) + list(x[1])) for x in zip(X,Xq)]) # for yi,v in y.items(): # if(v > 1): # print yi, v print('X.shape = {}'.format(X.shape)) print('Xq.shape = {}'.format(Xq.shape)) print('Y.shape = {}'.format(Y.shape)) print('leftover.shape = {}'.format(leftoversX.shape)) print('supporting_sentences.shape = {}'.format(supporting_sentences.shape)) print(bcolors.ENDC) return X, Xq, Y, supporting_sentences, leftoversX
list_sent_train = X_train["comment_text"] list_sent_test = X_test["comment_text"] sent_text = t_data['comment_text'] l_toxic = t_label['toxic'] l_sever_t = t_label['severe_toxic'] l_obscene = t_label['obscene'] l_idh = t_label['identity_hate'] l_in = t_label['insult'] l_th = t_label['threat'] max_features = 20000 tokenizer = Tokenizer(num_words=max_features, char_level=True) tokenizer.fit_on_texts(list(list_sent_train)) list_token_train = tokenizer.texts_to_sequences(list_sent_train) list_sent_test = tokenizer.texts_to_sequences(list_sent_test) maxlength = 500 X_t = pad(list_token_train, maxlen=maxlength) X_te = pad(list_sent_test, maxlen=maxlength) inp = Input(shape=(maxlength, )) embedding_size = 240 x = Embedding(len(tokenizer.word_index) + 1, embedding_size)(inp) x = Conv1D(filters=100, kernel_size=4, padding='same', activation='relu')(x) x = MaxPooling1D(pool_size=4)(x) #x = Bidirectional(GRU(60, return_sequences=True,name='lstm_layer',dropout=0.2,recurrent_dropout=0.2))(x) x = GlobalMaxPool1D()(x) x = Dense(50, activation="relu")(x)
def coco_generator( mappings, captions, #used for one hotting the target should be vocabularies size dict_size, max_len, image_batch_szie=1, path_to_pkl_files=".", pkl_file_extension='.pkl', epochs=1): for _ in range(epochs): acc_features = np.array([[[0 for i in range(4096)]]]) acc_caption = np.array([[[0 for i in range(max_len)]]]) acc_target = np.array([[[0 for i in range(dict_size + 1)]]]) counter = 0 for pkl_file, image_subset in mappings.items(): with open(path_to_pkl_files + '/' + pkl_file + pkl_file_extension, 'rb') as file: feature_dict = pickle.load(file) for image_name in image_subset: image_name = image_name.split('/')[-1] temp = [] caption = captions[image_name] [[temp.append(line[:i]) for i in range(1, len(line))] for line in caption] caption = np.array( pad(temp, maxlen=max_len, padding='post', value=0)) temp = [] [ np.array([temp.append(i) for i in line[1:]]) for line in captions[image_name] ] temp = np.asarray(temp) target = np.array( [np.array(one_hot(i, dict_size)) for i in temp]) features = feature_dict[image_name] features = (features.repeat(len(caption), axis=0)).reshape(-1, 1, 4096) # caption = caption.reshape( -1, 1 , max_len) target = target.reshape(-1, 1, dict_size + 1) counter += 1 if image_batch_szie > 1: acc_features = np.append(acc_features, features, axis=0) acc_caption = np.append(acc_caption, caption, axis=0) acc_target = np.append(acc_target, target, axis=0) if counter == image_batch_szie: acc_features = acc_features.reshape(-1, 1, 4096) # acc_caption = acc_caption.reshape(-1,1, max_len) acc_target = acc_target.reshape(-1, 1, dict_size + 1) yield [acc_features[1:], acc_caption[1:]], acc_target[1:] acc_features = np.array([[[0 for i in range(4096)]]]) acc_caption = np.array([[[0 for i in range(max_len)]]]) acc_target = np.array( [[[0 for i in range(dict_size + 1)]]]) counter = 0 continue else: yield [[features, caption], target] if len(acc_caption) > 1: acc_features = acc_features.reshape(-1, 1, 4096) # acc_caption = acc_caption.reshape(-1,1, max_len) acc_target = acc_target.reshape(-1, 1, dict_size + 1) yield [acc_features[1:], acc_caption[1:]], acc_target[1:]