def transform(self, X1, Y=None): sents = [] lengths = [] # transform label X for sent in X1: word_ids = [] for w in sent: w = self._lower(w) w = self._normalize_num(w) if w in self.vocab_word: word_id = self.vocab_word[w] else: word_id = self.vocab_word[UNK] word_ids.append(word_id) lengths.append(len(word_ids)) sents.append(word_ids) # transform label Y if Y is not None: sent_labels = [[self.vocab_tag[l] for l in labels] for labels in Y] else: sent_labels = None # sequence_length sequence_length = np.asarray(lengths) # padding X_result = pad_sequences(sents, 0, max_length=self.max_length) Y_result = pad_sequences(sent_labels, 0, max_length=self.max_length) intput_mask = np.array((Y_result > 0), dtype=np.float32) X_result = [X_result, intput_mask, sequence_length] return X_result, Y_result
def _pad_feed_dict(self, word_ids, char_ids, cap_ids, start_ids, end_ids, gold_labels, att_ids, dropout, hidden_dropout): fd = {} word_ids, sent_lenghts = pad_sequences(word_ids,0) fd[self.word_ids] = word_ids fd[self.sent_lengths] = sent_lenghts if self.parameters['char_dim']: char_ids, word_lenghts = pad_sequences(char_ids,0,2) fd[self.char_ids] = char_ids fd[self.word_lengths] = word_lenghts if self.parameters['cap_dim']: cap_ids, _ = pad_sequences(cap_ids,0) fd[self.cap_ids] = cap_ids att_ids, markable_length = pad_sequences(att_ids, [0, 0],max_len=self.parameters['max_len']) fd[self.att_ids] = att_ids fd[self.markable_lengths] = markable_length if self.parameters['len_dim']: len_ids = [e[1]-s[1] for s, e in zip(start_ids, end_ids)] fd[self.len_ids] = len_ids fd[self.start_ids] = start_ids fd[self.end_ids] = end_ids fd[self.gold_labels] = gold_labels fd[self.dropout] = dropout fd[self.hidden_dropout] = hidden_dropout return fd, sent_lenghts
def run_one_epoch(self, ep, train_ds, valid_ds, batch_size): losses = [] i = 0 score = 0 for xbatch, ybatch in minibatch(train_ds, batch_size): i += 1 word_seq, sequence_len = pad_sequences(xbatch) target_seq, _ = pad_sequences(ybatch) # build feed dictionary feed = { self.word_ids: word_seq, self.labels: target_seq, self.sequence_lengths: sequence_len, self.learning_rate: self.lr, self.keep_dropout_rate: self.kdr } _, train_loss = self.sess.run([self.train_op, self.loss], feed_dict=feed) losses += [train_loss] if i % 10 == 0: print('ep:', ep, 'iter:', i, 'loss:', np.mean(losses)) if i % 50 == 0: acc_score, _ = self.run_validation(valid_ds, batch_size) print('accuracy', acc_score) if acc_score == 0: acc_score, _ = self.run_validation(valid_ds, batch_size) metrics = {} metrics['acc'] = acc_score return metrics
def transform(self, X): results1 = [] results2 = [] feature_set1 = ["amod", "nsubj", "dep"] feature_set2 = ["dobj", "nsubj", "dep"] for sent, depency in X: result1 = [0] * len(sent) result2 = [0] * len(sent) dependents, governor = depency for i in range(1, len(governor)): gov_idx, relation = governor[i] if relation in feature_set2: result2[i - 1] = 1 if relation in feature_set1: result1[gov_idx - 1] = 1 results1.append(result1) results2.append(result2) padded_result1, sequence_length = pad_sequences(results1, pad_tok=0) padded_result2, _ = pad_sequences(results2, pad_tok=0) padded_result1_np = np.array(padded_result1, dtype=np.float32).reshape( len(padded_result1), len(padded_result1[0]), 1) padded_result2_np = np.array(padded_result2, dtype=np.float32).reshape( len(padded_result2), len(padded_result2[0]), 1) xxx = np.concatenate((padded_result1_np, padded_result2_np), axis=-1) return xxx, sequence_length
def pad_features(features, params): maxlen = params['max_len'] new_features = {} for fea in features.keys(): if fea == 'dep_path': continue if fea == 'has_dep': # not helping, not used continue num_example = len(features[fea]) seq = np.zeros([num_example, maxlen, params['dep_size']]) for i in xrange(num_example): for j, deps in itertools.izip(xrange(maxlen), features[fea][i]): for dep in deps: seq[i, j, dep] = 1.0 elif fea == 'dep': seq = utils.pad_sequences(features[fea], 5, value=0) elif fea == 'pos1' or fea == 'pos2': seq = utils.pad_sequences(features[fea], maxlen, value=maxlen * 2 - 1) elif fea == 'bag_size' or fea == 'mask' or fea == 'weight': seq = np.array(features[fea]) # no change else: seq = utils.pad_sequences(features[fea], maxlen, value=0) new_features[fea] = seq return new_features
def _next_batch(self, data): """ :param dataset.Dataset data: :return: """ start = 0 idx = 0 while start < len(data.words): l_batch = data.labels[start:start + self.batch_size] labels, _ = pad_sequences(l_batch, pad_tok=0, nlevels=1) w_batch = data.words[start:start + self.batch_size] c_batch = data.chars[start:start + self.batch_size] pos_batch = data.poses[start:start + self.batch_size] word_ids, sequence_lengths = pad_sequences(w_batch, pad_tok=0, nlevels=1) char_ids, word_lengths = pad_sequences(c_batch, pad_tok=0, nlevels=2) pos_ids, _ = pad_sequences(pos_batch, pad_tok=0, nlevels=1) start += self.batch_size idx += 1 batch_data = { self.sequence_lens: sequence_lengths, self.labels: labels, self.word_ids: word_ids, self.char_ids: char_ids, self.word_lengths: word_lengths, self.pos_ids: pos_ids, } yield batch_data
def get_feed_dict(self, words, labels=None, LR=None, dropout=None): if self.config.chars: char_ids, word_ids = zip(*words) # ==========important word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=0) char_ids, word_lengths = pad_sequences( char_ids, pad_tok=0, nlevels=2) else: word_ids, sequence_lengths = pad_sequences(words, pad_tok=0) feed_dict = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } if self.config.chars: feed_dict[self.char_ids] = char_ids feed_dict[self.word_lengths] = word_lengths if labels is not None: labels, _ = pad_sequences(labels, pad_tok=0) feed_dict[self.labels] = labels if LR is not None: feed_dict[self.LR] = LR if dropout is not None: feed_dict[self.dropout] = dropout return feed_dict, sequence_lengths
def load_te_dataset(filename, token2id, label2id): labels = [] padded_premises = [] padded_hypotheses = [] original_premises = [] original_hypotheses = [] with open(filename) as in_file: reader = csv.reader(in_file, delimiter="\t") for row in reader: label = row[0].strip() premise_tokens = row[1].strip().split() hypothesis_tokens = row[2].strip().split() premise = row[4].strip() hypothesis = row[5].strip() labels.append(label2id[label]) padded_premises.append([token2id.get(token, token2id["#unk#"]) for token in premise_tokens]) padded_hypotheses.append([token2id.get(token, token2id["#unk#"]) for token in hypothesis_tokens]) original_premises.append(premise) original_hypotheses.append(hypothesis) padded_premises = pad_sequences(padded_premises, padding="post", value=token2id["#pad#"], dtype=np.long) padded_hypotheses = pad_sequences(padded_hypotheses, padding="post", value=token2id["#pad#"], dtype=np.long) labels = np.array(labels) return labels, padded_premises, padded_hypotheses, original_premises, original_hypotheses
def _get_feed_dict(self, words, labels=None, lr=None, is_train=None): ''' :param words: :param labels: :param lr: :param is_train: :return: ''' word_ids, char_ids = zip(*words) word_ids, seq_len = pad_sequences(word_ids, max_length=None, pad_tok=0, nlevels=1) feed_dict = {self.word_ids: word_ids, self.seq_len: seq_len} if self.cfg.use_char_emb: char_ids, word_len = pad_sequences(char_ids, max_length=None, pad_tok=0, max_length_2=None, nlevels=2) #上一步返回的是 一三维的数组 [batch_size,max_len_sen,max_len_word] feed_dict[self.char_ids] = char_ids feed_dict[self.word_len] = word_len if labels is not None: feed_dict[self.labels] = labels if lr is not None: feed_dict[self.lr] = lr if is_train is not None: feed_dict[self.is_train] = is_train return feed_dict
def transform(self, X): results1 = [] results2 = [] fea_keys1 = self.features_dict.keys() fea_keys2 = self.features_dict2.keys() for words, depency in X: result1 = [0] * len(words) result2 = [0] * len(words) pre_words = [re.sub(r"\d{1,10}", "0", word) for word in words] for key in fea_keys1: asp_words = key.split() idents_idx = self.identify2(pre_words, asp_words) if len(idents_idx) > 0: for ident_idx in idents_idx: from_idx, to_idx = ident_idx for j in range(from_idx, to_idx): result1[j] = 1 for idx, word in enumerate(pre_words): if word in fea_keys2: result2[idx] = 1 results1.append(result1) results2.append(result2) padded_result1, sequence_length = pad_sequences(results1, pad_tok=0) padded_result2, _ = pad_sequences(results2, pad_tok=0) padded_result1_np = np.array(padded_result1, dtype=np.float32).reshape( len(padded_result1), len(padded_result1[0]), 1) padded_result2_np = np.array(padded_result2, dtype=np.float32).reshape( len(padded_result2), len(padded_result2[0]), 1) xxx = np.concatenate((padded_result1_np, padded_result2_np), axis=-1) return xxx, sequence_length
def predict_embeddings(self, words_to_drop): batches_i, sents_i, words, left_contexts, right_contexts = list( zip(*words_to_drop)) vectorized_words = [[self.comick.characters_vocabulary[c] for c in w] for w in words] words_lengths = torch.LongTensor([len(w) for w in words]) padded_words = pad_sequences(vectorized_words, words_lengths) vectorized_left_contexts = [l.data for l in left_contexts] left_contexts_length = torch.LongTensor( [len(c) for c in left_contexts]) padded_left = pad_sequences(vectorized_left_contexts, left_contexts_length) vectorized_right_contexts = [l.data for l in right_contexts] right_contexts_length = torch.LongTensor( [len(c) for c in right_contexts]) padded_right = pad_sequences(vectorized_right_contexts, right_contexts_length) use_gpu = torch.cuda.is_available() if use_gpu: padded_left = padded_left.cuda() padded_words = padded_words.cuda() padded_right = padded_right.cuda() embeddings = self.comick( (Variable(padded_left), Variable(padded_words), Variable(padded_right))) attentions = self.comick.get_attentions() for si, i, embedding, attention in zip(batches_i, sents_i, embeddings, attentions): yield (si, i, embedding, attention)
def predict_embeddings(self, words_to_drop): batches_i, sents_i, words, left_contexts, right_contexts, contexts = list( zip(*words_to_drop)) vectorized_words = self.comick.vectorize_words(words) words_lengths = torch.LongTensor([len(w) for w in vectorized_words]) padded_words = pad_sequences(vectorized_words, words_lengths) vectorized_contexts = [l.data.cpu() for l in contexts] contexts_length = torch.LongTensor([len(c) for c in contexts]) padded = pad_sequences(vectorized_contexts, contexts_length) # vectorized_left_contexts = [l.data.cpu() for l in left_contexts] # left_contexts_length = torch.LongTensor([len(c) for c in left_contexts]) # padded_left = pad_sequences(vectorized_left_contexts, left_contexts_length) # vectorized_right_contexts = [l.data.cpu() for l in right_contexts] # right_contexts_length = torch.LongTensor([len(c) for c in right_contexts]) # padded_right = pad_sequences(vectorized_right_contexts, right_contexts_length) use_gpu = torch.cuda.is_available() if use_gpu: padded = padded.cuda() padded_words = padded_words.cuda() # padded_right = padded_right.cuda() embeddings, attentions = self.comick((padded, padded_words)) if self.comick.attention: for si, i, embedding, attention, in zip(batches_i, sents_i, embeddings, attentions): yield (si, i, embedding, attention) else: for si, i, embedding, in zip(batches_i, sents_i, embeddings): yield (si, i, embedding, [])
def generate_answers(sess, model, dataset, rev_vocab): """ Loop over the dev or test dataset and generate answer. Note: output format must be answers[uuid] = "real answer" You must provide a string of words instead of just a list, or start and end index In main() function we are dumping onto a JSON file evaluate.py will take the output JSON along with the original JSON file and output a F1 and EM You must implement this function in order to submit to Leaderboard. :param sess: active TF session :param model: a built QASystem model :param rev_vocab: this is a list of vocabulary that maps index to actual words :return: """ answers = {} zipped = zip(*dataset) num_batches = (len(zipped) + FLAGS.batch_size - 1) / FLAGS.batch_size for i, batch in enumerate(get_minibatches(zipped, FLAGS.batch_size)): context_data, question_data, question_uuid_data, context_tokens = zip( *batch) p, q = [], [] p_len, q_len = [], [] for i in range(len(context_data)): q.append(question_data[i].split()) q_len.append( min(FLAGS.question_size, len(question_data[i].split()))) p.append(context_data[i].split()) p_len.append( min(FLAGS.paragraph_size, len(context_data[i].split()))) q = pad_sequences(q, maxlen=FLAGS.question_size, value=PAD_ID, padding="post") p = pad_sequences(p, maxlen=FLAGS.paragraph_size, value=PAD_ID, padding="post") ys, ye = model.predict_batch(sess, p, q, p_len, q_len) a_s_pred = np.argmax(ys, axis=1) a_e_pred = np.argmax(ye, axis=1) for i in range(len(context_data)): #predicted a_s and a_e s_pred = a_s_pred[i] e_pred = a_e_pred[i] uuid = question_uuid_data[i] pred_raw = ' '.join(context_tokens[i][s_pred:e_pred + 1]) answers[uuid] = pred_raw print("Finished answering batch {} of {}".format(i + 1, num_batches)) return answers
def load_vte_dataset(nli_dataset_filename, token2id, label2id, keep_neutrals=True): labels = [] padded_premises = [] padded_hypotheses = [] image_names = [] original_premises = [] original_hypotheses = [] with open(nli_dataset_filename) as in_file: reader = csv.reader(in_file, delimiter="\t") next(reader, None) #skip header for row in reader: label = row[0].strip() if keep_neutrals == False and label == 'neutral': continue premise_tokens = row[1].strip().split() hypothesis_tokens = row[2].strip().split() image = row[3].strip().split("#")[0] premise = row[4].strip() hypothesis = row[5].strip() labels.append(label2id[label]) padded_premises.append([ token2id.get(token, token2id["#unk#"]) for token in premise_tokens ]) padded_hypotheses.append([ token2id.get(token, token2id["#unk#"]) for token in hypothesis_tokens ]) image_names.append(image) original_premises.append(premise) original_hypotheses.append(hypothesis) padded_premises = pad_sequences(padded_premises, padding="post", value=token2id["#pad#"], dtype=np.long) padded_hypotheses = pad_sequences(padded_hypotheses, padding="post", value=token2id["#pad#"], dtype=np.long) labels = np.array(labels) return labels, padded_premises, padded_hypotheses, image_names, original_premises, original_hypotheses
def collate_examples(samples): words, labels = list(zip(*samples)) seq_lengths = torch.LongTensor([len(s) for s in words]) padded_words = pad_sequences(words, seq_lengths) padded_labels = pad_sequences(labels, seq_lengths) return ( padded_words, padded_labels )
def _get_feed_dict(self, words, labels=None, lr=None, is_train=None): word_ids, char_ids = zip(*words) word_ids, seq_len = pad_sequences(word_ids, max_length=None, pad_tok=0, nlevels=1) feed_dict = {self.word_ids: word_ids, self.seq_len: seq_len} if self.cfg.use_char_emb: char_ids, word_len = pad_sequences(char_ids, max_length=None, pad_tok=0, max_length_2=None, nlevels=2) feed_dict[self.char_ids] = char_ids feed_dict[self.word_len] = word_len if labels is not None: feed_dict[self.labels] = labels if lr is not None: feed_dict[self.lr] = lr if is_train is not None: feed_dict[self.is_train] = is_train return feed_dict
def init_data_provider(ngrams=False): logging.info('Data provider, initializing: ngrams = {}'.format(ngrams)) logging.info('Data provider, loading file: ' + STATE['source']) with open(STATE['source']) as s: data = json.load(s) data['jokes'] = data['jokes'][:STATE['max_jokes']] random.shuffle(data['jokes']) logging.info('Data provider, extracting categories...') STATE['classes'] = extract_categories(data['jokes'], STATE['stemmer']) logging.info('Data provider, tokenizing data...') STATE['data'], STATE['tokenizer'] = \ (get_data_as_ngrams if ngrams else get_data_as_bag_of_words)(data, STATE['stemmer'], STATE['classes']) X, Y = zip(*STATE['data']) STATE['X']['hot_vector'] = np.empty((len(X), STATE['tokenizer'].index)) for i, e in enumerate(X): STATE['X']['hot_vector'][i] = to_hot_vector(e, STATE['tokenizer'].index) STATE['X']['sequential'] = utils.pad_sequences(X) STATE['Y']['categorical'] = np.array([to_categorical(y, STATE['classes'], STATE['stemmer']) for y in Y]) STATE['Y']['numerical'] = np.array([to_numerical(y, STATE['classes'], STATE['stemmer']) for y in Y]) STATE['model_params']['input_length']['hot_vector'] = len(STATE['X']['hot_vector'][0]) STATE['model_params']['input_length']['sequential'] = len(STATE['X']['sequential'][0]) STATE['model_params']['output_length']['categorical'] = len(STATE['Y']['categorical'][0]) STATE['model_params']['output_length']['numerical'] = 1 logging.info( 'Data provider, finished loading [' + str(len(data['jokes'])) + ' jokes] from file: ' + STATE['source'])
def collate_fn(batch): x, y = zip(*batch) x_lengths = torch.LongTensor([len(item) for item in x]) padded_x = pad_sequences(x, x_lengths) return (padded_x, torch.FloatTensor(np.array(y)))
def Tokenize_with_note_id_hour(df, max_length, tokenizer): labels = df.Label.values note_ids = df.Note_ID.values times = pd.to_datetime(df.charttime.values) times = times - times.min() times = times / pd.Timedelta(days=1) if 'TEXT' in df.columns: sen = df.TEXT.values labels = df.Label.values sen = ["[CLS] " + x + " [SEP]" for x in sen] tokenized_texts = [tokenizer.tokenize(x) for x in sen] print("First sentence tokenized") print(tokenized_texts[0]) input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] else: assert 'Input_ID' in df.columns input_ids = df.Input_ID.apply(lambda x: x.split(' ')) input_ids = input_ids.apply(lambda x: [int(i) for i in x]) input_ids = input_ids.values input_ids = pad_sequences(input_ids, maxlen=max_length, dtype="long", truncating="post", padding="post") attention_masks = [] for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) return labels, input_ids, attention_masks, note_ids, times
def collate_x(batch): x, y = zip(*batch) x_lengths = torch.LongTensor([len(item) for item in x]) padded_x = pad_sequences(x, x_lengths) return padded_x, y
def test(model, sess): reviews, ent2idx, attr2idx, polarity2idx = load_semeval_reviews( constants.test_filename) # list of (ids, ent, attr, pol) tuples = [] for review in reviews: if len(review.tokens) <= 1: # ids = [0] + [word2idx[tok] for tok in review.tokens] ids = [word2idx[tok] for tok in review.tokens] else: ids = [word2idx[tok] for tok in review.tokens] tuples_ = [(ids, ent2idx[op.ent], attr2idx[op.attr], polarity2idx[op.polarity]) for op in review.opinions] tuples.extend(tuples_) unzipped = zip(*tuples) ids = utils.pad_sequences(unzipped[0], maxlen=constants.max_sent_len) sent_lens = np.array(map( lambda x: len(x) if len(x) < constants.max_sent_len else constants.max_sent_len, unzipped[0]), dtype='int32') ents, attrs, pols = (np.array(x, dtype='int32') for x in unzipped[1:]) acc = model.eval(sess, ids, ents, attrs, sent_lens, utils.to_categorical(pols)) utils.log('test accuracy: {}'.format(acc), True)
def transform(self, X, one_host=False): if one_host is True: return self.sents_2_onehost(X) else: poses = self.sents_2_posid(X) poses, length = pad_sequences(poses, pad_tok=0) return np.array(poses, dtype=np.int32)
def _next_batch(self, dataset, batch_size): """ :param dataset.Dataset dataset: :return: """ start = 0 while start < len(dataset.words): w_batch = dataset.words[start:start + batch_size] word_ids, _ = pad_sequences(w_batch, pad_tok=0, max_sent_length=self.max_length) if dataset.labels is not None: labels = dataset.labels[start:start + batch_size] else: labels = None start += batch_size yield { self.word_ids: word_ids, self.labels: labels } if labels is not None else { self.word_ids: word_ids }
def load_ic_dataset(ic_dataset_filename, token2id, label2id): labels = [] padded_sentences = [] images_filenames = [] original_sentences = [] with open(ic_dataset_filename) as in_file: reader = csv.reader(in_file, delimiter="\t") for row in reader: # each row is of the form: # label \t sentence tokens \t image filename \t source \t original sentence label = row[0].strip() sentence_tokens = row[1].strip().split() image_filename = row[2].strip() sentence = row[5].strip() labels.append(label2id[label]) padded_sentences.append([ token2id.get(token, token2id["#unk#"]) for token in sentence_tokens ]) images_filenames.append(image_filename) original_sentences.append(sentence) padded_sentences = pad_sequences(padded_sentences, padding="post", value=token2id["#pad#"], dtype=np.long) labels = np.array(labels) return labels, padded_sentences, images_filenames, original_sentences
def load_foil_dataset(filename, token2id, label2id): labels = [] padded_sentences = [] image_names = [] with open(filename) as in_file: reader = csv.reader(in_file, delimiter="\t") for row in reader: label = row[0].strip() sentence_tokens = row[1].strip().split() image = row[2].strip().split("_")[2] labels.append(label2id[label]) padded_sentences.append([ token2id.get(token, token2id["#unk#"]) for token in sentence_tokens ]) image_names.append(image) padded_sentences = pad_sequences(padded_sentences, padding="post", value=token2id["#pad#"], dtype=np.long) labels = np.array(labels) return labels, padded_sentences, image_names
def get_feed(self, seqs, labels=None, lr=None, dropout=None): word_ids, seq_len_list = pad_sequences(seqs, pad_mark=0) feed_dict = { self.word_ids: word_ids, self.sequence_lengths: seq_len_list } if labels is not None: labels_, _ = pad_sequences(labels, pad_mark=0) feed_dict[self.labels] = labels_ if lr is not None: feed_dict[self.lr_pl] = lr if dropout is not None: feed_dict[self.dropout_pl] = dropout return feed_dict, seq_len_list
def _input_data(wavfiles): textfiles = [file.replace('.wav', '.txt') for file in wavfiles] audio = [] audio_len = [] transcript = [] transcript_len = [] for target_filename, audio_filename in zip(textfiles, wavfiles): Sxx = preprocess_audio_spectrogram(audio_filename) inputs_data = 20 * np.log10(Sxx).T.astype('float32') inputs_data = (inputs_data - np.mean(inputs_data, axis=0)) / np.std( inputs_data, axis=0) audio.append(inputs_data) audio_len.append(np.int32(len(inputs_data))) # Readings targets # load text transcription and convert to numerical array targets = normalize_txt_file(target_filename) targets = text_to_char_array(targets) transcript.append(targets) transcript_len.append(len(targets)) audio = np.asarray(audio) audio_len = np.asarray(audio_len) transcript = np.asarray(transcript) transcript_len = np.asarray(transcript_len) train_inputs, train_seq_len = pad_sequences(audio) # Creating sparse representation to feed the placeholder train_targets = sparse_tuple_from(transcript) return train_inputs, train_targets, train_seq_len
def pad_sequence(self, word_ids, char_ids, labels=None): if labels: labels, _ = pad_sequences(labels, 0) labels = np.asarray(labels) labels = dense_to_one_hot(labels, len(self.vocab_tag), nlevels=2) word_ids, sequence_lengths = pad_sequences(word_ids, 0) word_ids = np.asarray(word_ids) if self.char_feature: char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) char_ids = np.asarray(char_ids) return [word_ids, char_ids], labels else: return [word_ids], labels
def preprocess_input(self, state): new_input = [] for l in state: new_input.append(letter_dict[l]) state = pad_sequences([new_input], maxlen=self.maxlen) if self.is_training: self.episode_memory.append((state, self.get_guessed_mat())) return state, self.get_guessed_mat()
def collate_examples_multiple_tags(samples): examples, labels = list(zip(*samples)) words = list() chars = list() bos = list() for e in examples: words.append(e[0]) chars.append(e[1]) bos.append(e[2]) seq_lengths = torch.LongTensor([len(s) for s in words]) padded_words = pad_sequences(words, seq_lengths) padded_chars = list() for char_list in chars: chars_seq_lengths = torch.LongTensor([len(s) for s in char_list]) padded_chars.append(pad_sequences(char_list, chars_seq_lengths)) padded_bos = list() for bos_list in bos: bos_seq_lengths = torch.LongTensor([len(s) for s in bos_list]) padded_bos.append(pad_sequences(bos_list, bos_seq_lengths)) tags_to_produce = set() for example in labels: tags_to_produce.update(example.keys()) labels_splitted = defaultdict(list) for tag in tags_to_produce: for example in labels: if tag in example: labels_splitted[tag].append(example[tag]) else: labels_splitted[tag].append([0]) padded_labels = dict() for label, tags in labels_splitted.items(): padded_labels[label] = pad_sequences(tags, seq_lengths) tags_to_produce.add(label) return ( (padded_words, padded_chars, padded_bos, tags_to_produce), padded_labels )
def select_model(): """Use validation set to tune""" tr_texts, tr_labels = datareader.DirDataReader.read( os.path.join(base, cfg.get('data', 'train')), label2int) tr_texts, val_texts, tr_labels, val_labels = train_test_split( tr_texts, tr_labels, test_size=0.20, random_state=2020) tok = tokenizer.Tokenizer(cfg.getint('data', 'vocab_size')) tok.fit_on_texts(tr_texts) tr_texts = tok.texts_as_sets_to_seqs(tr_texts) val_texts = tok.texts_as_sets_to_seqs(val_texts) # todo: what's up with max length? train_loader = make_data_loader( utils.pad_sequences(tr_texts), tr_labels, cfg.getint('model', 'batch_size'), 'train') val_loader = make_data_loader( utils.pad_sequences(val_texts), val_labels, cfg.getint('model', 'batch_size'), 'dev') print('loaded %d training and %d validation samples' % \ (len(tr_texts), len(val_texts))) model = TransformerClassifier() label_counts = torch.bincount(torch.tensor(tr_labels)) weights = len(tr_labels) / (2.0 * label_counts) best_roc, optimal_epochs = fit( model, train_loader, val_loader, weights, cfg.getint('model', 'n_epochs')) print('roc auc %.3f after %d epochs' % (best_roc, optimal_epochs)) return optimal_epochs
def getBatch(self, batchIndex): batchX = [] batchY = [] startKeyIndex = batchIndex * self.batchSize endKeyIndex = (batchIndex + 1) * self.batchSize for i in range(startKeyIndex, endKeyIndex, 1): j = i % self.keyList.__len__() if j == 0: self.completedEpoch += 1 pass sentenceX = self.dataFile[self.keyList[j] + "/input"] sentenceY = self.dataFile[self.keyList[j] + "/label"] batchX.append(sentenceX) batchY.append(sentenceY) pass batchX, _ = pad_sequences(batchX, maxlen=self.maxTimeStep) batchY, _ = pad_sequences(batchY, maxlen=self.maxTimeStep) return (batchX, batchY)
# Initializate the weights and biases tf.global_variables_initializer().run() for curr_epoch in range(num_epochs): train_cost = train_ler = 0 start = time.time() for batch in range(num_batches_per_epoch): # Getting the index indexes = [i % num_examples for i in range(batch * batch_size, (batch + 1) * batch_size)] batch_train_inputs = train_inputs[indexes] # Padding input to max_time_step of this batch batch_train_inputs, batch_train_seq_len = pad_sequences(batch_train_inputs) # Converting to sparse representation so as to to feed SparseTensor input batch_train_targets = sparse_tuple_from(train_targets[indexes]) feed = {inputs: batch_train_inputs, targets: batch_train_targets, seq_len: batch_train_seq_len} batch_cost, _ = session.run([cost, optimizer], feed) train_cost += batch_cost*batch_size train_ler += session.run(ler, feed_dict=feed)*batch_size # Shuffle the data shuffled_indexes = np.random.permutation(num_examples)