def crazy2_get_feed(path, batch_size, word_to_id, max_premise_length, max_hypothesis_length, num_iter=None, shuffle=False): data, _, _ = gd.process_data(1.0) premises = [] premise_lengths = [] hypotheses = [] hypothesis_lengths = [] labels = [] with open(path + "1256", 'r') as f: lines = f.readlines() if shuffle: random.shuffle(lines) for line in lines: example = json.loads(line) if " and " in example["sentence1"] or " or " in example[ "sentence1"] or " then " in example["sentence1"]: prem = du.parse_sentence( data, example["sentence1"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence1"])[1] + " " + du.parse_sentence( data, example["sentence1"])[2].emptystring hyp = du.parse_sentence( data, example["sentence2"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence2"])[1] + " " + du.parse_sentence( data, example["sentence2"])[2].emptystring premises.append( sentence_to_id(prem, word_to_id, max_premise_length)) premise_lengths.append(len(prem.split())) hypotheses.append( sentence_to_id(hyp, word_to_id, max_hypothesis_length)) hypothesis_lengths.append(len(hyp.split())) else: sentence1 = example["sentence1"] sentence2 = example["sentence2"] premises.append( sentence_to_id(sentence1, word_to_id, max_premise_length)) premise_lengths.append(len(sentence1.split())) hypotheses.append( sentence_to_id(sentence2, word_to_id, max_hypothesis_length)) hypothesis_lengths.append(len(sentence2.split())) labels.append( [label_to_num(example["gold_label"][i]) for i in range(12)]) if num_iter is not None and len(labels) > num_iter * batch_size: break if num_iter is None: num_iter = int(math.ceil(len(labels) / batch_size)) for i in range(num_iter): yield (np.array(premises[i * batch_size:(i + 1) * batch_size]), np.array(premise_lengths[i * batch_size:(i + 1) * batch_size]), np.array(hypotheses[i * batch_size:(i + 1) * batch_size]), np.array(hypothesis_lengths[i * batch_size:(i + 1) * batch_size]), np.array(labels[i * batch_size:(i + 1) * batch_size]), 1256)
def get_vocab(): data, _, _ = gd.process_data(1.0) vocab = [ "doesnot", "any", "or", "and", "if", "then", "emptystring", "notevery" ] for k in data: for word in data[k]: if type(word) == list: vocab += [w.lower() for w in word] else: vocab.append(word.lower()) return vocab
import generate_data as gd import random if __name__ == '__main__': train_size = 500000 val_size = 10000 test_size = 10000 data, _, _ = gd.process_data(1.0) examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", train_size, 0, data, simple_sampling = "level 2", boolean_sampling = "level 0") gd.save_data(examples, "experiment1_level2.train") examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", val_size, 0, data, simple_sampling = "level 2", boolean_sampling = "level 0") gd.save_data(examples, "experiment1_level2.val") examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", test_size, 0, data, simple_sampling = "level 2", boolean_sampling = "level 0") gd.save_data(examples, "experiment1_level2.test")
def crazy_get_feed(path, batch_size, word_to_id, max_premise_length, max_hypothesis_length, num_iter=None, shuffle=False): data, _, _ = gd.process_data(1.0) premises = [[], [], [], [], []] premise_lengths = [[], [], [], [], []] hypotheses = [[], [], [], [], []] hypothesis_lengths = [[], [], [], [], []] labels = [[], [], [], [], []] for i, type in enumerate(["", "1", "2", "5", "6"]): with open(path + type, 'r') as f: lines = f.readlines() if shuffle: random.shuffle(lines) for line in lines: example = json.loads(line) if " and " in example["sentence1"] or " or " in example[ "sentence1"] or " then " in example["sentence1"]: prem = du.parse_sentence( data, example["sentence1"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence1"])[1] + " " + du.parse_sentence( data, example["sentence1"])[2].emptystring hyp = du.parse_sentence( data, example["sentence2"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence2"])[1] + " " + du.parse_sentence( data, example["sentence2"])[2].emptystring premises.append( sentence_to_id(prem, word_to_id, max_premise_length)) premise_lengths.append(len(prem.split())) hypotheses.append( sentence_to_id(hyp, word_to_id, max_hypothesis_length)) hypothesis_lengths.append(len(hyp.split())) else: sentence1 = example["sentence1"] sentence2 = example["sentence2"] premises[i].append( sentence_to_id(sentence1, word_to_id, max_premise_length)) premise_lengths[i].append(len(sentence1.split())) hypotheses[i].append( sentence_to_id(sentence2, word_to_id, max_hypothesis_length)) hypothesis_lengths[i].append(len(sentence2.split())) labels[i].append(label_to_num(example["gold_label"])) if num_iter is not None and len( labels) > num_iter * batch_size: break if num_iter is None: num_iter = int(math.ceil(len(labels[0]) / batch_size)) batches = [] for i in range(num_iter): for j in range(5): batches.append((i, j)) lengths = {0: 9, 1: 1, 2: 2, 3: 5, 4: 6} random.shuffle(batches) random.shuffle(batches) random.shuffle(batches) for i, j in batches: yield (np.array(premises[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(premise_lengths[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(hypotheses[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(hypothesis_lengths[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(labels[j % 5][i * batch_size:(i + 1) * batch_size]), lengths[j])
import generate_data as gd import random if __name__ == '__main__': train_size = 500000 val_size = 10000 test_size = 10000 train_data, val_data, test_data = gd.process_data(0.6) examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", train_size, 0, train_data, simple_sampling="level 0", boolean_sampling="level 0") gd.save_data(examples, "experiment3_level0.train") examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", val_size, 0, val_data, simple_sampling="level 0", boolean_sampling="level 0") gd.save_data(examples, "experiment3_level0.val") examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", test_size, 0, test_data, simple_sampling="level 0", boolean_sampling="level 0") gd.save_data(examples, "experiment3_level0.test")