def crazy2_get_feed(path, batch_size, word_to_id, max_premise_length, max_hypothesis_length, num_iter=None, shuffle=False): data, _, _ = gd.process_data(1.0) premises = [] premise_lengths = [] hypotheses = [] hypothesis_lengths = [] labels = [] with open(path + "1256", 'r') as f: lines = f.readlines() if shuffle: random.shuffle(lines) for line in lines: example = json.loads(line) if " and " in example["sentence1"] or " or " in example[ "sentence1"] or " then " in example["sentence1"]: prem = du.parse_sentence( data, example["sentence1"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence1"])[1] + " " + du.parse_sentence( data, example["sentence1"])[2].emptystring hyp = du.parse_sentence( data, example["sentence2"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence2"])[1] + " " + du.parse_sentence( data, example["sentence2"])[2].emptystring premises.append( sentence_to_id(prem, word_to_id, max_premise_length)) premise_lengths.append(len(prem.split())) hypotheses.append( sentence_to_id(hyp, word_to_id, max_hypothesis_length)) hypothesis_lengths.append(len(hyp.split())) else: sentence1 = example["sentence1"] sentence2 = example["sentence2"] premises.append( sentence_to_id(sentence1, word_to_id, max_premise_length)) premise_lengths.append(len(sentence1.split())) hypotheses.append( sentence_to_id(sentence2, word_to_id, max_hypothesis_length)) hypothesis_lengths.append(len(sentence2.split())) labels.append( [label_to_num(example["gold_label"][i]) for i in range(12)]) if num_iter is not None and len(labels) > num_iter * batch_size: break if num_iter is None: num_iter = int(math.ceil(len(labels) / batch_size)) for i in range(num_iter): yield (np.array(premises[i * batch_size:(i + 1) * batch_size]), np.array(premise_lengths[i * batch_size:(i + 1) * batch_size]), np.array(hypotheses[i * batch_size:(i + 1) * batch_size]), np.array(hypothesis_lengths[i * batch_size:(i + 1) * batch_size]), np.array(labels[i * batch_size:(i + 1) * batch_size]), 1256)
import natural_logic_model as nlm import data_util import generate_data as gd data, _, _ = gd.process_data(1.0) print( "Input a premise sentence and hypothesis sentence of the form:\n Determiner (Adjective) Noun (does not) Verb Determiner Adjective Noun \n Make sure you conjugate to the present tense and use vocabulary from the files in the Data folder\n You can also combine two simple sentences of that form with: or, and, if...then" ) while True: premise = data_util.parse_sentence(data, input("Enter a premise sentence:\n")) while premise == None: premise = data_util.parse_sentence( data, input( "There was some issue with the entered premise\n Enter a premise sentence:\n" )) hypothesis = data_util.parse_sentence( data, input("Enter a hypothesis sentence:\n")) while hypothesis == None: hypothesis = data_util.parse_sentence( data, input( "There was some issue with the entered premise\n Enter a premise sentence:\n" )) if len(premise) == 1: label = nlm.get_label( nlm.compute_simple_relation(premise[0], hypothesis[0])) else: label = nlm.get_label( nlm.compute_boolean_relation(premise[0], premise[1], premise[2],
example2["gold_label"] = "equivalence" elif example["sentence2"].split( )[i] == "emptystring" and example["sentence2"].split()[ i + 1] == example["sentence1"].split()[i + 1]: example2["gold_label"] = "entails" elif example["sentence1"].split( )[i] == "emptystring" and example["sentence2"].split()[ i + 1] == example["sentence1"].split()[i + 1]: example2["gold_label"] = "reverse entails" else: example2["gold_label"] = "independence" label.append(example2["gold_label"]) example5 = dict() example5["sentence1"] = adjoin(example["sentence1"].split()[-5:]) example5["sentence2"] = adjoin(example["sentence2"].split()[-5:]) premise = du.parse_sentence(data, example["sentence1"])[0] hypothesis = du.parse_sentence(data, example["sentence2"])[0] verb_relation = nlm.standard_lexical_merge(premise.verb, hypothesis.verb) adverb_relation = nlm.standard_lexical_merge( premise.adverb, hypothesis.adverb) object_negation_signature = nlm.negation_merge( premise.object_negation, hypothesis.object_negation) object_determiner_signature = nlm.determiner_merge( premise.natlog_object_determiner, hypothesis.natlog_object_determiner) object_noun_relation = nlm.standard_lexical_merge( premise.object_noun, hypothesis.object_noun) object_adjective_relation = nlm.standard_lexical_merge( premise.object_adjective, hypothesis.object_adjective) VP_relation = nlm.standard_phrase(adverb_relation, verb_relation)
with open("simple_solutions", "r") as f: simple_solutions = json.loads(f.read()) for encoding in simple_solutions: encoding = json.loads(encoding) premise, hypothesis = gd.encoding_to_example(data, encoding) if gd.example_to_encoding(premise, hypothesis) != encoding: print("We have a problem with the simple encoding") nlm_label = nlm.get_label( nlm.compute_simple_relation(premise, hypothesis)) if simple_solutions[json.dumps(encoding)] != nlm_label: print("We have a problem with the simple file") print("simple file is good") with open("boolean_solutions", "r") as f: boolean_solutions = json.loads(f.read()) simple1 = [ (data_util.parse_sentence(data, "some wizard eats some flute")[0], data_util.parse_sentence(data, "some wizard eats some flute")[0]) ] simple1.append( (data_util.parse_sentence(data, "every wizard eats every flute")[0], data_util.parse_sentence(data, "some wizard eats some flute")[0])) simple1.append( (data_util.parse_sentence(data, "some wizard eats some flute")[0], data_util.parse_sentence(data, "every wizard eats every flute")[0])) simple1.append( (data_util.parse_sentence(data, "no wizard eats some flute")[0], data_util.parse_sentence(data, "some wizard eats every flute")[0])) simple1.append(
def crazy_get_feed(path, batch_size, word_to_id, max_premise_length, max_hypothesis_length, num_iter=None, shuffle=False): data, _, _ = gd.process_data(1.0) premises = [[], [], [], [], []] premise_lengths = [[], [], [], [], []] hypotheses = [[], [], [], [], []] hypothesis_lengths = [[], [], [], [], []] labels = [[], [], [], [], []] for i, type in enumerate(["", "1", "2", "5", "6"]): with open(path + type, 'r') as f: lines = f.readlines() if shuffle: random.shuffle(lines) for line in lines: example = json.loads(line) if " and " in example["sentence1"] or " or " in example[ "sentence1"] or " then " in example["sentence1"]: prem = du.parse_sentence( data, example["sentence1"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence1"])[1] + " " + du.parse_sentence( data, example["sentence1"])[2].emptystring hyp = du.parse_sentence( data, example["sentence2"] )[0].emptystring + " " + du.parse_sentence( data, example["sentence2"])[1] + " " + du.parse_sentence( data, example["sentence2"])[2].emptystring premises.append( sentence_to_id(prem, word_to_id, max_premise_length)) premise_lengths.append(len(prem.split())) hypotheses.append( sentence_to_id(hyp, word_to_id, max_hypothesis_length)) hypothesis_lengths.append(len(hyp.split())) else: sentence1 = example["sentence1"] sentence2 = example["sentence2"] premises[i].append( sentence_to_id(sentence1, word_to_id, max_premise_length)) premise_lengths[i].append(len(sentence1.split())) hypotheses[i].append( sentence_to_id(sentence2, word_to_id, max_hypothesis_length)) hypothesis_lengths[i].append(len(sentence2.split())) labels[i].append(label_to_num(example["gold_label"])) if num_iter is not None and len( labels) > num_iter * batch_size: break if num_iter is None: num_iter = int(math.ceil(len(labels[0]) / batch_size)) batches = [] for i in range(num_iter): for j in range(5): batches.append((i, j)) lengths = {0: 9, 1: 1, 2: 2, 3: 5, 4: 6} random.shuffle(batches) random.shuffle(batches) random.shuffle(batches) for i, j in batches: yield (np.array(premises[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(premise_lengths[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(hypotheses[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(hypothesis_lengths[j % 5][i * batch_size:(i + 1) * batch_size]), np.array(labels[j % 5][i * batch_size:(i + 1) * batch_size]), lengths[j])
premise, hypothesis = gd.encoding_to_example(data,encoding) if gd.example_to_encoding(premise,hypothesis) != encoding: print("We have a problem with the simple encoding") nlm_label = nlm.get_label(nlm.compute_simple_relation(premise, hypothesis)) if convert(simple_solutions[json.dumps(encoding)]) != nlm_label: print("We have a problem with the simple file") print("simple file is good") examples = gd.generate_balanced_data("simple_solutions", "boolean_solutions", 100, 0, data,simple_sampling = "level 2", boolean_sampling = "level 1") gd.save_data(examples, "test") examples = [] with open("test", "r") as f: lines = f.readlines() for line in lines: examples.append(json.loads(line)) for example in examples: premise = data_util.parse_sentence(data,example["sentence1"]) hypothesis = data_util.parse_sentence(data,example["sentence2"]) if len(premise) == 1: fol_label = fol.get_label(premise[0], hypothesis[0]) nlm_label = nlm.get_label(nlm.compute_simple_relation(premise[0], hypothesis[0])) if example["gold_label"] != fol_label or fol_label != nlm_label: print(example["gold_label"] , fol_label,nlm_label) print("We have a problem with simple generation") else: premise1 = premise[0] premise_conjunction = premise[1] premise2 = premise[2] hypothesis1 = hypothesis[0] hypothesis_conjunction = hypothesis[1] hypothesis2 = hypothesis[2] nlm_label = nlm.get_label(nlm.compute_boolean_relation(premise1, premise_conjunction, premise2, hypothesis1, hypothesis_conjunction, hypothesis2))