def generate_paradigm(self, number_to_generate=1000, rel_output_path=None, absolute_path=None): """ Contains the main loop for generating a full dataset for a given paradigm. Also contains exception handling: some exceptions are tolerated because sometimes no matching arguments can be found, but if at least 10% of cases have an exception, it terminates since this is probably an issue in the code, and it could cause an infinite loop otherwise. :param number_to_generate: number of minimal pairs/sets to generate :param rel_output_path: relative path of output file :param absolute_path: absolute path of output file :return: None """ if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = set() generated_data = [] pairID = 0 error_counter = 0 constant_data = self.make_metadata_dict() print("Generating data for " + constant_data["UID"]) self.make_logger(constant_data) output_writer = jsonlines.Writer(output, flush=True) while len(past_sentences) < number_to_generate: try: new_data, track_sentence = self.sample() if track_sentence not in past_sentences: past_sentences.add(track_sentence) for field in self.data_fields: if field in new_data: new_data[field] = string_beautify(new_data[field]) new_data.update(constant_data) new_data["pairID"] = str(pairID) pairID += 1 if pairID % 100 == 0: print("%d sentences generated" % pairID) output_writer.write(new_data) except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) error_counter += 1 if error_counter > number_to_generate // 5: pass # raise Exception("Over 20\% of samples result in errors. You should fix this.") jsonlines.Writer(output).write_all(generated_data)
def generate_paradigm(self, number_to_generate=10, rel_output_path=None, absolute_path=None): if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = set() generated_data = [] sentenceID = 0 paradigmID = 0 error_counter = 0 constant_data = self.make_metadata_dict() self.make_logger(constant_data) output_writer = jsonlines.Writer(output, flush=True) while len(past_sentences) < number_to_generate: try: new_data, track_sentence = self.sample() if track_sentence not in past_sentences: past_sentences.add(track_sentence) for line in new_data: line["sentence"] = string_beautify(line["sentence"]) line.update(constant_data) line["sentenceID"] = "%s_%s_%s_%s" % ( sentenceID, line["condition"], str(line["linguistic_feature_label"]), str(line["surface_feature_label"])) line["paradigmID"] = paradigmID sentenceID += 1 output_writer.write(line) paradigmID += 1 except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) error_counter += 1 if error_counter >= number_to_generate // 10: raise Exception( "Over 10\% of samples result in errors. You should fix this." )
def generate_paradigm(self, number_to_generate=1000, rel_output_path=None, absolute_path=None): if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = set() generated_data = [] pairID = 0 error_counter = 0 constant_data = self.make_metadata_dict() print("Generating data for " + constant_data["UID"]) self.make_logger(constant_data) output_writer = jsonlines.Writer(output, flush=True) while len(past_sentences) < number_to_generate: try: new_data, track_sentence = self.sample() if track_sentence not in past_sentences: past_sentences.add(track_sentence) for field in self.data_fields: if field in new_data: new_data[field] = string_beautify(new_data[field]) new_data.update(constant_data) new_data["pairID"] = str(pairID) pairID += 1 if pairID % 100 == 0: print("%d sentences generated" % pairID) output_writer.write(new_data) except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) error_counter += 1 if error_counter > number_to_generate // 5: pass # raise Exception("Over 20\% of samples result in errors. You should fix this.") jsonlines.Writer(output).write_all(generated_data)
def generate_paradigm(self, number_to_generate=12, rel_output_path=None, absolute_path=None): if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = [] generated_data = [] constant_data = self.make_metadata_dict() # error_counter = 0 #print(len(past_sentences)) while len(past_sentences) < number_to_generate: try: new_data, track_sentence = self.sample() print(track_sentence) if track_sentence not in past_sentences: past_sentences.append(track_sentence) for C in new_data: for field in self.data_fields: if field in C: C[field] = string_beautify(C[field]) C.update(constant_data) generated_data.append(C) except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) # error_counter += 1 # if error_counter >= number_to_generate // 10: # raise Exception("Over 10\% of samples result in errors. You should fix this.") jsonlines.Writer(output).write_all(generated_data)
def generate_paradigm(self, number_to_generate=10, rel_output_path=None, absolute_path=None): if rel_output_path is not None: project_root = "/".join(os.path.join(os.path.dirname(os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = [set() for i in range(len(self.data_fields))] generated_data = [] sentenceID = 0 paradigmID = 0 error_counter = 0 constant_data = self.make_metadata_dict() self.make_logger(constant_data) output_writer = jsonlines.Writer(output, flush=True) while len(past_sentences[0]) < number_to_generate: try: new_data, track_sentence = self.sample() overlap = False for i in range(len(track_sentence)): if track_sentence[i] in past_sentences[i]: overlap = True break if not overlap: for i in range(len(track_sentence)): past_sentences[i].add(track_sentence[i]) for line in new_data: line["sentence"] = string_beautify(line["sentence"]) line.update(constant_data) line["sentenceID"] = sentenceID line["paradigmID"] = paradigmID sentenceID += 1 output_writer.write(line) paradigmID += 1 except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) error_counter += 1 if error_counter >= number_to_generate // 10: raise Exception("Over 10\% of samples result in errors. You should fix this.")
V1 = conjugate(V1, DP1) V2 = conjugate(V2, DP1) Rel = choice(get_matched_by(DP1, "arg_1", get_all("category_2", "rel"))) sentence_1 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], Refl1[0], V2[0], DP2[0]) sentence_2 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], Refl2[0], V2[0], DP2[0]) sentence_3 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], DP2[0], V2[0], Refl1[0]) sentence_4 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], DP2[0], V2[0], Refl2[0]) sentence_1 = string_beautify(sentence_1) sentence_2 = string_beautify(sentence_2) sentence_3 = string_beautify(sentence_3) sentence_4 = string_beautify(sentence_4) if sentence_1 not in sentences: if writer == test_output: writer.write("%s\t%d\t\t%s\n" % ( "exp=reflexive-matrix_reflexive=0-matrix_antecedent=1-refl1=%s-refl2=%s-precede=1" % (Refl1[0], Refl2[0]), 1, sentence_1)) writer.write("%s\t%d\t\t%s\n" % ( "exp=reflexive-matrix_reflexive=0-matrix_antecedent=1-refl1=%s-refl2=%s-precede=0" % (Refl1[0], Refl2[0]), 0, sentence_2)) writer.write("%s\t%d\t\t%s\n" % ( "exp=reflexive-matrix_reflexive=1-matrix_antecedent=1-refl1=%s-refl2=%s-precede=1" % (Refl1[0], Refl2[0]), 1, sentence_3))
def generate_paradigm(self, number_to_generate=10, rel_output_path=None, absolute_path=None): if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output_dir = os.path.join(project_root, rel_output_path) elif absolute_path is not None: output_dir = absolute_path else: raise Exception("You need to give an output path") try: os.mkdir(output_dir) except FileExistsError: pass past_sentences = [set() for i in range(len(self.data_fields))] sentenceID = 0 paradigmID = 0 error_counter = 0 constant_data = self.make_metadata_dict() self.make_logger(constant_data) for file in ["test.jsonl", "train.jsonl"]: output_file = open(os.path.join(output_dir, file), "w") output_writer = jsonlines.Writer(output_file, flush=True) if not self.control_paradigm: output_control_file = open( os.path.join(output_dir, "control_" + file), "w") output_control_writer = jsonlines.Writer(output_control_file, flush=True) split_counter = 0 while split_counter < number_to_generate: try: new_data, track_sentence = self.sample() overlap = False for i in range(len(track_sentence)): if track_sentence[i] in past_sentences[i]: overlap = True break if not overlap: for i in range(len(track_sentence)): past_sentences[i].add(track_sentence[i]) for line in new_data: line["sentence"] = string_beautify( line["sentence"]) line.update(constant_data) line["sentenceID"] = sentenceID line["paradigmID"] = paradigmID line["split"] = file.split(".")[0] sentenceID += 1 if not self.control_paradigm: for line in list( filter( lambda x: x["condition"] == "control" and x["linguistic_feature_label"] != x["surface_feature_label"], new_data)): output_control_writer.write(line) if file == "test.jsonl": for line in list( filter( lambda x: not (x["condition"] == "control" and x[ "linguistic_feature_label"] != x[ "surface_feature_label"]), new_data)): output_writer.write(line) else: for line in list( filter( lambda x: x["condition"] == "training", new_data)): output_writer.write(line) split_counter += 1 paradigmID += 1 except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) if not isinstance(e, LengthHelperError) and not isinstance( e, MatchNotFoundError): error_counter += 1 if error_counter >= number_to_generate // 10: raise Exception( "Over 10\% of samples result in errors. You should fix this." )
sentence_4 = "%s %s who %s %s %s %s %s %s." % ( D1_up[0], N1[0], V1[0], D2[0], N2[0], V2[0], any_decoy_N3[0], N3[0]) # build sentences with DE quantifier sentence_5 = "%s %s who %s any %s %s %s %s." % (D1_down[0], N1[0], V1[0], N2[0], V2[0], D3[0], N3[0]) sentence_6 = "%s %s who %s %s %s %s %s %s." % ( D1_down[0], N1[0], V1[0], any_decoy_N2[0], N2[0], V2[0], D3[0], N3[0]) sentence_7 = "%s %s who %s %s %s %s any %s." % (D1_down[0], N1[0], V1[0], D2[0], N2[0], V2[0], N3[0]) sentence_8 = "%s %s who %s %s %s %s %s %s." % ( D1_down[0], N1[0], V1[0], D2[0], N2[0], V2[0], any_decoy_N3[0], N3[0]) # remove doubled up spaces (this is because the bare plural doesn't have a determiner, # but the code outputs a determiner with an empty string. might want to change this) sentence_1 = string_beautify(sentence_1) sentence_2 = string_beautify(sentence_2) sentence_3 = string_beautify(sentence_3) sentence_4 = string_beautify(sentence_4) sentence_5 = string_beautify(sentence_5) sentence_6 = string_beautify(sentence_6) sentence_7 = string_beautify(sentence_7) sentence_8 = string_beautify(sentence_8) # write sentences to output if sentence_1 not in sentences: # sentences 1-4 have quantifiers with UE restrictor output.write("%s\t%d\t\t%s\n" % ( "experiment=NPI-env=quantifier-npi=any-crucial_item=%s-licensor=0-scope=1-npi_present=1" % D1_up[0], 0, sentence_1)) output.write("%s\t%d\t\t%s\n" % (
V1 = conjugate(V1, N1) N2 = N_to_DP_mutate(choice(get_matches_of(V1, "arg_2", all_nouns))) Rel = choice(get_matched_by(N1, "arg_1", get_all("category_2", "rel"))) subject_agree_auxiliaries = get_matched_by(N1, "arg_1", all_auxiliaries) for Aux in subject_agree_auxiliaries: acceptability = 1 if is_match_disj(V2, Aux["arg_2"]) else 0 sentence_1 = "%s %s %s %s %s %s %s?" % (Aux[0], N1[0], Rel[0], V1[0], N2[0], V2[0], N3[0]) sentence_2 = "%s %s %s %s %s %s %s?" % (Aux[0], N1[0], V2[0], N3[0], Rel[0], V1[0], N2[0]) sentence_1 = string_beautify(sentence_1) sentence_2 = string_beautify(sentence_2) writer = np.random.choice([train_output, dev_output, test_output], 1, p=[0.5, 0.25, 0.25])[0] # out_of_domain_writer = np.random.choice([dev_output, test_output], 1)[0] \ # if in_domain_writer == train_output \ # else in_domain_writer # paradigm_in_domain = 1 if in_domain_writer == train_output else 0 if sentence_1 not in sentences: writer.write("%s\t%d\t\t%s\n" % ("exp=polar-src=1-highest=1-last=1-aux=%s" % Aux[0], acceptability, sentence_1)) writer.write("%s\t%d\t\t%s\n" % ("exp=polar-src=1-highest=1-last=1-aux=%s" % Aux[0], acceptability, sentence_2)) # writer.write("%s\t%d\t\t%s\n" % ("exp=polar-src=0-highest=1-last=0-aux=%s-aux2=%s-paradigm_in_domain=%d" % (Aux[0], paradigm_in_domain), 1, sentence_2))