Пример #1
0
 def generate_paradigm(self,
                       number_to_generate=1000,
                       rel_output_path=None,
                       absolute_path=None):
     """
     Contains the main loop for generating a full dataset for a given paradigm.
     Also contains exception handling: some exceptions are tolerated because sometimes no matching arguments can be found,
     but if at least 10% of cases have an exception, it terminates since this is probably an issue in the code, and
     it could cause an infinite loop otherwise.
     :param number_to_generate: number of minimal pairs/sets to generate
     :param rel_output_path: relative path of output file
     :param absolute_path: absolute path of output file
     :return: None
     """
     if rel_output_path is not None:
         project_root = "/".join(
             os.path.join(os.path.dirname(
                 os.path.abspath(__file__))).split("/")[:-1])
         output = open(os.path.join(project_root, rel_output_path), "w")
     elif absolute_path is not None:
         output = open(absolute_path, "w")
     else:
         raise Exception("You need to give an output path")
     past_sentences = set()
     generated_data = []
     pairID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     print("Generating data for " + constant_data["UID"])
     self.make_logger(constant_data)
     output_writer = jsonlines.Writer(output, flush=True)
     while len(past_sentences) < number_to_generate:
         try:
             new_data, track_sentence = self.sample()
             if track_sentence not in past_sentences:
                 past_sentences.add(track_sentence)
                 for field in self.data_fields:
                     if field in new_data:
                         new_data[field] = string_beautify(new_data[field])
                         new_data.update(constant_data)
                 new_data["pairID"] = str(pairID)
                 pairID += 1
                 if pairID % 100 == 0:
                     print("%d sentences generated" % pairID)
                 output_writer.write(new_data)
         except Exception as e:
             self.log_exception(e)
             print(self.get_stack_trace(e))
             error_counter += 1
             if error_counter > number_to_generate // 5:
                 pass
                 # raise Exception("Over 20\% of samples result in errors. You should fix this.")
     jsonlines.Writer(output).write_all(generated_data)
Пример #2
0
 def generate_paradigm(self,
                       number_to_generate=10,
                       rel_output_path=None,
                       absolute_path=None):
     if rel_output_path is not None:
         project_root = "/".join(
             os.path.join(os.path.dirname(
                 os.path.abspath(__file__))).split("/")[:-1])
         output = open(os.path.join(project_root, rel_output_path), "w")
     elif absolute_path is not None:
         output = open(absolute_path, "w")
     else:
         raise Exception("You need to give an output path")
     past_sentences = set()
     generated_data = []
     sentenceID = 0
     paradigmID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     self.make_logger(constant_data)
     output_writer = jsonlines.Writer(output, flush=True)
     while len(past_sentences) < number_to_generate:
         try:
             new_data, track_sentence = self.sample()
             if track_sentence not in past_sentences:
                 past_sentences.add(track_sentence)
                 for line in new_data:
                     line["sentence"] = string_beautify(line["sentence"])
                     line.update(constant_data)
                     line["sentenceID"] = "%s_%s_%s_%s" % (
                         sentenceID, line["condition"],
                         str(line["linguistic_feature_label"]),
                         str(line["surface_feature_label"]))
                     line["paradigmID"] = paradigmID
                     sentenceID += 1
                     output_writer.write(line)
                 paradigmID += 1
         except Exception as e:
             self.log_exception(e)
             print(self.get_stack_trace(e))
             error_counter += 1
             if error_counter >= number_to_generate // 10:
                 raise Exception(
                     "Over 10\% of samples result in errors. You should fix this."
                 )
Пример #3
0
 def generate_paradigm(self,
                       number_to_generate=1000,
                       rel_output_path=None,
                       absolute_path=None):
     if rel_output_path is not None:
         project_root = "/".join(
             os.path.join(os.path.dirname(
                 os.path.abspath(__file__))).split("/")[:-1])
         output = open(os.path.join(project_root, rel_output_path), "w")
     elif absolute_path is not None:
         output = open(absolute_path, "w")
     else:
         raise Exception("You need to give an output path")
     past_sentences = set()
     generated_data = []
     pairID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     print("Generating data for " + constant_data["UID"])
     self.make_logger(constant_data)
     output_writer = jsonlines.Writer(output, flush=True)
     while len(past_sentences) < number_to_generate:
         try:
             new_data, track_sentence = self.sample()
             if track_sentence not in past_sentences:
                 past_sentences.add(track_sentence)
                 for field in self.data_fields:
                     if field in new_data:
                         new_data[field] = string_beautify(new_data[field])
                         new_data.update(constant_data)
                 new_data["pairID"] = str(pairID)
                 pairID += 1
                 if pairID % 100 == 0:
                     print("%d sentences generated" % pairID)
                 output_writer.write(new_data)
         except Exception as e:
             self.log_exception(e)
             print(self.get_stack_trace(e))
             error_counter += 1
             if error_counter > number_to_generate // 5:
                 pass
                 # raise Exception("Over 20\% of samples result in errors. You should fix this.")
     jsonlines.Writer(output).write_all(generated_data)
Пример #4
0
    def generate_paradigm(self,
                          number_to_generate=12,
                          rel_output_path=None,
                          absolute_path=None):
        if rel_output_path is not None:
            project_root = "/".join(
                os.path.join(os.path.dirname(
                    os.path.abspath(__file__))).split("/")[:-1])
            output = open(os.path.join(project_root, rel_output_path), "w")
        elif absolute_path is not None:
            output = open(absolute_path, "w")
        else:
            raise Exception("You need to give an output path")
        past_sentences = []
        generated_data = []
        constant_data = self.make_metadata_dict()
        # error_counter = 0
        #print(len(past_sentences))

        while len(past_sentences) < number_to_generate:

            try:
                new_data, track_sentence = self.sample()
                print(track_sentence)

                if track_sentence not in past_sentences:

                    past_sentences.append(track_sentence)

                    for C in new_data:
                        for field in self.data_fields:
                            if field in C:
                                C[field] = string_beautify(C[field])
                                C.update(constant_data)
                        generated_data.append(C)
            except Exception as e:
                self.log_exception(e)
                print(self.get_stack_trace(e))
                # error_counter += 1
                # if error_counter >= number_to_generate // 10:
                #     raise Exception("Over 10\% of samples result in errors. You should fix this.")
        jsonlines.Writer(output).write_all(generated_data)
 def generate_paradigm(self, number_to_generate=10, rel_output_path=None, absolute_path=None):
     if rel_output_path is not None:
         project_root = "/".join(os.path.join(os.path.dirname(os.path.abspath(__file__))).split("/")[:-1])
         output = open(os.path.join(project_root, rel_output_path), "w")
     elif absolute_path is not None:
         output = open(absolute_path, "w")
     else:
         raise Exception("You need to give an output path")
     past_sentences = [set() for i in range(len(self.data_fields))]
     generated_data = []
     sentenceID = 0
     paradigmID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     self.make_logger(constant_data)
     output_writer = jsonlines.Writer(output, flush=True)
     while len(past_sentences[0]) < number_to_generate:
         try:
             new_data, track_sentence = self.sample()
             overlap = False
             for i in range(len(track_sentence)):
                 if track_sentence[i] in past_sentences[i]:
                     overlap = True
                     break
             if not overlap:
                 for i in range(len(track_sentence)):
                     past_sentences[i].add(track_sentence[i])
                 for line in new_data:
                     line["sentence"] = string_beautify(line["sentence"])
                     line.update(constant_data)
                     line["sentenceID"] = sentenceID
                     line["paradigmID"] = paradigmID
                     sentenceID += 1
                     output_writer.write(line)
                 paradigmID += 1
         except Exception as e:
             self.log_exception(e)
             print(self.get_stack_trace(e))
             error_counter += 1
             if error_counter >= number_to_generate // 10:
                 raise Exception("Over 10\% of samples result in errors. You should fix this.")
Пример #6
0
        V1 = conjugate(V1, DP1)
        V2 = conjugate(V2, DP1)

        Rel = choice(get_matched_by(DP1, "arg_1", get_all("category_2",
                                                          "rel")))

        sentence_1 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], Refl1[0],
                                             V2[0], DP2[0])
        sentence_2 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], Refl2[0],
                                             V2[0], DP2[0])
        sentence_3 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], DP2[0],
                                             V2[0], Refl1[0])
        sentence_4 = "%s %s %s %s %s %s." % (DP1[0], Rel[0], V1[0], DP2[0],
                                             V2[0], Refl2[0])

        sentence_1 = string_beautify(sentence_1)
        sentence_2 = string_beautify(sentence_2)
        sentence_3 = string_beautify(sentence_3)
        sentence_4 = string_beautify(sentence_4)

        if sentence_1 not in sentences:
            if writer == test_output:
                writer.write("%s\t%d\t\t%s\n" % (
                    "exp=reflexive-matrix_reflexive=0-matrix_antecedent=1-refl1=%s-refl2=%s-precede=1"
                    % (Refl1[0], Refl2[0]), 1, sentence_1))
                writer.write("%s\t%d\t\t%s\n" % (
                    "exp=reflexive-matrix_reflexive=0-matrix_antecedent=1-refl1=%s-refl2=%s-precede=0"
                    % (Refl1[0], Refl2[0]), 0, sentence_2))
                writer.write("%s\t%d\t\t%s\n" % (
                    "exp=reflexive-matrix_reflexive=1-matrix_antecedent=1-refl1=%s-refl2=%s-precede=1"
                    % (Refl1[0], Refl2[0]), 1, sentence_3))
Пример #7
0
 def generate_paradigm(self,
                       number_to_generate=10,
                       rel_output_path=None,
                       absolute_path=None):
     if rel_output_path is not None:
         project_root = "/".join(
             os.path.join(os.path.dirname(
                 os.path.abspath(__file__))).split("/")[:-1])
         output_dir = os.path.join(project_root, rel_output_path)
     elif absolute_path is not None:
         output_dir = absolute_path
     else:
         raise Exception("You need to give an output path")
     try:
         os.mkdir(output_dir)
     except FileExistsError:
         pass
     past_sentences = [set() for i in range(len(self.data_fields))]
     sentenceID = 0
     paradigmID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     self.make_logger(constant_data)
     for file in ["test.jsonl", "train.jsonl"]:
         output_file = open(os.path.join(output_dir, file), "w")
         output_writer = jsonlines.Writer(output_file, flush=True)
         if not self.control_paradigm:
             output_control_file = open(
                 os.path.join(output_dir, "control_" + file), "w")
             output_control_writer = jsonlines.Writer(output_control_file,
                                                      flush=True)
         split_counter = 0
         while split_counter < number_to_generate:
             try:
                 new_data, track_sentence = self.sample()
                 overlap = False
                 for i in range(len(track_sentence)):
                     if track_sentence[i] in past_sentences[i]:
                         overlap = True
                         break
                 if not overlap:
                     for i in range(len(track_sentence)):
                         past_sentences[i].add(track_sentence[i])
                     for line in new_data:
                         line["sentence"] = string_beautify(
                             line["sentence"])
                         line.update(constant_data)
                         line["sentenceID"] = sentenceID
                         line["paradigmID"] = paradigmID
                         line["split"] = file.split(".")[0]
                         sentenceID += 1
                     if not self.control_paradigm:
                         for line in list(
                                 filter(
                                     lambda x: x["condition"] == "control"
                                     and x["linguistic_feature_label"] !=
                                     x["surface_feature_label"], new_data)):
                             output_control_writer.write(line)
                     if file == "test.jsonl":
                         for line in list(
                                 filter(
                                     lambda x:
                                     not (x["condition"] == "control" and x[
                                         "linguistic_feature_label"] != x[
                                             "surface_feature_label"]),
                                     new_data)):
                             output_writer.write(line)
                     else:
                         for line in list(
                                 filter(
                                     lambda x: x["condition"] == "training",
                                     new_data)):
                             output_writer.write(line)
                     split_counter += 1
                     paradigmID += 1
             except Exception as e:
                 self.log_exception(e)
                 print(self.get_stack_trace(e))
                 if not isinstance(e, LengthHelperError) and not isinstance(
                         e, MatchNotFoundError):
                     error_counter += 1
                 if error_counter >= number_to_generate // 10:
                     raise Exception(
                         "Over 10\% of samples result in errors. You should fix this."
                     )
Пример #8
0
    sentence_4 = "%s %s who %s %s %s %s %s %s." % (
        D1_up[0], N1[0], V1[0], D2[0], N2[0], V2[0], any_decoy_N3[0], N3[0])

    # build sentences with DE quantifier
    sentence_5 = "%s %s who %s any %s %s %s %s." % (D1_down[0], N1[0], V1[0],
                                                    N2[0], V2[0], D3[0], N3[0])
    sentence_6 = "%s %s who %s %s %s %s %s %s." % (
        D1_down[0], N1[0], V1[0], any_decoy_N2[0], N2[0], V2[0], D3[0], N3[0])
    sentence_7 = "%s %s who %s %s %s %s any %s." % (D1_down[0], N1[0], V1[0],
                                                    D2[0], N2[0], V2[0], N3[0])
    sentence_8 = "%s %s who %s %s %s %s %s %s." % (
        D1_down[0], N1[0], V1[0], D2[0], N2[0], V2[0], any_decoy_N3[0], N3[0])

    # remove doubled up spaces (this is because the bare plural doesn't have a determiner,
    # but the code outputs a determiner with an empty string. might want to change this)
    sentence_1 = string_beautify(sentence_1)
    sentence_2 = string_beautify(sentence_2)
    sentence_3 = string_beautify(sentence_3)
    sentence_4 = string_beautify(sentence_4)
    sentence_5 = string_beautify(sentence_5)
    sentence_6 = string_beautify(sentence_6)
    sentence_7 = string_beautify(sentence_7)
    sentence_8 = string_beautify(sentence_8)

    # write sentences to output
    if sentence_1 not in sentences:
        # sentences 1-4 have quantifiers with UE restrictor
        output.write("%s\t%d\t\t%s\n" % (
            "experiment=NPI-env=quantifier-npi=any-crucial_item=%s-licensor=0-scope=1-npi_present=1"
            % D1_up[0], 0, sentence_1))
        output.write("%s\t%d\t\t%s\n" % (
Пример #9
0
    V1 = conjugate(V1, N1)

    N2 = N_to_DP_mutate(choice(get_matches_of(V1, "arg_2", all_nouns)))

    Rel = choice(get_matched_by(N1, "arg_1", get_all("category_2", "rel")))

    subject_agree_auxiliaries = get_matched_by(N1, "arg_1", all_auxiliaries)

    for Aux in subject_agree_auxiliaries:
        acceptability = 1 if is_match_disj(V2, Aux["arg_2"]) else 0


        sentence_1 = "%s %s %s %s %s %s %s?" % (Aux[0], N1[0], Rel[0], V1[0], N2[0], V2[0], N3[0])
        sentence_2 = "%s %s %s %s %s %s %s?" % (Aux[0], N1[0], V2[0], N3[0], Rel[0], V1[0], N2[0])

        sentence_1 = string_beautify(sentence_1)
        sentence_2 = string_beautify(sentence_2)

        writer = np.random.choice([train_output, dev_output, test_output], 1, p=[0.5, 0.25, 0.25])[0]
        # out_of_domain_writer = np.random.choice([dev_output, test_output], 1)[0] \
        #     if in_domain_writer == train_output \
        #     else in_domain_writer
        # paradigm_in_domain = 1 if in_domain_writer == train_output else 0


        if sentence_1 not in sentences:
            writer.write("%s\t%d\t\t%s\n" % ("exp=polar-src=1-highest=1-last=1-aux=%s" % Aux[0], acceptability, sentence_1))
            writer.write("%s\t%d\t\t%s\n" % ("exp=polar-src=1-highest=1-last=1-aux=%s" % Aux[0], acceptability, sentence_2))

            # writer.write("%s\t%d\t\t%s\n" % ("exp=polar-src=0-highest=1-last=0-aux=%s-aux2=%s-paradigm_in_domain=%d" % (Aux[0], paradigm_in_domain), 1, sentence_2))