Пример #1
0
def main(args):
    if not args.as_array and not args.as_extra:
        args.as_extra = True
    for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS):
        kwargs = dict(as_array=args.as_array,
                      as_extra=args.as_extra,
                      verbose=args.verbose,
                      lang=spec.lang)
        passages = spec.passages
        if spec.conllu:
            passages = copy_annotation(passages,
                                       spec.conllu,
                                       by_id=args.by_id,
                                       **kwargs)
        elif spec.udpipe:
            passages = annotate_udpipe(passages, spec.udpipe, **kwargs)
        elif spec.stanfordnlp:
            passages = annotate_stanfordnlp(passages, spec.stanfordnlp,
                                            **kwargs)
        for passage in annotate_all(passages if args.verbose else tqdm(
                passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    replace=spec.conllu
                                    or not (spec.udpipe or spec.stanfordnlp),
                                    **kwargs):
            if passage.extra.get("format") == "amr" and args.as_array:
                from semstr.conversion.amr import AmrConverter
                AmrConverter.introduce_placeholders(passage)
            write_passage(passage,
                          outdir=spec.out_dir,
                          verbose=args.verbose,
                          binary=args.binary)
Пример #2
0
 def finish(self, status, display=True, write=False, accuracies=None):
     self.model.classifier.finished_item(self.training)
     for model in self.models[1:]:
         model.classifier.finished_item(renew=False)  # So that dynet.renew_cg happens only once
     if not self.training or self.config.args.verify:
         self.out = self.state.create_passage(verify=self.config.args.verify, format=self.out_format)
     if write:
         for out_format in self.config.args.formats or [self.out_format]:
             if self.config.args.normalize and out_format == "ucca":
                 normalize(self.out)
             ioutil.write_passage(self.out, output_format=out_format, binary=out_format == "pickle",
                                  outdir=self.config.args.outdir, prefix=self.config.args.prefix,
                                  converter=get_output_converter(out_format), verbose=self.config.args.verbose,
                                  append=self.config.args.join, basename=self.config.args.join)
     if self.oracle and self.config.args.verify:
         self.verify(self.out, self.passage)
     ret = (self.out,)
     if self.evaluation:
         ret += (self.evaluate(self.evaluation),)
         status = "%-14s %s F1=%.3f" % (status, self.eval_type, self.f1)
     if display:
         self.config.print("%s%.3fs %s" % (self.accuracy_str, self.duration, status), level=1)
     if accuracies is not None:
         accuracies[self.passage.ID] = self.correct_action_count / self.action_count if self.action_count else 0
     return ret
Пример #3
0
 def finish(self, status, display=True, write=False, accuracies=None):
     self.model.classifier.finished_item(self.training)
     for model in self.models[1:]:
         model.classifier.finished_item(renew=False)  # So that dynet.renew_cg happens only once
     if not self.training or self.config.args.verify:
         self.out = self.state.create_passage(verify=self.config.args.verify, format=self.out_format)
     if write:
         for out_format in self.config.args.formats or [self.out_format]:
             if self.config.args.normalize and out_format == "ucca":
                 normalize(self.out)
             ioutil.write_passage(self.out, output_format=out_format, binary=out_format == "pickle",
                                  outdir=self.config.args.outdir, prefix=self.config.args.prefix,
                                  converter=get_output_converter(out_format), verbose=self.config.args.verbose,
                                  append=self.config.args.join, basename=self.config.args.join)
     if self.oracle and self.config.args.verify:
         self.verify(self.out, self.passage)
     ret = (self.out,)
     if self.evaluation:
         ret += (self.evaluate(self.evaluation),)
         status = "%-14s %s F1=%.3f" % (status, self.eval_type, self.f1)
     if display:
         self.config.print("%s%.3fs %s" % (self.accuracy_str, self.duration, status), level=1)
     if accuracies is not None:
         accuracies[self.passage.ID] = self.correct_action_count / self.action_count if self.action_count else 0
     return ret
Пример #4
0
 def download_task(self, task_id, normalize=False, write=True, validate=None, binary=None, log=None, out_dir=None,
                   prefix=None, by_external_id=False, verbose=False, write_valid_only=False, **kwargs):
     del kwargs
     task = self.get_user_task(task_id)
     user_id = task["user"]["id"]
     try:
         passage = from_json(task, by_external_id=by_external_id)
     except ValueError as e:
         raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e
     if normalize:
         try:
             normalization.normalize(passage)
         except AssertionError as e:
             raise ValueError("Failed normalizing task %s:\n%s" % (task_id, json.dumps(task))) from e
     if log:
         print(passage.ID, task_id, user_id, task["user_comment"], task["created_at"], task["updated_at"],
               file=log, sep="\t", flush=True)
     ret = passage, task_id, user_id
     if validate or write_valid_only:
         for error in validation.validate(passage, linkage=False):
             if validate:
                 print(passage.ID, task_id, user_id, error, file=validate, sep="\t", flush=True)
             if write_valid_only:
                 return ret
     if write:
         write_passage(passage, binary=binary, outdir=out_dir, prefix=prefix, verbose=verbose)
     return ret
Пример #5
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.filename, encoding="utf-8") as f:
        t = list(map(str.split, f))
        if not args.verbose:
            t = tqdm(t, desc="Downloading", unit=" passages")
        for passage_id, id_field in t:
            if not args.verbose:
                t.set_postfix({
                    "passage_id": passage_id,
                    args.method: id_field
                })
            if args.verbose:
                with external_write_mode():
                    print("Getting passage " + passage_id + " with " +
                          args.method + "=" + id_field,
                          end="\t")
            xml_root = get_by_method(id_field=id_field.split(","),
                                     passage_id=passage_id,
                                     **vars(args))
            if xml_root is None:
                continue
            if args.write_site:
                site_filename = passage_id + "_site_download.xml"
                with open(site_filename, "w", encoding="utf-8") as fsite:
                    print(tostring(xml_root).decode(), file=fsite)
                if args.verbose:
                    with external_write_mode():
                        print("Wrote '%s'" % site_filename)
            if args.write:
                write_passage(convert.from_site(xml_root),
                              outdir=args.outdir,
                              verbose=args.verbose)
Пример #6
0
def main(filename, input_filenames, outdir):
    os.makedirs(outdir, exist_ok=True)
    with open(filename, encoding="utf-8") as f:
        pairs = [line.strip().split() for line in f]
        old_to_new_id = {old_id: new_id for new_id, old_id in pairs}
    for passage in get_passages_with_progress_bar(input_filenames, desc="Renaming"):
        passage._ID = old_to_new_id[passage.ID]
        write_passage(passage, outdir=outdir, verbose=False)
Пример #7
0
def main(args):
    for passage in annotate_all(get_passages_with_progress_bar(
            args.filenames, desc="Annotating"),
                                replace=True,
                                as_array=args.as_array,
                                verbose=args.verbose):
        assert is_annotated(
            passage, args.as_array), "Passage %s is not annotated" % passage.ID
        write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
Пример #8
0
def main(filename, input_filenames, outdir):
    os.makedirs(outdir, exist_ok=True)
    with open(filename, encoding="utf-8") as f:
        pairs = [line.strip().split() for line in f]
        old_to_new_id = {old_id: new_id for new_id, old_id in pairs}
    for passage in get_passages_with_progress_bar(input_filenames,
                                                  desc="Renaming"):
        passage._ID = old_to_new_id[passage.ID]
        write_passage(passage, outdir=outdir, verbose=False)
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "terminal", "before", "after"))
        for passage in get_passages_with_progress_bar(args.passages, desc="Converting"):
            convert_passage(passage, lang=passage.attrib.get("lang", args.lang), report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Пример #10
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "edge", "before", "after"))
        for passage in get_passages_with_progress_bar(args.passages, desc="Converting"):
            convert_passage(passage, report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Пример #11
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    words_set = read_dict(args.words_set)
    with open(args.logfile, "w", newline="", encoding="utf-8") as outfile:
        cw = csv.writer(outfile)
        for passage in get_passages_with_progress_bar(args.filenames, "Fixing tokenization"):
            fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw)
            if fixed is not None:
                outfile.flush()
                normalize(fixed)
                write_passage(fixed, outdir=args.outdir, binary=args.binary, prefix=args.prefix, verbose=args.verbose)
Пример #12
0
def main(args):
    for passages, out_dir, lang in read_specs(args):
        for passage in tqdm(passages,
                            unit=" passages",
                            desc="Setting language in " + out_dir,
                            postfix={"lang": lang}):
            passage.attrib["lang"] = lang
            write_passage(passage,
                          outdir=out_dir,
                          verbose=False,
                          binary=args.binary)
Пример #13
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        if spec.udpipe:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose)
        elif spec.conllu:
            spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose)
        for passage in annotate_all(spec.passages if args.verbose else
                                    tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang,
                                    verbose=args.verbose):
            write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
Пример #14
0
def main(args):
    for passages, out_dir, lang in read_specs(args):
        for passage in annotate_all(passages if args.verbose else tqdm(
                passages, unit=" passages", desc="Annotating " + out_dir),
                                    as_array=args.as_array,
                                    replace=True,
                                    lang=lang,
                                    verbose=args.verbose):
            write_passage(passage,
                          outdir=out_dir,
                          verbose=args.verbose,
                          binary=args.binary)
Пример #15
0
def main(args):
    if args.outdir:
        os.makedirs(args.outdir, exist_ok=True)
    for p in get_passages_with_progress_bar(args.filenames,
                                            desc="Normalizing",
                                            converters={}):
        normalize(p, extra=args.extra)
        write_passage(p,
                      outdir=args.outdir,
                      prefix=args.prefix,
                      binary=args.binary,
                      verbose=False)
Пример #16
0
def train_test(train_passages,
               dev_passages,
               test_passages,
               args,
               model_suffix=""):
    """
    Train and test parser on given passage
    :param train_passages: passage to train on
    :param dev_passages: passages to evaluate on every iteration
    :param test_passages: passages to test on after training
    :param args: extra argument
    :param model_suffix: string to append to model filename before file extension
    :return: generator of Scores objects: dev scores for each training iteration (if given dev), and finally test scores
    """
    model_base, model_ext = os.path.splitext(
        args.model or "%s_%s" % (args.format or "ucca", args.classifier))
    p = Parser(model_file=model_base + model_suffix + model_ext,
               model_type=args.classifier,
               beam=args.beam)
    print("%s %s" % (os.path.basename(__file__), Config()))
    yield from filter(
        None,
        p.train(train_passages, dev=dev_passages, iterations=args.iterations))
    if test_passages:
        if args.train or args.folds:
            print("Evaluating on test passages")
        passage_scores = []
        evaluate = args.evaluate or train_passages
        for result in p.parse(test_passages, evaluate=evaluate):
            if evaluate:
                guessed_passage, score = result
                passage_scores.append(score)
            else:
                guessed_passage = result
                print()
            if guessed_passage is not None and args.write:
                ioutil.write_passage(guessed_passage,
                                     output_format=args.output_format,
                                     binary=args.output_format == "pickle",
                                     outdir=args.outdir,
                                     prefix=args.prefix,
                                     converter=TO_FORMAT.get(
                                         args.output_format,
                                         Config().output_converter or to_text))
        if passage_scores:
            scores = Scores(passage_scores)
            if args.verbose <= 1 or len(passage_scores) > 1:
                print("\nAverage labeled F1 score on test: %.3f" %
                      scores.average_f1())
                print("Aggregated scores:")
                scores.print()
            print_scores(scores, args.testscores)
            yield scores
Пример #17
0
def main(args):
    textutil.BATCH_SIZE = 1
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "terminal", "pos", "before", "after"))
        for passage in annotate_all(get_passages_with_progress_bar(args.passages, desc="Converting"),
                                    verbose=args.verbose):
            convert_passage(passage, report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Пример #18
0
def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    exceptions = []
    for pattern in args.filenames:
        for filename in sorted(glob(pattern)) or [pattern]:
            print("Reading '%s'..." % filename)
            try:
                passage = pickle_site2passage(filename)
                write_passage(passage, outdir=args.out_dir, binary=args.binary, basename=os.path.basename(filename))
            except ValueError as e:
                exceptions.append((filename, e))
    if exceptions:
        for filename, e in exceptions:
            print("'%s': %s" % (filename, e))
Пример #19
0
def main(args):
    for i, line in enumerate(tqdm(gen_lines(args.filenames),
                                  unit=" lines",
                                  desc="Creating passages"),
                             start=1):
        p = core.Passage(args.format % i)
        l0 = layer0.Layer0(p)
        layer1.Layer1(p)
        for tok in line.split():
            l0.add_terminal(text=tok, punct=PUNCTUATION.issuperset(tok))
        write_passage(p,
                      outdir=args.out_dir,
                      binary=args.binary,
                      verbose=False)
Пример #20
0
def train_test(train_passages,
               dev_passages,
               test_passages,
               args,
               model_suffix=""):
    """
    Train and test parser on given passage
    :param train_passages: passage to train on
    :param dev_passages: passages to evaluate on every iteration
    :param test_passages: passages to test on after training
    :param args: extra argument
    :param model_suffix: string to append to model filename before file extension
    :return: pair of (test scores, list of dev scores per iteration) where each one is a Scores object
    """
    test_scores = None
    model_base, model_ext = os.path.splitext(args.model
                                             or "ucca_" + args.classifier)
    p = Parser(model_file=model_base + model_suffix + model_ext,
               model_type=args.classifier,
               beam=args.beam)
    p.train(train_passages, dev=dev_passages, iterations=args.iterations)
    if test_passages:
        if args.train or args.folds:
            print("Evaluating on test passages")
        passage_scores = []
        evaluate = args.evaluate or train_passages
        for result in p.parse(test_passages, evaluate=evaluate):
            if evaluate:
                guessed_passage, score = result
                passage_scores.append(score)
            else:
                guessed_passage = result
                print()
            if guessed_passage is not None and not args.no_write:
                ioutil.write_passage(guessed_passage,
                                     output_format=args.format,
                                     binary=args.binary,
                                     outdir=args.outdir,
                                     prefix=args.prefix)
        if passage_scores and (not args.verbose or len(passage_scores) > 1):
            test_scores = evaluation.Scores.aggregate(passage_scores)
            print("\nAverage labeled F1 score on test: %.3f" %
                  test_scores.average_f1())
            print("Aggregated scores:")
            test_scores.print()
            if Config().args.testscores:
                with open(Config().args.testscores, "a") as f:
                    print(",".join(test_scores.fields()), file=f)
    return test_scores, p.dev_scores
Пример #21
0
 def download_task(self,
                   task_id,
                   write=True,
                   out_format=None,
                   binary=None,
                   out_dir=None,
                   prefix=None,
                   **kwargs):
     del kwargs
     passage = from_json(self.get_user_task(task_id),
                         all_categories=self.layer["categories"])
     if write:
         write_passage(passage, out_format, binary, out_dir, prefix,
                       TO_FORMAT.get(out_format))
     return passage
Пример #22
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    words_set = read_dict(args.words_set)
    with open(args.logfile, "w", newline="", encoding="utf-8") as outfile:
        cw = csv.writer(outfile)
        for passage in get_passages_with_progress_bar(args.filenames,
                                                      "Fixing tokenization"):
            fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw)
            if fixed is not None:
                outfile.flush()
                normalize(fixed)
                write_passage(fixed,
                              outdir=args.outdir,
                              binary=args.binary,
                              prefix=args.prefix,
                              verbose=args.verbose)
Пример #23
0
def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    exceptions = []
    for pattern in args.filenames:
        for filename in glob(pattern) or [pattern]:
            print("Reading '%s'..." % filename)
            try:
                passage = pickle_site2passage(filename)
                write_passage(passage,
                              outdir=args.out_dir,
                              binary=args.binary,
                              basename=os.path.basename(filename))
            except ValueError as e:
                exceptions.append((filename, e))
    if exceptions:
        for filename, e in exceptions:
            print("'%s': %s" % (filename, e))
Пример #24
0
 def download_task(self, task_id, write=True, out_format=None, binary=None, out_dir=None, prefix=None, **kwargs):
     del kwargs
     passage = from_json(self.get_user_task(task_id), all_categories=self.layer["categories"])
     if write:
         write_passage(passage, out_format, binary, out_dir, prefix, TO_FORMAT.get(out_format))
     return passage
Пример #25
0
def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    for filename, passage in ((filename, site2passage(filename)) for pattern in args.filenames
                              for filename in sorted(glob(pattern)) or [pattern]) if args.filenames \
            else ((pid, db2passage(sqlite3.connect(args.db).cursor(), pid, args.user)) for pid in args.pids):
        write_passage(passage, outdir=args.out_dir, binary=args.binary)
Пример #26
0
def main(args):
    for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"),
                                replace=True, as_array=args.as_array, verbose=args.verbose):
        assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID
        write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
Пример #27
0
def get_validation_accuracy(val_text_tensor, model, a_model, label_model, s_model, rm_model, rm_lstm_model,
                            val_text, val_passages,
                            val_pos, val_pos_tensor, labels, label2index, val_ent, val_ent_tensor,
                            val_case_tensor, unroll, eval_type="unlabeled",
                            testing=False, testing_phase=False):

    total_labeled = (0, 0, 0)
    total_unlabeled = (0, 0, 0)
    total_labeled_remote = (0, 0, 0)
    total_unlabeled_remote = (0, 0, 0)

    top_10_to_writeout = 10

    debugging_remote = 0
    debugging_remote_min = 0
    debugging_remote_max = -1
    if debugging_remote_max > -1 or debugging_remote_min > 0:
        print("WARNING: Only test on part of sents")

    for sent_tensor, ori_sent, tgt_passage, pos, pos_tensor, ent, ent_tensor, case_tensor in \
            zip(val_text_tensor, val_text, val_passages, val_pos, val_pos_tensor, val_ent, val_ent_tensor,
                val_case_tensor):
        # if len(ori_sent) > 70:
        #     print("sent %s is too long with %d words" % (tgt_passage.ID, len(ori_sent)))
        # try:

        # print(tgt_passage.ID)
        # print(tgt_passage)

        debugging_remote += 1
        if debugging_remote < debugging_remote_min:
            continue
        if debugging_remote == debugging_remote_max:
            break
        # print(tgt_passage.ID)
        with torch.no_grad():
            # try:
            pred_passage = evaluate_with_label(sent_tensor, model, a_model, label_model, s_model, rm_model,
                                               rm_lstm_model, ori_sent,
                                               tgt_passage, pos, pos_tensor, labels, label2index, ent,
                                               ent_tensor, case_tensor, unroll)
            # except Exception as e:
            #     print(e)
            #     print(tgt_passage.ID)
            #     assert False

        if testing_phase:
            ioutil.write_passage(pred_passage, outdir="pred_test/")
        else:
            labeled, unlabeled, labeled_remote, unlabeled_remote = get_score(pred_passage,
                                                                             tgt_passage, testing, eval_type)

            total_labeled = tuple(map(operator.add, total_labeled, labeled))
            total_unlabeled = tuple(map(operator.add, total_unlabeled, unlabeled))
            total_labeled_remote = tuple(map(operator.add, total_labeled_remote, labeled_remote))
            total_unlabeled_remote = tuple(map(operator.add, total_unlabeled_remote, unlabeled_remote))

            if top_10_to_writeout < 10:
                ioutil.write_passage(pred_passage)
                top_10_to_writeout += 1

        # except Exception as e:
        #     print("Error: %s in passage: %s" % (e, tgt_passage.ID))

    if testing_phase:
        return 100, 100, 100, 100

    labeled_f1 = calculate_f1(total_labeled[0], total_labeled[1], total_labeled[2])
    unlabeled_f1 = calculate_f1(total_unlabeled[0], total_unlabeled[1], total_unlabeled[2])
    labeled_f1_remote = calculate_f1(total_labeled_remote[0], total_labeled_remote[1], total_labeled_remote[2])
    unlabeled_f1_remote = calculate_f1(total_unlabeled_remote[0], total_unlabeled_remote[1],
                                       total_unlabeled_remote[2])

    return labeled_f1, unlabeled_f1, labeled_f1_remote, unlabeled_f1_remote
Пример #28
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        for passage in tqdm(spec.passages, unit=" passages", desc="Setting language in " + spec.out_dir,
                            postfix={"lang": spec.lang}):
            passage.attrib["lang"] = spec.lang
            write_passage(passage, outdir=spec.out_dir, verbose=False, binary=args.binary)