예제 #1
0
 def upload_passage(self, external_id, tokens):
     assert external_id, "Missing external ID for passage %s" % tokens
     assert tokens, "Empty passage %s" % external_id
     passage_out = self.create_passage(text=" ".join(tokens),
                                       external_id=external_id,
                                       type="PUBLIC",
                                       source=self.source)
     task_in = dict(type="TOKENIZATION",
                    status="SUBMITTED",
                    project=self.project,
                    user=self.user,
                    passage=passage_out,
                    manager_comment="External ID: " + external_id,
                    user_comment="",
                    parent=None,
                    is_demo=False,
                    is_active=True)
     tok_task_out = self.create_tokenization_task(**task_in)
     tok_user_task_in = dict(tok_task_out)
     passage = list(from_text(tokens, tokenized=True))[0]
     tok_user_task_in.update(
         to_json(passage, return_dict=True, tok_task=True))
     self.submit_tokenization_task(**tok_user_task_in)
     task_in = dict(type="ANNOTATION",
                    status="NOT_STARTED",
                    project=self.project,
                    user=self.annotation_user,
                    passage=tok_task_out["passage"],
                    manager_comment="External ID: " + external_id,
                    user_comment="",
                    parent=tok_task_out,
                    is_demo=False,
                    is_active=True)
     self.create_annotation_task(**task_in)
     print("Uploaded passage " + external_id + " successfully")
예제 #2
0
파일: USim_corpus.py 프로젝트: iYUYUE/USim
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence,
                     model_path):
    # text = [normalize_sentence(x) for x in text]
    # text = from_text(text, split=True, one_per_line=True)
    # text = list(text)
    # By pass the UCCA tokenizor
    text = [
        next(
            from_text(normalize_sentence(val).split(' '),
                      passage_id=idx,
                      tokenized=True)) for idx, val in enumerate(text)
    ]
    # print(text)
    parser = get_parser(model_path)
    out_location = os.path.dirname(parse_location(output_dir, filename, 0))
    if not os.path.isdir(out_location):
        os.makedirs(out_location)
    for i, (passage, *_) in enumerate(parser.parse(text)):
        passage2file(passage, parse_location(output_dir, filename, i))
    # create an empty file anounces parsing finished succsessfuly
    parsed_file = os.path.join(out_location, PARSED_FILE)
    with open(parsed_file, "w") as _:
        pass
    if clean:
        filenames = os.listdir(output_dir)
        for filename in filenames:
            if filename.endswith(".txt"):
                os.remove(os.path.join(output_dir, item))
예제 #3
0
    def parse_sentence(self, sentence):

        reg_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w', encoding='UTF-8')
        parsed_passage = None

        try:
            TupaParser.__passage_counter = +1
            passage_id = TupaParser.__passage_counter = +1

            # from_text will convert the sentence into a ucca structure.
            # annotate_all will annotate the structure with information from the Spacy parse.
            # annotate_all returns a generator - one that will yield only one object - hence
            # we call next
            unparsed_passage = next(
                annotate_all(from_text(sentence, passage_id,
                                       one_per_line=True)))

            # The 'tupa.parse class's parse method expects a list of unparsed-message. We also need to set
            # the 'evaluate' argument to True, otherwise we get incorrect results. (Ofir Arviv advised as such).
            # The parse method also returns a generator, hence the need to call next.
            # The actual object returned is a tuple of the parsed-passage and an internal score object. We're
            # not interested in the score though, so we just extract the parsed-passage
            parsed_passage_and_score = next(
                self.__parser.parse([unparsed_passage], evaluate=True))
            internal_parsed_passage = parsed_passage_and_score[0]
            parsed_passage = TupaParser.__get_ucca_parsed_passage_from_passage(
                internal_parsed_passage)

        finally:
            sys.stdout = reg_stdout
            return parsed_passage
예제 #4
0
파일: uccamt.py 프로젝트: gyn0806/SWSS
def ucca_parse_sentences(sentences,
                         model_path='models/ucca-bilstm',
                         model=None,
                         lang='en',
                         to_save=True):
    get_parsed_sent()
    sentences = [
        normalize_sentence(sentence, lang=lang) for sentence in sentences
    ]
    to_parse = []
    for i in range(
            len(sentences)
    ):  # check the preprocess pickle file to see if any update is needed
        if sentences[i] in PARSED_SENT:
            sentences[i] = PARSED_SENT[sentences[i]]
        elif len(sentences[i].strip()) == 0:
            sentences[i] = NoSentence()
        else:
            to_parse.append((i, sentences[i]))
    if len(to_parse) > 0:
        print("Parsing", len(to_parse), "sentences.",
              len(sentences) - len(to_parse), "sentences already parsed.")
        if model is None:
            parser = get_parser(model_path)
        else:
            parser = model
        ids, text = zip(*to_parse)
        text = list(from_text(text, split=True, one_per_line=True, lang=lang))
        for i, (passage, *_) in enumerate(parser.parse(text)):
            PARSED_SENT[sentences[ids[i]]] = passage
            sentences[ids[i]] = passage
        if to_save:
            save_parsed_sent()
    return sentences
예제 #5
0
def parse():
    text = request.values["input"]
    print("Parsing text: '%s'" % text)
    in_passage = next(from_text(text))
    out_passage = next(get_parser().parse(in_passage))[0]
    root = to_standard(out_passage)
    xml = tostring(root).decode()
    return Response(indent_xml(xml), headers={"Content-Type": "xml/application"})
예제 #6
0
파일: amr.py 프로젝트: zoharai/semstr
 def _init_passages(self, amrs):
     for lines, amr_id, tokens in amrs:
         assert tokens is not None, "Cannot convert AMR without input tokens: %s" % lines
         amr = parse(" ".join(lines), tokens=tokens)
         amr_id = amr_id or self.passage_id
         passage = next(convert.from_text(tokens, amr_id, tokenized=True))
         passage.extra["format"] = "amr"
         yield passage, amr, amr_id
예제 #7
0
def parse():
    text = request.values["input"]
    print("Parsing text: '%s'" % text)
    in_passage = next(from_text(text))
    out_passage = next(get_parser().parse(in_passage))[0]
    root = to_standard(out_passage)
    xml = tostring(root).decode()
    return Response(indent_xml(xml),
                    headers={"Content-Type": "xml/application"})
예제 #8
0
 def test_from_text(self):
     sample = ['Hello . again', 'nice', ' ?! end', '']
     passage = convert.from_text(sample)
     terms = passage.layer(layer0.LAYER_ID).all
     pos = 0
     for i, par in enumerate(sample):
         for text in par.split():
             self.assertTrue(terms[pos].text == text and
                             terms[pos].paragraph == i + 1)
             pos += 1
예제 #9
0
파일: test_ucca.py 프로젝트: macleginn/ucca
 def test_from_text(self):
     sample = ["Hello . again", "nice", " ? ! end", ""]
     passage = next(convert.from_text(sample))
     terms = passage.layer(layer0.LAYER_ID).all
     pos = 0
     for i, par in enumerate(sample):
         for text in par.split():
             self.assertEqual(terms[pos].text, text)
             self.assertEqual(terms[pos].paragraph, i + 1)
             pos += 1
예제 #10
0
 def test_from_text(self):
     sample = ['Hello . again', 'nice', ' ? ! end', '']
     passage = next(convert.from_text(sample))
     terms = passage.layer(layer0.LAYER_ID).all
     pos = 0
     for i, par in enumerate(sample):
         for text in par.split():
             self.assertTrue(terms[pos].text == text
                             and terms[pos].paragraph == i + 1)
             pos += 1
예제 #11
0
def test_from_text():
    sample = ["Hello . again", "nice", " ? ! end", ""]
    passage = next(convert.from_text(sample))
    terms = passage.layer(layer0.LAYER_ID).all
    pos = 0
    for i, par in enumerate(sample):
        for text in par.split():
            assert terms[pos].text == text
            assert terms[pos].paragraph == i + 1
            pos += 1
예제 #12
0
 def test_from_text(self):
     sample = ["Hello . again", "nice", " ? ! end", ""]
     passage = next(convert.from_text(sample))
     terms = passage.layer(layer0.LAYER_ID).all
     pos = 0
     for i, par in enumerate(sample):
         for text in par.split():
             self.assertEqual(terms[pos].text, text)
             self.assertEqual(terms[pos].paragraph, i + 1)
             pos += 1
예제 #13
0
 def _init_passages(self, graphs, **kwargs):
     for graph in graphs:
         if not graph.id:
             graph.id = graph.id or self.passage_id
         passage = next(
             convert.from_text(graph.tokens, graph.id, tokenized=True))
         graph.format = kwargs.get("format") or graph.format
         if graph.format is None or graph.format == self.format:
             passage.extra["format"] = self.format
         yield passage, graph
예제 #14
0
    def upload_streussel_passage_file(self, filenames, log=None, **kwargs):
        del kwargs
        log_h = open(log, "w", encoding="utf-8") if log else None
        with open(filenames) as f_all:
            for filename in f_all:
                passage_text = ""
                external_id = "None given"
                filename = filename.strip()
                with open(filename, encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        elif line.startswith("#"):
                            fields = line.split()
                            if len(fields) != 4 or fields[1] != "sent_id":
                                print("FORMAT ERROR in " + filename,
                                      file=sys.stderr)
                            else:
                                external_id = fields[3].split("-")[1]
                        else:
                            passage_text = passage_text + " " + line
                passage_out = self.create_passage(text=passage_text.strip(),
                                                  external_id=external_id,
                                                  type="PUBLIC",
                                                  source=self.source)
                task_in = dict(type="TOKENIZATION",
                               status="SUBMITTED",
                               project=self.project,
                               user=self.user,
                               passage=passage_out,
                               manager_comment="External ID: " + external_id,
                               user_comment="",
                               parent=None,
                               is_demo=False,
                               is_active=True)
                tok_task_out = self.create_task(**task_in)
                tok_user_task_in = dict(tok_task_out)

                passage = list(from_text(passage_text.split(),
                                         tokenized=True))[0]
                tok_user_task_in.update(
                    to_json(passage, return_dict=True, tok_task=True))

                self.submit_task(**tok_user_task_in)
                print("Uploaded passage " + filename + " successfully.",
                      file=sys.stderr)
                if log:
                    print(filename.split(".")[-2],
                          passage_out["id"],
                          tok_task_out["id"],
                          file=log_h,
                          sep="\t")
        if log:
            log_h.close()
예제 #15
0
 def test_from_text_long(self):
     sample = """
         After graduation, John moved to New York City.
         
         He liked it there. He played tennis.
         And basketball.
         
         And he lived happily ever after.
         """
     passages = list(convert.from_text(sample))
     self.assertEqual(len(passages), 3, list(map(convert.to_text, passages)))
예제 #16
0
def test_from_text_long():
    sample = """
        After graduation, John moved to New York City.

        He liked it there. He played tennis.
        And basketball.

        And he lived happily ever after.
        """
    passages = list(convert.from_text(sample))
    assert len(passages) == 3, list(map(convert.to_text, passages))
예제 #17
0
def test_parser(config, model_type, formats, default_setting, text=True):
    filename = "test_files/models/%s_%s%s" % ("_".join(formats), model_type,
                                              default_setting.suffix())
    remove_existing(filename)
    config.update(default_setting.dict())
    scores = []
    params = []
    passages = list(map(load_passage, passage_files(*formats)))
    evaluate = ("amr" not in formats)
    for mode in "train", "load":
        print("-- %sing %s" % (mode, model_type))
        config.update(dict(classifier=model_type, copy_shared=None))
        p = Parser(model_files=filename, config=config)
        p.save_init = True
        list(
            p.train(passages if mode == "train" else None,
                    dev=passages,
                    test=True,
                    iterations=2))
        assert p.model.is_finalized, "Model should be finalized after %sing" % mode
        assert not getattr(p.model.feature_extractor, "node_dropout",
                           0), p.model.feature_extractor.node_dropout
        all_params = p.model.all_params()
        params.append(all_params)
        param1, param2 = [
            d.get("W") for d in (all_params, p.model.feature_extractor.params)
        ]
        if param1 is not None and param2 and param2.init is not None and not config.args.update_word_vectors:
            assert_allclose(param1,
                            weight_decay(p.model) * param2.init,
                            rtol=1e-6)
        text_results = results = list(p.parse(passages, evaluate=evaluate))
        if text:
            print("Converting to text and parsing...")
            text_results = list(
                p.parse([
                    p3 for p1 in passages
                    for p2 in convert.to_text(p1, sentences=False)
                    for p3 in convert.from_text(
                        p2, p1.ID, extra_format=p1.extra.get("format"))
                ]))
            assert len(results) == len(text_results)
        if evaluate:
            scores.append(Scores(tuple(zip(*results))[1]).average_f1())
            if text:
                for t, (r, s) in zip(text_results, results):
                    print("  %s F1=%.3f" % (r.ID, s.average_f1()))
        assert not list(p.parse(()))  # parsing nothing returns nothing
        print()
    assert_all_params_equal(*params)
    if evaluate:
        print("-- average f1: %.3f, %.3f\n" % tuple(scores))
        assert scores[0] == pytest.approx(scores[1], 0.1)
예제 #18
0
 def _build_passage(self):
     assert self.tokens is not None, "Cannot convert AMR without input tokens"
     # amr = penman.decode(re.sub("~e\.[\d,]+", "", " ".join(self.lines)))
     amr = parse(" ".join(self.lines), tokens=self.tokens)
     passage = next(convert.from_text(self.tokens, self.amr_id or self.passage_id, tokenized=True))
     passage.extra["format"] = "amr"
     self.lines = []
     self.amr_id = self.tokens = None
     textutil.annotate(passage)
     l0 = passage.layer(layer0.LAYER_ID)
     l1 = passage.layer(layer1.LAYER_ID)
     self._build_layer1(amr, l1)
     self._build_layer0(self.align_nodes(amr), l1, l0)
     self._update_implicit(l1)
     self._update_labels(l1)
     # return (passage, penman.encode(amr), self.amr_id) if self.return_amr else passage
     return (passage, amr(alignments=False), self.amr_id) if self.return_amr else passage
예제 #19
0
 def upload_passage(self, external_id, tokens):
     assert external_id, "Missing external ID for passage %s" % tokens
     assert tokens, "Empty passage %s" % external_id
     passage_out = self.create_passage(text=" ".join(tokens), external_id=external_id, type="PUBLIC",
                                       source=self.source)
     task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user,
                    passage=passage_out, manager_comment="External ID: "+external_id,
                    user_comment="", parent=None, is_demo=False, is_active=True)
     tok_task_out = self.create_task(**task_in)
     tok_user_task_in = dict(tok_task_out)
     passage = list(from_text(tokens, tokenized=True))[0]
     tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
     self.submit_task(**tok_user_task_in)
     task_in = dict(type="ANNOTATION", status="NOT_STARTED", project=self.project, user=self.annotation_user,
                    passage=tok_task_out["passage"], manager_comment="External ID: "+external_id,
                    user_comment=external_id, parent=tok_task_out, is_demo=False, is_active=True)
     self.create_task(**task_in)
     print("Uploaded passage "+external_id+" successfully")
예제 #20
0
 def tokenize_and_upload(self, filename, log=None, lang=None, **kwargs):
     del kwargs
     log_h = open(log, "w", encoding="utf-8") if log else None
     prefix = os.path.splitext(os.path.basename(filename))[0].replace(
         " ", "_")
     with open(filename, encoding="utf-8") as f:
         for passage, text in from_text(f,
                                        passage_id=prefix,
                                        lang=lang,
                                        return_text=True):
             passage_out = self.create_passage(text=text,
                                               type="PUBLIC",
                                               source=self.source)
             task_in = dict(type="TOKENIZATION",
                            status="SUBMITTED",
                            project=self.project,
                            user=self.user,
                            passage=passage_out,
                            manager_comment=passage.ID,
                            user_comment="",
                            parent=None,
                            is_demo=False,
                            is_active=True)
             tok_task_out = self.create_task(**task_in)
             tok_user_task_in = dict(tok_task_out)
             tok_user_task_in.update(
                 to_json(passage, return_dict=True, tok_task=True))
             self.submit_task(**tok_user_task_in)
             task_in.update(parent=tok_task_out, type="ANNOTATION")
             ann_user_task_out = self.create_task(**task_in)
             print("Uploaded passage " + filename + " successfully.",
                   file=sys.stderr)
             if log:
                 print(passage.ID,
                       passage_out["id"],
                       tok_task_out["id"],
                       ann_user_task_out["id"],
                       file=log_h,
                       sep="\t",
                       flush=True)
     if log:
         log_h.close()
예제 #21
0
    def upload_streussel_passage_file(self, filenames, log=None, **kwargs):
        del kwargs
        log_h = open(log, "w", encoding="utf-8") if log else None
        with open(filenames) as f_all:
            for filename in f_all:
                passage_text = ""
                external_id = "None given"
                filename = filename.strip()
                with open(filename, encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        elif line.startswith("#"):
                            fields = line.split()
                            if len(fields) != 4 or fields[1] != "sent_id":
                                print("FORMAT ERROR in " + filename, file=sys.stderr)
                            else:
                                external_id = fields[3].split("-")[1]
                        else:
                            passage_text = passage_text + " " + line
                passage_out = self.create_passage(text=passage_text.strip(), external_id=external_id, type="PUBLIC",
                                                  source=self.source)
                task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project,
                               user=self.user, passage=passage_out, manager_comment="External ID: " + external_id,
                               user_comment="", parent=None, is_demo=False, is_active=True)
                tok_task_out = self.create_task(**task_in)
                tok_user_task_in = dict(tok_task_out)

                passage = list(from_text(passage_text.split(), tokenized=True))[0]
                tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))

                self.submit_task(**tok_user_task_in)
                print("Uploaded passage " + filename + " successfully.", file=sys.stderr)
                if log:
                    print(filename.split(".")[-2], passage_out["id"], tok_task_out["id"], file=log_h, sep="\t")
        if log:
            log_h.close()
예제 #22
0
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence):
    text = [normalize_sentence(x) for x in text]
    # print("parsing", text)
    text = from_text(text, split=True, one_per_line=True)
    text = list(text)
    # print("output_dir", output_dir)
    # print(filename, "filename")
    # print("parsed to", parse_location(
    # output_dir, filename, 0))
    # raise
    parser = get_parser()
    for i, passage in enumerate(parser.parse(text)):
        passage2file(passage, parse_location(output_dir, filename, i))
    # create a file anounces parsing finished succsessfuly
    parsed_file = os.path.join(
        os.path.dirname(parse_location(output_dir, filename, 0)), PARSED_FILE)
    with open(parsed_file, "w") as _:
        pass
    if clean:
        filenames = os.listdir(output_dir)
        for filename in filenames:
            if filename.endswith(".txt"):
                os.remove(os.path.join(output_dir, item))