Пример #1
0
    def _test_biluov_task():
        import es_core_news_md
        from scripts.utils import Sentence

        def forward(tokensxsentence, entitiesxsentence):
            labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence)
            return [
                from_biluov(biluov, sentence, spans=True)
                for biluov, sentence in zip(labelsxsentence, tokensxsentence)
            ]

        training = Collection().load(Path("data/training/scenario.txt"))
        nlp = es_core_news_md.load()

        def per_label(label):
            tokensxsentence = [nlp(s.text) for s in training.sentences]
            entitiesxsentence = [[
                k.spans for k in s.keyphrases if k.label == label
            ] for s in training.sentences]
            decoded = forward(tokensxsentence, entitiesxsentence)
            return decoded

        collection = Collection([Sentence(s.text) for s in training.sentences])
        for label in ENTITIES:
            decoded = per_label(label)
            for entities, sentence in zip(decoded, collection.sentences):
                for spans in entities:
                    keyphrase = Keyphrase(sentence, label, -1, spans)
                    sentence.keyphrases.append(keyphrase)

        collection.fix_ids()
        output = Path(
            "data/submissions/forward-biluov/train/run1/scenario2-taskA/")
        output.mkdir(parents=True, exist_ok=True)
        collection.dump(output / "scenario.txt", skip_empty_sentences=False)
Пример #2
0
def main(gold_input, submit_input, skip_A, skip_B, verbose, skip_C=True):
    gold = Collection()
    gold.load(gold_input)

    submit = Collection()
    submit.load(submit_input)

    data = OrderedDict()

    dataA = subtaskA(gold, submit, verbose)
    data.update(dataA)
    if not skip_A:
        report(dataA, verbose)

    if not skip_B:
        dataB = subtaskB(gold, submit, dataA, verbose)
        data.update(dataB)
        report(dataB, verbose)

    if not skip_C:
        dataC = subtaskC(gold, submit, data, verbose)
        data.update(dataC)
        report(dataC, verbose)

    print("-" * 20)

    metrics = compute_metrics(data, skip_A, skip_B, skip_C)

    for key, value in metrics.items():
        print("{0}: {1:0.4}".format(key, value))

    return data
Пример #3
0
    def _load_collection(self, scenario):
        gold = self.gold.format(scenario)
        gold = Path(gold)

        loader = Collection().load_dir if gold.is_dir() else Collection().load

        return loader(
            gold,
            legacy=False,
            keyphrases=scenario.endswith("-taskB"),
            relations=False,
            attributes=False,
        )
def get_clean_collection(anns_path: Path, select: str) -> Collection:
    collection = Collection()

    for file in sorted((anns_path / select).iterdir()):
        if file.suffix == ".txt":
            collection.load(file, attributes=False)

    for s in collection.sentences:
        overlaps = s.overlapping_keyphrases()

        if overlaps:
            print("Found overlapping:", overlaps)
            s.merge_overlapping_keyphrases()
            overlaps = s.overlapping_keyphrases()

        dups = s.dup_relations()

        if dups:
            print(
                "Found duplicated relations %r in sentence '%s'"
                % ([v[0] for v in dups.values()], s.text)
            )
            s.remove_dup_relations()
            dups = s.dup_relations()

        assert not overlaps
        assert not dups

    return collection
Пример #5
0
def load_corpus(anns_path: Path, clean=True) -> Collection:
    collection = Collection()

    for file in sorted(anns_path.iterdir()):
        if file.name.endswith(".txt"):
            collection.load(file)

    if clean:
        for s in collection.sentences:
            overlaps = s.overlapping_keyphrases()

            if overlaps:
                print("Found overlapping:", overlaps)
                s.merge_overlapping_keyphrases()
                overlaps = s.overlapping_keyphrases()

            dups = s.dup_relations()

            if dups:
                print("Found duplicated relations %r in sentence '%s'" %
                      ([v[0] for v in dups.values()], s.text))
                s.remove_dup_relations()
                dups = s.dup_relations()

            assert not overlaps
            assert not dups

    return collection
Пример #6
0
def count_labels_based_on(path: Path, reference: Path):
    collection = load_corpus(path)
    reference = CollectionV1Handler.load_dir(Collection(), reference)

    sentences = []
    for ref_sent in reference.sentences:
        for sent in collection.sentences:
            if sent.text == ref_sent.text:
                sentences.append(sent)
                break

    print(len(collection))
    print(len(reference))
    print(len(sentences))

    return count_labels_on(Collection(sentences))
Пример #7
0
def main():
    baseline = Baseline()
    baseline.train(Path("data/training/"))

    collection = CollectionV1Handler.load_dir(
        Collection(), Path("data/testing/scenario2-taskA/"))
    output = baseline.run(collection, taskA=True, taskB=False)
    CollectionV1Handler.dump(output, Path("pepe/input_scenario2.txt"), False)
Пример #8
0
def load_and_dump_from_corpus(path2sentences, path2corpus, path2output):
    sentences = path2sentences.read_text().splitlines()
    print(len(sentences))
    collection = Collection().load_dir(path2corpus, legacy=True, attributes=False)
    print(len(collection))
    collection = filter(collection, sentences)
    print(len(collection))
    collection.dump(path2output)
Пример #9
0
    def test(self, finput: Path, skip_A=False, skip_B=False):

        collection = Collection()
        collection.load(finput)

        self.predict_entities(collection)

        return collection
Пример #10
0
def load_training_data(corpus) -> Collection:
    packs = Path("/data") / corpus / "packs/submitted/"

    collection = Collection()

    for filename in packs.glob("*.txt"):
        collection.load(filename)

    return collection
Пример #11
0
def task_annotate_relations(corpus: str, pack: str):
    model = get_model(corpus)
    text_path = Path("/data") / corpus / "packs" / "open" / pack / "pack.txt"

    collection = Collection().load(text_path)
    collection = model.predict_relations(collection)
    collection.dump(text_path, skip_empty_sentences=False)

    return {"reload": True}
Пример #12
0
    def get_train_valid_set(self,
                            finput_train: Path,
                            finput_valid: Path = None):
        #TRAIN SET
        finput_train = Path(finput_train)
        collection_train = (Collection().load_dir(finput_train)
                            if finput_train.is_dir() else
                            Collection().load(finput_train))
        #VALIDATION SET
        if finput_valid:
            finput_valid = Path(finput_valid)
            collection_valid = (Collection().load_dir(finput_valid)
                                if finput_valid.is_dir() else
                                Collection().load(finput_valid))
        else:
            collection_valid = None

        return collection_train, collection_valid
Пример #13
0
def filter(collection: Collection, sentences):
    # return Collection([s for s in collection.sentences if s.text in sentences])

    def find(text):
        for s in collection.sentences:
            if s.text == text:
                return s
        raise Exception("Not found! " + text)

    return Collection([find(text) for text in sentences])
Пример #14
0
    def _load_collection(self, scenario):
        gold = self.gold.format(scenario)

        return Collection().load(
            Path(gold),
            legacy=False,
            keyphrases=scenario.endswith("-taskB"),
            relations=False,
            attributes=False,
        )
Пример #15
0
    def train(self, finput: Path):
        collection = (Collection().load_dir(finput)
                      if finput.is_dir() else Collection().load(finput))

        self.model = keyphrases, relations = {}, {}

        for sentence in collection.sentences:
            for keyphrase in sentence.keyphrases:
                text = keyphrase.text.lower()
                keyphrases[text] = keyphrase.label

        for sentence in collection.sentences:
            for relation in sentence.relations:
                origin = relation.from_phrase
                origin_text = origin.text.lower()
                destination = relation.to_phrase
                destination_text = destination.text.lower()

                relations[origin_text, origin.label, destination_text,
                          destination.label] = relation.label
Пример #16
0
def task_clear_all(corpus: str, pack: str):
    path = Path("/data") / corpus / "packs" / "open" / pack / "pack.txt"

    collection = Collection()
    collection.load(path)

    for sentence in collection.sentences:
        sentence.relations = []

    collection.dump(path)

    return {"reload": True}
Пример #17
0
    def test(self, finput: Path, skip_A, skip_B):
        collection = Collection()

        if skip_A:
            collection.load_keyphrases(finput)
        else:
            collection.load_input(finput)
            self.predict_entities(collection)

        if not skip_B:
            for sentence in collection.sentences:
                self.predict_relations(sentence)
                sentence.remove_dup_relations()

        return collection
Пример #18
0
def main(anns_path: Path, training_path, develop_path, test_path, public):
    random.seed(42)  # default seed, but each generator should use his own

    # dump training and development collections ----------------------------------
    train_develop_sentences = get_training_and_development(anns_path)

    #### training
    training = Collection(train_develop_sentences[:800])
    training.dump(training_path / "scenario.txt")

    #### development/main
    develop = Collection(train_develop_sentences[800:])
    develop.dump(develop_path / "main" / "scenario.txt")

    # dump test collection (per scenario) ----------------------------------------
    test_sentences = get_test(anns_path)
    extra_sentences_main = get_extra(anns_path, "main",
                                     train_develop_sentences, test_sentences)
    extra_sentences_transfer = get_extra(anns_path, "transfer",
                                         train_develop_sentences,
                                         test_sentences)

    #### test/scenario3
    scn3 = Collection(test_sentences[200:])
    clean(scn3, public, remove_keyphrases=False)
    scn3.dump(test_path / "scenario3-taskB" / "scenario.txt", False)

    #### test/scenario2
    scn2 = Collection(test_sentences[100:200])
    clean(scn2, public)
    scn2.dump(test_path / "scenario2-taskA" / "scenario.txt", False)

    #### test/scenario1
    scn1 = Collection(
        shuffle(extra_sentences_main[:4900], test_sentences[:100]))
    clean(scn1, public)
    scn1.dump(test_path / "scenario1-main" / "scenario.txt", False)

    # dump transfer learning collections ----------------------------------------
    transfer_sentences = get_transfer(anns_path)

    #### development/transfer
    develop_transfer = Collection(transfer_sentences[:100])
    develop_transfer.dump(develop_path / "transfer" / "scenario.txt")

    #### test/scenario4
    scn4 = Collection(
        shuffle(extra_sentences_transfer[:1400], transfer_sentences[100:]))
    clean(scn4, public)
    scn4.dump(test_path / "scenario4-transfer" / "scenario.txt", False)
Пример #19
0
    def predict_entities(self, sentences):
        if isinstance(sentences[0], Sentence):
            sentences = [s.text for s in sentences]

        result = []
        nlp = spacy_model("es")

        for i, sentence in enumerate(sentences):
            if self.callback:
                self.callback(msg="Processing sentence",
                              current=i,
                              total=len(sentences))

            doc, xs = self.feature_sentence(sentence)
            sentence = self.predict_single(doc, xs)
            result.append(sentence)

        return Collection(sentences=result)
Пример #20
0
def evaluate_scenario(submit_path: Path, gold: Collection, scenario: int):
    submit_file = submit_path / ("scenario.txt")
    if not submit_file.exists():
        warnings.warn("Input file not found in '%s'" % submit_path)
        return {}

    submit = Collection().load(submit_file)
    resultA = subtaskA(gold, submit)
    resultB = subtaskB(gold, submit, resultA)

    results = {}
    for k, v in list(resultA.items()) + list(resultB.items()):
        results[k] = len(v)

    metrics = compute_metrics(dict(resultA, **resultB),
                              skipA=scenario == 3,
                              skipB=scenario == 2)
    results.update(metrics)

    return results
Пример #21
0
    def train(self, finput):

        collection = Collection()
        collection.load(finput)

        self.keyphrases.clear()
        for sentence in collection.sentences:
            for keyphrase in sentence.keyphrases:
                text = keyphrase.text.lower()
                self.keyphrases[text] = keyphrase.label

        self.relations.clear()
        for sentence in collection.sentences:
            for relation in sentence.relations:
                origin = relation.from_phrase
                origin_text = origin.text.lower()
                destination = relation.to_phrase
                destination_text = destination.text.lower()

                self.relations[origin_text, origin.label, destination_text, destination.label] = relation.label
def main(anns_path: Path, training_path, develop_path, test_path):
    random.seed(42)  # default seed, but each generator should use his own

    # dump training and development collections ----------------------------------
    train_develop_sentences = get_training_and_development(anns_path)

    #### training
    training = Collection(train_develop_sentences[:800])
    training.dump(training_path / "scenario.txt")

    #### development/main
    develop = Collection(train_develop_sentences[800:])
    develop.dump(develop_path / "main" / "scenario.txt")

    # dump test collection (per scenario) ----------------------------------------
    test_sentences = get_test(anns_path)
    extra_sentences = get_extra(anns_path, train_develop_sentences, test_sentences)

    #### test/scenario3
    scn3 = Collection(test_sentences[200:])
    scn3.dump(test_path / "scenario3-taskB" / "scenario.txt")

    #### test/scenario2
    scn2 = Collection(test_sentences[100:200])
    scn2.dump(test_path / "scenario2-taskA" / "scenario.txt")

    #### test/scenario1
    scn1 = Collection(
        extra_sentences[:4567] + test_sentences[:100] + extra_sentences[4567:]
    )
    scn1.dump(test_path / "scenario1-main" / "scenario.txt", False)

    # dump transfer learning collections ----------------------------------------
    transfer_sentences = get_transfer(anns_path)

    #### development/transfer
    develop_transfer = Collection(transfer_sentences[:100])
    develop_transfer.dump(develop_path / "transfer" / "scenario.txt")

    #### test/scenario4
    scn4 = Collection(transfer_sentences[100:])
    scn4.dump(test_path / "scenario4-transfer" / "scenario.txt")
Пример #23
0
    def train(self, finput: Path):

        collection = Collection()
        collection = CollectionV1Handler.load(collection,
                                              finput / "input_training.txt")
        # collection = CollectionV2Handler.load(collection, finput / 'ensemble.txt')

        self.model = keyphrases, relations = {}, {}

        for sentence in collection.sentences:
            for keyphrase in sentence.keyphrases:
                text = keyphrase.text.lower()
                keyphrases[text] = keyphrase.label

        for sentence in collection.sentences:
            for relation in sentence.relations:
                origin = relation.from_phrase
                origin_text = origin.text.lower()
                destination = relation.to_phrase
                destination_text = destination.text.lower()

                relations[origin_text, origin.label, destination_text,
                          destination.label] = relation.label
Пример #24
0
    def train(self, finput):

        collection = Collection()
        collection.load(finput)

        """
        full_text = ""
        for sentence in collection.sentences:
            full_text += sentence.text
            full_text += "\n"

        doc = es_pipeline.nlp(full_text)
        """

        x_train, y_train = self.preprocess(collection, True)
        # print('Counter train: %s' % Counter(y_train))

        fit_result = self.clf.fit(x_train, y_train)
        print("Success at training!\n")

        self.label_set = list(set(y_train))

        return fit_result
Пример #25
0
def count_labels(path: Path, handler=None):
    corpus = handler.load_dir(Collection(),
                              path) if handler else load_corpus(path)
    return count_labels_on(corpus)
Пример #26
0
def main(
    mode="test",
    best=False,
    single=False,
    csv=False,
    pretty=False,
    final=False,
    plain=False,
    compact=False,
    gold="data",
    submit="data/submissions",
    ignore=False,
):
    users = collections.defaultdict(list)

    if csv and not best:
        raise ValueError("Error: --csv implies --best")

    if final and (not csv or not best):
        raise ValueError("Error: --final implies --csv and --best")

    if mode == "test":
        test_gold = Path(gold)
        gold_scenarios = [
            Collection().load(test_gold /
                              "testing/scenario1-main/scenario.txt"),
            Collection().load(test_gold /
                              "testing/scenario2-taskA/scenario.txt"),
            Collection().load(test_gold /
                              "testing/scenario3-taskB/scenario.txt"),
            Collection().load(test_gold /
                              "testing/scenario4-transfer/scenario.txt"),
        ]
    elif mode == "dev":
        dev_gold = Path(gold)
        gold_scenarios = [
            Collection().load(dev_gold / "development/main/scenario.txt"),
            Collection().load(dev_gold / "development/main/scenario.txt"),
            Collection().load(dev_gold / "development/main/scenario.txt"),
            Collection().load(dev_gold / "development/transfer/scenario.txt"),
        ]
    elif mode == "train":
        dev_gold = Path(gold)
        gold_scenarios = [
            Collection().load(dev_gold / "training/scenario.txt"),
            Collection().load(dev_gold / "training/scenario.txt"),
            Collection().load(dev_gold / "training/scenario.txt"),
        ]
    else:
        raise ValueError("Unexpected mode: {0}".format(mode))

    submits = Path(submit)
    if single:
        submits = submits / single
        runs = submits / mode
        if not runs.exists():
            msg = "Directory {0} not found. Check --mode and --single options.".format(
                runs)
            raise ValueError(msg)
        ensure_number_of_runs(runs)
        for subfolder in runs.iterdir():
            users[submits.name].append(evaluate_one(subfolder,
                                                    *gold_scenarios))
    else:
        for userfolder in submits.iterdir():
            if not userfolder.is_dir():
                continue
            runs = userfolder / mode
            if not runs.exists():
                msg = "Directory {0} not found. Did you mean to use --single? Check --mode option.".format(
                    runs)
                if ignore:
                    warnings.warn(msg)
                    continue
                else:
                    raise ValueError(msg)
            ensure_number_of_runs(runs)
            for subfolder in runs.iterdir():
                users[userfolder.name].append(
                    evaluate_one(subfolder, *gold_scenarios))

    results = dict(users)

    if best:
        results = filter_best(results)

    if csv:
        import pandas as pd

        items = []

        for user, data in results.items():
            userdata = dict(name=user)

            for k, metrics in data.items():
                userdata.update(
                    {"%s-%s" % (k, m): v
                     for m, v in metrics.items()})

            items.append(userdata)

        df = pd.DataFrame(items)
        df = df.set_index("name").sort_index().transpose()

        if final:
            df1 = df.transpose()[[
                "scenario1-f1", "scenario1-precision", "scenario1-recall"
            ]]
            df1 = df1.sort_values("scenario1-f1", ascending=False)

            df2 = df.transpose()[[
                "scenario2-f1", "scenario2-precision", "scenario2-recall"
            ]]
            df2 = df2.sort_values("scenario2-f1", ascending=False)

            df3 = df.transpose()[[
                "scenario3-f1", "scenario3-precision", "scenario3-recall"
            ]]
            df3 = df3.sort_values("scenario3-f1", ascending=False)

            df4 = df.transpose()[[
                "scenario4-f1", "scenario4-precision", "scenario4-recall"
            ]]
            df4 = df4.sort_values("scenario4-f1", ascending=False)

            if pretty:
                print(df1.round(3).to_markdown() + "\n")
                print(df2.round(3).to_markdown() + "\n")
                print(df3.round(3).to_markdown() + "\n")
                print(df4.round(3).to_markdown() + "\n")

            else:
                print(df1.to_csv())
                print(df2.to_csv())
                print(df3.to_csv())
                print(df4.to_csv())
        else:
            print(df.to_csv())

    elif plain:
        for user, info in results.items():
            print(50 * "=")
            print(" {0} ".format(user).center(50, ":").upper())
            print(50 * "=")
            for run in info:
                print("[ {0} ]".format(run["submit"]).center(50, "-"))
                for scenario, data in run.items():
                    if scenario == "submit":
                        continue
                    print("> {0} ".format(scenario))
                    for metric, value in data.items():
                        if metric == "submit":
                            continue
                        metric = "{0}".format(metric).ljust(15)
                        if isinstance(value, float):
                            print("     {0} ~ {1:0.4}".format(metric, value))
                        else:
                            print("     {0} = {1}".format(metric, value))

    elif compact:
        if not single:
            raise ValueError("--compact requires --single")
        if not best:
            raise ValueError("--compact requires --best")

        results = results[single]

        for scn, metrics in results.items():
            for m in ["f1", "precision", "recall"]:
                print(f"{scn}-{m}: {metrics[m]:0.5}")
    else:
        print(json.dumps(results, sort_keys=True,
                         indent=2 if pretty else None))
Пример #27
0
def main(
    mode="test",
    best=False,
    single=False,
    csv=False,
    pretty=False,
    final=False,
    plain=False,
):
    users = collections.defaultdict(list)

    if csv and not best:
        raise ValueError("Error: --csv implies --best")

    if final and (not csv or not best):
        raise ValueError("Error: --final implies --csv and --best")

    if mode == "test":
        scn1_gold = Collection().load(
            Path("data/testing/scenario1-main/scenario.txt"))
        scn2_gold = Collection().load(
            Path("data/testing/scenario2-taskA/scenario.txt"))
        scn3_gold = Collection().load(
            Path("data/testing/scenario3-taskB/scenario.txt"))
        scn4_gold = Collection().load(
            Path("data/testing/scenario4-transfer/scenario.txt"))
    elif mode == "dev":
        scn1_gold = Collection().load(
            Path("data/development/main/scenario.txt"))
        scn2_gold = Collection().load(
            Path("data/development/main/scenario.txt"))
        scn3_gold = Collection().load(
            Path("data/development/main/scenario.txt"))
        scn4_gold = Collection().load(
            Path("data/development/transfer/scenario.txt"))
    else:
        raise ValueError("Unexpected mode: {0}".format(mode))

    submits = Path("data/submissions/")
    if single:
        submits = submits / single
        runs = submits / mode
        if not runs.exists():
            raise ValueError(
                "Directory {0} not found. Check --mode and --single options.")
        ensure_number_of_runs(runs)
        for subfolder in runs.iterdir():
            users[submits.name].append(
                evaluate_one(
                    subfolder,
                    scn1_gold,
                    scn2_gold,
                    scn3_gold,
                    scn4_gold,
                ))
    else:
        for userfolder in submits.iterdir():
            if not userfolder.is_dir():
                continue
            runs = userfolder / mode
            if not runs.exists():
                raise ValueError(
                    "Directory {0} not found. Did you mean to use --single? Check --mode option."
                )
            ensure_number_of_runs(runs)
            for subfolder in runs.iterdir():
                users[userfolder.name].append(
                    evaluate_one(
                        subfolder,
                        scn1_gold,
                        scn2_gold,
                        scn3_gold,
                        scn4_gold,
                    ))

    results = dict(users)

    if best:
        results = filter_best(results)

    if csv:
        import pandas as pd

        items = []

        for user, data in results.items():
            userdata = dict(name=user)

            for k, metrics in data.items():
                userdata.update(
                    {"%s-%s" % (k, m): v
                     for m, v in metrics.items()})

            items.append(userdata)

        df = pd.DataFrame(items)
        df = df.set_index("name").sort_index().transpose()

        if final:
            df1 = df.transpose()[[
                "scenario1-f1", "scenario1-precision", "scenario1-recall"
            ]]
            df1 = df1.sort_values("scenario1-f1", ascending=False).to_csv()

            df2 = df.transpose()[[
                "scenario2-f1", "scenario2-precision", "scenario2-recall"
            ]]
            df2 = df2.sort_values("scenario2-f1", ascending=False).to_csv()

            df3 = df.transpose()[[
                "scenario3-f1", "scenario3-precision", "scenario3-recall"
            ]]
            df3 = df3.sort_values("scenario3-f1", ascending=False).to_csv()

            df4 = df.transpose()[[
                "scenario4-f1", "scenario4-precision", "scenario4-recall"
            ]]
            df4 = df4.sort_values("scenario4-f1", ascending=False).to_csv()

            print(df1)
            print(df2)
            print(df3)
            print(df4)

        elif pretty:
            print(df.to_html())
        else:
            print(df.to_csv())

    elif plain:
        for user, info in results.items():
            print(50 * "=")
            print(" {0} ".format(user).center(50, ":").upper())
            print(50 * "=")
            for run in info:
                print("[ {0} ]".format(run["submit"]).center(50, "-"))
                for scenario, data in run.items():
                    if scenario == "submit":
                        continue
                    print("> {0} ".format(scenario))
                    for metric, value in data.items():
                        if metric == "submit":
                            continue
                        metric = "{0}".format(metric).ljust(15)
                        if isinstance(value, float):
                            print("     {0} ~ {1:0.4}".format(metric, value))
                        else:
                            print("     {0} = {1}".format(metric, value))
    else:
        print(json.dumps(results, sort_keys=True,
                         indent=2 if pretty else None))
Пример #28
0
    def _training_task(
        n_epochs,
        *,
        bert_mode,
        cnet_mode,
        ignore_path,
        inclusion=1.1,
        task=None,
        jointly=True,
        early_stopping=None,
        use_crf=True,
        weight=True,
        only_bert=False,
        reduce=False,
        split_relations="both",
        straight_forward_encoding=False,
        dropout=False,
        stacked_layers=1,
    ):
        if split_relations not in ("both", "pair", "seq"):
            raise ValueError()

        training = Collection().load(Path("data/training/scenario.txt"))
        validation = Collection().load(
            Path("data/development/main/scenario.txt"))

        early_stopping = early_stopping or dict(wait=5, delta=0.0)

        train_pairs = (TAXONOMIC_RELS if split_relations == "both" else
                       RELATIONS if split_relations == "pair" else None)
        train_seqs = (CONTEXT_RELS if split_relations == "both" else
                      RELATIONS if split_relations == "seq" else None)

        algorithm = eHealth20Model(
            bert_mode=bert_mode,
            only_bert=only_bert,
            cnet_mode=cnet_mode,
            ignore_path=ignore_path,
        )
        if task is None:
            algorithm.train(
                training,
                validation,
                jointly=jointly,
                inclusion=inclusion,
                n_epochs=n_epochs,
                save_to=name_to_path,
                early_stopping=early_stopping,
                use_crf=use_crf,
                weight=weight,
                train_pairs=train_pairs,
                train_seqs=train_seqs,
                straight_forward_encoding=straight_forward_encoding,
                reduce=reduce,
                dropout=dropout,
                stacked_layers=stacked_layers,
            )
        elif task == "A":
            algorithm.train_taskA(
                training,
                validation,
                jointly=jointly,
                n_epochs=n_epochs,
                save_to=name_to_path,
                early_stopping=early_stopping,
                use_crf=use_crf,
                weight=weight,
                dropout=dropout,
                stacked_layers=stacked_layers,
            )
        elif task == "B":
            # load A
            if jointly:
                taskA_models = {}
                for label in ENTITIES:
                    checkpoint = torch.load(f"trained/taskA-{label}.pt")
                    _ensure_bert(bert_mode, checkpoint)
                    model = checkpoint["model"]
                    taskA_models[label] = model
                    model.eval()
                algorithm.taskA_models = taskA_models

            algorithm.train_taskB(
                training,
                validation,
                jointly=jointly,
                inclusion=inclusion,
                n_epochs=n_epochs,
                save_to=name_to_path,
                early_stopping=early_stopping,
                weight=weight,
                use_crf=use_crf,
                train_pairs=train_pairs,
                train_seqs=train_seqs,
                straight_forward_encoding=straight_forward_encoding,
                reduce=reduce,
                dropout=dropout,
            )
Пример #29
0
from pathlib import Path
from scripts.utils import Collection, CollectionV1Handler, CollectionV2Handler

talp = CollectionV1Handler.load(
    Collection(),
    Path("data/training/talp-576640/scenario1-main/input_scenario1.txt"))
print(f"Talp: {len(talp)}")

ensemble = CollectionV2Handler.load(Collection(),
                                    Path("data/training/ensemble.txt"))
print(f"Ensemble: {len(ensemble)}")

sentences = set([s.text for s in ensemble.sentences])

selection = Collection([s for s in talp.sentences if s.text in sentences])
print(f"Selection: {len(selection)}")

output = Path("data/training/talp.txt")
output.parent.mkdir(exist_ok=True)

CollectionV2Handler.dump(selection, output, skip_empty_sentences=False)
Пример #30
0
from streamlit.ScriptRunner import StopException

from autobrat.classifier import Model
from scripts.score import compute_metrics, subtaskA, subtaskB
from scripts.utils import (
    ENTITIES,
    RELATIONS,
    Collection,
    CollectionV1Handler,
    CollectionV2Handler,
    Keyphrase,
    Relation,
    Sentence,
)

c = Collection()

if st.sidebar.checkbox("Original Data", value=False):
    c = CollectionV1Handler.load(c, Path("data/training/input_training.txt"))

if st.sidebar.checkbox("Ensemble Data", value=False):
    old_size = len(c)
    c = CollectionV2Handler.load(c, Path("data/training/ensemble.txt"))
    ensemble_size = len(c) - old_size
    top_agreement = st.sidebar.number_input("Number of sentences (Ensemble)",
                                            0, ensemble_size, ensemble_size)
    c.sentences = c.sentences[:old_size + top_agreement]

if st.sidebar.checkbox("Talp Data", value=False):
    old_size = len(c)
    c = CollectionV2Handler.load(c, Path("data/training/talp.txt"))