예제 #1
0
    def predict_entities(self, collection):

        next_id = 0
        for instance_keyphrase, label in self.keyphrases.items():
            for sentence in collection.sentences:
                text = sentence.text.lower()
                pattern = r'\b' + instance_keyphrase + r'\b'
                for match in re.finditer(pattern, text):
                    keyphrase = Keyphrase(sentence, label, next_id, [match.span()])
                    keyphrase.split()
                    next_id += 1

                    sentence.keyphrases.append(keyphrase)
예제 #2
0
    def _test_biluov_task():
        import es_core_news_md
        from scripts.utils import Sentence

        def forward(tokensxsentence, entitiesxsentence):
            labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence)
            return [
                from_biluov(biluov, sentence, spans=True)
                for biluov, sentence in zip(labelsxsentence, tokensxsentence)
            ]

        training = Collection().load(Path("data/training/scenario.txt"))
        nlp = es_core_news_md.load()

        def per_label(label):
            tokensxsentence = [nlp(s.text) for s in training.sentences]
            entitiesxsentence = [[
                k.spans for k in s.keyphrases if k.label == label
            ] for s in training.sentences]
            decoded = forward(tokensxsentence, entitiesxsentence)
            return decoded

        collection = Collection([Sentence(s.text) for s in training.sentences])
        for label in ENTITIES:
            decoded = per_label(label)
            for entities, sentence in zip(decoded, collection.sentences):
                for spans in entities:
                    keyphrase = Keyphrase(sentence, label, -1, spans)
                    sentence.keyphrases.append(keyphrase)

        collection.fix_ids()
        output = Path(
            "data/submissions/forward-biluov/train/run1/scenario2-taskA/")
        output.mkdir(parents=True, exist_ok=True)
        collection.dump(output / "scenario.txt", skip_empty_sentences=False)
예제 #3
0
def decode_bilou(sentence: Sentence, tags, tokens, spans) -> List[Keyphrase]:
    """tags: B-Concept, B-Action, ..."""

    next_id = 0  # unique id
    tokens = [{
        "token": i,
        "span": j,
        "label": k
    } for i, j, k in zip(tokens, spans, tags) if j != (0, 0)]

    entity_spans = []
    entity_label = None
    prev_state = None
    prev_label = 'O'

    # if tokens are the atomic elements, we convert them to word
    words = []
    for token in tokens:
        if token['token'].startswith('##'):
            #  as ##ma is being convert to asma
            word = words.pop()
            s0 = list(word['span'])[0]
            s1 = list(token['span'])[1]
            words.append({
                "token": word['token'] + token['token'][2:],
                "span": (s0, s1),
                "label": word['label']
            })
        else:
            words.append(token)

    list_of_keypfrases = []
    for w in words:
        """IF the new label is BUO, 
        or the new label is different from previous one, 
        or  the new label is IL but the previuos one isnt BI
        THEN save the previous entity and reset """
        bool_1 = (w['label'][:1] in ['B', 'U', 'O'])
        bool_2 = w['label'][2:] != prev_label
        bool_3 = (w['label'][:1] in ['I', 'L']) and (prev_state
                                                     not in ['B', 'I'])
        if bool_1 or bool_2 or bool_3:
            if entity_spans:
                keyphrase = Keyphrase(sentence=sentence,
                                      label=entity_label,
                                      id=next_id,
                                      spans=entity_spans)
                list_of_keypfrases.append(keyphrase)
                next_id += 1
                entity_spans = []

        if w['label'] == 'O':
            entity_label = 'O'
        else:
            entity_spans.append(w['span'])
            entity_label = w['label'][2:]

        prev_state = w['label'][:1]
        prev_label = entity_label
    return list_of_keypfrases
예제 #4
0
def make_sentence(doc, bilouv, labels) -> Sentence:
    sentence = Sentence(doc.text)

    logger.debug(f"[make_sentence]: doc.text={doc.text}")
    logger.debug(f"[make_sentence]: bilouv={bilouv}")

    labels = set(l[2:] for l in labels if l != 'O')

    for label in labels:
        specific_bilouv = []

        for tag in bilouv:
            if tag.endswith(label):
                tag = tag[0]
                specific_bilouv.append(tag[0])
            else:
                specific_bilouv.append('O')

        logger.debug(
            f"[make_sentence]: label={label} specific_bilouv={specific_bilouv}"
        )

        spans = from_biluov(specific_bilouv, doc, spans=True)
        sentence.keyphrases.extend(
            Keyphrase(sentence, label, i, sp) for i, sp in enumerate(spans))

    return sentence
예제 #5
0
    def load_keyphrases(cls, collection: Collection, finput: Path):
        cls.load_input(collection, finput)

        input_a_file = finput.parent / ("output_a_" + finput.name.split("_")[1])

        sentences_length = [len(s.text) for s in collection.sentences]
        for i in range(1, len(sentences_length)):
            sentences_length[i] += sentences_length[i - 1] + 1

        sentence_by_id = {}

        for line in input_a_file.open(encoding="utf8").readlines():
            lid, spans, label, _ = line.strip().split("\t")
            lid = int(lid)

            spans = [s.split() for s in spans.split(";")]
            spans = [(int(start), int(end)) for start, end in spans]

            # find the sentence where this annotation is
            i = bisect.bisect(sentences_length, spans[0][0])
            # correct the annotation spans
            if i > 0:
                spans = [
                    (
                        start - sentences_length[i - 1] - 1,
                        end - sentences_length[i - 1] - 1,
                    )
                    for start, end in spans
                ]
                spans.sort(key=lambda t: t[0])
            # store the annotation in the corresponding sentence
            the_sentence = collection.sentences[i]
            keyphrase = Keyphrase(the_sentence, label, lid, spans)
            the_sentence.keyphrases.append(keyphrase)

            if len(keyphrase.spans) == 1:
                keyphrase.split()

            sentence_by_id[lid] = the_sentence

        return sentence_by_id
예제 #6
0
    def run(self, collection, taskA, taskB):
        gold_keyphrases, gold_relations = self.model

        if taskA:
            next_id = 0
            for gold_keyphrase, label in gold_keyphrases.items():
                for sentence in collection.sentences:
                    text = sentence.text.lower()
                    pattern = r"\b" + gold_keyphrase + r"\b"
                    for match in re.finditer(pattern, text):
                        keyphrase = Keyphrase(sentence, label, next_id,
                                              [match.span()])
                        keyphrase.split()
                        next_id += 1

                        sentence.keyphrases.append(keyphrase)

        if taskB:
            for sentence in collection.sentences:
                for origin in sentence.keyphrases:
                    origin_text = origin.text.lower()
                    for destination in sentence.keyphrases:
                        destination_text = destination.text.lower()
                        try:
                            label = gold_relations[origin_text, origin.label,
                                                   destination_text,
                                                   destination.label, ]
                        except KeyError:
                            continue
                        relation = Relation(sentence, origin.id,
                                            destination.id, label)
                        sentence.relations.append(relation)

                sentence.remove_dup_relations()

        return collection
예제 #7
0
    def run_taskA_for_label(self, collection: Collection, entity_label: str,
                            *args, **kargs):
        model = self.taskA_models[entity_label]
        print(f"Building dataset for {entity_label} ...")
        dataset = BILUOVSentencesDS([s.text for s in collection.sentences],
                                    language=self.nlp)
        print(f"Done!")

        with torch.no_grad():
            for sid, (*s_features, _) in tqdm(
                    enumerate(dataset.shallow_dataloader()),
                    total=len(dataset),
                    desc=entity_label,
            ):
                tokensxsentence = dataset.tokensxsentence[sid]
                output = model(s_features)
                output = model.decode(output)
                labels = [dataset.labels[x] for x in output]
                decoded = from_biluov(labels, tokensxsentence, spans=True)

                sentence = collection.sentences[sid]
                for spans in decoded:
                    keyphrase = Keyphrase(sentence, entity_label, -1, spans)
                    sentence.keyphrases.append(keyphrase)