Exemplo n.º 1
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    header = True
    with open(input_path, "r") as in_f, open(output_path, "w") as out_f:
        for line in tqdm(in_f):
            if header:
                header = False
                continue
            sentence, tokens = pd.read_csv(StringIO(line),
                                           header=None,
                                           usecols=[0, 1]).values[0]
            tokens = eval(tokens)
            dict_line = line_to_dict(sentence, tokens)
            eg = dict_line

            if eg["answer"] != "accept":
                continue
            tokens = [token["text"] for token in eg["tokens"]]
            words, spaces = get_words_and_spaces(tokens, eg["text"])
            doc = Doc(nlp.vocab, words=words, spaces=spaces)
            doc.ents = [
                doc.char_span(s["start"], s["end"], label=s["label"])
                for s in eg.get("spans", [])
            ]
            doc_bin.add(doc)
        doc_bin.to_disk(output_path)
        print(f"Processed {len(doc_bin)} documents: {output_path}")
Exemplo n.º 2
0
def convert(lang: str, input_path: Path, training_path: Path,
            validation_path: Path):
    nlp = spacy.blank(lang)
    db_train = DocBin()
    db_test = DocBin()

    df = pd.read_csv(input_path)
    df_si = df[df.is_humor > 0]
    train_si = df_si.sample(frac=0.8, random_state=31416)
    test_si = df_si.drop(train_si.index)

    df_no = df[df.is_humor == 0]
    train_no = df_no.sample(frac=0.8, random_state=31416)
    test_no = df_no.drop(train_no.index)

    db_train = genera(nlp, train_si.text, {
        'humor': 1.0,
        'no_humor': 0.0
    }, db_train)
    db_train = genera(nlp, train_no.text, {
        'humor': 0.0,
        'no_humor': 1.0
    }, db_train)
    db_train.to_disk(training_path)

    db_test = genera(nlp, test_si.text, {
        'humor': 1.0,
        'no_humor': 0.0
    }, db_test)
    db_test = genera(nlp, test_no.text, {
        'humor': 0.0,
        'no_humor': 1.0
    }, db_test)
    db_test.to_disk(validation_path)
Exemplo n.º 3
0
def convert(json_path, output):
    db = DocBin()
    for line in srsly.read_jsonl(json_path):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output)
Exemplo n.º 4
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for line in srsly.read_jsonl(input_path):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output_path)
Exemplo n.º 5
0
def convert(output_path):
    global nlp
    db = DocBin()
    for line in srsly.read_jsonl("db.json"):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output_path)
Exemplo n.º 6
0
    def generate_corpus(nlp):
        directory_path = path.join('data')
        
        corpus_path = Path(path.join(directory_path, file_name) + ".spacy")
        raw_path = Path(path.join(directory_path, file_name) + ".jsonl")

        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)

        vulnerabilities = []
        with open(raw_path) as file:
            for line in file.readlines():
                vulnerability = loads(line)

                vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])})
                

        corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"])

        for vulnerability in vulnerabilities:
            document = nlp.make_doc(vulnerability['description'].lower())
            #print(vulnerability)
            #print(len(document))
            #iob =  [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc]
            #biluo = iob_to_biluo(iob)
            #print(biluo)
            
            
            #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']])
            #document.set_ents(list(document.ents))

            tags = offsets_to_biluo_tags(document, vulnerability['entities'])
            entities = biluo_tags_to_spans(document, tags)
            document.set_ents(entities)
            '''
             Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens.
            '''
            #print(document.has_annotation(1)) #ID of "SOFTWARE"

            # passt alles!
            ents = list(document.ents)
            for i, _ in enumerate(ents):
                print(ents[i].label_)
                print(ents[i].text)
                print('\n')


            print('\nOK\n')   
            #exit()
            corpus.add(document)
            
        print(len(corpus))
        print(list(corpus.get_docs(nlp.vocab)))
        corpus.to_disk(corpus_path)
    
        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)
Exemplo n.º 7
0
def make_docbin(user_key, language=None):
    docbin = DocBin(store_user_data=True)
    serial = make_serial(user_key, language='??')
    file_key = file_key_from_principal_key(principal_key=user_key,
                                           serial=serial,
                                           language='ll',
                                           principal_type='u')
    path = path_from_file_key(file_key)
    docbin.to_disk(path)
    return file_key, docbin
Exemplo n.º 8
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    in_db = DocBin().from_disk(input_path)
    out_db = DocBin()
    logging.info(f"Read {len(in_db)} documents from {input_path}.")
    for doc in in_db.get_docs(nlp.vocab):
        new_doc = nlp.make_doc(doc.text)
        new_doc.user_data = doc.user_data
        new_doc.ents = doc.ents
        out_db.add(new_doc)
    out_db.to_disk(output_path)
Exemplo n.º 9
0
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    data_tuples = ((eg["text"], eg) for eg in srsly.read_jsonl(input_path))
    for doc, eg in nlp.pipe(data_tuples, as_tuples=True):
        # doc.cats = {category: 0 for category in CATEGORIES}
        doc.cats[eg["label"]] = 1
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Exemplo n.º 10
0
def prepare_data(
    params: Params,
    verbose: bool = True,
) -> Dict[str, Doc]:
    """
    return a single spacy doc for each age.

    warning: if corpus binary is not on disk already, it will be saved to disk.
    this means the corpus should never be modified - else, the binary will also contain unexpected modifications
    """

    # try loading transcripts from disk
    fn = params.corpus_name + '.spacy'
    bin_path = configs.Dirs.corpora / fn
    if bin_path.exists():
        doc_bin = DocBin().from_disk(bin_path)
        docs = list(doc_bin.get_docs(nlp.vocab))
    # load raw transcripts + process them
    else:
        print(
            f'WARNING: Did not find binary file associated with {params.corpus_name}. Preprocessing corpus...'
        )
        transcripts = load_transcripts(params)
        docs: List[Doc] = [doc for doc in nlp.pipe(transcripts)]
        # WARNING: only save to disk if we know that corpus has not been modified
        doc_bin = DocBin(docs=docs)
        doc_bin.to_disk(bin_path)

    # group docs by age
    ages = load_ages(params)
    if len(ages) != len(docs):
        raise RuntimeError(f'Num docs={len(docs)} and num ages={len(ages)}')
    age2docs = {}
    for age in SortedSet(ages):
        if age == EXCLUDED_AGE:
            continue
        docs_at_age = [docs[n] for n, ai in enumerate(ages) if ai == age]
        age2docs[age] = docs_at_age
        if verbose:
            print(
                f'Processed {len(age2docs[age]):>6} transcripts for age={age}')

    # combine all documents at same age
    age2doc = {}
    for age, docs in age2docs.items():

        doc_combined = Doc.from_docs(docs)
        age2doc[age] = doc_combined
        print(f'Num tokens at age={age} is {len(doc_combined):,}')

    return age2doc
Exemplo n.º 11
0
def write_spacy_docs(
    data: Doc | Iterable[Doc],
    filepath: types.PathLike,
    *,
    make_dirs: bool = False,
    format: str = "binary",
    attrs: Optional[Iterable[str]] = None,
    store_user_data: bool = False,
) -> None:
    """
    Write one or more ``Doc`` s to disk at ``filepath`` in binary or pickle format.

    Args:
        data: A single ``Doc`` or a sequence of ``Doc`` s to write to disk.
        filepath: Path to file on disk to which data will be written.
        make_dirs: If True, automatically create (sub)directories
            if not already present in order to write ``filepath``.
        format ({"pickle", "binary"}): Format of the data written to disk.
            If "binary", uses :class:`spacy.tokens.DocBin` to serialie data;
            if "pickle", uses python's stdlib ``pickle``.

            .. warning:: When writing docs in pickle format, all the docs in ``data``
               must be saved as a list, which means they're all loaded into memory.
               Mind your RAM usage, especially when writing many docs!

        attrs: List of attributes to serialize if ``format`` is "binary". If None,
            spaCy's default values are used; see here: https://spacy.io/api/docbin#init
        store_user_data: If True, write :attr`Doc.user_data` and the values of custom
            extension attributes to disk; otherwise, don't.

    Raises:
        ValueError: if format is not "binary" or "pickle"
    """
    if isinstance(data, Doc):
        data = [data]
    if format == "binary":
        kwargs = {"docs": data, "store_user_data": store_user_data}
        if attrs is not None:
            kwargs["attrs"] = list(attrs)
        docbin = DocBin(**kwargs)
        docbin.to_disk(filepath)
    elif format == "pickle":
        if store_user_data is False:
            data = _clear_docs_user_data(data)
        with io_utils.open_sesame(filepath, mode="wb",
                                  make_dirs=make_dirs) as f:
            pickle.dump(list(data), f, protocol=-1)
    else:
        raise ValueError(
            errors.value_invalid_msg("format", format, {"binary", "pickle"}))
Exemplo n.º 12
0
 def convert(input_path, output_path, lang='en'):
     nlp = spacy.blank(lang)
     db = DocBin()
     for text, annot in srsly.read_json(input_path):
         doc = nlp.make_doc(text)
         ents = []
         for start, end, label in annot["entities"]:
             span = doc.char_span(start, end, label=label)
             if span is None:
                 print("Skipping entity")
             else:
                 ents.append(span)
         doc.ents = ents
         db.add(doc)
     db.to_disk(output_path)
Exemplo n.º 13
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in srsly.read_json(input_path):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)
Exemplo n.º 14
0
def removefrom_docbin(file_key, obj_type, obj_id):
    file_key, docbin = get_docbin(file_key=file_key)
    language = language_from_file_key(file_key)
    model = settings.LANGUAGE_MODELS[language]
    index = i = 0
    docs = []
    for doc in list(docbin.get_docs(model.vocab)):
        if doc._.obj_type == obj_type and doc._.obj_id == obj_id:
            index = i
        else:
            docs.append(doc)
        i += 1
    delete_docbin(file_key)
    docbin = DocBin(docs=docs, store_user_data=True)
    path = path_from_file_key(file_key)
    docbin.to_disk(path)
    return index
Exemplo n.º 15
0
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for eg in srsly.read_json(input_path):
        print(eg)

        doc = nlp(eg[0])
        doc.ents = [
            doc.char_span(s[0], s[1], label=s[2])
            for s in eg[1].get("entities", [])
        ]
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Exemplo n.º 16
0
    def to_spacy(self, df, file_path=None):
        """
        Function to convert dataframe returned by annotator into spacy .

        Parameters
        ----------
        df (pandas DataFrame): Dataframe returned by the annotator (see Annotate()).
        file_path (str): Filepath (including filename) to save the .spacy file to.
        
        Returns
        -------
        Spacy docbin if a user wants to combine additional training data
        """

        if (not isinstance(df, pd.DataFrame)):
            raise TypeError("Pass the pandas dataframe returned by annotate()")

        if file_path and (not isinstance(file_path, str)):
            raise TypeError("The file_path must be a string or None")

        if file_path is None:
            file_path = os.path.join(os.getcwd(), 'annotations.spacy')

        db = DocBin()
        training_data = [ant for ant in df['annotations'].tolist() if ant]
        for text, annotations in training_data:

            ents = []
            doc = self.nlp(text)
            for start, end, label in annotations['entities']:

                span = doc.char_span(start, end, label=label)
                ents.append(span)

            # Drop overlapping spans. Note: when spans overlap, the (first) longest span is preferred over shorter spans.
            # See: https://spacy.io/api/top-level#util.filter_spans
            # TODO: alert users that some spans have been dropped.
            doc.ents = filter_spans(ents)

            db.add(doc)

        db.to_disk(file_path)
        print(f"Spacy file saved to: {file_path}")

        return db
Exemplo n.º 17
0
def descrip_to_spacy(data: pd.DataFrame, output_path: str) -> None:
    "Takes in dataframe with description and label to save DocBin to disk"
    tuples = data.apply(lambda row:
                        (strip_html_tags(row["description"]), row["fraud"]),
                        axis=1).to_list()
    nlp = spacy.blank("en")
    db = DocBin()

    for doc, label in nlp.pipe(tuples, as_tuples=True):
        if label:
            doc.cats["FRAUD"] = 1
            doc.cats["NOTFRAUD"] = 0
        else:
            doc.cats["FRAUD"] = 0
            doc.cats["NOTFRAUD"] = 1

        db.add(doc)

    db.to_disk(output_path)
Exemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-i',
        '--input',
        required=True,
        help=
        'The path to the input dataset to convert to SpaCy\'s binary format.')
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        help=
        'The path to the input dataset to convert to SpaCy\'s binary format.')
    parser.add_argument(
        '-c',
        '--categories',
        required=True,
        help='The path to the .json file which contains the categories.')
    args = parser.parse_args()

    # Read the .json file which contains the list of categories
    categories_dict = _read_categories(path_to_file=args.categories)

    # Define an empty SpaCy pipeline for English language
    nlp = spacy.blank('en')

    # Read and parse the sentences with their labels
    records = read_tsv_file(args.input)

    # Convert the (sentence, label) pairs to SpaCy Doc object
    docs = [
        convert_record(nlp, record_dict, categories_dict)
        for record_dict in records
    ]

    # Create the SpaCy's data structure that contains the SpaCy's Doc(s)
    doc_bin = DocBin(docs=docs)

    # Save it as .spacy file format
    doc_bin.to_disk(args.output)
    print('INFO: saved as .spacy binary format the {} [{} documents].'.format(
        args.input.split('/')[1].split('.')[0], len(docs)))
Exemplo n.º 19
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for eg in tqdm(srsly.read_jsonl(input_path)):
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Exemplo n.º 20
0
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for idx, eg in enumerate(srsly.read_jsonl(input_path)):
        if idx % 10000 == 0:
            print(f"converted {idx} sentences")
        doc = nlp(eg["text"])
        spans_from_json = eg.get("spans", [])
        spans_objects = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in spans_from_json
        ]
        spans_objects = filter_spans(spans_objects)
        doc.ents = spans_objects
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Exemplo n.º 21
0
def main(input_path: Path = typer.Argument(..., exists=True, dir_okay=False)):
    print("Read params.yaml...")
    with open("params.yaml", "r") as fd:
        params = yaml.safe_load(fd)
    dev_size = params["train"]["corpora"]["dev_size"]
    shuffle_seed = params["train"]["corpora"]["shuffle_seed"]
    print(f"...read dev_size={dev_size}, shuffle_seed={shuffle_seed}")

    print("Read annotations...")
    corpus = list(srsly.read_jsonl(input_path))
    print(f"...read {len(corpus)} texts")

    print("Convert into documents...")
    docs = []
    nlp = spacy.blank("en")
    for eg in corpus:
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        docs.append(doc)
    print(f"...converted {len(docs)} documents")

    print("Split into train and dev...")
    train, dev = train_test_split(docs,
                                  test_size=dev_size,
                                  random_state=shuffle_seed,
                                  shuffle=True)
    print(f"...split into {len(train)} train and {len(dev)} dev documents")

    print("Write serialized documents...")
    for split, data in [("train", train), ("dev", dev)]:
        output_path = input_path.with_suffix(f".{split}.spacy")
        doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], docs=data)
        doc_bin.to_disk(output_path)
        print(f"...wrote {output_path}")
Exemplo n.º 22
0
def generateSpacyFiles(training_data):
    filenames = ['./train.spacy', './evaluation.spacy']
    for i in range(len(filenames)):
        nlp = spacy.blank('de')  # load a new spacy model
        db = DocBin()  # create a DocBin object
        for text, annot in tqdm(training_data[i]):  # data in previous format
            doc = nlp.make_doc(text)  # create doc object from text
            ents = []
            for start, end, label in annot[
                    "entities"]:  # add character indexes
                span = doc.char_span(start,
                                     end,
                                     label=label,
                                     alignment_mode="contract")
                if span is None:
                    print("Skipping entity")
                else:
                    ents.append(span)
        doc.ents = ents  # label the text with the ents
        db.add(doc)
        db.to_disk(filenames[i])  # save the docbin object
Exemplo n.º 23
0
def df_to_spacy(df, outfile, model='en_core_web_md'):
    """ Convert a dataframe into a .spacy training file """
    nlp = spacy.load(model)
    # nlp = spacy.blank("en")
    db = DocBin()  # create a DocBin object
    for index, row in df.iterrows():
        doc = nlp.make_doc(row['data'])  # create doc object from text
        ents = []
        for start, end, label in row['label']:  # add character indexes
            span = doc.char_span(start,
                                 end,
                                 label=label,
                                 alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents  # label the text with the ents
        db.add(doc)
    db.to_disk(outfile)  # save the docbin object
    print(f'Successfully wrote \'{outfile}\' to disk')
Exemplo n.º 24
0
 def build_training_file(self):
     # TODO: divide into training/dev
     nlp = spacy.blank('en')
     doc_bin = DocBin()
     annotated_text = self._db.get_training_corpus()
     for text, entities in annotated_text:
         doc = nlp.make_doc(text)
         ents = list()
         for start, stop, label in entities:
             span = doc.char_span(start,
                                  stop,
                                  label=label,
                                  alignment_mode="contract")
             if span is None:
                 log.debug(f'{label} entity from {start} to {stop} was not ' + \
                            'valid and was discarded.')
             else:
                 ents.append(span)
         doc.ents = ents
         doc_bin.add(doc)
     # TODO: remove underscore when dev data is available
     data_uri = f'{self._OUT_DIR}/_train.spacy'
     log.info(f'Now saving training file to disk at {data_uri}')
     doc_bin.to_disk(data_uri)
Exemplo n.º 25
0
def main(json_loc: Path,
         train_file: Path,
         dev_file: Path,
         test_file: Path,
         test_split=0.189,
         train_split=0.709):
    """Creating the corpus from the Prodigy annotations."""
    Doc.set_extension("rel", default={})
    vocab = Vocab()

    docs = {"train": [], "dev": [], "test": []}
    ids = {"train": set(), "dev": set(), "test": set()}
    count_all = {"train": 0, "dev": 0, "test": 0}
    count_pos = {"train": 0, "dev": 0, "test": 0}

    long_rel_count = 0  #how many relations are longer
    error_count_rel = 0  #how often is something different than ARGO, ARG1, ARG

    with json_loc.open("r", encoding="utf8") as jsonfile:
        length_training_data = len([
            True for line in jsonfile if json.loads(line)["answer"] == "accept"
        ])
        msg.info(f"Number of accepted recipes: {length_training_data}")

    with json_loc.open("r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            example = json.loads(line)  #one recipe
            span_starts = set()

            if example["answer"] == "accept":
                neg = 0
                pos = 0
                try:
                    # Parse the tokens -> example["tokens"] = list of dicts
                    words = [t["text"] for t in example["tokens"]
                             ]  #list containing all words
                    spaces = [
                        t["ws"] for t in example["tokens"]
                    ]  #list containing ws is behind word (ws = True/False)
                    doc = Doc(vocab, words=words, spaces=spaces)

                    # Parse the entities
                    spans = example[
                        "spans"]  #list of dicts containing entities
                    entities = []
                    span_end_to_start = {}
                    ents_dict = {}
                    for span in spans:  #every detected span
                        entity = doc.char_span(
                            span["start"], span["end"], label=span["label"]
                        )  #"start" = wievielter character ist start character des spans im doc
                        span_end_to_start[span["token_end"]] = span[
                            "token_start"]  #end_token of span as key for start_token (start token = wievielter token in doc)
                        entities.append(entity)  #appended to list
                        span_starts.add(span["token_start"])  #added to set
                        ents_dict[span["token_start"]] = (span["label"],
                                                          span["token_start"])
                    doc.ents = entities  #entity list assigned as doc entites

                    # Parse the relations
                    rels = {}

                    # create token combinations
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 1a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 1a
                                        if DIFF_FRONT_BACK == True:

                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                rels[(x1, x2)] = {}

                                            else:
                                                pass
                                        #DIFF_FRONT_BACK 1b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                rels[(x1, x2)] = {
                                                }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...
                        #VERBS_TO_OTHER 1b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 2a
                                if DIFF_FRONT_BACK == True:

                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        rels[(x1, x2)] = {}

                                    else:
                                        pass
                                #DIFF_FRONT_BACK 2b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        rels[(x1, x2)] = {
                                        }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...

                    relations = example[
                        "relations"]  #relations is list of dict
                    for relation in relations:
                        # the 'head' and 'child' annotations refer to the end token in the span
                        # but we want the first token
                        start = span_end_to_start[relation[
                            "head"]]  #wievielter token ist start token des head
                        end = span_end_to_start[relation[
                            "child"]]  #wievielter token ist start token des child
                        label = relation["label"]

                        #DETAILED_ARGS 1a
                        if DETAILED_ARGS == True:
                            if label == "ARG0":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG0[ents_dict[end][
                                        0]]  #assign new label based on span type
                            elif label == "ARG1":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG1[ents_dict[end][0]]
                            elif label == "ARG":
                                if ents_dict[end][0] in ["Z", "TOOL"]:
                                    if ents_dict[end][0] == "Z":
                                        label = "Arg0Z"
                                    elif ents_dict[end][0] == "TOOL":
                                        label = "Arg1Tool"
                                else:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                            else:
                                error_count_rel += 1

                        #DETAILED_ARGS 1b
                        else:
                            label = MAP_LABELS_STANDARD[
                                label]  #MAP_LABELS = dict containing label as key

                        # Positive relations are being added
                        try:
                            if label not in rels[(
                                    start, end
                            )]:  #check if label already exists for token combination
                                rels[(
                                    start, end
                                )][label] = 1.0  #initialize label as new key with value 1.0
                                pos += 1  #positive case
                        except:
                            long_rel_count += 1  #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb)
                            pass

                    # The annotation is complete, so fill in zero's where the data is missing
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 2a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 2a
                                        if DIFF_FRONT_BACK == True:
                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                #DETAILED_ARGS 2a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                            #DETAILED_ARGS 2b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0

                                        #DIFF_FRONT_BACK 2b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                #DETAILED_ARGS 3a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                                #DETAILED_ARGS 3b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0  #span combination with label as key gets 0 as value
                        #VERBS_TO_OTHER 2b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 3a
                                if DIFF_FRONT_BACK == True:
                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        #DETAILED_ARGS 4a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                    #DETAILED_ARGS 4b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                                #DIFF_FRONT_BACK 3b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        #DETAILED_ARGS 5a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                        #DETAILED_ARGS 5b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                    #print(rels)
                    doc._.rel = rels  # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}}

                    # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list)
                    if pos > 0:

                        recipe_id = example["_input_hash"]

                        if len(docs["train"]) < round(
                                train_split * length_training_data):
                            ids["train"].add(recipe_id)
                            docs["train"].append(doc)
                            count_pos["train"] += pos
                            count_all["train"] += pos + neg
                        elif len(docs["test"]) < round(
                                test_split * length_training_data):
                            ids["test"].add(recipe_id)
                            docs["test"].append(doc)
                            count_pos["test"] += pos
                            count_all["test"] += pos + neg
                        else:
                            ids["dev"].add(recipe_id)
                            docs["dev"].append(doc)
                            count_pos["dev"] += pos
                            count_all["dev"] += pos + neg

                except KeyError as e:
                    msg.fail(
                        f"Skipping doc because of key error: {e} in {example['_input_hash']}"
                    )

    msg.info(
        f"{long_rel_count} relations have been cut because tokens are too far apart."
    )

    docbin = DocBin(docs=docs["train"], store_user_data=True)
    docbin.to_disk(train_file)
    msg.info(
        f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, "
        f"{count_pos['train']}/{count_all['train']} pos instances.")

    docbin = DocBin(docs=docs["dev"], store_user_data=True)
    docbin.to_disk(dev_file)
    msg.info(
        f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, "
        f"{count_pos['dev']}/{count_all['dev']} pos instances.")

    docbin = DocBin(docs=docs["test"], store_user_data=True)
    docbin.to_disk(test_file)
    msg.info(
        f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, "
        f"{count_pos['test']}/{count_all['test']} pos instances.")
Exemplo n.º 26
0
    # nlp.pipe([texts]) is way faster than running nlp(text) for each text
    # as_tuples allows us to pass in a tuple, the first one is treated as text
    # the second one will get returned as it is.

    #for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
    for doc, label in nlp.pipe(data, as_tuples=True):
        # we need to set the (text)cat(egory) for each document
        doc.cats["positive"] = label

        # put them into a nice list
        docs.append(doc)

    return docs


# we are so far only interested in the first 5000 reviews
# this will keep the training time short.
# In practice take as much data as you can get.
# you can always reduce it to make the script even faster.
num_texts = 5000

# first we need to transform all the training data
train_docs = make_docs(train_data[:num_texts])
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/train.spacy")

# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./data/valid.spacy")
Exemplo n.º 27
0
            'FALAR SOBRE SEMEAR': 0.0,
            'FALAR SOBRE ADA': 0.0,
            'MÚSICA': 0.0,
            'SOLETRAR': 0.0,
            'DANÇAR': 1.0
        })

    doc = nlp.make_doc(texto)
    doc.cats = dic.copy()
    #db.add(doc)

    arq.write("{\"text\":\"" + texto + "\",\"cats\":")
    arq.write(json.dumps(dic, ensure_ascii=False))
    arq.write("}\n")

    #print(doc.cats)

    #baseDeDadosFinal.append([texto, dic.copy()])

arq.close()

convert("db.json", "db.spacy")
db.to_disk("db")

#Após rodar o PLN.py, rodar a linha de comando abaixo
#python -m spacy train config.conf --output training/ --paths.train db.spacy --paths.dev db.spacy --nlp.lang "pt" --gpu-id -1
#python -m spacy debug data config.conf --paths.train db --paths.dev db --nlp.lang "pt"
#python -m spacy init fill-config config.conf

#Site de referência:
#https://towardsdatascience.com/sarcasm-text-classification-using-spacy-in-python-7cd39074f32e
Exemplo n.º 28
0
#create training set
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(trainSet): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)
db.to_disk("./train.spacy") # save the docbin object

#create validation set
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(valSet): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)
Exemplo n.º 29
0
train = pd.read_csv("./data/sentiment/norec_sentence/train.txt",
                    delimiter="\t",
                    header=None)  #type: ignore
dev = pd.read_csv("./data/sentiment/norec_sentence/dev.txt",
                  delimiter="\t",
                  header=None)  #type: ignore
test = pd.read_csv("./data/sentiment/norec_sentence/test.txt",
                   delimiter="\t",
                   header=None)  #type: ignore

for sid, (label, sent) in train.iterrows():
    doc = nlp(sent)
    doc.user_data["gold"] = label
    train_doc_bin.add(doc)
train_doc_bin.to_disk("./data/sentiment/norec_sentence/train.docbin")

for sid, (label, sent) in dev.iterrows():
    doc = nlp(sent)
    doc.user_data["gold"] = label
    dev_doc_bin.add(doc)
dev_doc_bin.to_disk("./data/sentiment/norec_sentence/dev.docbin")

for sid, (label, sent) in test.iterrows():
    doc = nlp(sent)
    doc.user_data["gold"] = label
    test_doc_bin.add(doc)
test_doc_bin.to_disk("./data/sentiment/norec_sentence/test.docbin")

##################################################################
# Weak supervision
Exemplo n.º 30
0
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

with open("exercises/en/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
# Adicionar padrões ao comparador
pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}])
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [
        Span(doc, start, end, label=match_id)
        for match_id, start, end in matches
    ]
    doc.ents = spans
    docs.append(doc)

doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./train.spacy")