def __init__(self, nlp):
     self.nlp = nlp
     # self._endpoint = os.getenv("AZ_TA_FOR_HEALTH_ENDPOINT")
     stanza.download("en", package="mimic", processors={"ner": "i2b2"})
     self._analyzer = stanza.Pipeline(
         "en", package="mimic", processors={"ner": "i2b2"}
     )
예제 #2
0
 def __init__(self, lang='en'):
     import stanza
     try:
         self.pipeline = stanza.Pipeline(lang=lang, processors='tokenize', verbose=False, tokenize_no_ssplit=True)
     except Exception:
         stanza.download(lang=lang, resources_url='stanford')
         self.pipeline = stanza.Pipeline(lang=lang, processors='tokenize', verbose=False, tokenize_no_ssplit=True)
예제 #3
0
 def _download_model(self) -> None:
     """Interface with the `stanza` model downloader."""
     if not self.interactive:
         if not self.silent:
             print(
                 f"CLTK message: Going to download required Stanza models to ``{self.model_path}`` ..."
             )  # pragma: no cover
         stanza.download(lang=self.stanza_code, package=self.treebank)
     else:
         print(  # pragma: no cover
             "CLTK message: This part of the CLTK depends upon the Stanza NLP library."
         )  # pragma: no cover
         dl_is_allowed = query_yes_no(
             f"CLTK message: Allow download of Stanza models to ``{self.model_path}``?"
         )  # type: bool
         if dl_is_allowed:
             stanza.download(lang=self.stanza_code, package=self.treebank)
         else:
             raise CLTKException(
                 f"Download of necessary Stanza model declined for '{self.language}'. Unable to continue with Stanza's processing."
             )
     # if file model still not available after attempted DL, then raise error
     if not file_exists(self.model_path):
         raise FileNotFoundError(
             "Missing required models for ``stanza`` at ``{0}``.".format(
                 self.model_path
             )
         )
예제 #4
0
def test_spacy_stanza_german():
    lang = "de"
    stanza.download(lang)
    snlp = stanza.Pipeline(lang=lang)
    nlp = StanzaLanguage(snlp)
    with pytest.warns(UserWarning):
        doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten")
예제 #5
0
def create_model(vectors_loc=None,
                 lang=None,
                 stz=True,
                 vectors_name='fasttext',
                 max_items=-1):
    if lang is None or lang == 'sv' and not stz:
        nlp = Swedish()
    elif not stz:
        nlp = spacy.blank(lang)
    elif stz:
        stanza.download(lang)
        snlp = stanza.Pipeline(lang=lang)
        nlp = StanzaLanguage(snlp)

    with open(vectors_loc, 'rb') as file_:
        logger.info("Reading file '{}'".format(vectors_loc))
        header = file_.readline()
        nr_row, nr_dim = header.split(
        )  # the first line is number of tokens and dimensions
        counter = 0
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            if counter % 100 == 0:
                logger.info(counter)
            if counter == max_items:
                break
            counter = counter + 1
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
        nlp.vocab.vectors.name = vectors_name  # give vectors a name
    return nlp
def get_all_stanza_models() -> None:
    """Download language models, from the ``stanza`` project,
    that are supported by the CLTK or in scope. More here:
    `<https://stanfordnlp.github.io/stanza/models.html>_.

    TODO: Use CLTK stanza wrapper class to dlk files
    """
    all_ud_models_for_cltk = dict(
        cop=["scriptorium"],
        cu=["proiel"],  # OCS
        fro=["srcmf"],  # Old French
        grc=["perseus", "proiel"],
        got=["proiel"],
        la=["ittb", "proiel", "perseus"],
        lzh=["kyoto"],
    )  # type: Dict[str, List[str]]
    stanford_dir = os.path.expanduser("~/stanza_resources/")  # type: str
    for lang_name, model_sources in all_ud_models_for_cltk.items():
        for model_source in model_sources:
            if lang_name == "cop":
                # Coptic errors our, for some reason, if we pass the package name ``scriptorium``
                stanza.download(lang=lang_name, dir=stanford_dir)
            else:
                stanza.download(lang=lang_name,
                                dir=stanford_dir,
                                package=model_source)
예제 #7
0
    def nlp(self, model):
        if model == "default":
            # A default Stanza NLP pipeline
            stanza.download(lang="en", processors="tokenize")
            if self.tokenize_by in ["sentence", "sentence_by_document"]:
                BASIC_STANZA_PIPELINE = stanza.Pipeline(processors="tokenize")
            else:
                BASIC_STANZA_PIPELINE = stanza.Pipeline(
                    processors="tokenize", tokenize_no_ssplit=True)
            self._nlp = BASIC_STANZA_PIPELINE
        else:
            if self.tokenize_by in ["sentence", "sentence_by_document"]:
                if model.config["tokenize_no_ssplit"]:
                    model.processors["tokenize"].config["no_ssplit"] = False
                    model.config["tokenize_no_ssplit"] = False
                    warn(
                        "NLP does not have a sentencizer pipe; one has been added to tokenize by sentence."
                    )
            else:
                if not model.config["tokenize_no_ssplit"]:
                    model.processors["tokenize"].config["no_ssplit"] = True
                    model.config["tokenize_no_ssplit"] = True
                    warn(
                        "NLP contains a sentencizer pipe which has been removed to tokenize by document."
                    )

            self._nlp = model
예제 #8
0
def main(args):
    """Visualization of contexts, questions, and colored answer spans."""

    # Load dataset, and optionally shuffle.
    dataset = QADataset(args, args.path)
    samples = dataset.samples
    if args.shuffle:
        random.shuffle(samples)

    vis_samples = samples[:args.samples]

    print()
    print('-' * RULE_LENGTH)
    print()

    # Visualize samples.
    for (qid, context, question, answer_start, answer_end) in vis_samples[:10]:
        cxt = _build_string(context)
        print(cxt)
        stanza.download('en')
        en_nlp = stanza.Pipeline('en')
        en_doc = en_nlp(cxt)

        for i, sent in enumerate(en_doc.sentences):
            print(f"[Sentence {i+1}")
            for word in sent.words:
                print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format(
                    word.text, word.lemma, word.pos, word.head, word.deprel))
                print("")

        print("Mention text\tType\tStart-End")
        for ent in en_doc.ents:
            print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char,
                                         ent.end_char))
예제 #9
0
def init_stanza_model(lang: str = "en"):
    try:
        snlp = stanza.Pipeline(lang)
    except ResourcesFileNotFoundError:
        stanza.download(lang)
        snlp = stanza.Pipeline(lang)
    return snlp
예제 #10
0
 def __init__(self, dataset_name="squad", mode="train", data_limit=-1):
     super(NQGDataset, self).__init__()
     stanza.download('en')
     self.nlp = stanza.Pipeline('en', processors='tokenize,pos,ner')
     self.datatype = QAExample
     if dataset_name == "squad":
         self.datatype = SquadExample
         if mode == "train":
             datapath = SQUAD_TRAIN
         elif mode == "dev":
             datapath = SQUAD_DEV
         else:
             raise ValueError()
         self.ds = read_squad_dataset(datapath, limit=data_limit)
     elif dataset_name == "medquad":
         if mode == "train":
             datapath = MEDQUAD_TRAIN
         elif mode == "dev":
             datapath = MEDQUAD_DEV
         else:
             raise ValueError()
         self.ds = read_qa_dataset(datapath, limit=data_limit)
     elif dataset_name == "medqa_handmade":
         if mode == "test":
             datapath = MEDQA_HANDMADE_FILEPATH
         else:
             raise ValueError()
         self.ds = read_qa_dataset(datapath, limit=data_limit)
     else:
         raise NotImplementedError()
예제 #11
0
 def __init__(self, lang='en'):
     self.lang = lang
     if not exists_file(home_dir() + '/stanza_resources/' + lang):
         stanza.download(lang)
     self.nlp = stanza.Pipeline(lang=lang, logging_level='WARN')
     ensure_path("out/")
     ensure_path("pics/")
def stanza_extract_entities(datafile, labels=LABELS):
    """Extract named entities from an EMu xml file using Stanza,
    returning a dictionary"""
    import stanza
    try:
        nlp = stanza.Pipeline('en')
    except ValueError:
        print('English model has not been downloaded. Downloading now.')
        stanza.download('en')
        nlp = stanza.Pipeline('en')
    futures = []
    if datafile.endswith('.xml'):
        records = xml_iterator(datafile)
    else:
        records = csv_iterator(datafile)
    with ProcessPoolExecutor() as ex:
        for ident, lines in records:
            for line in lines:
                proc_line = nlp(line)
                futures.append(
                    ex.submit(stanza_return_ents,
                              ident,
                              proc_line,
                              labels=labels))
    ent_data = reconcile_entities(futures)
    return (ent_data)
예제 #13
0
def download():
    import os
    from pathlib import Path as path
    HOME_DIR = str(path.home())

    # check nltk_data availability, download if not available
    import nltk
    nltk_rsc = os.path.join(HOME_DIR, 'nltk_data')
    for required in [os.path.join('corpora', 'stopwords.zip'), os.path.join('taggers', 'averaged_perceptron_tagger.zip')]:
        if not os.path.exists(os.path.join(nltk_rsc, required)):
            print('downloading nltk: ', required[:-4])
            nltk.download(os.path.basename(required)[:-4], quiet=True)

    # check stanza_data availability, download if not available
    import stanza
    stanza_rsc = os.path.join(HOME_DIR, 'stanza_resources/en/ner')
    for required in ['anatem.pt', 'bionlp13cg.pt', 'i2b2.pt', 'jnlpba.pt']:
        if not os.path.exists(os.path.join(stanza_rsc, required)):
            print('downloading stanza: ', required[:-3])
            stanza.download('en', package='craft', processors={'ner': required[:-3]}, verbose=False)

    # check benepar_data availability, download if not available
    import benepar
    if not os.path.exists(os.path.join(nltk_rsc, 'models', 'benepar_en3')):
        print('downloading benepar: benepar_en3')
        benepar.download('benepar_en3')
예제 #14
0
def parser(sent):
    try:
        result = []
        str2 = "".join(sent)
        global str1
        global dependencyInstalled
        # Uncomment when running for the first tim
        if not dependencyInstalled:
            stanza.download('en')
            dependencyInstalled = True
        nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse')
        count = 0
        counter = 0
        doc = nlp(str2)
        for i, sentence in enumerate(doc.sentences):
            list1 = []
            list2 = []
            list3 = []
            list4 = []
            for word in sentence.words:
                if (word.deprel == "nsubj"):
                     count += 1
                     if (count <= 1):
                        list1.append([word.deprel,word.text,word.id,sentence.words[word.head - 1].text,word.head,word.xpos])
                        list4 = sentenceBuilder(list1[count-1], list2,list3)
                if (word.deprel != "nsubj" and word.deprel != "punct" and word.deprel != "mark"):
                     list1.append([word.deprel, word.text, word.id, sentence.words[word.head - 1].text, word.head, word.xpos])
                     list4 = sentenceBuilder(list1[counter - 1], list2,list3)
            list4 = [ele for ele in list4 if ele!=[]]
            for x in list4:
                 str1 = ' '.join(x)
                 result.append(str1)
        return result
    except:
        print("Nothing passed into parameters")
예제 #15
0
파일: stanza_cli.py 프로젝트: de9uch1/dnlp
def main(args):
    print(args, file=sys.stderr, flush=True)

    stanza.download(args.lang, dir=args.model_dir)
    kwargs = get_stanza_kwargs(args)
    pipeline = stanza.Pipeline(**kwargs)

    def run_stanza(lines):
        if args.depparse:
            doc = pipeline([l.split() for l in lines])
            for sent in doc.sentences:
                print(' '.join(str(word.head) for word in sent.words))
        elif args.tokenize:
            doc = pipeline(lines)
            for sent in doc.sentences:
                print(' '.join(str(word.text) for word in sent.words))

    with fileinput.input(files=[args.input]) as f:
        print('| ', end='', file=sys.stderr, flush=True)
        batch = []
        for i, line in enumerate(f, start=1):
            batch.append(line.strip())
            if i % args.batch_size == 0:
                run_stanza(batch)
                print('{}...'.format(i), end='', file=sys.stderr, flush=True)
                batch = []
        if len(batch) > 0:
            print(i, file=sys.stderr)
            run_stanza(batch)

    print('| processed sentences: {}'.format(i), file=sys.stderr)
예제 #16
0
    def _download_model(self) -> None:
        """Interface with the `stanza` model downloader.

        # >>> stanza_wrapper = StanzaWrapper(language='grc', stanza_debug_level="INFO")
        # >>> stanza_wrapper._download_model()
        # True
        """
        # TODO: Add prompt whether to allow stanza to download files
        # prompt user to DL the get_stanza_models models
        print("")  # pragma: no cover
        print("")  # pragma: no cover
        print("Α" * 80)  # pragma: no cover
        print("")  # pragma: no cover
        print(  # pragma: no cover
            "CLTK message: The part of the CLTK that you are using depends upon the Stanza NLP library (`stanza`). What follows are several question prompts coming from it. (More at: <https://github.com/stanfordnlp/stanza>.) Answer with defaults."
        )  # pragma: no cover
        print("")  # pragma: no cover
        print("Ω" * 80)  # pragma: no cover
        print("")  # pragma: no cover
        print("")  # pragma: no cover
        stanza.download(lang=self.language, package=self.treebank)
        # if file model still not available after attempted DL, then raise error
        if not file_exists(self.model_path):
            raise FileNotFoundError(
                "Missing required models for ``stanza`` at ``{0}``.".format(
                    self.model_path))
 def tokenize(self, lang = None, text = None, case_id = None, article_id = None, use_gpu=True):
     ## turn lang into isocode alpha 2
     lang = pycountry.languages.lookup(lang).alpha_2
     ## check if the language model exists, if not download it
     if not "/home/jmr/stanza_resources/" + lang in self.stanza_models:
         stanza.download(lang)
     ## start the stanza pipeline for tokenizing with sentence segmentation as well as pos-tagging
     nlp = stanza.Pipeline(lang=lang, processors='tokenize,mwt,pos', tokenize_no_ssplit=False)
     ## tokenize
     #  minibatch at the paragraph level by spliting the text with "\n\n" as per stanza usage
     # significant speed improvements after some experimentation...
     mini_batch = text.replace("\n", "\n\n").strip()
     doc = nlp(mini_batch)
     # for each sentence, get all tokens as well as their pos-tag and other morphological info
     dict_list = []
     for i, sentence in enumerate(doc.sentences):
         for j, current_word in enumerate(sentence.words):
             cur_dict = current_word.to_dict()
             cur_dict['token_id'] = "_".join([case_id, str(i + 1), str(j + 1)])
             dict_list.append(cur_dict)
     ## turn dict to df
     df_raw = pd.DataFrame(dict_list).drop(columns = "id").add_prefix("token_")
     df_raw["case_id"] = case_id
     if isinstance(article_id, str):
         df_raw['article_id'] = article_id
     return df_raw
예제 #18
0
def apply_stanza_processors(*corpus_names,
                            path='/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/',
                            src_lang="de", trg_lang="en", _start=1, num=None,
                            doc_size=10000, tok_bsz=64, mwt_bsz=200, pos_bsz=10000):
    stanza.download(lang=src_lang, processors='tokenize,mwt,pos')
    stanza.download(lang=trg_lang, processors='tokenize,pos')

    src_processor = stanza.Pipeline(lang=src_lang, processors='tokenize,mwt,pos', tokenize_no_ssplit=True, tokenize_batch_size=tok_bsz, mwt_batch_size=mwt_bsz, pos_batch_size=pos_bsz)
    trg_processor = stanza.Pipeline(lang=trg_lang, processors='tokenize,pos', tokenize_no_ssplit=True, tokenize_batch_size=tok_bsz, pos_batch_size=pos_bsz)

    corpuses = read_corpuses(*corpus_names, path=path, _start=_start, num=num)

    # store the number of pieces that each corpus's stanza processor outputs
    # will be stored across (so know how many pickle files to merge together
    # in downstream processing)
    num_corpus_pieces = {corpus_name:ceil(len(corpuses[corpus_name]) / doc_size) for corpus_name in corpuses}
    print(f"pieces: {num_corpus_pieces}\n")
    stanza_path = path + 'stanza_outputs/'
    dump(num_corpus_pieces, open(f"{stanza_path}num_corpus_pieces.pkl", 'wb'))

    for corpus_name in corpuses:
        if is_src_corpus(corpus_name):
            apply_stanza_processor(corpus_name, corpuses[corpus_name], src_processor, path=stanza_path, doc_size=doc_size)
        else:
            apply_stanza_processor(corpus_name, corpuses[corpus_name], trg_processor, path=stanza_path, doc_size=doc_size)
    print("done.")
예제 #19
0
def stanza_pipeline(lenguaje,
                    procesadores='tokenize, pos, lemma',
                    modelo_lemas='',
                    modelo_ner='',
                    modelo_pos=''):
    """
    Carga y retorna un pipeline, o flujo de trabajo, de Stanza del y lenguaje y con los procesos \
        especificados por el usuario. Los procesos que el usuario puede elegir añadir al pipeline incluyen \
        tokenización, *Part of Speech* (POS), lematización y *Named Entity Recognition* (NER), entre otros. \
        Para mayor información sobre estos modelos y los pipelines se puede consultar la página web \
        de Stanza (https://stanfordnlp.github.io/stanza/pipeline.html#processors).

    :param lenguaje: (str). Lenguaje para el que se desean cargar los modelos de Stanza. Stanza tiene modelos \
        disponibles para varios lenguajes, dependiendo de la función a realizar. Para mayor información, visitar \
        https://stanfordnlp.github.io/stanza/available_models.html
    :param procesadores: (str). Valor por defecto: 'tokenize, pos, lemma'. Lista de procesadores, también \
        entendidos como procesos o tareas que se desean aplicar a un texto de entrada, que se desean incluir \
        en el pipeline. Se ingresa un string en el que los diferentes procesadores van separados por comas.
    :param modelo_lemas: (str). Valor por defecto: ''. Unicación de un archivo que contenga el modelo o procesador \
        que el usuario desea utilizar para aplicar lematización a los textos. Si este parámetro se deja vacío, se \
        utilizará el procesador disponible de la librería Stanza para el lenguaje especificado.  
    :param modelo_ner: (str). Valor por defecto: ''. Unicación de un archivo que contenga el modelo o procesador \
        que el usuario desea utilizar para aplicar *Named Entity Recognition* a los textos. Si este parámetro se deja \
        vacío, se utilizará el procesador disponible de la librería Stanza para el lenguaje especificado.
    :param modelo_pos: (str). Valor por defecto: ''. Unicación de un archivo que contenga el modelo o procesador \
        que el usuario desea utilizar para aplicar *Part of Speech* a los textos. Si este parámetro se deja vacío, se \
        utilizará el procesador disponible de la librería Stanza para el lenguaje especificado.                          
    :return: (stanza Pipeline). Pipeline de Stanza, del lenguaje especificado, con los procesadores determinados por \
        el usuario. Si los modelos requeridos no están disponibles en el computador del usuario, la función descargará \
        los modelos correspondientes, lo cual puede tardar algunos minutos dependiendo del tamaño de los modelos y la \
        velocidad de conexión a internet del usuario. 
    """
    # Configuración básica del pipeline
    config = {
        'processors': procesadores,
        'lang': lenguaje,
    }
    # Si se añade algún modelo custom, se agrega al diccionario
    if modelo_pos != '':
        config['pos_model_path'] = modelo_pos
    if modelo_lemas != '':
        config['lemma_model_path'] = modelo_lemas
    if modelo_ner != '':
        config['ner_model_path'] = modelo_ner
    # Intentar crear pipeline. Si el modelo no está descargado, se descarga
    # primero
    try:
        nlp_pipe = stanza.Pipeline(
            **config,
            verbose=0,
        )
    except BaseException:
        print(
            '[INFO] Descargando modelo. Este proceso puede tardar varios minutos.\n'
        )
        stanza.download(lenguaje)
        nlp_pipe = stanza.Pipeline(**config, verbose=0)
    # Retornar pipeline
    return nlp_pipe
예제 #20
0
    def set_language(self, lang=None):
        self.lang = lang
        if lang is None: return

        if not exists_file(home_dir() + '/stanza_resources/' + lang):
            stanza.download(lang)

        self.nlp = NLP(lang=lang, logging_level='ERROR')
예제 #21
0
def _download_ru_stanfordnlp_model():
    import stanza

    if not pathlib.Path.home().joinpath(
            'stanza_resources/resources.json').exists():
        stanza.download('ru')
    else:
        logging.warning(f'ru language model already exist. Skip download.')
예제 #22
0
 def __init__(self, lang="en"):
     self.lang = lang
     try:
         nlp = stanza.Pipeline(lang=self.lang)
     except:
         stanza.download(lang)
         nlp = stanza.Pipeline(lang=self.lang)
     self.nlp = nlp
예제 #23
0
 def __init__(self, context, wordService):
     self.__wordService = wordService
     self.__context = context
     stanza.download('en')
     stanza.download('pl')
     self.stanza_en = stanza.Pipeline(lang='en',
                                      processors='tokenize,mwt,pos,lemma')
     self.stanza_pl = stanza.Pipeline(lang='pl',
                                      processors='tokenize,mwt,pos,lemma')
예제 #24
0
def test_spacy_stanza_german():
    lang = "de"
    stanza.download(lang)
    nlp = spacy_stanza.load_pipeline(lang)
    assert nlp.Defaults == GermanDefaults

    # warning for misaligned ents due to multi-word token expansion
    with pytest.warns(UserWarning):
        doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten")
예제 #25
0
def test_spacy_stanza_tokenizer_options():
    # whitespace tokens from spacy tokenizer are handled correctly
    lang = "en"
    stanza.download(lang)
    nlp = spacy_stanza.load_pipeline(lang, processors={"tokenize": "spacy"})

    doc = nlp(" Barack  Obama  was  born\n\nin Hawaii.")
    assert [t.text for t in doc] == [
        " ",
        "Barack",
        " ",
        "Obama",
        " ",
        "was",
        " ",
        "born",
        "\n\n",
        "in",
        "Hawaii",
        ".",
    ]

    # pretokenized text is handled correctly
    nlp = spacy_stanza.load_pipeline(lang, tokenize_pretokenized=True)
    doc = nlp(
        "Barack Obama was born in Hawaii.\nBarack Obama was born in Hawaii.")
    assert [t.text for t in doc] == [
        "Barack",
        "Obama",
        "was",
        "born",
        "in",
        "Hawaii.",
        "Barack",
        "Obama",
        "was",
        "born",
        "in",
        "Hawaii.",
    ]
    doc = nlp(
        " Barack  Obama  was  born\n\n in Hawaii.\nBarack Obama was born in Hawaii."
    )
    assert [t.text for t in doc] == [
        "Barack",
        "Obama",
        "was",
        "born",
        "in",
        "Hawaii.",
        "Barack",
        "Obama",
        "was",
        "born",
        "in",
        "Hawaii.",
    ]
예제 #26
0
def main(args=None):
    torch.multiprocessing.set_start_method('fork')
    torch.set_num_threads(1)
    torch.set_num_interop_threads(1)
    stanza.download('en')
    run_processor(StanzaSelectiveParser(),
                  mp=True,
                  mp_context=torch.multiprocessing,
                  args=args)
예제 #27
0
 def __init__(self):
     print('Init NLPUtils')
     nltk.download('punkt')
     nltk.download('stopwords')
     nltk.download('wordnet')
     stanza.download('es')
     self.nlp = stanza.Pipeline('es')
     self.pattern = re.compile(r'(\#\w+)')
     self.tweet_tokenizer = TweetTokenizer()
예제 #28
0
 def update_config_value(self, name, old_value, new_value):
     if old_value == 'install_resource':
         if new_value == 'y':
             stanza.download('fr')
         else:
             raise Exception(
                 'you can not run this package without the resources')
     else:
         self.config['package'] = new_value
예제 #29
0
    def __init__(self):
        super(StanzaAnalyzer, self).__init__()

        stanza.download("pl")
        self._nlp_pipeline = stanza.Pipeline(
            "pl", processors="tokenize,pos,lemma", verbose=True,
            use_gpu=True)  # initialize neural pipeline

        self._conv_stanza_pos = lambda x: [w.pos for w in x.words]
예제 #30
0
def parse_data(data_file=config.DEV_MA_FILE,
               target=config.PDEV_MA_FILE,
               function_test=False,
               force_exe=False):
    """
    input (data = str, embedding = str, target file = str)
    effect preprocess and save data to target
    ouput preprocessed data
    
    parsed data is in jsonl (each line is a json)
    {
        config.idf : id(in string)
        config.hf : Stanza Doc,
        config.pf : Stanza Doc,
        config.lf : int 
    }
    """
    # alias
    p = config.pf
    h = config.hf
    l = config.lf

    # stanza dinit
    stanza.download('en')
    nlp = stanza.Pipeline(lang='en',
                          processors='tokenize,mwt,pos,lemma,depparse')

    # data_file_loading
    with open(data_file) as fo:
        raw_lines = fo.readlines()
        json_data = [json.loads(line) for line in raw_lines]

    if function_test:
        json_data = json_data[:10]

    if os.path.isfile(str(target)) and not force_exe:
        print("file " + str(target) + " already exist")
        print(
            "if u still want to procceed, add force_exe=True in function arg")
        print("exiting")
        return None
    else:
        print("creating file " + str(target) + " to save result")
        print("executing")

    # dependency parsing and jsonl saving
    with jsonl.open(target, mode='w') as writer:
        parsed_data = []
        for data in tqdm(json_data):
            # only add those who have gold labels
            if (data[l] not in config.label_to_id.keys()):
                continue
            pdata = process_one_example(data, nlp)
            parsed_data.append(pdata)
            writer.write(pdata)

    return parsed_data