Пример #1
0
def train_from_file(
    input_path,
    corpus_type='bccwj_ud',
    model_path=None,
    clear_model=False,
    keep_gold_tokens=False,
    evaluate_all_combinations=False,
    mini_batch_size=128,
    max_epochs=32,
    online_sgd_max_epochs=0,
    give_up_iter=3,
    evaluation_corpus_path=None,
    output_base_path=None,
    require_gpu=False,
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    if corpus_type == 'bccwj_ud':
        corpus = convert_files(input_path)
        if evaluation_corpus_path:
            evaluation_gold = convert_files(evaluation_corpus_path)
        else:
            evaluation_gold = corpus[0:100]
    else:
        corpus = None
        evaluation_gold = None
    return train(corpus, model_path, clear_model, keep_gold_tokens,
                 evaluate_all_combinations, mini_batch_size, max_epochs,
                 online_sgd_max_epochs, give_up_iter, evaluation_gold,
                 output_base_path)
Пример #2
0
    def set_nlp(self):
        if self.nlp:
            return

        if self.require_gpu:
            spacy.require_gpu()

        if self.output_format in ["2", "mecab"]:
            nlp = JapaneseDefaults.create_tokenizer(
                config={
                    "split_mode": self.split_mode
                }).tokenizer
        else:
            # Work-around for pickle error. Need to share model data.
            if self.model_path:
                nlp = spacy.load(self.model_path)
            else:
                nlp = spacy.load("ja_ginza")

            if self.disable_sentencizer:

                def disable_sentencizer(doc):
                    for t in doc:
                        t.is_sent_start = False
                    return doc

                nlp.add_pipe(disable_sentencizer, before="parser")

            if self.split_mode:
                set_split_mode(nlp, self.split_mode)

        self.nlp = nlp
Пример #3
0
def main():
    spacy.require_gpu()

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--article-path",
        help="path to articles to constrain",
    )
    parser.add_argument(
        "--abstract-path",
        help="abstracts to constrain",
    )
    parser.add_argument(
        "--output-path",
        help="path to save pos sents",
    )
    parser.add_argument("--k", type=int, default=16)
    parser.add_argument("--batch-size", type=int, default=1000)

    args = parser.parse_args()

    selector = SentenceSelector(args.k)

    i = 0
    with open(args.article_path) as article_file, open(
            args.abstract_path) as abstract_file, open(args.output_path,
                                                       'w+') as output_file:
        start = time.time()
        articles = []
        abstracts = []
        for x, y in zip(article_file, abstract_file):

            x = x.strip()
            y = y.strip()
            articles.append(x)
            abstracts.append(y)

            if i % args.batch_size == 0:
                articles = list(selector.nlp.pipe(articles))
                abstracts = list(selector.nlp.pipe(abstracts))
                for j in range(len(articles)):
                    labels = selector.confine_docs(articles[j], abstracts[j])
                    for label in labels:
                        print(' '.join(label), file=output_file)

                articles = []
                abstracts = []
                end = time.time()
                print('elapsed since start', end - start)
                print("processed {} lines".format(i), file=sys.stderr)

            i += 1
        if len(articles) > 0:
            print('leftovers')
            articles = list(selector.nlp.pipe(articles))
            abstracts = list(selector.nlp.pipe(abstracts))
            for j in range(len(articles)):
                labels = selector.confine_docs(articles[j], abstracts[j])
                for label in labels:
                    print(' '.join(label), file=output_file)
Пример #4
0
def train_parser_from_file(
        input_json_path,
        model_path=None,
        clear_model=False,
        excluding_labels='',
        mini_batch_size=128,
        max_epochs=32,
        online_sgd_max_epochs=0,
        give_up_iter=3,
        evaluation_corpus_path=None,
        output_base_path=None,
        require_gpu=False,
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    corpus = convert_files(input_json_path)
    if evaluation_corpus_path:
        evaluation_gold = convert_files(evaluation_corpus_path)
    else:
        evaluation_gold = corpus[0:1000]
    train(
        corpus,
        model_path,
        clear_model,
        excluding_labels,
        mini_batch_size,
        max_epochs,
        online_sgd_max_epochs,
        give_up_iter,
        evaluation_gold,
        output_base_path
    )
Пример #5
0
def setup_spacy(use_gpu=False):
    if use_gpu:
        spacy.require_gpu()
        spacy.util.use_gpu(0)
    # disable everything we don't have at runtime either
    nlp = spacy.load('de', disable=['parser', 'ner'])
    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes + [
        '—',  # numeric dash: (?<=[0-9])—(?=[0-9])
        '/'
    ])  # maybe more restrictive?
    suffix_re = spacy.util.compile_suffix_regex(
        nlp.Defaults.suffixes + ('/', ))  # maybe more restrictive?
    # '〟' as historic quotation mark (left and right)
    # '〃' as historic quotation mark (at the start of the line!)
    # '‟' as historic quotation mark (at the start of the line!)
    # '›' and '‹' as historic quotation marks (maybe goes away with NFC?)
    # '⟨' and '⟩' parentheses (maybe goes away with NFC?)
    # '⁽' and '⁾' parentheses (maybe goes away with NFC?)
    # '〈' and '〉' brackets (maybe goes away with NFC?)
    # '‹' and '›' as historic quotation mark
    # '’' as historic apostrophe
    # '—' as dash, even when written like a prefix
    # \u+feff (byte order mark) as prefix

    nlp.tokenizer = spacy.tokenizer.Tokenizer(
        nlp.vocab,
        token_match=nlp.tokenizer.token_match,
        prefix_search=nlp.tokenizer.prefix_search,
        suffix_search=nlp.tokenizer.suffix_search,
        infix_finditer=infix_re.finditer)
    return nlp
Пример #6
0
 def __init__(self, data, args):
     self.data = data
     self.job_level = load_job_dict(args)
     self.edu_level = load_edu_dict(args)
     # load nlp piepline model
     spacy.require_gpu()
     self.nlp_pipe = spacy.load(args.nlp_model)
     self.args = args
Пример #7
0
    def __init__(self, iso, use_gpu=False):
        import spacy
        if use_gpu:
            # TODO: Support different GPU ids??
            spacy.require_gpu()

        self.nlp = spacy.load(DSpacy[iso])
        EngineInstance.__init__(self, iso, use_gpu)
Пример #8
0
def test_require_gpu():
    try:
        import cupy  # noqa: F401

        require_gpu()
        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        with pytest.raises(ValueError):
            require_gpu()
def load_pipeline(use_gpu: bool, with_ner: bool, model_name: str = "hu_core_news_lg"):
    if use_gpu:
        spacy.require_gpu()

    nlp = spacy.load(model_name)

    if not with_ner:
        nlp.remove_pipe("ner")
    return nlp
Пример #10
0
 def __init__(self, pipeline_path="fi_geoparser", use_gpu=True, 
              output_df=True):
     if use_gpu:
         spacy.require_gpu()
     else:
         spacy.require_cpu()
     
     self.output_df = output_df
     
     self.ner_pipeline = spacy.load(pipeline_path)
Пример #11
0
def test_require_cpu():
    require_cpu()
    assert isinstance(get_current_ops(), NumpyOps)
    try:
        import cupy  # noqa: F401

        require_gpu()
        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        pass
    require_cpu()
    assert isinstance(get_current_ops(), NumpyOps)
Пример #12
0
def ner_parser(df, col_string, batch_size=256):
    spacy.require_gpu()
    nlp = spacy.load("en_core_web_sm")
    docs = nlp.pipe(df[col_string], disable=["tagger", "parser"], batch_size=batch_size)
    out = []
    for doc in docs:
        l = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
        val = ", "
        l = val.join(l)
        out.append(l)
    df["company_name_list"] = out
    return df
Пример #13
0
def evaluate_from_file(
    input_json_path,
    model_path=None,
    require_gpu=False,
    print_stats=True,
    nlp=None,
):
    corpus = convert_files(input_json_path)
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    return evaluate(corpus, model_path, print_stats, nlp)
Пример #14
0
def spacy_sentences(path):
    import spacy
    # spacy.prefer_gpu()
    spacy.require_gpu()
    nlp = spacy.load('en_core_web_sm')
    pin = path / 'intermediate_files/wikipedia.txt'
    pout = path / 'final_text_file_single/wikipedia.segmented.txt'
    with open(pin) as f:
        with open(pout, 'w') as out:
            for ln in f:
                if ln != '\n':
                    for s in nlp(ln).sents:
                        out.write(s.text + '\n')
Пример #15
0
    def __init__(
        self,
        model="en_ner_eco_md",
        with_abbrev=False,
        with_linking=None,
        with_sentence=False,
        threshold=0.7,
        prefer_gpu=False,
        verbose=False,
        logger=None,
    ):
        self.logger = logger if logger else logging.getLogger(__name__)
        warnings.simplefilter("ignore")

        self.verbose = verbose
        self.extractor = TextExtractor(logger=self.logger)
        if prefer_gpu:
            use_cuda = torch.cuda.is_available()
            self.logger.info(
                "GPU is available" if use_cuda else "GPU not found")
            if use_cuda:
                spacy.require_gpu()
                self.logger.info("TaxoNERD will use GPU")
        self.logger.info("Load model {}".format(model))
        self.nlp = spacy.load(model)
        self.logger.info("Loaded model {}-{}".format(self.nlp.meta["name"],
                                                     self.nlp.meta["version"]))

        self.with_sentence = with_sentence
        if self.with_sentence:
            if self.verbose:
                logger.info(f"Add pySBDSentencizer to pipeline")
            Span.set_extension("sent_id", default=None)
            self.nlp.add_pipe("pysbd_sentencizer", before="ner")

        self.with_abbrev = with_abbrev
        if self.with_abbrev:
            if self.verbose:
                logger.info(f"Add TaxonomicAbbreviationDetector to pipeline")
            self.nlp.add_pipe("taxonomic_abbreviation_detector")

        self.with_linking = with_linking != None
        if self.with_linking:
            kb_name = with_linking if with_linking != "" else "gbif_backbone"
            if self.verbose:
                logger.info(f"Add EntityLinker {kb_name} to pipeline")
            self.create_linker(kb_name, threshold)
Пример #16
0
def evaluate_from_file(
    path,
    corpus_type='bccwj_ud',
    model_path=None,
    parse_result_path=None,
    keep_gold_tokens=False,
    evaluate_all_combinations=False,
    require_gpu=False,
    print_file=sys.stdout,
    nlp=None,
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    parse_results = None
    if corpus_type == 'bccwj_ud':
        gold = convert_files(path)
        if parse_result_path:
            parse_results = convert_files(parse_result_path)
    else:
        gold = None
    if not nlp:
        nlp = spacy.load(model_path)
        nlp.tokenizer.use_sentence_separator = False

    if 'JapaneseCorrector' not in nlp.pipe_names:
        corrector = nlp.create_pipe('JapaneseCorrector')
        nlp.add_pipe(corrector, last=True)

    rewritten = [g.clone() for g in gold]
    if not keep_gold_tokens:
        print('Rewriting gold corpus with tokenizer', file=sys.stderr)
        disabled = nlp.disable_pipes(*nlp.pipe_names)
        rewrite_by_tokenizer(rewritten, nlp, sys.stderr)
        disabled.restore()
        print(file=sys.stderr, flush=True)

    return evaluate(gold, rewritten, model_path, parse_results,
                    keep_gold_tokens, evaluate_all_combinations, print_file,
                    nlp)
Пример #17
0
def main():
    logger.info("Start coreference parsing")
    parser = ArgumentParser()
    parser.add_argument('--htmls_fname', type=str, required=True)
    parser.add_argument('--objects_fname', type=str, required=True)
    parser.add_argument('--htmls_coref_cache', type=str, required=True)
    parser.add_argument('--work_dir',
                        type=str,
                        required=False,
                        default=os.getcwd())
    args = parser.parse_args()
    work_dir = args.work_dir
    set_up_root_logger('COREF', os.path.join(work_dir, 'logs'))

    html_fname: str = args.htmls_fname
    objects_path = Path(args.objects_fname)
    htmls_coref_cache_fname: str = args.htmls_coref_cache

    with open(html_fname, "rb") as f_html:
        htmls_lookup = pickle.load(f_html)

    htmls_lookup_coref = load_cache(htmls_coref_cache_fname)

    names = get_all_objects(objects_path, work_dir)
    logger.info(f'Number of objects: {len(names)}')

    spacy.require_gpu()
    nlp = spacy.load('en_core_web_sm')
    neuralcoref.add_to_pipe(nlp)

    find_corefs(htmls_coref_cache_fname, htmls_lookup, htmls_lookup_coref,
                names, nlp)

    with open(htmls_coref_cache_fname, 'wb') as f:
        pickle.dump(htmls_lookup_coref, f, pickle.HIGHEST_PROTOCOL)

    logger.info('Finished')
Пример #18
0
def main(args):
    if args.cuda:
        spacy.require_gpu()
    # Load an spacy model (supported models are "es" and "en")
    print("Loading spacy...")
    nlp = spacy.load("en_core_web_lg")
    print("Done")
    nlp.tokenizer = lambda text: whitespace_tokenizer(text, nlp.vocab)
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger")
    nlp.add_pipe(BeneparComponent("benepar_en2"))

    with open(args.data) as f:
        lines = [line.strip() for line in list(f)]

    all_texts = []
    all_feats = []
    docs = nlp.pipe(lines, batch_size=args.batch_size)
    for doc in tqdm(docs, desc="Extracting feats", total=len(lines)):
        doc_feats = []
        doc_texts = []
        for token in doc:
            t_feats = extract_feats(token)
            doc_feats.append(t_feats)
            doc_texts.append(token.text)
        all_feats.append(doc_feats)
        all_texts.append(doc_texts)

    with open(args.data.replace(".tok", ".feats"), "w") as f:
        f.write("|".join((";".join(fn[:2]) for fn in FEATS)))
        f.write("\n")
        for text, doc_feats in zip(all_texts, all_feats):
            t_feats_joined = ["|".join(tf) for tf in doc_feats]
            line_feats = " ".join(
                ["|".join((t, f)) for t, f in zip(text, t_feats_joined)])
            f.write(line_feats)
            f.write("\n")
Пример #19
0
def run_on_input(nlp,
                 str_list,
                 output_folder,
                 filename='single_run',
                 ref_list=None):
    from research.prodigy.functions import make_evaluation_html
    spacy.require_gpu()
    ent_data = []
    for i, doc in tqdm(enumerate(nlp.pipe(str_list, batch_size=1000)),
                       total=len(str_list)):
        ent_data += [{
            "text":
            doc.text,
            "tp":
            [[ent.start_char, ent.end_char, ent.label_] for ent in doc.ents],
            "fp": [],
            "fn": [],
            "ref":
            "" if ref_list is None else ref_list[i],
            "_id":
            "",
        }]
    srsly.write_jsonl(f'{output_folder}/{filename}.jsonl', ent_data)
    make_evaluation_html(ent_data, output_folder, filename + '.html')
Пример #20
0
def main(nH=6,
         dropout=0.1,
         nS=6,
         nB=64,
         nE=20,
         use_gpu=-1,
         lim=1000000,
         nM=300,
         mL=100,
         save=False,
         nTGT=5000,
         save_name="model.pkl"):
    if use_gpu != -1:
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    ''' Read dataset '''
    nlp = spacy.load('en_core_web_sm')
    print('English model loaded')
    for control_token in ("<eos>", "<bos>", "<pad>", "<cls>", "<mask>"):
        nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}])

    train, dev, test = get_iwslt()
    print('Dataset loaded')

    train, _ = zip(*train)
    dev, _ = zip(*dev)
    test, _ = zip(*test)

    train = train[:lim]
    dev = dev[:lim]
    test = test[:lim]
    ''' Tokenize '''
    train = spacy_tokenize(nlp.tokenizer, train, mL=mL)
    dev = spacy_tokenize(nlp.tokenizer, dev, mL=mL)
    test = spacy_tokenize(nlp.tokenizer, test, mL=mL)
    print('Tokenization finished')
    ''' Set rank based on all the docs '''
    all_docs = train + dev + test
    set_rank(nlp.vocab, all_docs, nTGT=nTGT)

    train = set_numeric_ids(nlp.vocab, train)
    dev = set_numeric_ids(nlp.vocab, dev)
    test = set_numeric_ids(nlp.vocab, test)
    print('Numeric ids set')

    word2indx, indx2word = get_dicts(nlp.vocab)
    print('Vocab dictionaries grabbed')

    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(mL, nM)
        model = (FeatureExtracter(attrs=embed_cols) >> with_flatten(
            FancyEmbed(nM, nTGT, cols=embed_cols)) >> Residual(position_encode)
                 >> create_model_input() >> Encoder(
                     nM=nM, nS=nS, nH=nH, device=device) >> with_reshape(
                         Softmax(nO=nTGT, nI=nM)))
        ''' Progress tracking '''
        losses = [0.]
        train_accuracies = [0.]
        train_totals = [0.]
        dev_accuracies = [0.]
        dev_loss = [0.]

        def track_progress():
            correct = 0.
            total = 0.
            ''' Get dev stats '''
            for X0 in minibatch(dev, size=nB):
                X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL)
                Xh = model(X1)
                L, C, t = get_loss(Xh, X0, X1, loss_mask)
                correct += C
                total += t
                dev_loss[-1] += (L**2).sum()
            dev_accuracies[-1] = correct / total
            print(len(losses), losses[-1],
                  train_accuracies[-1] / train_totals[-1], dev_loss[-1],
                  dev_accuracies[-1])
            dev_loss.append(0.)
            losses.append(0.)
            train_accuracies.append(0.)
            dev_accuracies.append(0.)
            train_totals.append(0.)
            if save:
                model.to_disk('.models/' + save_name)

        ''' Model training '''
        with model.begin_training(batch_size=nB,
                                  nb_epoch=nE) as (trainer, optimizer):
            trainer.dropout = dropout
            trainer.dropout_decay = 1e-4
            optimizer.alpha = 0.001
            optimizer.L2 = 1e-6
            optimizer.max_grad_norm = 1.0
            trainer.each_epoch.append(track_progress)
            optimizer.alpha = 0.001
            optimizer.L2 = 1e-6
            optimizer.max_grad_norm = 1.0
            for X0, _ in trainer.iterate(train, train):
                X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL)
                Xh, backprop = model.begin_update(X1, drop=dropout)
                dXh, C, total = get_loss(Xh, X0, X1, loss_mask)
                backprop(dXh, sgd=optimizer)
                losses[-1] += (dXh**2).sum()
                train_accuracies[-1] += C
                train_totals[-1] += total
Пример #21
0
def init():
    global nlp
    # note: don't initialize CUDA until we fork child procs
    spacy.require_gpu()
    nlp = spacy.load('en', disable=['tagger', 'ner', 'textcat'])
Пример #22
0
def test_require_gpu():
    with pytest.raises(ValueError):
        require_gpu()
Пример #23
0
def run(
    model_path=None,
    mode=SUDACHI_DEFAULT_MODE,
    use_sentence_separator=False,
    disable_pipes='',
    recreate_corrector=False,
    output_path=None,
    output_format='0',
    require_gpu=False,
    *files,
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    if model_path:
        nlp = spacy.load(model_path)
    else:
        nlp = spacy.load('ja_ginza')
    if disable_pipes:
        print("disabling pipes: {}".format(disable_pipes), file=sys.stderr)
        nlp.disable_pipes(disable_pipes)
        print("using : {}".format(nlp.pipe_names), file=sys.stderr)
    if recreate_corrector:
        if 'JapaneseCorrector' in nlp.pipe_names:
            nlp.remove_pipe('JapaneseCorrector')
        corrector = JapaneseCorrector(nlp)
        nlp.add_pipe(corrector, last=True)

    if mode == 'A':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.A
    elif mode == 'B':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.B
    elif mode == 'C':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.C
    else:
        raise Exception('mode should be A, B or C')
    print("mode is {}".format(mode), file=sys.stderr)
    if not use_sentence_separator:
        print("disabling sentence separator", file=sys.stderr)
        nlp.tokenizer.use_sentence_separator = False

    if output_path:
        output = open(str(output_path), 'w')
    else:
        output = sys.stdout

    try:
        if files:
            for path in files:
                with open(path, 'r') as f:
                    lines = f.readlines()
                for line in lines:
                    print_result(line, nlp, True, output_format, output)
        else:
            while True:
                line = input()
                print_result(line, nlp, True, output_format, output)
    except EOFError:
        pass
    except KeyboardInterrupt:
        pass
    finally:
        output.close()
def main(model=None,
         output_dir=None,
         n_iter=20,
         n_texts=2000,
         init_tok2vec=None,
         data_loader=None):
    if data_loader is None:
        raise ValueError("Data Loader is required")

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
    # enable working with GPU
    spacy.require_gpu()
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat",
                                  config={
                                      "exclusive_classes": True,
                                      "architecture": "simple_cnn",
                                  })
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    textcat.add_label("NEUTRAL")

    # load the IMDB dataset
    print("Loading data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = data_loader()
    train_texts = train_texts[:n_texts] if n_texts is not None else train_texts
    train_cats = train_cats[:n_texts] if n_texts is not None else train_cats
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts, len(train_texts), len(dev_texts)))
    print("text {}".format(dev_texts[0]))
    print("Cat {}".format(dev_cats[0]))
    train_data = list(zip(train_texts, [{
        "cats": cats
    } for cats in train_cats]))
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".
                  format(  # print a simple table
                      losses["textcat"],
                      scores["textcat_p"],
                      scores["textcat_r"],
                      scores["textcat_f"],
                  ))

    # test the trained model
    test_text = "This movie sucked"
    test_text2 = "In the summer time we can have some good time. In the summer time we can do SKR"
    test_text3 = "I know that girl. She is very superficial. She is all about looks and money. She wants to do SKR"
    test_text4 = "A brand new gucci pouch. If you don't invest then you're losing out. All of that assets."
    test_text5 = "There were good moments in my high school: Koforidua Secondary Technical School."
    test_text6 = "Ex President Obama has swag."
    test_text7 = "Robert Freeman is a pathological liar."
    test_text8 = "Dear all, find attached the document"
    test_text9 = "What happens when technology meets science?"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
        doc2 = nlp2(test_text3)
        print(test_text3, doc2.cats)
        doc2 = nlp2(test_text4)
        print(test_text4, doc2.cats)
        doc2 = nlp2(test_text5)
        print(test_text5, doc2.cats)
        doc2 = nlp2(test_text6)
        print(test_text6, doc2.cats)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
        doc2 = nlp2(test_text7)
        print(test_text7, doc2.cats)
        doc2 = nlp2(test_text8)
        print(test_text8, doc2.cats)
        doc2 = nlp2(test_text9)
        print(test_text9, doc2.cats)

    do_report(dev_texts, dev_cats, output_dir)
Пример #25
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import spacy
spacy.require_gpu()
from pathlib import Path
import random
from spacy.util import minibatch, compounding
#import xx_ent_wiki_sm
from spacy.lang.pt import Portuguese
from ast import literal_eval
import datetime
import time 

output_dir = "./sky_ner"

modelDir = Path(output_dir)

nlp = spacy.blank('pt')    
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print (st)
if modelDir.exists() is True:

    # training data
    TRAIN_DATA = open('dataset_new.txt', 'r').read()
    print('Dados carregados')
    try:
        TRAIN_DATA = literal_eval(TRAIN_DATA)
        print('literal eval aplicado')
    except: 
Пример #26
0
def main(nH=6, dropout=0.1, nS=6, nB=15, nE=20, use_gpu=-1, lim=2000):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
    train, dev, test = get_iwslt()
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    test_X, test_Y = zip(*test)
    ''' Read dataset '''
    nlp_en = spacy.load('en_core_web_sm')
    nlp_de = spacy.load('de_core_news_sm')
    print('Models loaded')
    for control_token in ("<eos>", "<bos>", "<pad>"):
        nlp_en.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
        nlp_de.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
    train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                      train_X[-lim:], train_Y[-lim:],
                                      MAX_LENGTH)
    dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                  dev_X[-lim:], dev_Y[-lim:], MAX_LENGTH)
    test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                    test_X[-lim:], test_Y[-lim:], MAX_LENGTH)
    train_X = set_numeric_ids(nlp_en.vocab, train_X, vocab_size=VOCAB_SIZE)
    train_Y = set_numeric_ids(nlp_de.vocab, train_Y, vocab_size=VOCAB_SIZE)
    nTGT = VOCAB_SIZE

    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(MAX_LENGTH, MODEL_SIZE)
        model = (apply_layers(extractor, extractor) >> apply_layers(
            with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)),
            with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)),
        ) >> apply_layers(Residual(position_encode), Residual(position_encode))
                 >> create_batch() >> EncoderDecoder(nS=nS, nH=nH, nTGT=nTGT))

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh, Y_mask = model((X, Y))
            L, C = get_loss(model.ops, Yh, Y, Y_mask)
            correct += C
            dev_loss[-1] += (L**2).sum()
            total += len(Y)
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        for X, Y in trainer.iterate(train_X, train_Y):
            (Yh, X_mask), backprop = model.begin_update((X, Y), drop=dropout)
            dYh, C = get_loss(model.ops, Yh, Y, X_mask)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += sum(len(y) for y in Y)
Пример #27
0
def train_word2vec_from_file(
        corpus_type='sudachi_b',
        base_model_path=None,
        lang_name='ja',
        model_name='bccwj_ud',
        model_version='1.0.0',
        dimension=100,
        vocab_size=100000,
        min_count=5,
        window=7,
        negative=5,
        n_workers=8,
        epochs=2,
        output_dir=Path('.'),
        require_gpu=False,
        input_path=None,
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    if corpus_type == 'sudachi_a':
        corpus_reader = read_sudachi_a
    elif corpus_type == 'sudachi_b':
        corpus_reader = read_sudachi_b
    elif corpus_type == 'sudachi_c':
        corpus_reader = read_sudachi_c
    elif corpus_type == 'bccwj_ud':
        corpus_reader = read_bccwj_ud
    else:
        raise Exception('%s not supported' % corpus_type)

    if base_model_path:
        print('load base model: {}'.format(base_model_path), file=sys.stderr)
        model = Word2Vec.load(str(model_file_path(base_model_path, 'w2v')))
        print('w2v loaded', file=sys.stderr)
        with open(str(model_file_path(base_model_path, 'pickle')), 'rb') as f:
            total_sents, word_store, word_counter = pickle.load(f)
        print('pickle loaded', file=sys.stderr)
    else:
        model = Word2Vec(
            size=dimension,
            window=window,
            min_count=min_count,
            workers=n_workers,
            sample=1e-5,
            negative=negative
        )
        total_sents = 0
        word_store = {}
        word_counter = []
        print('initialized', file=sys.stderr)

    total_sents, words = train_word2vec(
        model, total_sents, word_store, word_counter, corpus_reader, vocab_size, min_count, epochs, input_path
    )

    new_model_path = output_dir

    nlp = get_lang_class(lang_name)
    nlp.meta['name'] = model_name
    nlp.meta['version'] = model_version
    vocab = nlp.vocab
    for word in words:
        vocab.set_vector(word, model.wv[word])

    corrector = nlp.create_pipe('JapaneseCorrector')
    nlp.add_pipe(corrector, last=True)
    nlp.to_disk(new_model_path)
    print('saved: ', new_model_path, file=sys.stderr)

    model.save(str(model_file_path(new_model_path, 'w2v')))
    print('w2v saved', file=sys.stderr)

    with open(str(model_file_path(new_model_path, 'pickle')), 'wb') as f:
        pickle.dump((total_sents, word_store, word_counter), f)
    print('pickle saved', file=sys.stderr)
Пример #28
0
def test_require_gpu():
    with pytest.raises(ValueError):
        require_gpu()
Пример #29
0
# NVIDIA

import os
import spacy

#spacy.prefer_gpu()
spacy.require_gpu()

input_file = os.environ['WORKING_DIR'] + '/intermediate_files/wikipedia.txt'
output_file = os.environ['WORKING_DIR'] + '/final_test_file_single/wikipedia.segmented.txt'

nlp = spacy.load('en_core_web_sm')

doc_seperator = "\n"

file_mem = []

print("Reading file into memory.")
with open(input_file) as ifile:
  for line in ifile:
    if line != "\n":
      file_mem.append(line)

print("File read.")
print("Starting nlp.pipe")
docs = nlp.pipe(file_mem, batch_size=1000)

print("Starting to write output")
with open(output_file, "w") as ofile:
  for item in docs:
    for sent in item.sents:
Пример #30
0
def main(
    corpus_type=None,
    model_path=None,
    mode=SUDACHI_DEFAULT_MODE,
    use_sentence_separator=False,
    disable_pipes='',
    recreate_corrector=False,
    output_path=None,
    require_gpu=False,
    *lines,
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    nlp = load_model(model_path)
    if disable_pipes:
        print("disabling pipes: {}".format(disable_pipes), file=sys.stderr)
        nlp.disable_pipes(disable_pipes)
        print("using : {}".format(nlp.pipe_names), file=sys.stderr)
    else:
        # to ensure reflect local changes of corrector
        if recreate_corrector and 'JapaneseCorrector' in nlp.pipe_names:
            nlp.remove_pipe('JapaneseCorrector')
            corrector = JapaneseCorrector(nlp)
            nlp.add_pipe(corrector, last=True)

    if mode == 'A':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.A
    elif mode == 'B':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.B
    elif mode == 'C':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.C
    else:
        raise Exception('mode should be A, B or C')
    print("mode is {}".format(mode), file=sys.stderr)
    if not use_sentence_separator:
        print("disabling sentence separator", file=sys.stderr)
        nlp.tokenizer.use_sentence_separator = False

    if output_path:
        output = open(str(output_path), 'w')
    else:
        output = sys.stdout

    line = '<init>'
    try:
        if corpus_type:
            if corpus_type == 'bccwj_ud':
                for line in convert_files(lines):
                    print_result(line, nlp, True, output)
            else:
                for path in lines:
                    with open(path, 'r') as f:
                        lines = f.readlines()
                    for line in lines:
                        print_result(line, nlp, True, output)
        elif len(lines) > 0:
            for line in lines:
                print_result(line, nlp, True, output)
        else:
            while True:
                line = input()
                print_result(line, nlp, True, output)
    except EOFError:
        pass
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(e, file=sys.stderr)
        print('exception raised while analyzing the line:',
              line,
              file=sys.stderr)
    finally:
        output.close()
Пример #31
0
def test_require_gpu():
    try:
        import cupy  # noqa: F401
    except ImportError:
        with pytest.raises(ValueError):
            require_gpu()
Пример #32
0
def main(
):  #execute all functions within main to protect against multiprocessing infinite feedback loop

    if cpu_count() >= 8:  #to avoid overtaxing Brad, save some cores
        cpu = 8
    else:
        cpu = cpu_count()

    with open(
            '../input/generated_meta_strings.pkl', "rb"
    ) as pkl:  # dictionary with authors as keys and their strings as values
        auth_strings = pickle.load(pkl)

    with open(
            '../input/alter_lists.pkl', "rb"
    ) as pkl:  # dataframe with author column, alters column, and alters_2 column
        alter_lists = pickle.load(pkl)

    with open(
            '../input/author_metadata.pkl', "rb"
    ) as pkl:  # dictionary with author metadata (ie. community membership)
        author_metadata = pickle.load(pkl)

    #create dataframe from dict (todo: just output a dataframe from community_strings instead)
    author_metadata = pd.DataFrame.from_dict(
        author_metadata,
        orient="index").reset_index().rename(columns={"index": "author"})

    auth_alt_dict = dict(zip(alter_lists.author,
                             alter_lists.alter))  # dict of {auth:alter list}
    auth_alt_dict_2 = dict(
        zip(alter_lists.author,
            alter_lists.alter_2))  # dict of {auth: alter_2 list}
    auth_list = list(auth_strings.keys())  # list of author names

    auth_index = dict()  # pretty sure this isn't needed anymore

    for i, item in enumerate(auth_list):  # see above
        auth_index[item] = [i]

    abs_list = []  # list of author strings to process

    # NOTE: this is only safe because the auth_strings dict hasn't been modified. Should be modified for posterity
    for author in auth_strings:
        abs_list.append(auth_strings[author]["meta_string"])

    del auth_strings

    bigram_text = bigram_process(
        abs_list)  # find and concatenate bigrams in the author string list

    # load spacy model, disable unnecessary parser and named entity recog for performance
    spacy.require_gpu()  #comment out to not use GPU
    nlp = spacy.load('en', disable=['parser', 'ner'])

    #nlp.max_length = 10000000   # comment out if strings are very large and causing memory issues

    # send bigrammed text and spacy function + its required variables to multiprocess function for execution
    #processed_list = mp(bigram_text, spacy_process, cpu, nlp) #comment out to use GPU instead of multiprocess function
    processed_list = spacy_process_gpu(bigram_text,
                                       nlp)  #comment out to not use GPU
    print('spacy_process_complete')
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 min_df=3,
                                 stop_words='english',
                                 norm='l2')
    matrix = vectorizer.fit_transform(
        processed_list)  # Tfidf vectors for each author string
    auth_vectors = dict(zip(auth_list,
                            matrix))  # creat a dict of {author : tfidf vector}

    #create a dataframe by sending list of authors and the dissim function + its required variables to multiprocess function
    sim_df = pd.DataFrame.from_dict(
        mp(auth_list, dissim, cpu, auth_alt_dict, auth_alt_dict_2,
           auth_vectors))

    # populate all 3 df author average columns by sending avg_alter_dissim to the mp3 function, which returns 3 lists of results
    sim_df['alter_dissim_avg'], sim_df['ring_dissim_avg'], sim_df['bridge_dissim_avg'] =\
        pd.Series(mp3(auth_list, rba_dissim, cpu, auth_alt_dict, auth_vectors)).array

    sim_df.to_csv('../output/sim_scores.csv', index=False)