def train_from_file( input_path, corpus_type='bccwj_ud', model_path=None, clear_model=False, keep_gold_tokens=False, evaluate_all_combinations=False, mini_batch_size=128, max_epochs=32, online_sgd_max_epochs=0, give_up_iter=3, evaluation_corpus_path=None, output_base_path=None, require_gpu=False, ): if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) if corpus_type == 'bccwj_ud': corpus = convert_files(input_path) if evaluation_corpus_path: evaluation_gold = convert_files(evaluation_corpus_path) else: evaluation_gold = corpus[0:100] else: corpus = None evaluation_gold = None return train(corpus, model_path, clear_model, keep_gold_tokens, evaluate_all_combinations, mini_batch_size, max_epochs, online_sgd_max_epochs, give_up_iter, evaluation_gold, output_base_path)
def set_nlp(self): if self.nlp: return if self.require_gpu: spacy.require_gpu() if self.output_format in ["2", "mecab"]: nlp = JapaneseDefaults.create_tokenizer( config={ "split_mode": self.split_mode }).tokenizer else: # Work-around for pickle error. Need to share model data. if self.model_path: nlp = spacy.load(self.model_path) else: nlp = spacy.load("ja_ginza") if self.disable_sentencizer: def disable_sentencizer(doc): for t in doc: t.is_sent_start = False return doc nlp.add_pipe(disable_sentencizer, before="parser") if self.split_mode: set_split_mode(nlp, self.split_mode) self.nlp = nlp
def main(): spacy.require_gpu() parser = argparse.ArgumentParser() parser.add_argument( "--article-path", help="path to articles to constrain", ) parser.add_argument( "--abstract-path", help="abstracts to constrain", ) parser.add_argument( "--output-path", help="path to save pos sents", ) parser.add_argument("--k", type=int, default=16) parser.add_argument("--batch-size", type=int, default=1000) args = parser.parse_args() selector = SentenceSelector(args.k) i = 0 with open(args.article_path) as article_file, open( args.abstract_path) as abstract_file, open(args.output_path, 'w+') as output_file: start = time.time() articles = [] abstracts = [] for x, y in zip(article_file, abstract_file): x = x.strip() y = y.strip() articles.append(x) abstracts.append(y) if i % args.batch_size == 0: articles = list(selector.nlp.pipe(articles)) abstracts = list(selector.nlp.pipe(abstracts)) for j in range(len(articles)): labels = selector.confine_docs(articles[j], abstracts[j]) for label in labels: print(' '.join(label), file=output_file) articles = [] abstracts = [] end = time.time() print('elapsed since start', end - start) print("processed {} lines".format(i), file=sys.stderr) i += 1 if len(articles) > 0: print('leftovers') articles = list(selector.nlp.pipe(articles)) abstracts = list(selector.nlp.pipe(abstracts)) for j in range(len(articles)): labels = selector.confine_docs(articles[j], abstracts[j]) for label in labels: print(' '.join(label), file=output_file)
def train_parser_from_file( input_json_path, model_path=None, clear_model=False, excluding_labels='', mini_batch_size=128, max_epochs=32, online_sgd_max_epochs=0, give_up_iter=3, evaluation_corpus_path=None, output_base_path=None, require_gpu=False, ): if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) corpus = convert_files(input_json_path) if evaluation_corpus_path: evaluation_gold = convert_files(evaluation_corpus_path) else: evaluation_gold = corpus[0:1000] train( corpus, model_path, clear_model, excluding_labels, mini_batch_size, max_epochs, online_sgd_max_epochs, give_up_iter, evaluation_gold, output_base_path )
def setup_spacy(use_gpu=False): if use_gpu: spacy.require_gpu() spacy.util.use_gpu(0) # disable everything we don't have at runtime either nlp = spacy.load('de', disable=['parser', 'ner']) infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes + [ '—', # numeric dash: (?<=[0-9])—(?=[0-9]) '/' ]) # maybe more restrictive? suffix_re = spacy.util.compile_suffix_regex( nlp.Defaults.suffixes + ('/', )) # maybe more restrictive? # '〟' as historic quotation mark (left and right) # '〃' as historic quotation mark (at the start of the line!) # '‟' as historic quotation mark (at the start of the line!) # '›' and '‹' as historic quotation marks (maybe goes away with NFC?) # '⟨' and '⟩' parentheses (maybe goes away with NFC?) # '⁽' and '⁾' parentheses (maybe goes away with NFC?) # '〈' and '〉' brackets (maybe goes away with NFC?) # '‹' and '›' as historic quotation mark # '’' as historic apostrophe # '—' as dash, even when written like a prefix # \u+feff (byte order mark) as prefix nlp.tokenizer = spacy.tokenizer.Tokenizer( nlp.vocab, token_match=nlp.tokenizer.token_match, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer) return nlp
def __init__(self, data, args): self.data = data self.job_level = load_job_dict(args) self.edu_level = load_edu_dict(args) # load nlp piepline model spacy.require_gpu() self.nlp_pipe = spacy.load(args.nlp_model) self.args = args
def __init__(self, iso, use_gpu=False): import spacy if use_gpu: # TODO: Support different GPU ids?? spacy.require_gpu() self.nlp = spacy.load(DSpacy[iso]) EngineInstance.__init__(self, iso, use_gpu)
def test_require_gpu(): try: import cupy # noqa: F401 require_gpu() assert isinstance(get_current_ops(), CupyOps) except ImportError: with pytest.raises(ValueError): require_gpu()
def load_pipeline(use_gpu: bool, with_ner: bool, model_name: str = "hu_core_news_lg"): if use_gpu: spacy.require_gpu() nlp = spacy.load(model_name) if not with_ner: nlp.remove_pipe("ner") return nlp
def __init__(self, pipeline_path="fi_geoparser", use_gpu=True, output_df=True): if use_gpu: spacy.require_gpu() else: spacy.require_cpu() self.output_df = output_df self.ner_pipeline = spacy.load(pipeline_path)
def test_require_cpu(): require_cpu() assert isinstance(get_current_ops(), NumpyOps) try: import cupy # noqa: F401 require_gpu() assert isinstance(get_current_ops(), CupyOps) except ImportError: pass require_cpu() assert isinstance(get_current_ops(), NumpyOps)
def ner_parser(df, col_string, batch_size=256): spacy.require_gpu() nlp = spacy.load("en_core_web_sm") docs = nlp.pipe(df[col_string], disable=["tagger", "parser"], batch_size=batch_size) out = [] for doc in docs: l = [ent.text for ent in doc.ents if ent.label_ == "ORG"] val = ", " l = val.join(l) out.append(l) df["company_name_list"] = out return df
def evaluate_from_file( input_json_path, model_path=None, require_gpu=False, print_stats=True, nlp=None, ): corpus = convert_files(input_json_path) if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) return evaluate(corpus, model_path, print_stats, nlp)
def spacy_sentences(path): import spacy # spacy.prefer_gpu() spacy.require_gpu() nlp = spacy.load('en_core_web_sm') pin = path / 'intermediate_files/wikipedia.txt' pout = path / 'final_text_file_single/wikipedia.segmented.txt' with open(pin) as f: with open(pout, 'w') as out: for ln in f: if ln != '\n': for s in nlp(ln).sents: out.write(s.text + '\n')
def __init__( self, model="en_ner_eco_md", with_abbrev=False, with_linking=None, with_sentence=False, threshold=0.7, prefer_gpu=False, verbose=False, logger=None, ): self.logger = logger if logger else logging.getLogger(__name__) warnings.simplefilter("ignore") self.verbose = verbose self.extractor = TextExtractor(logger=self.logger) if prefer_gpu: use_cuda = torch.cuda.is_available() self.logger.info( "GPU is available" if use_cuda else "GPU not found") if use_cuda: spacy.require_gpu() self.logger.info("TaxoNERD will use GPU") self.logger.info("Load model {}".format(model)) self.nlp = spacy.load(model) self.logger.info("Loaded model {}-{}".format(self.nlp.meta["name"], self.nlp.meta["version"])) self.with_sentence = with_sentence if self.with_sentence: if self.verbose: logger.info(f"Add pySBDSentencizer to pipeline") Span.set_extension("sent_id", default=None) self.nlp.add_pipe("pysbd_sentencizer", before="ner") self.with_abbrev = with_abbrev if self.with_abbrev: if self.verbose: logger.info(f"Add TaxonomicAbbreviationDetector to pipeline") self.nlp.add_pipe("taxonomic_abbreviation_detector") self.with_linking = with_linking != None if self.with_linking: kb_name = with_linking if with_linking != "" else "gbif_backbone" if self.verbose: logger.info(f"Add EntityLinker {kb_name} to pipeline") self.create_linker(kb_name, threshold)
def evaluate_from_file( path, corpus_type='bccwj_ud', model_path=None, parse_result_path=None, keep_gold_tokens=False, evaluate_all_combinations=False, require_gpu=False, print_file=sys.stdout, nlp=None, ): if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) parse_results = None if corpus_type == 'bccwj_ud': gold = convert_files(path) if parse_result_path: parse_results = convert_files(parse_result_path) else: gold = None if not nlp: nlp = spacy.load(model_path) nlp.tokenizer.use_sentence_separator = False if 'JapaneseCorrector' not in nlp.pipe_names: corrector = nlp.create_pipe('JapaneseCorrector') nlp.add_pipe(corrector, last=True) rewritten = [g.clone() for g in gold] if not keep_gold_tokens: print('Rewriting gold corpus with tokenizer', file=sys.stderr) disabled = nlp.disable_pipes(*nlp.pipe_names) rewrite_by_tokenizer(rewritten, nlp, sys.stderr) disabled.restore() print(file=sys.stderr, flush=True) return evaluate(gold, rewritten, model_path, parse_results, keep_gold_tokens, evaluate_all_combinations, print_file, nlp)
def main(): logger.info("Start coreference parsing") parser = ArgumentParser() parser.add_argument('--htmls_fname', type=str, required=True) parser.add_argument('--objects_fname', type=str, required=True) parser.add_argument('--htmls_coref_cache', type=str, required=True) parser.add_argument('--work_dir', type=str, required=False, default=os.getcwd()) args = parser.parse_args() work_dir = args.work_dir set_up_root_logger('COREF', os.path.join(work_dir, 'logs')) html_fname: str = args.htmls_fname objects_path = Path(args.objects_fname) htmls_coref_cache_fname: str = args.htmls_coref_cache with open(html_fname, "rb") as f_html: htmls_lookup = pickle.load(f_html) htmls_lookup_coref = load_cache(htmls_coref_cache_fname) names = get_all_objects(objects_path, work_dir) logger.info(f'Number of objects: {len(names)}') spacy.require_gpu() nlp = spacy.load('en_core_web_sm') neuralcoref.add_to_pipe(nlp) find_corefs(htmls_coref_cache_fname, htmls_lookup, htmls_lookup_coref, names, nlp) with open(htmls_coref_cache_fname, 'wb') as f: pickle.dump(htmls_lookup_coref, f, pickle.HIGHEST_PROTOCOL) logger.info('Finished')
def main(args): if args.cuda: spacy.require_gpu() # Load an spacy model (supported models are "es" and "en") print("Loading spacy...") nlp = spacy.load("en_core_web_lg") print("Done") nlp.tokenizer = lambda text: whitespace_tokenizer(text, nlp.vocab) nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger") nlp.add_pipe(BeneparComponent("benepar_en2")) with open(args.data) as f: lines = [line.strip() for line in list(f)] all_texts = [] all_feats = [] docs = nlp.pipe(lines, batch_size=args.batch_size) for doc in tqdm(docs, desc="Extracting feats", total=len(lines)): doc_feats = [] doc_texts = [] for token in doc: t_feats = extract_feats(token) doc_feats.append(t_feats) doc_texts.append(token.text) all_feats.append(doc_feats) all_texts.append(doc_texts) with open(args.data.replace(".tok", ".feats"), "w") as f: f.write("|".join((";".join(fn[:2]) for fn in FEATS))) f.write("\n") for text, doc_feats in zip(all_texts, all_feats): t_feats_joined = ["|".join(tf) for tf in doc_feats] line_feats = " ".join( ["|".join((t, f)) for t, f in zip(text, t_feats_joined)]) f.write(line_feats) f.write("\n")
def run_on_input(nlp, str_list, output_folder, filename='single_run', ref_list=None): from research.prodigy.functions import make_evaluation_html spacy.require_gpu() ent_data = [] for i, doc in tqdm(enumerate(nlp.pipe(str_list, batch_size=1000)), total=len(str_list)): ent_data += [{ "text": doc.text, "tp": [[ent.start_char, ent.end_char, ent.label_] for ent in doc.ents], "fp": [], "fn": [], "ref": "" if ref_list is None else ref_list[i], "_id": "", }] srsly.write_jsonl(f'{output_folder}/{filename}.jsonl', ent_data) make_evaluation_html(ent_data, output_folder, filename + '.html')
def main(nH=6, dropout=0.1, nS=6, nB=64, nE=20, use_gpu=-1, lim=1000000, nM=300, mL=100, save=False, nTGT=5000, save_name="model.pkl"): if use_gpu != -1: spacy.require_gpu() device = 'cuda' else: device = 'cpu' ''' Read dataset ''' nlp = spacy.load('en_core_web_sm') print('English model loaded') for control_token in ("<eos>", "<bos>", "<pad>", "<cls>", "<mask>"): nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}]) train, dev, test = get_iwslt() print('Dataset loaded') train, _ = zip(*train) dev, _ = zip(*dev) test, _ = zip(*test) train = train[:lim] dev = dev[:lim] test = test[:lim] ''' Tokenize ''' train = spacy_tokenize(nlp.tokenizer, train, mL=mL) dev = spacy_tokenize(nlp.tokenizer, dev, mL=mL) test = spacy_tokenize(nlp.tokenizer, test, mL=mL) print('Tokenization finished') ''' Set rank based on all the docs ''' all_docs = train + dev + test set_rank(nlp.vocab, all_docs, nTGT=nTGT) train = set_numeric_ids(nlp.vocab, train) dev = set_numeric_ids(nlp.vocab, dev) test = set_numeric_ids(nlp.vocab, test) print('Numeric ids set') word2indx, indx2word = get_dicts(nlp.vocab) print('Vocab dictionaries grabbed') with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(mL, nM) model = (FeatureExtracter(attrs=embed_cols) >> with_flatten( FancyEmbed(nM, nTGT, cols=embed_cols)) >> Residual(position_encode) >> create_model_input() >> Encoder( nM=nM, nS=nS, nH=nH, device=device) >> with_reshape( Softmax(nO=nTGT, nI=nM))) ''' Progress tracking ''' losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. ''' Get dev stats ''' for X0 in minibatch(dev, size=nB): X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL) Xh = model(X1) L, C, t = get_loss(Xh, X0, X1, loss_mask) correct += C total += t dev_loss[-1] += (L**2).sum() dev_accuracies[-1] = correct / total print(len(losses), losses[-1], train_accuracies[-1] / train_totals[-1], dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) if save: model.to_disk('.models/' + save_name) ''' Model training ''' with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 for X0, _ in trainer.iterate(train, train): X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL) Xh, backprop = model.begin_update(X1, drop=dropout) dXh, C, total = get_loss(Xh, X0, X1, loss_mask) backprop(dXh, sgd=optimizer) losses[-1] += (dXh**2).sum() train_accuracies[-1] += C train_totals[-1] += total
def init(): global nlp # note: don't initialize CUDA until we fork child procs spacy.require_gpu() nlp = spacy.load('en', disable=['tagger', 'ner', 'textcat'])
def test_require_gpu(): with pytest.raises(ValueError): require_gpu()
def run( model_path=None, mode=SUDACHI_DEFAULT_MODE, use_sentence_separator=False, disable_pipes='', recreate_corrector=False, output_path=None, output_format='0', require_gpu=False, *files, ): if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) if model_path: nlp = spacy.load(model_path) else: nlp = spacy.load('ja_ginza') if disable_pipes: print("disabling pipes: {}".format(disable_pipes), file=sys.stderr) nlp.disable_pipes(disable_pipes) print("using : {}".format(nlp.pipe_names), file=sys.stderr) if recreate_corrector: if 'JapaneseCorrector' in nlp.pipe_names: nlp.remove_pipe('JapaneseCorrector') corrector = JapaneseCorrector(nlp) nlp.add_pipe(corrector, last=True) if mode == 'A': nlp.tokenizer.mode = OriginalTokenizer.SplitMode.A elif mode == 'B': nlp.tokenizer.mode = OriginalTokenizer.SplitMode.B elif mode == 'C': nlp.tokenizer.mode = OriginalTokenizer.SplitMode.C else: raise Exception('mode should be A, B or C') print("mode is {}".format(mode), file=sys.stderr) if not use_sentence_separator: print("disabling sentence separator", file=sys.stderr) nlp.tokenizer.use_sentence_separator = False if output_path: output = open(str(output_path), 'w') else: output = sys.stdout try: if files: for path in files: with open(path, 'r') as f: lines = f.readlines() for line in lines: print_result(line, nlp, True, output_format, output) else: while True: line = input() print_result(line, nlp, True, output_format, output) except EOFError: pass except KeyboardInterrupt: pass finally: output.close()
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, data_loader=None): if data_loader is None: raise ValueError("Data Loader is required") if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() # enable working with GPU spacy.require_gpu() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn", }) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe("textcat") # add label to text classifier textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") textcat.add_label("NEUTRAL") # load the IMDB dataset print("Loading data...") (train_texts, train_cats), (dev_texts, dev_cats) = data_loader() train_texts = train_texts[:n_texts] if n_texts is not None else train_texts train_cats = train_cats[:n_texts] if n_texts is not None else train_cats print("Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts))) print("text {}".format(dev_texts[0])) print("Cat {}".format(dev_cats[0])) train_data = list(zip(train_texts, [{ "cats": cats } for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}". format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], )) # test the trained model test_text = "This movie sucked" test_text2 = "In the summer time we can have some good time. In the summer time we can do SKR" test_text3 = "I know that girl. She is very superficial. She is all about looks and money. She wants to do SKR" test_text4 = "A brand new gucci pouch. If you don't invest then you're losing out. All of that assets." test_text5 = "There were good moments in my high school: Koforidua Secondary Technical School." test_text6 = "Ex President Obama has swag." test_text7 = "Robert Freeman is a pathological liar." test_text8 = "Dear all, find attached the document" test_text9 = "What happens when technology meets science?" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats) doc2 = nlp2(test_text3) print(test_text3, doc2.cats) doc2 = nlp2(test_text4) print(test_text4, doc2.cats) doc2 = nlp2(test_text5) print(test_text5, doc2.cats) doc2 = nlp2(test_text6) print(test_text6, doc2.cats) doc2 = nlp2(test_text) print(test_text, doc2.cats) doc2 = nlp2(test_text7) print(test_text7, doc2.cats) doc2 = nlp2(test_text8) print(test_text8, doc2.cats) doc2 = nlp2(test_text9) print(test_text9, doc2.cats) do_report(dev_texts, dev_cats, output_dir)
#!/usr/bin/env python # -*- coding: utf-8 -*- import spacy spacy.require_gpu() from pathlib import Path import random from spacy.util import minibatch, compounding #import xx_ent_wiki_sm from spacy.lang.pt import Portuguese from ast import literal_eval import datetime import time output_dir = "./sky_ner" modelDir = Path(output_dir) nlp = spacy.blank('pt') ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) if modelDir.exists() is True: # training data TRAIN_DATA = open('dataset_new.txt', 'r').read() print('Dados carregados') try: TRAIN_DATA = literal_eval(TRAIN_DATA) print('literal eval aplicado') except:
def main(nH=6, dropout=0.1, nS=6, nB=15, nE=20, use_gpu=-1, lim=2000): if use_gpu != -1: # TODO: Make specific to different devices, e.g. 1 vs 0 spacy.require_gpu() train, dev, test = get_iwslt() train_X, train_Y = zip(*train) dev_X, dev_Y = zip(*dev) test_X, test_Y = zip(*test) ''' Read dataset ''' nlp_en = spacy.load('en_core_web_sm') nlp_de = spacy.load('de_core_news_sm') print('Models loaded') for control_token in ("<eos>", "<bos>", "<pad>"): nlp_en.tokenizer.add_special_case(control_token, [{ ORTH: control_token }]) nlp_de.tokenizer.add_special_case(control_token, [{ ORTH: control_token }]) train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, train_X[-lim:], train_Y[-lim:], MAX_LENGTH) dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, dev_X[-lim:], dev_Y[-lim:], MAX_LENGTH) test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, test_X[-lim:], test_Y[-lim:], MAX_LENGTH) train_X = set_numeric_ids(nlp_en.vocab, train_X, vocab_size=VOCAB_SIZE) train_Y = set_numeric_ids(nlp_de.vocab, train_Y, vocab_size=VOCAB_SIZE) nTGT = VOCAB_SIZE with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(MAX_LENGTH, MODEL_SIZE) model = (apply_layers(extractor, extractor) >> apply_layers( with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)), with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)), ) >> apply_layers(Residual(position_encode), Residual(position_encode)) >> create_batch() >> EncoderDecoder(nS=nS, nH=nH, nTGT=nTGT)) losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. for batch in minibatch(zip(dev_X, dev_Y), size=1024): X, Y = zip(*batch) Yh, Y_mask = model((X, Y)) L, C = get_loss(model.ops, Yh, Y, Y_mask) correct += C dev_loss[-1] += (L**2).sum() total += len(Y) dev_accuracies[-1] = correct / total n_train = train_totals[-1] print(len(losses), losses[-1], train_accuracies[-1] / n_train, dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 for X, Y in trainer.iterate(train_X, train_Y): (Yh, X_mask), backprop = model.begin_update((X, Y), drop=dropout) dYh, C = get_loss(model.ops, Yh, Y, X_mask) backprop(dYh, sgd=optimizer) losses[-1] += (dYh**2).sum() train_accuracies[-1] += C train_totals[-1] += sum(len(y) for y in Y)
def train_word2vec_from_file( corpus_type='sudachi_b', base_model_path=None, lang_name='ja', model_name='bccwj_ud', model_version='1.0.0', dimension=100, vocab_size=100000, min_count=5, window=7, negative=5, n_workers=8, epochs=2, output_dir=Path('.'), require_gpu=False, input_path=None, ): if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) if corpus_type == 'sudachi_a': corpus_reader = read_sudachi_a elif corpus_type == 'sudachi_b': corpus_reader = read_sudachi_b elif corpus_type == 'sudachi_c': corpus_reader = read_sudachi_c elif corpus_type == 'bccwj_ud': corpus_reader = read_bccwj_ud else: raise Exception('%s not supported' % corpus_type) if base_model_path: print('load base model: {}'.format(base_model_path), file=sys.stderr) model = Word2Vec.load(str(model_file_path(base_model_path, 'w2v'))) print('w2v loaded', file=sys.stderr) with open(str(model_file_path(base_model_path, 'pickle')), 'rb') as f: total_sents, word_store, word_counter = pickle.load(f) print('pickle loaded', file=sys.stderr) else: model = Word2Vec( size=dimension, window=window, min_count=min_count, workers=n_workers, sample=1e-5, negative=negative ) total_sents = 0 word_store = {} word_counter = [] print('initialized', file=sys.stderr) total_sents, words = train_word2vec( model, total_sents, word_store, word_counter, corpus_reader, vocab_size, min_count, epochs, input_path ) new_model_path = output_dir nlp = get_lang_class(lang_name) nlp.meta['name'] = model_name nlp.meta['version'] = model_version vocab = nlp.vocab for word in words: vocab.set_vector(word, model.wv[word]) corrector = nlp.create_pipe('JapaneseCorrector') nlp.add_pipe(corrector, last=True) nlp.to_disk(new_model_path) print('saved: ', new_model_path, file=sys.stderr) model.save(str(model_file_path(new_model_path, 'w2v'))) print('w2v saved', file=sys.stderr) with open(str(model_file_path(new_model_path, 'pickle')), 'wb') as f: pickle.dump((total_sents, word_store, word_counter), f) print('pickle saved', file=sys.stderr)
# NVIDIA import os import spacy #spacy.prefer_gpu() spacy.require_gpu() input_file = os.environ['WORKING_DIR'] + '/intermediate_files/wikipedia.txt' output_file = os.environ['WORKING_DIR'] + '/final_test_file_single/wikipedia.segmented.txt' nlp = spacy.load('en_core_web_sm') doc_seperator = "\n" file_mem = [] print("Reading file into memory.") with open(input_file) as ifile: for line in ifile: if line != "\n": file_mem.append(line) print("File read.") print("Starting nlp.pipe") docs = nlp.pipe(file_mem, batch_size=1000) print("Starting to write output") with open(output_file, "w") as ofile: for item in docs: for sent in item.sents:
def main( corpus_type=None, model_path=None, mode=SUDACHI_DEFAULT_MODE, use_sentence_separator=False, disable_pipes='', recreate_corrector=False, output_path=None, require_gpu=False, *lines, ): if require_gpu: spacy.require_gpu() print("GPU enabled", file=sys.stderr) nlp = load_model(model_path) if disable_pipes: print("disabling pipes: {}".format(disable_pipes), file=sys.stderr) nlp.disable_pipes(disable_pipes) print("using : {}".format(nlp.pipe_names), file=sys.stderr) else: # to ensure reflect local changes of corrector if recreate_corrector and 'JapaneseCorrector' in nlp.pipe_names: nlp.remove_pipe('JapaneseCorrector') corrector = JapaneseCorrector(nlp) nlp.add_pipe(corrector, last=True) if mode == 'A': nlp.tokenizer.mode = OriginalTokenizer.SplitMode.A elif mode == 'B': nlp.tokenizer.mode = OriginalTokenizer.SplitMode.B elif mode == 'C': nlp.tokenizer.mode = OriginalTokenizer.SplitMode.C else: raise Exception('mode should be A, B or C') print("mode is {}".format(mode), file=sys.stderr) if not use_sentence_separator: print("disabling sentence separator", file=sys.stderr) nlp.tokenizer.use_sentence_separator = False if output_path: output = open(str(output_path), 'w') else: output = sys.stdout line = '<init>' try: if corpus_type: if corpus_type == 'bccwj_ud': for line in convert_files(lines): print_result(line, nlp, True, output) else: for path in lines: with open(path, 'r') as f: lines = f.readlines() for line in lines: print_result(line, nlp, True, output) elif len(lines) > 0: for line in lines: print_result(line, nlp, True, output) else: while True: line = input() print_result(line, nlp, True, output) except EOFError: pass except KeyboardInterrupt: pass except Exception as e: print(e, file=sys.stderr) print('exception raised while analyzing the line:', line, file=sys.stderr) finally: output.close()
def test_require_gpu(): try: import cupy # noqa: F401 except ImportError: with pytest.raises(ValueError): require_gpu()
def main( ): #execute all functions within main to protect against multiprocessing infinite feedback loop if cpu_count() >= 8: #to avoid overtaxing Brad, save some cores cpu = 8 else: cpu = cpu_count() with open( '../input/generated_meta_strings.pkl', "rb" ) as pkl: # dictionary with authors as keys and their strings as values auth_strings = pickle.load(pkl) with open( '../input/alter_lists.pkl', "rb" ) as pkl: # dataframe with author column, alters column, and alters_2 column alter_lists = pickle.load(pkl) with open( '../input/author_metadata.pkl', "rb" ) as pkl: # dictionary with author metadata (ie. community membership) author_metadata = pickle.load(pkl) #create dataframe from dict (todo: just output a dataframe from community_strings instead) author_metadata = pd.DataFrame.from_dict( author_metadata, orient="index").reset_index().rename(columns={"index": "author"}) auth_alt_dict = dict(zip(alter_lists.author, alter_lists.alter)) # dict of {auth:alter list} auth_alt_dict_2 = dict( zip(alter_lists.author, alter_lists.alter_2)) # dict of {auth: alter_2 list} auth_list = list(auth_strings.keys()) # list of author names auth_index = dict() # pretty sure this isn't needed anymore for i, item in enumerate(auth_list): # see above auth_index[item] = [i] abs_list = [] # list of author strings to process # NOTE: this is only safe because the auth_strings dict hasn't been modified. Should be modified for posterity for author in auth_strings: abs_list.append(auth_strings[author]["meta_string"]) del auth_strings bigram_text = bigram_process( abs_list) # find and concatenate bigrams in the author string list # load spacy model, disable unnecessary parser and named entity recog for performance spacy.require_gpu() #comment out to not use GPU nlp = spacy.load('en', disable=['parser', 'ner']) #nlp.max_length = 10000000 # comment out if strings are very large and causing memory issues # send bigrammed text and spacy function + its required variables to multiprocess function for execution #processed_list = mp(bigram_text, spacy_process, cpu, nlp) #comment out to use GPU instead of multiprocess function processed_list = spacy_process_gpu(bigram_text, nlp) #comment out to not use GPU print('spacy_process_complete') vectorizer = TfidfVectorizer(max_df=0.5, min_df=3, stop_words='english', norm='l2') matrix = vectorizer.fit_transform( processed_list) # Tfidf vectors for each author string auth_vectors = dict(zip(auth_list, matrix)) # creat a dict of {author : tfidf vector} #create a dataframe by sending list of authors and the dissim function + its required variables to multiprocess function sim_df = pd.DataFrame.from_dict( mp(auth_list, dissim, cpu, auth_alt_dict, auth_alt_dict_2, auth_vectors)) # populate all 3 df author average columns by sending avg_alter_dissim to the mp3 function, which returns 3 lists of results sim_df['alter_dissim_avg'], sim_df['ring_dissim_avg'], sim_df['bridge_dissim_avg'] =\ pd.Series(mp3(auth_list, rba_dissim, cpu, auth_alt_dict, auth_vectors)).array sim_df.to_csv('../output/sim_scores.csv', index=False)