def __init__( self, file_path: str = None, ): if file_path is None: raise ValueError( "Do not use the default arguments to KnowledgeBase. " "Instead, use a subclass (e.g UmlsKnowledgeBase) or pass a path to a kb." ) if file_path.endswith("jsonl"): raw = (json.loads(line) for line in open(cached_path(file_path))) else: raw = json.load(open(cached_path(file_path))) alias_to_cuis: Dict[str, Set[str]] = defaultdict(set) self.cui_to_entity: Dict[str, Entity] = {} for concept in raw: unique_aliases = set(concept["aliases"]) unique_aliases.add(concept["canonical_name"]) for alias in unique_aliases: alias_to_cuis[alias].add(concept["concept_id"]) self.cui_to_entity[concept["concept_id"]] = Entity(**concept) self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis}
def load_approximate_nearest_neighbours_index( tfidf_vectors_path: str = DEFAULT_PATHS["tfidf_umls_vectors"], ann_index_path: str = DEFAULT_PATHS["ann_index"], ef_search: int = 200) -> FloatIndex: """ Load an approximate nearest neighbours index from disk. Parameters ---------- tfidf_vectors_path : str, required. The path to the tfidf vectors of the items in the index. ann_index_path : str, required. The path to the ann index. ef_search: int, optional (default = 200) Controls speed performance at query time. Max value is 2000, but reducing to around ~100 will increase query speed by an order of magnitude for a small performance hit. """ uml_concept_alias_tfidfs = scipy.sparse.load_npz( cached_path(tfidf_vectors_path)).astype(numpy.float32) ann_index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) ann_index.addDataPointBatch(uml_concept_alias_tfidfs) ann_index.loadIndex(cached_path(ann_index_path)) query_time_params = {'efSearch': ef_search} ann_index.setQueryTimeParams(query_time_params) return ann_index
def load_approximate_nearest_neighbours_index( linker_paths: LinkerPaths, ef_search: int = 200, ) -> FloatIndex: """ Load an approximate nearest neighbours index from disk. Parameters ---------- linker_paths: LinkerPaths, required. Contains the paths to the data required for the entity linker. ef_search: int, optional (default = 200) Controls speed performance at query time. Max value is 2000, but reducing to around ~100 will increase query speed by an order of magnitude for a small performance hit. """ concept_alias_tfidfs = scipy.sparse.load_npz( cached_path(linker_paths.tfidf_vectors)).astype(numpy.float32) ann_index = nmslib.init( method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR, ) ann_index.addDataPointBatch(concept_alias_tfidfs) ann_index.loadIndex(cached_path(linker_paths.ann_index)) query_time_params = {"efSearch": ef_search} ann_index.setQueryTimeParams(query_time_params) return ann_index
def init_model(lang, output_dir, freqs_loc=None, vectors_loc=None, no_expand_vectors=False, meta_overrides=None, prune_vectors=-1, min_word_frequency=50): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. """ output_dir = ensure_path(output_dir) if vectors_loc is not None: vectors_loc = cached_path(vectors_loc) vectors_loc = ensure_path(vectors_loc) if freqs_loc is not None: freqs_loc = cached_path(freqs_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) probs, oov_prob = read_freqs(freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20) vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors) # Insert our custom tokenizer into the base model. #nlp.tokenizer = combined_rule_tokenizer(nlp) nlp.tokenizer = Tokenizer(nlp.vocab) if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) nlp.meta["version"] = VERSION if not output_dir.exists(): os.makedirs(output_dir, exist_ok=True) nlp.to_disk(output_dir) return nlp
def construct_umls_tree_from_tsv(filepath: str) -> UmlsSemanticTypeTree: """ Reads in a tsv file which is formatted as a depth first traversal of a hierarchy tree, where nodes are of the format: Name TAB UMLS Semantic Type TAB Tree Depth Event T051 1 Activity T052 2 Behavior T053 3 Social Behavior T054 4 Individual Behavior T055 4 Daily or Recreational Activity T056 3 """ node_stack: Deque[SemanticTypeNode] = deque() for line in open(cached_path(filepath), "r"): name, type_id, level = line.split("\t") name = name.strip() int_level = int(level.strip()) node = SemanticTypeNode(type_id, name, [], int_level) node_stack.append(node) def attach_children(node: SemanticTypeNode, stack: Deque[SemanticTypeNode]): while stack and stack[0].level > node.level: popped = stack.popleft() attach_children(popped, stack) node.children.append(popped) first = node_stack.popleft() attach_children(first, node_stack) return UmlsSemanticTypeTree(first)
def __init__(self, ann_index: FloatIndex = None, tfidf_vectorizer: TfidfVectorizer = None, ann_concept_aliases_list: List[str] = None, umls: UmlsKnowledgeBase = None, verbose: bool = False, ef_search: int = 200) -> None: self.ann_index = ann_index or load_approximate_nearest_neighbours_index(ef_search=ef_search) self.vectorizer = tfidf_vectorizer or joblib.load(cached_path(DEFAULT_PATHS["tfidf_vectorizer"])) self.ann_concept_aliases_list = ann_concept_aliases_list or \ json.load(open(cached_path(DEFAULT_PATHS["concept_aliases_list"]))) self.umls = umls or UmlsKnowledgeBase() self.verbose = verbose
def init_umls_nlp_linker(): base_dir = '' tfidf_path = base_dir + 'tfidf_vectors_sparse.npz' ann_path = base_dir + 'nmslib_index.bin' ann_index = load_approximate_nearest_neighbours_index( tfidf_vectors_path=tfidf_path, ann_index_path=ann_path) vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib')) ann_concept = json.load( open(cached_path(base_dir + 'concept_aliases.json'))) umlsknowlegebase = UmlsKnowledgeBase( file_path=base_dir + 'umls_2017_aa_cat0129.json', types_file_path=base_dir + 'umls_semantic_type_tree.tsv') cg = CandidateGenerator(ann_index=ann_index, tfidf_vectorizer=vec, ann_concept_aliases_list=ann_concept, umls=umlsknowlegebase) linker = UmlsEntityLinker(candidate_generator=cg, max_entities_per_mention=1) nlp.add_pipe(linker) return linker
def read_full_med_mentions(directory_path: str, label_mapping: Dict[str, str] = None, span_only: bool = False): def _cleanup_dir(dir_path: str): if os.path.exists(dir_path): shutil.rmtree(dir_path) resolved_directory_path = cached_path(directory_path) if "tar.gz" in directory_path: # Extract dataset to temp dir tempdir = tempfile.mkdtemp() print(f"extracting dataset directory {resolved_directory_path} to temp dir {tempdir}") with tarfile.open(resolved_directory_path, 'r:gz') as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived # contents are needed outside this function. atexit.register(_cleanup_dir, tempdir) resolved_directory_path = tempdir expected_names = ["corpus_pubtator.txt", "corpus_pubtator_pmids_all.txt", "corpus_pubtator_pmids_dev.txt", "corpus_pubtator_pmids_test.txt", "corpus_pubtator_pmids_trng.txt"] corpus = os.path.join(resolved_directory_path, expected_names[0]) examples = med_mentions_example_iterator(corpus) train_ids = {x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[4]))} dev_ids = {x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[2]))} test_ids = {x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[3]))} train_examples = [] dev_examples = [] test_examples = [] for example in examples: if example.pubmed_id in train_ids: train_examples.append(example) elif example.pubmed_id in dev_ids: dev_examples.append(example) elif example.pubmed_id in test_ids: test_examples.append(example) return train_examples, dev_examples, test_examples
def __init__( self, ann_index: FloatIndex = None, tfidf_vectorizer: TfidfVectorizer = None, ann_concept_aliases_list: List[str] = None, kb: KnowledgeBase = None, verbose: bool = False, ef_search: int = 200, name: str = None, ) -> None: if name is not None and any( [ann_index, tfidf_vectorizer, ann_concept_aliases_list, kb]): raise ValueError( "You cannot pass both a name argument and other constuctor arguments." ) # Set the name to the default, after we have checked # the compatability with the args above. if name is None: name = "umls" linker_paths = DEFAULT_PATHS.get(name, UmlsLinkerPaths) self.ann_index = ann_index or load_approximate_nearest_neighbours_index( linker_paths=linker_paths, ef_search=ef_search) self.vectorizer = tfidf_vectorizer or joblib.load( cached_path(linker_paths.tfidf_vectorizer)) self.ann_concept_aliases_list = ann_concept_aliases_list or json.load( open(cached_path(linker_paths.concept_aliases_list))) self.kb = kb or DEFAULT_KNOWLEDGE_BASES[name]() self.verbose = verbose # TODO(Mark): Remove in scispacy v1.0. self.umls = self.kb
def __init__(self, file_path: str = DEFAULT_UMLS_PATH, types_file_path: str = DEFAULT_UMLS_TYPES_PATH): raw = json.load(open(cached_path(file_path))) alias_to_cuis: Dict[str, Set[str]] = defaultdict(set) self.cui_to_entity: Dict[str, UmlsEntity] = {} for concept in raw: unique_aliases = set(concept["aliases"]) unique_aliases.add(concept["canonical_name"]) for alias in unique_aliases: alias_to_cuis[alias].add(concept["concept_id"]) self.cui_to_entity[concept["concept_id"]] = UmlsEntity(**concept) self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis} self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv( types_file_path)
def read_ner_from_tsv(filename: str) -> List[SpacyNerExample]: """ Reads BIO formatted NER data from a TSV file, such as the NER data found here: https://github.com/cambridgeltl/MTL-Bioinformatics-2016 Data is expected to be 2 tab seperated tokens per line, with sentences denoted by empty lines. Sentences read by this function will be already tokenized, but returned as a string, as this is the format required by SpaCy. Consider using the WhitespaceTokenizer(scispacy/util.py) to split this data with a SpaCy model. Parameters ---------- filename : str The path to the tsv data. Returns ------- spacy_format_data : List[SpacyNerExample] The BIO tagged NER examples. """ spacy_format_data = [] examples: List[Tuple[str, str]] = [] for line in open(cached_path(filename)): line = line.strip() if line.startswith("-DOCSTART-"): continue # We have reached the end of a sentence. if not line: if not examples: continue spacy_format_data.append(_handle_sentence(examples)) examples = [] else: word, entity = line.split("\t") examples.append((word, entity)) if examples: spacy_format_data.append(_handle_sentence(examples)) return spacy_format_data
def main(input_path: str, output_path: str, min_word_frequency: int): if input_path is not None: input_path = cached_path(input_path) input_path = ensure_path(input_path) probs, oov_prob = ( read_freqs(input_path, min_freq=min_word_frequency) if input_path is not None else ({}, -20) ) with open(output_path, "w") as _jsonl_file: _jsonl_file.write( json.dumps({"lang": "en", "settings": {"oov_prob": -20.502029418945312}}) ) _jsonl_file.write("\n") for word, prob in probs.items(): _jsonl_file.write(json.dumps({"orth": word, "prob": prob})) _jsonl_file.write("\n")
def train_parser_and_tagger(train_json_path: str, dev_json_path: str, test_json_path: str, model_output_dir: str, model_path: str = None, ontonotes_path: str = None, ontonotes_train_percent: float = 0.0): """Function to train the spacy parser and tagger from a blank model, with the default, en_core_web_sm vocab. Training setup is mostly copied from the spacy cli train command. @param train_json_path: path to the conll formatted training data @param dev_json_path: path to the conll formatted dev data @param test_json_path: path to the conll formatted test data @param model_output_dir: path to the output directory for the trained models @param model_path: path to the model to load @param ontonotes_path: path to the directory containnig ontonotes in spacy format (optional) @param ontonotes_train_percent: percentage of the ontonotes training data to use (optional) """ msg = Printer() train_json_path = cached_path(train_json_path) dev_json_path = cached_path(dev_json_path) test_json_path = cached_path(test_json_path) if model_path is not None: nlp = spacy.load(model_path) else: lang_class = util.get_lang_class('en') nlp = lang_class() if 'tagger' not in nlp.pipe_names: tagger = nlp.create_pipe('tagger') nlp.add_pipe(tagger, first=True) else: tagger = nlp.get_pipe('tagger') if 'parser' not in nlp.pipe_names: parser = nlp.create_pipe('parser') nlp.add_pipe(parser) else: parser = nlp.get_pipe('parser') train_corpus = GoldCorpus(train_json_path, dev_json_path) test_corpus = GoldCorpus(train_json_path, test_json_path) if ontonotes_path: onto_train_path = os.path.join(ontonotes_path, "train") onto_dev_path = os.path.join(ontonotes_path, "dev") onto_test_path = os.path.join(ontonotes_path, "test") onto_train_corpus = GoldCorpus(onto_train_path, onto_dev_path) onto_test_corpus = GoldCorpus(onto_train_path, onto_test_path) dropout_rates = util.decaying(0.2, 0.2, 0.0) batch_sizes = util.compounding(1., 16., 1.001) if model_path is not None: meta = nlp.meta else: meta = {} meta["lang"] = "en" meta["pipeline"] = ["tagger", "parser"] meta["name"] = "scispacy_core_web_sm" meta["license"] = "CC BY-SA 3.0" meta["author"] = "Allen Institute for Artificial Intelligence" meta["url"] = "allenai.org" meta["sources"] = ["OntoNotes 5", "Common Crawl", "GENIA 1.0"] meta["version"] = "1.0.0" meta["spacy_version"] = ">=2.2.1" meta["parent_package"] = "spacy" meta["email"] = "*****@*****.**" n_train_words = train_corpus.count_train() other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tagger', 'parser']] with nlp.disable_pipes(*other_pipes): if ontonotes_path: optimizer = nlp.begin_training(lambda: itertools.chain(train_corpus.train_tuples, onto_train_corpus.train_tuples)) else: optimizer = nlp.begin_training(lambda: train_corpus.train_tuples) nlp._optimizer = None train_docs = train_corpus.train_docs(nlp) train_docs = list(train_docs) train_mixture = train_docs if ontonotes_path: onto_train_docs = onto_train_corpus.train_docs(nlp) onto_train_docs = list(onto_train_docs) num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs)) randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs) train_mixture += randomly_sampled_onto row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) best_epoch = 0 best_epoch_uas = 0.0 for i in range(20): random.shuffle(train_mixture) with nlp.disable_pipes(*other_pipes): with tqdm(total=n_train_words, leave=False) as pbar: losses = {} minibatches = list(util.minibatch(train_docs, size=batch_sizes)) for batch in minibatches: docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) # save intermediate model and output results on the dev set with nlp.use_params(optimizer.averages): epoch_model_path = os.path.join(model_output_dir, "model"+str(i)) os.makedirs(epoch_model_path, exist_ok=True) nlp.to_disk(epoch_model_path) with open(os.path.join(model_output_dir, "model"+str(i), "meta.json"), "w") as meta_fp: meta_fp.write(json.dumps(meta)) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = train_corpus.dev_docs(nlp_loaded) dev_docs = list(dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() gpu_wps = None cpu_wps = nwords/(end_time-start_time) if ontonotes_path: onto_dev_docs = list(onto_train_corpus.dev_docs(nlp_loaded)) onto_scorer = nlp_loaded.evaluate(onto_dev_docs) if scorer.scores["uas"] > best_epoch_uas: best_epoch_uas = scorer.scores["uas"] best_epoch = i progress = _get_progress( i, losses, scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) if ontonotes_path: progress = _get_progress( i, losses, onto_scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) # save final model and output results on the test set final_model_path = os.path.join(model_output_dir, "best") if os.path.exists(final_model_path): shutil.rmtree(final_model_path) shutil.copytree(os.path.join(model_output_dir, "model" + str(best_epoch)), final_model_path) nlp_loaded = util.load_model_from_path(final_model_path) start_time = timer() test_docs = test_corpus.dev_docs(nlp_loaded) test_docs = list(test_docs) nwords = sum(len(doc_gold[0]) for doc_gold in test_docs) scorer = nlp_loaded.evaluate(test_docs) end_time = timer() gpu_wps = None cpu_wps = nwords/(end_time-start_time) meta["speed"] = {"gpu": None, "nwords": nwords, "cpu": cpu_wps} print("Retrained genia evaluation") print("Test results:") print("UAS:", scorer.uas) print("LAS:", scorer.las) print("Tag %:", scorer.tags_acc) print("Token acc:", scorer.token_acc) with open(os.path.join(model_output_dir, "genia_test.json"), "w+") as metric_file: json.dump(scorer.scores, metric_file) with open(os.path.join(model_output_dir, "best", "meta.json"), "w") as meta_fp: meta_fp.write(json.dumps(meta)) if ontonotes_path: onto_test_docs = list(onto_test_corpus.dev_docs(nlp_loaded)) print("Retrained ontonotes evaluation") scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs) print("Test results:") print("UAS:", scorer_onto_retrained.uas) print("LAS:", scorer_onto_retrained.las) print("Tag %:", scorer_onto_retrained.tags_acc) print("Token acc:", scorer_onto_retrained.token_acc) with open(os.path.join(model_output_dir, "ontonotes_test.json"), "w+") as metric_file: json.dump(scorer_onto_retrained.scores, metric_file)
def read_full_med_mentions( directory_path: str, label_mapping: Dict[str, str] = None, span_only: bool = False, spacy_format: bool = True, ): def _cleanup_dir(dir_path: str): if os.path.exists(dir_path): shutil.rmtree(dir_path) resolved_directory_path = cached_path(directory_path) if "tar.gz" in directory_path: # Extract dataset to temp dir tempdir = tempfile.mkdtemp() print( f"extracting dataset directory {resolved_directory_path} to temp dir {tempdir}" ) with tarfile.open(resolved_directory_path, "r:gz") as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived # contents are needed outside this function. atexit.register(_cleanup_dir, tempdir) resolved_directory_path = tempdir expected_names = [ "corpus_pubtator.txt", "corpus_pubtator_pmids_all.txt", "corpus_pubtator_pmids_dev.txt", "corpus_pubtator_pmids_test.txt", "corpus_pubtator_pmids_trng.txt", ] corpus = os.path.join(resolved_directory_path, expected_names[0]) examples = med_mentions_example_iterator(corpus) train_ids = { x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[4])) } dev_ids = { x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[2])) } test_ids = { x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[3])) } train_examples = [] dev_examples = [] test_examples = [] def label_function(label): if span_only: return "ENTITY" if label_mapping is None: return label else: return label_mapping[label] for example in examples: spacy_format_entities = [ (x.start, x.end, label_function(x.mention_type)) for x in example.entities ] spacy_format_entities = remove_overlapping_entities( sorted(spacy_format_entities, key=lambda x: x[0]) ) spacy_example = (example.text, {"entities": spacy_format_entities}) if example.pubmed_id in train_ids: train_examples.append(spacy_example if spacy_format else example) elif example.pubmed_id in dev_ids: dev_examples.append(spacy_example if spacy_format else example) elif example.pubmed_id in test_ids: test_examples.append(spacy_example if spacy_format else example) return train_examples, dev_examples, test_examples