def load_final_pipe(): import string import en_core_sci_lg import en_core_web_md from collections import Counter from tqdm import tqdm en_core_sci_lg.load() en_core_web_md.load() return load_object(FINAL_PIPE_FILE_PATH)
def __init__(self, model_size='en_core_web_md'): try: self.nlp = spacy.load(model_size) except: self.nlp = en_core_web_md.load() finally: self.nlp.tokenizer = self.custom_tokenizer(self.nlp)
def entity_analyzer(my_text): nlp = en_core_web_md.load() docx = nlp(my_text) tokens = [token.text for token in docx] entities = [(entity.text, entity.label_) for entity in docx.ents] allData = ['"Token":{},\n"Entities":{}'.format(tokens, entities)] return allData
def text_analyzer(my_text): nlp = en_core_web_md.load() docx = nlp(my_text) allData = [('"Token":{},\n"Lemma":{}'.format(token.text, token.lemma_)) for token in docx] return allData
def do_eval(test_data_path, shuffle=False): if FLAGS.load_model is None: raise ValueError("You need to specify the model location by --load_model=[location]") # Load Testing Data question_1, question_2, labels = get_input_from_csv(test_data_path) if shuffle: question_1, question_2, labels = shuffle_data(question_1, question_2, labels) # Load Pre-trained Model if FLAGS.best_glove: import en_core_web_md nlp = en_core_web_md.load() # load best-matching version for Glove else: nlp = spacy.load('en') embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown) # shape=(1071074, 300) tf.logging.info('Build model ...') esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate) if FLAGS.load_model: model = esim.build_model(FLAGS.load_model) else: raise ValueError("You need to specify the model location by --load_model=[location]") # Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors q1_test, q2_test = convert_questions_to_word_ids(question_1, question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) labels = to_categorical(np.asarray(labels, dtype='int32')) scores = model.evaluate([q1_test, q2_test], labels, batch_size=FLAGS.batch_size, verbose=1) print("=================== RESULTS =====================") print("[*] LOSS OF TEST DATA: %.4f" % scores[0]) print("[*] ACCURACY OF TEST DATA: %.4f" % scores[1])
def test_predict(self): nlp = en_core_web_md.load() classifier = TextClassifier(nlp.vocab, n_classes=2) text = "Sample text to test the classifier" output = classifier.predict(text) self.assertTrue(isinstance(output.data, torch.Tensor))
def test_get_ids(self): nlp = en_core_web_md.load() classifier = TextClassifier(nlp.vocab, n_classes=2) text = "Sample text to test the classifier" ids = classifier.get_ids(text) self.assertTrue(isinstance(ids, list))
def get_spacy_model(): global _spacy_cache if _spacy_cache is None: print("Initializing spaCy model...") _spacy_cache = en_core_web_md.load() return _spacy_cache
def __init__(self, dictionary_loader: DictionaryLoader, language="en", filters: List[AnnotationFilter] = None): """ Parameters ---------- dictionary_loader: DictionaryLoader The dictionary loader that will provide the dictionary contents language: str The language of the text that will processed (affects the choice of tokenner and stemmer). filters: List[AnnotationFilter] A list of filters to apply post recognition """ super().__init__(dictionary_loader, language=language, filters=filters) self.punctuation_remove = regex.compile( r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) self.label_concept_index = {} self.label_token_counts = {} self.label_lengths = {} self.trie = Trie() if language == 'en': import en_core_web_md self.spacy = en_core_web_md.load() elif language == 'fr': import fr_core_web_md self.spacy = fr_core_web_md.load() else: raise ValueError(f"Unsupported language: {language}")
def train(train_data, val_data, batch_size, n_epochs, save_dir=None): # Stage 1: Read training data (csv) && Preprocessing them tf.logging.info('Loading training and validataion data ...') train_question_1, train_question_2, train_labels = get_input_from_csv(train_data) # val_question_1, val_question_2, val_labels = get_input_from_csv(val_data) # Stage 2: Load Pre-trained embedding matrix (Using GLOVE here) tf.logging.info('Loading pre-trained embedding matrix ...') if FLAGS.best_glove: import en_core_web_md nlp = en_core_web_md.load() # load best-matching version for Glove else: nlp = spacy.load('en') embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown) # shape=(1071074, 300) # Stage 3: Build Model tf.logging.info('Build model ...') esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate) if FLAGS.load_model: model = esim.build_model(FLAGS.load_model) else: model = esim.build_model() # Stage 4: Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors tf.logging.info('Converting questions into ids ...') q1_train, q2_train = convert_questions_to_word_ids(train_question_1, train_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) train_labels = to_categorical(np.asarray(train_labels, dtype='int32')) # q1_val, q2_val = convert_questions_to_word_ids(val_question_1, val_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate) # val_labels = to_categorical(np.asarray(val_labels, dtype='int32')) # Stage 5: Training tf.logging.info('Start training ...') callbacks = [] save_dir = save_dir if save_dir is not None else 'checkpoints' filepath = os.path.join(save_dir, "weights-{epoch:02d}-{val_acc:.2f}.hdf5") checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks.append(checkpoint) if FLAGS.tensorboard: graph_dir = os.path.join('.', 'GRAPHs') if not os.path.exists(graph_dir): os.makedirs(graph_dir) tb = TensorBoard(log_dir=graph_dir, histogram_freq=0, write_graph=True, write_images=True) callbacks.append(tb) model.fit( x=[q1_train, q2_train], y=train_labels, batch_size=batch_size, epochs=n_epochs, # validation_data=([q1_val, q2_val], val_labels), validation_split=0.2, callbacks=callbacks, shuffle=True, verbose=FLAGS.verbose )
def __init__(self): ifninstall(self.__class__.md_pkg) print('initiating NLP language pipeline...', end='', flush=True) import en_core_web_md self._nlp = en_core_web_md.load() print('done') self._corpus = None
def _get_spacy_wmd_model(): global _spacy_wmd_cache if _spacy_wmd_cache is None: _spacy_wmd_cache = en_core_web_md.load() _spacy_wmd_cache.add_pipe( wmd.WMD.SpacySimilarityHook(_spacy_wmd_cache), last=True) return _spacy_wmd_cache
def __init__(self): # init model and disabling unuse module(s) try: self.model = en_core_web_md.load() print("Set SpaCy model to medium model") except: self.model = en_core_web_sm.load() print("Set SpaCy model to small model")
def __init__(self, lang): if lang == LANG.EN: self.nlp = en_core_web_md.load() else: self.nlp = de_core_news_sm.load() self.stanford_ner = StanfordNERTagger(model, '../models/stanford-ner.jar', encoding='utf-8')
def embbeding(self): pretrained_emb = [] spacy_tool = en_core_web_md.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) pretrained_emb.append(spacy_tool('CLS').vector) for i in range(len(self.dictionary.idx2word)): word = self.dictionary.idx2word[i] pretrained_emb.append(spacy_tool(word).vector) return np.array(pretrained_emb)
def create_corpus(text_list, target_path): nlp = en_core_web_md.load(disable=['tagger', 'ner', 'textcat']) with open(target_path, 'w') as f: # Split sentences for each document for text in tqdm(text_list): text_lines = [str(sent) for i, sent in enumerate(nlp(str(text)).sents)] f.write('\n'.join(text_lines)) f.write("\n \n")
def en_lang(cls): me_list = ['i', 'my'] embeddings_model = FlairEmbeddingModels().en_lang() nlp = en_core_web_md.load() relationship_list = [ 'father', 'mother', 'sister', 'brother', 'son', 'daughter', 'husband', 'wife', 'grandson', 'granddaughter', 'grandmother', 'grandfather', 'uncle', 'aunt', 'friend' ] return cls(me_list, embeddings_model, nlp, relationship_list)
def __init__(self, n_classes, arch="cnn", vocab=None, hparams={}): """ Constructor class for TextClassifier Args: vocab (spacy.vocab): Spacy vocabulary class arch (string/pl.Lightningmodule): The underlying text classifier model architecture hparams (dict): Dictionary of hyper parameters for the underlying model Returns: None """ if isinstance(vocab, Vocab): self._vocab = vocab else: self._vocab = en_core_web_md.load().vocab self._vocab.set_vector("@pad@", vector=np.zeros(self.vocab.vectors.shape[1])) self._vocab.set_vector("@oov@", vector=np.zeros(self.vocab.vectors.shape[1])) oov_orth = self._vocab["@oov@"].orth self.oov_id = self._vocab.vectors.key2row[oov_orth] self.tokenizer = Tokenizer(self.vocab) input_dim, embedding_dim = self.vocab.vectors.shape output_dim = n_classes pad_idx = self.vocab.vectors.key2row[self.vocab["@pad@"].orth] hparams["pad_idx"] = pad_idx hparams["input_dim"] = input_dim hparams["embedding_dim"] = embedding_dim if isinstance(arch, BaseModel): self._model = arch elif isinstance(arch, str): if arch == "cnn": self._model = CNN2D(output_dim, hparams) elif arch == "bilstm": self._model = BiLSTM(output_dim, hparams) else: print("No such architecture exists") else: print("arch should be string or a torch file duh")
def spacy_keyList(q): nlp = en_core_web_md.load() doc = nlp(q) # for w in doc: # print(f'{w.text:15s}[{w.tag_:5s}|{w.lemma_:20s}|{w.pos_:6s}]') # for ent in doc.ents: # print(f'{ent.text:15s}[{ent.label_}]') pattern1 = [{'ENT_TYPE': 'ORG', "OP": "+"}] pattern2 = [{'ENT_TYPE': 'PERSON', "OP": "+"}] pattern3 = [{'ENT_TYPE': 'FAC', "OP": "+"}] pattern4 = [{'ENT_TYPE': 'LOC', "OP": "+"}] pattern5 = [{'ENT_TYPE': 'GPE', "OP": "+"}] pattern6 = [{ 'ENT_TYPE': 'CARDINAL', "OP": "*" }, { 'ENT_TYPE': 'LOC', "OP": "+" }, { "IS_PUNCT": True }, { 'ENT_TYPE': 'GPE', "OP": "+" }, { "IS_PUNCT": True }, { 'ENT_TYPE': 'GPE', "OP": "+" }] pattern7 = [{ 'ENT_TYPE': 'FAC', "OP": "+" }, { "IS_LOWER": True }, { 'ENT_TYPE': 'CARDINAL', "OP": "?" }, { 'ENT_TYPE': 'GPE', "OP": "+" }] matcher = Matcher(nlp.vocab) matcher.add("name1", None, pattern1, pattern2, pattern3, pattern4, pattern5, pattern6, pattern7) matches = matcher(doc) keyList = [] for m_id, s, e in matches: keyword = doc[s:e].text if len(keyword) > 1 and len(doc[s:e].text.split()) > 1: keyList.append(keyword) return keyList
def __init__(self, title, path, verbose = False, testing = 0): self.verbose = verbose self.testing = testing self.home = path self.title = title self.date = self.get_date() self.today = date.today() self.pdf_path = os.path.join(self.home, self.title) self.img_path = os.path.join(self.pdf_path + 'convertedimages') self.words = [] self.num_pages = self.get_pdf_images() self.text = self.get_text() self.nlp = en_core_web_md.load() self.doc = self.nlp(self.text)
def get_entities(self, document: str, language: str = 'english'): ''' Takes a document and returns a list of extracted entities ''' if language == 'spanish': try: import es_core_news_md logger.info("Success importing en_core_web_md") except ImportError: logger.error("Error importing es_core_news_md") sys.exit(-1) else: try: import en_core_web_md logger.info("Success importing en_core_web_md") except ImportError: logger.error("Error importing en_core_web_md") sys.exit(-1) if isinstance(document, List): document = " ".join(document) # if language given is not the name of a spacy parser, try to convert it to one parser_name = language if language in LANG_TO_PARSER.values( ) else LANG_TO_PARSER.get(language.lower()) if not parser_name: raise Exception('language not supported') # if requested parser is not already in memory, try to load from spacy if parser_name not in PARSERS: try: logger.info("Trying to load parser.") if language == 'spanish': PARSERS[parser_name] = es_core_news_md.load() else: PARSERS[parser_name] = en_core_web_md.load() #PARSERS[parser_name] = spacy.load(parser_name) except Exception: logger.exception("Error loading parser") sys.exit(-1) else: logger.info("Found parser %s in memory" % parser_name) def get_ents(doc): ''' prep, parse, then extract entities from doc text ''' doc = prep_text(doc) # preprocess string doc = PARSERS[parser_name](doc) # parse prepped doc ents = set(ent.text for ent in doc.ents if not self.filter_entity(ent)) # extract entities return list(ents) return get_ents(document)
def __init__(self, name="", motorcycles=[], closest_track=None, **kwargs): print("Loading in NLP models... (this could take a minute or two)") self.nlp = en_core_web_md.load() self.sbert_model = SentenceTransformer('stsb-roberta-base') # For NER (at least for detecting a person's name) it seems to perform a lot better with small, it wouldn't detect my name on medium self.ner = en_core_web_sm.load() # user data self.name = name self.motorcycles = motorcycles self.closest_track = closest_track self.conn = sqlite3.connect("knowledge_base.db").cursor() self.motorcycle_finder = MotorcycleFinder(ner=self.ner) self.new_user = False if name else True # will eventually be something like True if name else False
def tokenize_string_details(s): global nlp if nlp is None: # not doing it here causes some errors about en_models.vector not being able to be loaded # essentially this causes the models to be loaded at most once in the executors # not very good, but it will do for this sie nlp = en_core_web_md.load() doc = nlp(s) tokens = [] for token in doc: if token.ent_type_ in ['GPE', 'LOC', 'DATE', 'TIME']: tokens.append(f'-{token.ent_type}-') elif token.like_url: tokens.append(f'-URL-') else: tokens.append(token.lower_) return tokens
def main(): logging.getLogger().setLevel('INFO') nlp = en_core_web_md.load() split = 'val' sent = sentences(json.load(open(coco_path + '/dataset.json')), split=split) writer = csv.writer(open(data_path + '/depparse_coco_val.csv', "w")) writer.writerow( ["sentid", "position", "word", "postag", "postag_c", "dep", "head"]) for sent_i in sent: #logging.info("Parsing: {} {}".format(sent_i['sentid'], ' '.join(sent_i['tokens']))) logging.info("Parsing: {} {}".format(sent_i['sentid'], sent_i['raw'])) postag, postag_c, label, head = parse(nlp, sent_i['tokens']) for i in range(len(sent_i['tokens'])): writer.writerow([ sent_i['sentid'], i, sent_i['tokens'][i], postag[i], postag_c[i], label[i], head[i] ])
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def en_lang(cls): nlp = en_core_web_md.load() embeddings_model = FlairEmbeddingModels().en_lang() # PP: e.g. 'I have a son', 'I have a smaller brother', 'I have a 9 year old son' # NP: e.g. 'My (little) sister' grammar = r""" PP: {<PRON><VERB><NUM>?<DET>?<ADJ>?<NOUN>} NP: {<ADJ><ADJ>?<NOUN>} REL: {<PP>|<NP>}""" relationship_list = [ 'father', 'mother', 'sister', 'brother', 'son', 'daughter', 'husband', 'wife', 'grandson', 'granddaughter', 'grandmother', 'grandfather', 'uncle', 'aunt', 'friend' ] me_list = ['i', 'my', 'me'] return cls(nlp, grammar, relationship_list, me_list, embeddings_model)
def process_page(row): nlp = en_core_web_md.load() warc_record = WarcRecord(row) warc_payload = warc_record.payload if warc_record.broken or not warc_payload or warc_payload is None or warc_record.id is None: return (yield "") ents = [] soup = None try: soup = BeautifulSoup(warc_payload, "html.parser") except: raise Exception("SOUP FAILED: %s %s\n" % (warc_payload, warc_record.id) ) for script in soup(['style', 'script', 'head', 'title', 'meta', '[document]', 'code' 'blockquote', 'cite']): script.extract() text = " ".join(re.findall(r'\w+', soup.get_text())) article = nlp(text) raw_words = [] for ent in article.ents: if(ent.label_ not in ["CARDINAL", "DATE", "QUANTITY", "TIME", "ORDINAL", "MONEY", "PERCENT", "QUANTITY"]) and len(ent.text.split(" ")) < 4: raw_words.append(ent.text) ents.append(ent) processed_results = {} output = [] for ent in ents: stripped_ent = ent.text.strip() if text == '': continue es_results = es.search(stripped_ent) sims = [] abstracts = [] processed_results = {} for es_result in es_results: if es_result.label in processed_results: continue else: processed_results[es_result.label] = True abstract = sparql.search(es_result.id, es_result.label, es_result.score) if abstract: abstracts.append([es_result.id, abstract]) if len(abstracts) > 0: output.append(stringify_reply(warc_record.id, stripped_ent, get_sim(text, abstracts))) yield output
def load_model(model_str): """ Loads a model given an input string (could work differently) """ if model_str == "en_core_web_md": import en_core_web_md model_func = en_core_web_md.load() if model_str == "en_resume_model": import en_resume_model model_func = en_resume_model.load() if model_str == "seed_data": import spacy from spacy.matcher import PhraseMatcher from spacy.tokens import Span class EntityMatcher(object): """ Matcher with multi-case """ def __init__(self, nlp, terms, label): patterns = [nlp(text.lower()) for text in terms] patterns += [nlp(text.upper()) for text in terms] patterns += [nlp(text.title()) for text in terms] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(label, None, *patterns) self.name = '{}_matcher'.format(label) def __call__(self, doc): matches = self.matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=match_id) doc.ents = list(doc.ents) + [span] return doc nlp = spacy.load('en_core_web_md', disable=['ner']) with open("../data/seeds.json", 'r') as f: seed_data_json = json.loads(f.read()) for key in seed_data_json.keys(): terms = seed_data_json[key] #terms = spacy_similarity(terms, nlp, num_similar=2) entity_matcher = EntityMatcher(nlp, terms, key) nlp.add_pipe(entity_matcher) model_func = nlp return model_func
def eventLister(text, subject): # Load the large English NLP model nlp = en_core_web_md.load() # Parse the document with spaCy doc = nlp(text) # Extract semi-structured statements statements = textacy.extract.semistructured_statements(doc, subject) # Return the results str = "Here are the things I know about " + subject + ":" for statement in statements: subject, verb, fact = statement str += f"\n - {fact}" return str
def get_people_orgs_from_text(text, model="large"): """ Loads and runs spaCy NLP object on the text provided. returns two lists: (1) tuples of people (person, number of appearances) (2) tuples of organizations (organization, number of appearances) """ # Step 1: Check model input and load nlp object if model.lower() == "large": nlp = en_core_web_lg.load() elif model.lower() == "medium": nlp = en_core_web_md.load() elif model.lower() == "small": nlp = en_core_web_sm.load() else: print("Invalid NLP model.") return [], [] # Step 2: Process the text using spaCy and get list of people and # orgnization objects doc = nlp(text) orgs = [x.text for x in doc.ents if x.label_ == 'ORG'] people = [x.text for x in doc.ents if x.label_ == 'PERSON'] #pprint([(x.text, x.label_) for x in doc.ents]) # Step 3: Process and filter the list of organizations orgdict = dict() for item in orgs: if item in orgdict.keys(): orgdict[item] += 1 else: orgdict[item] = 1 orgs = list(orgdict.items()) orgs.sort(key=lambda x: x[1], reverse=True) orgs = filter_people_orgs(orgs) # Step 4: Process and filter the list of people peopledict = dict() for item in people: if item in peopledict.keys(): peopledict[item] += 1 else: peopledict[item] = 1 people = list(peopledict.items()) people.sort(key=lambda x: x[1], reverse=True) people = filter_people_orgs(people) return people, orgs