def test_issue10643(en_vocab): """Ensure overlapping terms can be removed from PhraseMatcher""" # fmt: off words = [ "Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "." ] # fmt: on doc = Doc(en_vocab, words=words) terms = { "0": Doc(en_vocab, words=["binary"]), "1": Doc(en_vocab, words=["binary", "data"]), } matcher = PhraseMatcher(en_vocab) for match_id, term in terms.items(): matcher.add(match_id, [term]) matches = matcher(doc) assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)] matcher.remove("0") assert len(matcher) == 1 new_matches = matcher(doc) assert new_matches == [(en_vocab.strings["1"], 4, 6)] matcher.remove("1") assert len(matcher) == 0 no_matches = matcher(doc) assert not no_matches
def test_phrase_matcher_remove_overlapping_patterns(en_vocab): matcher = PhraseMatcher(en_vocab) pattern1 = Doc(en_vocab, words=["this"]) pattern2 = Doc(en_vocab, words=["this", "is"]) pattern3 = Doc(en_vocab, words=["this", "is", "a"]) pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"]) matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4]) matcher.remove("THIS")
def dfprep(json_in, save_df, inputfile): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) json_in = para['json_in'] save_df = para['save_df'] with mlflow.start_run() as mlrun: print(subprocess.getoutput("python -m spacy download en_core_web_sm")) artpd = pd.read_json(json_in, orient='index', convert_dates=False, convert_axes=False) artpda = artpd[artpd.abstract.notnull()].copy() artpda = artpda[artpd.title.notnull()] # artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8]) artpdak = artpda[artpda.keywords.str.len() > 0].copy() dataf = pd.DataFrame( index=artpdak.index, columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey']) dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract dataf.loc[:, 'keywords'] = artpdak.keywords svoc = spacy.load("en_core_web_sm") matcher = PhraseMatcher(svoc.vocab, attr="LOWER") for pmid in dataf.index: t0 = dataf.loc[pmid] patterns = [svoc.make_doc(str(name)) for name in t0.keywords] matcher.add("Names", None, *patterns) doc = svoc(t0.SRC) t1 = ['O'] * (len(doc)) matched = [] matn = 0 for _, start, end in matcher(doc): t1[start] = 'B' t1[start + 1:end] = 'I' * (end - start - 1) if str(doc[start:end]).lower() not in matched: matn = matn + 1 matched.append(str(doc[start:end]).lower()) abskw = [] for x in t0.keywords: if x.lower() not in matched: abskw.append(x) dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1]) dataf.loc[pmid, 'Extracted'] = matn dataf.loc[pmid, 'abskey'] = abskw matcher.remove("Names") dataf.to_pickle(save_df)
def test_phrase_matcher_overlapping_with_remove(en_vocab): matcher = PhraseMatcher(en_vocab) matcher.add("TEST", [Doc(en_vocab, words=["like"])]) # TEST2 is added alongside TEST matcher.add("TEST2", [Doc(en_vocab, words=["like"])]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert "TEST" in matcher assert len(matcher) == 2 assert len(matcher(doc)) == 2 # removing TEST does not remove the entry for TEST2 matcher.remove("TEST") assert "TEST" not in matcher assert len(matcher) == 1 assert len(matcher(doc)) == 1 assert matcher(doc)[0][0] == en_vocab.strings["TEST2"] # removing TEST2 removes all matcher.remove("TEST2") assert "TEST2" not in matcher assert len(matcher) == 0 assert len(matcher(doc)) == 0
def test_phrase_matcher_remove(en_vocab): matcher = PhraseMatcher(en_vocab) matcher.add("TEST1", [Doc(en_vocab, words=["like"])]) matcher.add("TEST2", [Doc(en_vocab, words=["best"])]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert "TEST1" in matcher assert "TEST2" in matcher assert "TEST3" not in matcher assert len(matcher(doc)) == 2 matcher.remove("TEST1") assert "TEST1" not in matcher assert "TEST2" in matcher assert "TEST3" not in matcher assert len(matcher(doc)) == 1 matcher.remove("TEST2") assert "TEST1" not in matcher assert "TEST2" not in matcher assert "TEST3" not in matcher assert len(matcher(doc)) == 0 with pytest.raises(KeyError): matcher.remove("TEST3") assert "TEST1" not in matcher assert "TEST2" not in matcher assert "TEST3" not in matcher assert len(matcher(doc)) == 0
def get_context (doc, target_list, output_type='window', start_NE_num=0, end_NE_num=None, window_size=10): matcher = PhraseMatcher(nlp.vocab) terms = target_list terms.sort() print("Here are each of the relevant named entities:") for term in terms: print('\t* '+term) print("\n<Press enter to show concordances, one at a time.\n") move_on = input() if end_NE_num == None: end_NE_num = len(terms) for term in terms[start_NE_num:end_NE_num]: print("*"+term.upper()+"*") term_list = [nlp.make_doc(term)] matcher.add("TermList", None, *term_list) matches = matcher(doc) for i, match in enumerate(matches): match_id, start, end = match[0], match[1], match[2] if output_type == 'window': # # To print 10 words before and after target word: span = doc[start-window_size:end+window_size] print(i, span.text+'\n') elif output_type == 'sent': # # To print detected sentences with target words: span = doc[start:end] print(i, span.sent.text+'\n') input() matcher.remove("TermList")
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache, val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs, model_save, es): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) search_term = para['search_term'] max_records = para['max_records'] embvec = para['embvec'] embvecache = para['embvecache'] val_ratio = para['val_ratio'] rnnsize = para['rnnsize'] batchsize = para['batchsize'] lr = para['lr'] weight_decay = para['weight_decay'] n_epochs = para['n_epochs'] model_save = para['model_save'] if embvec == 1: embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache) use_pretrained = True with mlflow.start_run() as mlrun: pubmed = PubMed(tool="AlphabetH", email="*****@*****.**") query = search_term results = pubmed.query(query, max_results=max_records) pp = defaultdict(lambda: defaultdict(dict)) for art in results: pmed = art.pubmed_id try: pp[pmed]['title'] = art.title except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = art.abstract except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results except (AttributeError, TypeError): pass try: pp[pmed]['keywords'] = art.keywords except (AttributeError, TypeError): pass try: pp[pmed]['authors'] = art.authors except (AttributeError, TypeError): pass try: pp[pmed]['journal'] = art.journal except (AttributeError, TypeError): pass try: pp[pmed]['pubdate'] = str(art.publication_date.year) except (AttributeError, TypeError): pass try: pp[pmed]['conclusions'] = art.conclusions except (AttributeError, TypeError): pass print(subprocess.getoutput("python -m spacy download en_core_web_sm")) artpd = pd.DataFrame.from_dict(pp, orient='index') artpda = artpd[artpd.abstract.notnull()].copy() artpda = artpda[artpd.title.notnull()] # artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8]) artpdak = artpda[artpda.keywords.str.len() > 0].copy() dataf = pd.DataFrame( index=artpdak.index, columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey']) dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract dataf.loc[:, 'keywords'] = artpdak.keywords svoc = spacy.load("en_core_web_sm") matcher = PhraseMatcher(svoc.vocab, attr="LOWER") for pmid in dataf.index: t0 = dataf.loc[pmid] patterns = [svoc.make_doc(str(name)) for name in t0.keywords] matcher.add("Names", None, *patterns) doc = svoc(t0.SRC) t1 = ['O'] * (len(doc)) matched = [] matn = 0 for _, start, end in matcher(doc): t1[start] = 'B' t1[start + 1:end] = 'I' * (end - start - 1) if str(doc[start:end]).lower() not in matched: matn = matn + 1 matched.append(str(doc[start:end]).lower()) abskw = [] for x in t0.keywords: if x.lower() not in matched: abskw.append(x) dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1]) dataf.loc[pmid, 'Extracted'] = matn dataf.loc[pmid, 'abskey'] = abskw matcher.remove("Names") datatrain = dataf[dataf['Extracted'] >= 3].copy() datatest = dataf[dataf['Extracted'] < 3].copy() # separate train and validate dtrain = datatrain.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] seed = 250 idx = np.arange(datatrain.shape[0]) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.shuffle(idx) val_size = int(len(idx) * val_ratio) df_train = dtrain.iloc[idx[val_size:], :] df_val = dtrain.iloc[idx[:val_size], :] df_val_k = dtraink.iloc[idx[:val_size], :] df_test = datatest.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] df_val_k = dtraink.iloc[idx[:val_size], :] # Load original dataset datai = artpda.copy() datai = datai[datai.abstract.notnull()] datai = datai[datai.title.notnull()] datai = datai.replace('\n', ' ', regex=True) datai = datai.replace('\t', ' ', regex=True) dataiu = datai.loc[datai.keywords.str.len() == 0] dataik = datai.loc[datai.keywords.str.len() > 0] dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract tokenizertrg = lambda x: x.split() def tokenizersrc(text): # create a tokenizer function return [tok.text for tok in svoc.tokenizer(text)] def safe_value(field_val): return field_val if not pd.isna(field_val) else "Other" def safe_year(field_val): return field_val if not pd.isna(field_val) else 1900 TEXT = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False) LABEL = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None) fields = [('text', TEXT), ('label', LABEL)] device = 'cuda' train_examples = read_data(df_train, fields, tokenizersrc, tokenizertrg) valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg) # Load the pre-trained embeddings that come with the torchtext library. if use_pretrained: print('We are using pre-trained word embeddings.') TEXT.build_vocab(train_examples, vectors=embvec) else: print('We are training word embeddings from scratch.') TEXT.build_vocab(train_examples, max_size=5000) LABEL.build_vocab(train_examples) # Create one of the models defined above. #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False) model0 = RNNCRFTagger(TEXT, LABEL, rnnsize, emb_dim=300, update_pretrained=False) model0.to(device) optimizer = torch.optim.Adam(model0.parameters(), lr=lr, weight_decay=weight_decay) train(train_examples, valid_examples, embvec, TEXT, LABEL, device, model0, batchsize, optimizer, n_epochs) out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields, device) ttp3 = kphperct(df_val_k, out2, svoc) mlflow.log_param("epochs", n_epochs) mlflow.pytorch.save_model(model0, model_save) mlflow.log_metric("extraction_rate", ttp3.mean()) augout = evaltest2(dataiu, model0, tokenizersrc, fields, device) klist = kphext2(dataiu.SRC, augout, svoc) for i in range(len(dataiu.index)): dataiu.iloc[i, 2].extend(list(set(klist[i]))) output = pd.concat([dataik, dataiu], join="inner") output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index') if es == 1: output['journal'] = output['journal'].apply(safe_value) output['conclusions'] = output['conclusions'].apply(safe_value) output['pubdate'] = output['pubdate'].apply(safe_year) output['PMID'] = output.index test_server = [{'host': '127.0.0.1', 'port': 9200}] es = Elasticsearch(test_server, http_compress=True) use_these_keys = [ 'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate' ] def filterKeys(document): return {key: document[key] for key in use_these_keys} def doc_generator(df): df_iter = df.iterrows() for index, document in df_iter: try: yield { "_index": 'ms', "_source": filterKeys(document), } except StopIteration: return helpers.bulk(es, doc_generator(output)) print(ttp3.mean())
def main(): import nltk from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import SnowballStemmer import spacy from spacy import displacy from spacy.matcher import Matcher from spacy.matcher import PhraseMatcher from spacy.tokens import Span from spacy.pipeline import SentenceSegmenter import config nlp = spacy.load('en_core_web_sm') #%% print(nlp.pipeline) print(nlp.pipe_names) #%% print("Data string examples \n") mystring = '"As of last quarter autonomous cars have shifted insurance liability toward manufacturers. ' \ 'There\'s a car factory in LA! About 5km away. ' \ 'Here is the Apple snail-mail: [email protected] or visit http://www.oursite.com."' mystring2 = 'I am a runner running in a race because I love to run since I ran today.' words = [ 'run', 'ran', 'runner', 'runs', 'fairly', 'fairness', 'generous', 'generously', 'generate', 'generation' ] #%% print( "Print each word in the string with it's corresponding POS, dependency:" ) doc1 = nlp(mystring) print("The vocab size for our small lang. lib. is: ", len(doc1.vocab)) for token in doc1: print(token.text, token.pos, token.pos_, token.dep_) #%% print("Print the named entities:") for token in doc1.ents: print( f"{token.text} {10*'.'}\t {token.label_} {5*'.'}\t {spacy.explain(token.label_)}\n" ) #%% print("A function to display basic entity info.") def show_ents(doc): if doc.ents: for ent in doc.ents: print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('\n No named entities found. \n') doc4 = nlp( u'May I go to Washington, DC next May to see the Washington Monument and buy Tesla stocks? The flight ticket is only 500 dollars.' ) doc5 = nlp(u'Hi, Hope you are well.') show_ents(doc4) show_ents(doc5) #%% print("Adding a single term as an NER") from spacy.tokens import Span doc = nlp(u'Tesla to build a U.K. factory for $6 million') # Get the hash value of the ORG entity label ORG = doc.vocab.strings[u'ORG'] print(ORG) # Create a Span for the new entity # doc: Name of document object # 0: start position of the span, # 1: stop position of the span (exclusive: not including 1) # Label: ORG is the label assigned to the entity new_ent = Span(doc, 0, 1, label=ORG) # Add the entity to the existing Doc object doc.ents = list(doc.ents) + [new_ent] show_ents(doc) #%% print("Adding multiple phrases as NERs") doc = nlp(u'Our company plans to introduce a new vacuum cleaner. ' u'If successful, the vacuum cleaner will be our first product.') show_ents(doc) # Import PhraseMatcher and create a matcher object: from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab) # Create the desired phrase patterns: phrase_list = ['vacuum cleaner', 'vacuum-cleaner'] phrase_patterns = [nlp(text) for text in phrase_list] # Apply the patterns to our matcher object: matcher.add('newproduct', None, *phrase_patterns) # Apply the matcher to our Doc object: found_matches = matcher(doc) # See what matches occur: print(found_matches) # Here we create Spans from each match, and create named entities from them: from spacy.tokens import Span PROD = doc.vocab.strings[u'PRODUCT'] new_ents = [ Span(doc, match[1], match[2], label=PROD) for match in found_matches ] doc.ents = list(doc.ents) + new_ents show_ents(doc) #%% print("Counting Named Entities occurrences") doc = nlp( u'Originally priced at $29.50, the sweater was marked down to five dollars.' ) show_ents(doc) len([ent for ent in doc.ents if ent.label_ == 'MONEY']) # For more on **Named Entity Recognition** visit https://spacy.io/usage/linguistic-features#101 #%% print("Visualizing NER") doc = nlp( u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. ' u'By contrast, Sony sold only 7 thousand Walkman music players.') displacy.render(doc, style='ent', jupyter=True) displacy.serve(doc1, style='ent') print('Viewing Sentences Line by Line') for sent in doc.sents: displacy.render(nlp(sent.text), style='ent', jupyter=True) print("Viewing Specific Entities, and customizing the visualization") options = {'ents': ['ORG', 'PRODUCT']} colors = { 'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'PRODUCT': 'radial-gradient(yellow, green)' } options = {'ents': ['ORG', 'PRODUCT'], 'colors': colors} print('display entities on jupiter notebook') displacy.render(doc, style='ent', jupyter=True, options=options) print('Display entities on browser: http://127.0.0.1:5000 ') displacy.serve(doc1, style='ent', options=options) # For more on applying CSS background colors and gradients, visit https://www.w3schools.com/css/css3_gradients.asp # https://spacy.io/usage/visualizers #%% print("Visualize entity recognizer with Spacy (line by line)") doc1 = nlp(mystring) spans = list(doc1.sents) # print('display entities on jupiter notebook') # displacy.render(spans,style='ent',jupyter=True,options={'distance':80}) print('Display entities on browser: http://127.0.0.1:5000 ') displacy.serve(spans, style='ent', options=options) #%% print("Visualize entity recognizer with Spacy (whole paragraph)") # print('display entities on jupiter notebook') # displacy.render(doc1,style='ent',jupyter=True,options={'distance':80}) print('Display entities on browser: http://127.0.0.1:5000 ') displacy.serve(doc1, style='ent', options=options) #%% print("List name Chunks:") for token in doc1.noun_chunks: print(token) # For more on **noun_chunks** visit https://spacy.io/usage/linguistic-features#noun-chunks #%% print("Dependency visualization with Spacy ") # style 'dep': shows pos tags and syntactic dependencies options = { 'distance': 80, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times' } print('display dependencies on jupiter notebook') displacy.render(doc1, style='dep', jupyter=True, options=options) print('Display dependencies on browser: http://127.0.0.1:5000 ') displacy.serve(doc1, style='dep', options=options) #%% print( "Spacy doesn't include a Stemmer. Instead it relies on lemmatization entirely. \n" "We use NLTK Porter and Snowball Stemmers here.") p_stemmer = PorterStemmer() for word in words: print(f"{word}, {10*'.'}, {p_stemmer.stem(word)}") s_stemmer = SnowballStemmer(language='english') for word in words: print(f"{word}, {10*'.'}, {s_stemmer.stem(word)}") #%% print("Perform Lemmatization with Spacy") text = nlp(mystring2) def show_lemmas(text): for token in text: print( f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{12}} {token.lemma_:{6}} {token.tag_:{6}} {spacy.explain(token.tag_)}' ) show_lemmas(text) #%% print("Remove/Add stopwords with Spacy") print(nlp.Defaults.stop_words) # Print the List of Spacy stopwords len(nlp.Defaults.stop_words) # Number of default stopwords in Spacy nlp.vocab[ 'is'].is_stop # Tells if the vocab is among Spacy stopwords or not nlp.vocab['mystery'].is_stop nlp.Defaults.stop_words.add( 'btw') # Adding to the Spacy's list of stopwords nlp.vocab['btw'].is_stop = True # set it to True nlp.Defaults.stop_words.remove( 'six') # Removing from the Spacy's list of stopwords nlp.vocab['six'].is_stop = False #%% print( "RuleBased Vocabulary Matching.\n More powerful version of the regular expressions" ) # looking for 3 different forms of the same pattern here matcher = Matcher(nlp.vocab) # a single token whose lowercase text reads 'solarpower' pattern1 = [{'LOWER': 'solarpower'}] # two adjacent tokens that read 'solar' and 'power' in that order pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}] # three adjacent tokens, with a middle token that can be any punctuation pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}] # Option (OP) key '*' : allows pattern 0 or more times pattern4 = [{ 'LOWER': 'solar' }, { 'IS_PUNCT': True, 'OP': '*' }, { 'LOWER': 'power' }] # add patterns to matcher labeled 'SolarPowerMatcherName' matcher.add('SolarPowerMatcherName', None, pattern1, pattern2, pattern3, pattern4) doc = nlp(u'The Solar Power industry continues to grow as demand \ for solarpower increases. Solar-power cars are gaining popularity as solar--power shows more strength' ) found_matches = matcher(doc) print( found_matches) # gives you tuples with match_id, start, and end index for match_id, start, end in found_matches: # grabs raw matched-vocab with match_id, start, and end index string_id = nlp.vocab.strings[match_id] # get string representation span = doc[start:end] # get the matched span print(match_id, string_id, start, end, span.text) # remove the patterns identified under 'SolarPowerMatcherName' label to avoid duplicates in next search matcher.remove('SolarPowerMatcherName') #%% print( "RuleBased Phrase Matching.\n More powerful version of the regular expressions" ) matcher = PhraseMatcher(nlp.vocab) # if your file gave you utf8 file error run this on terminal: # iconv -f iso-8859-1 -t utf-8 original_file > new_file doc2_path = config.DATA_DIR + 'reaganomics.txt' with open(doc2_path) as f: doc2 = nlp(f.read()) # First, create a list of match phrases: phrase_list = [ 'voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics' ] # Next, convert each phrase to a Doc object: phrase_patterns = [nlp(text) for text in phrase_list] # Pass each Doc object into matcher (note the use of the asterisk!): matcher.add('VoodooEconomics', None, *phrase_patterns) # Build a list of matches: found_matches = matcher(doc2) for match_id, start, end in found_matches: # grabs raw matched-vocab with match_id, start, and end index string_id = nlp.vocab.strings[match_id] # get string representation span = doc2[start:end] # get the matched span print(match_id, string_id, start, end, span.text) #%% print("going through doc sentences") with open(config.DATA_DIR + 'owlcreek.txt') as f: doc = nlp(f.read()) sents = [sent for sent in doc.sents] len(sents) #%% print("sentense segmentation") ''' It is important to note that `doc.sents` is a *generator*. That is, a Doc is not segmented until `doc.sents` is called. This means that, where you could print the second Doc token with `print(doc[1])`, you can't call the' "second Doc sentence" with `print(doc.sents[1])` However, you *can* build a sentence collection by running `doc.sents` and saving the result to a list ''' doc = nlp( u'This is the first sentence. This is another sentence. This is the last sentence.' ) for sent in doc.sents: print(sent) print(doc[1]) type(list(doc.sents)[0]) # it is a span type not string # print(doc.sents[1]) gives you error, you should use the following instead print(list(doc.sents)[0]) doc_sents = [sent for sent in doc.sents] # Now you can access individual sentences print(doc_sents[1]) # At first glance it looks like each `sent` contains text from the original Doc object. In fact they're just Spans # with start and end token pointers. type(doc_sents[1]) print(doc_sents[1].start, doc_sents[1].end) #%% print("Spacy's built-in `sentencizer` for sentense segmentation") """ spaCy's built-in `sentencizer` relies on the dependency parse and end-of-sentence punctuation to determine segmentation rules. We can add rules of our own, but they have to be added *before* the creation of the Doc object, as that is where the parsing of segment start tokens happens """ # Parsing the segmentation start tokens happens during the nlp pipeline doc2 = nlp(u'This is a sentence; This is a sentence. This is a sentence.') for token in doc2: print(token.is_sent_start, ' ' + token.text) for sent in doc2.sents: print(sent) #%% print("ADD A NEW SEGMENTATION RULE TO THE PIPELINE-part2") def set_custom_boundaries(doc): for token in doc[:-1]: if token.text == ';': doc[token.i + 1].is_sent_start = True return doc nlp.add_pipe(set_custom_boundaries, before='parser') print( nlp.pipe_names) # ['tagger', 'set_custom_boundaries', 'parser', 'ner'] # Re-run the Doc object creation: doc4 = nlp( u'"Management is doing things right; leadership is doing the right things." -Peter Drucker' ) for sent in doc4.sents: # separates sentences on semicolon print(sent) # And yet the new rule doesn't apply to the older Doc object: for sent in doc2.sents: print(sent) #%% print("ADD CHANGE SEGMENTATION RULES TO THE PIPELINE-part2") """ Why not simply set the `.is_sent_start` value to True on existing tokens? In some cases we want to *replace* spaCy's default sentencizer with our own set of rules. In this section we'll see how the default sentencizer breaks on periods. We'll then replace this behavior with a sentencizer that breaks on linebreaks. """ nlp = spacy.load('en_core_web_sm') # reset to the original mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence." # SPACY DEFAULT BEHAVIOR: doc = nlp(mystring) for sent in doc.sents: print([token.text for token in sent]) def split_on_newlines(doc): #split on newlines instead of `.` start = 0 seen_newline = False for word in doc: if seen_newline: yield doc[start: word.i] #word.i --> current word index position start = word.i seen_newline = False elif word.text.startswith('\n'): # handles multiple occurrences seen_newline = True yield doc[start:] # handles the last group of tokens sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines) nlp.add_pipe(sbd) doc = nlp(mystring) for sent in doc.sents: print([token.text for token in sent]) #%% print("Perform POS with Spacy") text = nlp(u"I read books on NLP.") text2 = nlp(u"I read a book on NLP.") word = text[1] print(f'{word} : {type(word)}') print(f'{word.text} : {type(word.text)}') def show_pos(text): for token in text: print( f'{token.text:{12}} {token.pos_:{6}} {token.tag_:{6}} {spacy.explain(token.tag_)}' ) # the pos shows 'read' is past/present tense print('\n read (present tense)\n') show_pos(text) print(f'\n read (past tense)\n') show_pos(text2) #%% print("Count different coarse-grained POS codes\n") doc = nlp(u"The quick brown fox jumped over the lazy dog's back.") POS_counts = doc.count_by(spacy.attrs.POS) print('POS_counts:', POS_counts) print('Associated `item` for the POS `key #`: ', doc.vocab[83].text) print( 'Creat frequency list of POS tags since `POS_counts` returns a dictionary with `POS_counts.items()\n' ) for k, v in sorted(POS_counts.items()): print(f'{k}. {doc.vocab[k].text:{5}}: {v}') #%% print("Count different coarse-grained Tag codes\n") TAG_counts = doc.count_by(spacy.attrs.TAG) for k, v in sorted(TAG_counts.items()): print(f'{k}. {doc.vocab[k].text:{4}}: {v}') #%% print('Count the different dependencies (DEP) codes\n') DEP_counts = doc.count_by(spacy.attrs.DEP) for k, v in sorted(DEP_counts.items()): print(f'{k}. {doc.vocab[k].text:{4}}: {v}')