Пример #1
0
def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html
Пример #2
0
def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 0, -1, 1, 0, 1, -2, -3]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)
Пример #3
0
def test_displacy_rtl():
    # Source: http://www.sobhe.ir/hazm/ – is this correct?
    words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
    # These are (likely) wrong, but it's just for testing
    pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
    deps = ["foo", "bar", "foo", "baz"]
    heads = [1, 0, 1, -2]
    nlp = Persian()
    doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
    doc.ents = [Span(doc, 1, 3, label="TEST")]
    html = displacy.render(doc, page=True, style="dep")
    assert "direction: rtl" in html
    assert 'direction="rtl"' in html
    assert 'lang="{}"'.format(nlp.lang) in html
    html = displacy.render(doc, page=True, style="ent")
    assert "direction: rtl" in html
    assert 'lang="{}"'.format(nlp.lang) in html
Пример #4
0
def test_issue2361(de_tokenizer):
    chars = ("<", ">", "&", """)
    doc = de_tokenizer('< > & " ')
    doc.is_parsed = True
    doc.is_tagged = True
    html = render(doc)
    for char in chars:
        assert char in html
Пример #5
0
def test_displacy_render_wrapper(en_vocab):
    """Test that displaCy accepts custom rendering wrapper."""

    def wrapper(html):
        return "TEST" + html + "TEST"

    displacy.set_render_wrapper(wrapper)
    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc, style="ent")
    assert html.startswith("TEST<div")
    assert html.endswith("/div>TEST")
    # Restore
    displacy.set_render_wrapper(lambda html: html)
Пример #6
0
def to_html(doc, output='/tmp', style='dep'):
    """Doc method extension for saving the current state as a displaCy
    visualization.
    """
    # generate filename from first six non-punct tokens
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
    html = displacy.render(doc, style=style, page=True)  # render markup
    if output is not None:
        output_path = Path(output)
        if not output_path.exists():
            output_path.mkdir()
        output_file = Path(output) / file_name
        output_file.open('w', encoding='utf-8').write(html)  # save to file
        print('Saved HTML to {}'.format(output_file))
    else:
        print(html)
Пример #7
0
def file_handler(image_id, part_table, part_slot, encoded_filepath, view_type):
    """Display page for a file system element.
    If the element is a directory then the page displays the directory listing
    as read from the disk image.
    If a file is selected the files contents as a binary payload is sent in
    the Response.
    """
    file_path = urllib.unquote(encoded_filepath)
    partition = _found_or_404(Partition.by_image_table_and_slot(image_id, part_table, part_slot))
    fs_ele = _found_or_404(FileSysEle.from_partition(partition, file_path))
    # Check if we have a directory
    if fs_ele.is_directory:
        # Render the dir listing template
        return _render_directory(partition, file_path)

    # Its a file, we'll need a temp file to analyse or serve
    temp_file = FileSysEle.create_temp_copy(partition, fs_ele)
    # Get the byte stream object and index it.
    byte_sequence, full_text =\
            ImageIndexer.get_path_details(temp_file, os.path.abspath(fs_ele.path))

    # Build the NLP object from extracted full_text, generate entity markup
    full_text_nlp_obj = nlp(unicode(full_text, 'utf-8'))
    full_text_entity_html = displacy.render(full_text_nlp_obj, style='ent', page=False)

    # Check whether this path has been indexed and the results are in the DB
    file_element = FileElement.by_partition_and_path(partition, file_path)
    if file_element is None:
        # If not then add the path and p
        file_element = FileElement(file_path, partition, byte_sequence)

    # Is this a blob request
    if request_wants_binary():
        return send_file(temp_file, mimetype=byte_sequence.mime_type,
                         as_attachment=True, attachment_filename=fs_ele.name)

    # Return correct view depending on URL parameter
    if view_type == 'text-view': 
        return render_template('text_analysis.html', image=partition.image, partition=partition,
                               file_path=file_path, fs_ele=fs_ele, file_element=file_element,
                               full_text=full_text)
    else:
        return render_template('entity_analysis.html', image=partition.image, partition=partition,
                               file_path=file_path, fs_ele=fs_ele, file_element=file_element,
                               full_text=full_text_entity_html)
Пример #8
0
def test_displacy_raises_for_wrong_type(en_vocab):
    with pytest.raises(ValueError):
        displacy.render("hello world")
Пример #9
0
        d = dict(start=m.start(), end=m.end(), label="")
        ents.append(d)

#sort the result by ents, as ent rule suggests
sort_ents = sorted(ents, key=lambda x: x["start"])

st.header('Output')

result_view = st.radio("Choose visualization type",
                       ('Highlighting', 'Word cloud', 'Table'),
                       index=0)
if result_view == 'Highlighting':
    #use spacy to higlight the keywords
    ex = [{"text": text, "ents": sort_ents, "title": None}]

    html = displacy.render(ex, style="ent", manual=True)
    html = html.replace("\n", " ")
    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
elif result_view == "Table":
    #tabular data (columns: keywords, score)
    df = pd.DataFrame(keywords, columns=("keywords", "score"))
    st.table(df)

else:
    #create and generate a word cloud image
    wordcloud = WordCloud(width=1000,
                          height=600,
                          max_font_size=80,
                          min_font_size=10,
                          prefer_horizontal=1,
                          max_words=numOfKeywords,
Пример #10
0
def test_displacy_raises_for_wrong_type(en_vocab):
    with pytest.raises(ValueError):
        displacy.render("hello world")
Пример #11
0
import spacy
from spacy import displacy


# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")


# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

displacy.render(doc, style='dep')
Пример #12
0
def main():
	'''Creates a main title and subheader on your page -
	these are static across all pages'''
	st.title("Tweet Classifier")
	st.subheader("Climate change tweet classification")

	# Creating sidebar with selection box -
	# you can create multiple pages this way
	options = ["Prediction", "Natural Language Processing Tool", "Exploratory Data Analysis"]
	selection = st.sidebar.selectbox("Choose Option", options)

	##### Building out the Prediction page ####
	if selection == "Prediction":
		st.markdown("# Machine Learning Model Predictions")
		st.markdown('Sentiment analysis is the classification of text in emotional categories such as positive, neutral, negative and news. The following machine learning models were built and trained to predict the emotional drive of tweets related to climate change. Please enter your text below and select a machine learning model to predict the sentiment of your text.')
		raw_text = st.text_area("Enter Text","Type Here")		
		

		# Model Prediction

		#Select model
		all_ml_modles= ["Linear SVC","Naive Bayes", "Logistic Regression"]
		model_choice = st.selectbox("Select base ML model",all_ml_modles)
		
		st.markdown("#### Select 'Classify' to view the result of the model prediction")
		st.markdown("")
		prediction_labels = {'anti climate change':-1,'news':2,'pro climate change':1,'neutral':0}
		if st.button("Classify"):
			#st.text("Original Text:\n{}".format(raw_text))
			vect_text = tweet_cv.transform([raw_text]).toarray()

			if model_choice == 'Linear SVC':
				predictor = joblib.load(open(os.path.join("resources/models/linsvcmodel.pkl"),"rb"))
				prediction = predictor.predict(vect_text)
				# st.write(prediction)
			elif model_choice == 'Naive Bayes':
				predictor = joblib.load(open(os.path.join("resources/models/naivebayesmodel.pkl"),"rb"))
				prediction = predictor.predict(vect_text)
				# st.write(prediction)
			elif model_choice == 'Logistic Regression':
				predictor = joblib.load(open(os.path.join("resources/models/logisticregression.pkl"),"rb"))
				prediction = predictor.predict(vect_text)
				# st.write(prediction)

			final_result = get_keys(prediction,prediction_labels)
			st.success("Tweet categorized as : {} using the {} model".format(final_result, model_choice))

	##### Building out the NLP page ####
	if selection == "Natural Language Processing Tool":
		st.markdown('# Natural Language Processing Tool')
		st.markdown('Natural language processing, commonly known as NLP, is a field of artificial intelligence about the interaction between computers and humans using natural language. The objective of NLP is for the computer to read, understand and derive meaning from human languages.')
		st.markdown('The following text processing tools can be viewed on your input text below:\n'
					'- **Tokenization** - Listing each word and punctuation \n'
					'- **Lemmatization** - Returns single base form of a word \n'
					'- **Named-entity recognition (NER)** - Locate and classify entities in categories such as person names and organisations\n'
					'- **Parts of Speech tags (POS)** - The identification of words as nouns, verbs, adjectives, etc.')

		nlp_text = st.text_area("Enter your text to see how text is processed using the Spacy library.","Type Here")
		nlp_task = ["Tokenization","Lemmatization","NER","POS Tags"]
		task_choice = st.selectbox("Choose NLP Task",nlp_task)
		
		docx = nlp(nlp_text)
		lemma = [word.lemma_ for word in docx]
		token = [word.text for word in docx]
		tag = [word.tag_ for word in docx]
		depend = [word.dep_ for word in docx]
		pos = [token.pos_ for token in docx ]
		
		if st.button("Analyze"):

			if task_choice == 'Tokenization':
				token_df =pd.DataFrame(token, columns = ['Tokens'])
				st.dataframe(token_df)
			elif task_choice == 'Lemmatization':
				lemma_df = pd.DataFrame(zip(token, lemma), columns=['Tokens', 'Lemma'])
				st.dataframe(lemma_df)
			elif task_choice == 'NER':
				html = displacy.render(docx,style="ent")
				html = html.replace("\n\n","\n")
				st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True)
			elif task_choice == 'POS Tags':
				pos_df=pd.DataFrame(zip(token, tag, depend), columns=['Tokens', 'Tag', 'Dependency'])
				st.dataframe(pos_df)
		st.markdown('---')
		#NLP table	
		st.markdown('## View table of NLP results')
		st.markdown("Select 'View Table' to view a table of the tokens, lemma and POS tags of your text.")
		if st.button("View Table"):
			docx = nlp(nlp_text)
			table_df = pd.DataFrame(zip(token,lemma,pos),columns=['Tokens','Lemma','POS'])
			st.dataframe(table_df)
		
		st.markdown('---')
		#Word cloud
		st.markdown('## Generate text Word Cloud')
		st.markdown("Select 'Generate Word Cloud' to view a word cloud of the most common words in your text")
		if st.button("Generate Word Cloud"):
			wordcloud =  WordCloud().generate(nlp_text)
			plt.imshow(wordcloud)
			plt.axis("off")
			st.pyplot()

	##### Building out the EDA page #####
	
	if selection == "Exploratory Data Analysis":
		# You can read a markdown file from supporting resources folder
		st.markdown("# Exploratory Data Analysis")
		st.markdown('This page discusses the Exploratory Data Analysis done on the Twitter data received to analyse and to build predictive machine learning models. Here you will find some of the insights from exploring the data as well as visualisations to describe some of our findings.')		

		#Sentiment Description
		st.markdown("## Sentiment Description")
		st.markdown("The table displays the description of each sentiment category.")
		# Image
		st.image(Image.open(os.path.join("resources/imgs/sentiment_description.png")))
		
		# Show dataset
		st.markdown("## Raw Twitter data and labels")
		st.markdown("Select the checkbox to view the original data")
		if st.checkbox('Show raw dataset'): # data is hidden if box is unchecked
			st.dataframe(raw_df) # will write the df to the page
		
		# Dimensions
		st.markdown("## Dataframe Dimensions")
		st.markdown("Select the buttons below to view the number of rows and columns for the raw dataset")
		data_dim = st.radio('Select dimension',('All','Rows','Columns'))
		if data_dim == 'All':
			st.text("Showing Shape of Entire Dataframe")
			st.info(raw_df.shape)
		if data_dim == 'Rows':
			st.text("Showing Length of Rows")
			st.info(raw_df.shape[0])
		if data_dim == 'Columns':
			st.text("Showing Length of Columns")
			st.info(raw_df.shape[1])

		# Count of labels
		st.markdown("## Sentiment labels")
		st.markdown("Below is a table displaying the count of each sentiment in the dataset. Majority of the tweets are positive(1) towards climate change. The least amount of tweets are negative(-1). This means that we have an unbalanced dataset that might have an effect on our prediction models. Select 'Show Bar Graph' to view this information visually.")
		bar_info = pd.DataFrame(raw_df['sentiment'].value_counts(sort=False))
		bar_info.reset_index(level=0, inplace=True)
		bar_info.columns = ['Sentiment','Count']
		bar_info['Percentage'] = [(i/len(raw_df['sentiment'])*100) for i in bar_info['Count']]
		st.dataframe(bar_info[['Sentiment','Count']])

		# Bar Graph
		if st.button("Show Bar Graph"):
			sns.set(font_scale=.6)
			sns.set_style('white')
			plot = sns.catplot(x="sentiment", kind="count", edgecolor=".6",palette="pastel",data=df_with_metadata,label='small')
			plot.fig.set_figheight(2.5)
			plt.xlabel("Sentiment")
			plt.ylabel("Count")
			plt.title("Sentiment counts")
			st.pyplot(bbox_inches="tight")


		#Clean dataset
		st.markdown("# Processed dataset")

		# Clean tweets
		st.markdown("Select the checkbox to view the processed data with additional information extracted from the text.")
		if st.checkbox('Show processed dataset'): # data is hidden if box is uncheckedz
			st.dataframe(df_with_metadata)	

		# Retweets
		st.markdown("## Retweets")
		st.markdown("The first thing we look at is the retweets. We find that just over 60% of the tweets are retweets. There is a possibility that some of these retweets are duplicates. We also look at the top 5 most retweeted tweets and how many times they were retweeted.")

		valuecounts = df_with_metadata['retweet'].value_counts()
		st.write('No: ', round(valuecounts[1]/len(df_with_metadata['retweet'])*100,2),'%')
		st.write('Yes: ', round(valuecounts[0]/len(df_with_metadata['retweet'])*100,2),'%')
		#Bar graph of number of rewteets
		sns.set(font_scale=.6)
		sns.set_style('white')
		plot = sns.catplot(x="retweet", kind="count", edgecolor=".6",palette="pastel",data=df_with_metadata);
		plt.xlabel("Retweet")
		plt.ylabel("Count")
		plt.title("Retweet count")
		plot.fig.set_figheight(2.5)
		st.pyplot(bbox_inches="tight")	
		
		#View the top 10 retweeted tweets
		tdf = pd.DataFrame(df_with_metadata['message'].astype(str).value_counts())
		st.dataframe(tdf[:6])

		# Word Cloud - Static wordcloud
		st.markdown('## Hashtags and Mentions')
		st.markdown('We can tell a lot from the sentiment of tweets by looking at the hashtags or mentions that are used. Select an option from the dropdown menu to view a Word Cloud of the most common mentions and hashtags. You can also view the top mentions and hashtags per category.')
		wc_options = ["Top Hashtags", "Top Mentions", "Top Hashtags by Sentiment","Top Mentions by Sentiment"]
		wc_selection = st.selectbox("Select Word Cloud OPtion", wc_options)
		
		if wc_selection=="Top Hashtags":
			newsimg = Image.open('resources/imgs/TopHashWC.png')
			st.image(newsimg)
		elif wc_selection=="Top Mentions":
			newsimg = Image.open('resources/imgs/TopMentionWC.png')
			st.image(newsimg)
		elif wc_selection=="Top Hashtags by Sentiment":
			newsimg = Image.open('resources/imgs/HashtagCatWC.png')
			st.image(newsimg,  width=700)
		elif wc_selection=="Top Mentions by Sentiment":
			newsimg = Image.open('resources/imgs/MentionsCatWC.png')
			st.image(newsimg, width=700)
		
		st.markdown('---')
		st.markdown('Select a checkbox below to view a table of the top hashtags or mentions for each category and how often they appear:')
		if st.checkbox('View top hashtags table'):
			st.dataframe(top_hashtags_df)
		if st.checkbox('View top mentions table'):
			st.dataframe(top_mentions_df)
		st.markdown('---')

		st.markdown('After looking at the top mentions and hashtags from the wordcloud above and doing some research, we can make a couple of assumptions: \n\n'
					'- This data seems to be taken from Americans around the time of the 2016 US presidential elections.\n\n'
					'- **@realDonaldTrump** is the top mentioned account. \n\n'
					'- **#Climatechange**, **#climate**, and **#Trump#** are the three most used hashtags')


		# Most Common Words
		st.markdown("## Most Common Words")
		st.markdown('If we look at the most common words used, we see the following:\n\n'
		"- For all the words: **climate**, **change**, **rt**, **global**,and **warming** all are at the top of the word counts. These are top   occurrences throughout all categories.\n\n"
		"- For negative words: **science**, **cause**, **real**, and **scam** stand out as top words that are distinct to the negative category.\n\n"
		"- For news words: **fight**, **epa**, **pruit**, **scientist**, and **new** stand out as top words that are distinct to the news category.")
		st.dataframe(top_words_df)		

		# Conclusion
		#st.markdown("## Conclusion")

		# Most Common Words
		st.markdown("## Created by:")
		st.markdown('\n'
		"- Karin Louw\n"
		"- Jonathan Dankers\n"
		"- Luvuyo Nkosana\n"
		"- Wright Nyoka\n"
		"- Kwande Skaap\n"
		"- Tsholofelo Mautjana")
            elif opt == 2:
                got: str = input(
                    'Input # of desired results, a semicolon, and search text: '
                )
                gotargs: List[str] = got.split(';')
                n: int = int(gotargs[0])
                recommendation_list = recommend_courses_using_search_text(
                    gotargs[1], n)
                pprint(recommendation_list)

            elif opt == 3:
                got: str = input('Input course to generate an SVG tree for: ')
                gotargs: List[str] = got.split(' ')
                course: cd.Course = cd.Course(gotargs[0], gotargs[1])
                with Path(course.program + course.designation + '.svg').open(
                        'w+', encoding='utf-8') as svg:
                    svg.write(
                        displacy.render(course_nlp_descs[course],
                                        style='dep',
                                        options={
                                            'compact': True,
                                            'bg': 'white',
                                            'color': 'black',
                                            'font': 'DejaVu Sans Mono'
                                        }))

        except Exception as e:
            print('Failed:', e)

# 20;CS 2201;EECE 2116;SC 3260;EES 4760;CS 3251;CS 2231;CS 4260;CS 3281;CS 3270;BUS 2100;BUS 2400
Пример #14
0
 def consultaVirutoso(self, texto):
     # tokenizar texto con spacy
     text = self.nlp(texto)
     tokenized_sentences = [sentence.text for sentence in text.sents]
     # dar estilos al texo analizado
     spacyText = displacy.render(text, style="ent")
     # declaras listas vacias
     datos = []
     datostype = []
     entidades = []
     for sentence in tokenized_sentences:
         for entity in self.nlp(sentence).ents:
             entidades.append(entity.text)
             palabra = self.limpiarDatos(entity)
             consulta = """
             SELECT ?s ?p ?o
                 WHERE 
                     { 
                         ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                     }
                     """ % (palabra, palabra)
             self.sbcEndpoint.setQuery(consulta)
             # retornar consulta enformto json
             self.sbcEndpoint.setReturnFormat(JSON)
             results = self.sbcEndpoint.query().convert()
             for result in results["results"]["bindings"]:
                 lista = []
                 listaS = result["s"]["value"]
                 listaP = result["p"]["value"]
                 listaO = result["o"]["value"]
                 lista.append(listaS)
                 lista.append(listaP)
                 lista.append(listaO)
                 datos.append(lista)
     for sentence in tokenized_sentences:
         for entity in self.nlp(sentence).ents:
             entidades.append(entity.text)
             palabra = self.limpiarDatos(entity)
             consultatype = """
             PREFIX caseav: <http://localhost:8080/Data/page/>
             SELECT ?o
                 WHERE 
                     { 
                        {?s  caseav:hasNombrePersona ?o .FILTER (regex(str(?o), "%s")) .}
                        UNION
                        {?s caseav:hashasApellidoPersona ?o .FILTER (regex(str(?o), "%s")) .}
                        UNION
                        {?s  caseav:hasCodigo ?o .FILTER (regex(Str(?o), "%s")) .} 
                        UNION
                        {?s  caseav:hasNombreCompletoPersona ?o .FILTER (regex(Str(?o), "%s")) .} 
                     }
                     """ % (palabra, palabra, palabra)
             self.sbcEndpoint.setQuery(consultatype)
             # retornar consulta enformto json
             self.sbcEndpoint.setReturnFormat(JSON)
             results = self.sbcEndpoint.query().convert()
             for result in results["results"]["bindings"]:
                 listae = []
                 #listaSe = result["s"]["value"]
                 #listaPe = result["p"]["value"]
                 listaOe = result["o"]["value"]
                 #listae.append(listaSe)
                 #listae.append(listaPe)
                 listae.append(listaOe)
                 datostype.append(listae)
     return datos, entidades, spacyText, datostype
Пример #15
0
async def nlp_display(request):
    txt = request.rel_url.query['txt']
    style = request.rel_url.query['style']
    doc = nlp(txt)
    svg = displacy.render(doc, style=style)
    return web.Response(body=svg)
Пример #16
0
print('\n----')

for ent in doc8.ents:
    print(ent.text + ' - ' + ent.label_ + ' - ' +
          str(spacy.explain(ent.label_)))

doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

# DISPLAYING
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', options={'distance': 110})
displacy.render(doc, style='ent', options={'distance': 110})

################################################################# LEMMATIZATION

doc1 = nlp(
    u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)


def show_lemmas(text):
    for token in text:
        print(
            f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}'
Пример #17
0
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'This is a sentence.')

doc = nlp(u'Rats are various medium-sized, long-tailed rodents.')
displacy.render(doc, style='dep', jupyter=True)
Пример #18
0
  print(token.text,end="|")

for i in tok.ents:
  print(i.text+' '+i.label_+' '+str(spacy.explain(i.label_)))
#entity

#lets see chunks
doc="Honda Plan to start a new plant at Khegegaon worth $78.45 billion"
text=model(doc)
for i in text.noun_chunks:
  print(i.text)

from spacy import displacy
text="Apple to built a Mobile Manufacture company in Hongkong worth $76.8 million"
tok=model(text)
displacy.render(tok,style='dep',jupyter=True,options={'distance':110})
#parts of speech and realtion of each word with each other

from spacy import displacy
text="Apple to built a Mobile Manufacture company in Hongkong worth $76.8 million"
tok=model(text)
displacy.render(tok,style='ent',jupyter=True)

# lemmatization
Lemmatization, on the other hand, takes into consideration the morphological analysis of the words. 
To do so, it is necessary to have detailed dictionaries which the algorithm can look through to link 
the form back to its lemma. 
Again, you can see how it works with the same example words.

Mapping from text-word to lemma
help(verb)
Пример #19
0
def main():

    st.title("Your one stop NLP App")

    expander1 = st.beta_expander("Tokenization & Named Entity Recognition")
    with expander1:
        message = st.text_area("Your Text Below")
        col1, col2 = st.beta_columns(2)
        with col1:
            col1.header("Tokenize Your Text")
            if st.button("Show Entities"):
                nlp_result = text_analyzer(message)
        with col2:
            col2.header("NER")
            if st.button("Analyze"):
                docx = analyze_text(message)
                html = displacy.render(docx, style="ent")
                html = html.replace("\n\n", "\n")
                st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

    # Sentiment Analysis
    expander2 = st.beta_expander("View Sentiment")
    with expander2:
        st.subheader(
            "Polarity:  It determines if the text expresses the positive, negative or neutral"
        )
        st.markdown(''' #### value lies in the range of [-1,1]
         1  : positive statement
         0  : neutral statement
        -1  : negative statement.''')
        st.subheader(
            "Subjectivity: It determines if the text is subjective or objective"
        )
        st.markdown(''' #### value lies in the range of [0,1]
          0  : Subjective (Has emotions)
          1  : Objective (Fact)''')

        message2 = st.text_area("Enter Your Text Below")
        if st.button("Show Sentiment metrices"):
            blob = TextBlob(message2)
            result_sentiment = blob.sentiment
            st.success(result_sentiment)

    # Summarization
    expander3 = st.beta_expander("Summarize your Text")
    with expander3:
        message3 = st.text_area("Add your text below")
        summary_options = st.selectbox("Choose Summarizer", ['Sumy', 'Gensim'])
        if st.button("Summarize"):
            if summary_options == 'Sumy':
                st.text("Using Sumy Summarizer ..")
                summary_result = sumy_summarizer(message3)
            elif summary_options == 'Gensim':
                st.text("Using Gensim Summarizer ..")
                summary_result = summarize(message3)
            else:
                st.warning("Using Default Summarizer")
                st.text("Using Gensim Summarizer ..")
                summary_result = summarize(message3)
            st.success(summary_result)

    # Dummy Data Generator
    expander4 = st.beta_expander("Generate Dummy Data")
    with expander4:
        column1, column2, column3 = st.beta_columns(3)
        with column1:
            number_to_gen = st.number_input("Number", 5, 5000)
        with column2:
            localized_providers = [
                "ar_AA", "ar_EG", "ar_JO", "ar_PS", "ar_SA", "bg_BG", "bs_BA",
                "cs_CZ", "de", "de_AT", "de_CH", "de_DE", "dk_DK", "el_CY",
                "el_GR", "en", "en_AU", "en_CA", "en_GB", "en_IE", "en_IN",
                "en_NZ", "en_PH", "en_TH", "en_US", "es", "es_CA", "es_ES",
                "es_MX", "et_EE", "fa_IR", "fi_FI", "fil_PH", "fr_CA", "fr_CH",
                "fr_FR", "fr_QC", "he_IL", "hi_IN", "hr_HR", "hu_HU", "hy_AM",
                "id_ID", "it_CH", "it_IT", "ja_JP", "ka_GE", "ko_KR", "la",
                "lb_LU", "lt_LT", "lv_LV", "mt_MT", "ne_NP", "nl_BE", "nl_NL",
                "no_NO", "or_IN", "pl_PL", "pt_BR", "pt_PT", "ro_RO", "ru_RU",
                "sk_SK", "sl_SI", "sv_SE", "ta_IN", "th", "th_TH", "tl_PH",
                "tr_TR", "tw_GH", "uk_UA", "zh_CN", "zh_TW"
            ]
            locale = st.multiselect("Select Locale",
                                    localized_providers,
                                    default="en_IN")
        with column3:
            profile_options_list = [
                'username', 'name', 'sex', 'address', 'mail', 'birthdate'
                'job', 'company', 'ssn', 'residence', 'current_location',
                'blood_group', 'website'
            ]
            profile_fields = st.multiselect("Fields",
                                            profile_options_list,
                                            default=['username', 'mail'])

        custom_fake = Faker(locale)
        data = [
            custom_fake.profile(fields=profile_fields)
            for i in range(number_to_gen)
        ]
        df = pd.DataFrame(data)

        st.dataframe(df)

        if st.button("Download"):
            make_downloadable_df_format(df)
Пример #20
0
    #ps = PorterStemmer()
    #review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    aspect_term.append(review)

dataset['loc_start'] = dataset[' term_location'].replace("--[0-9]+",
                                                         "",
                                                         regex=True)
dataset['end'] = dataset[' term_location'].replace("[0-9]+--", "", regex=True)
loc_start = []
loc_start = dataset['loc_start'].tolist()
loc_end = []
loc_end = dataset['end'].tolist()

from spacy import displacy
displacy.render(dep_par[4], style='dep')


def check_similarity(text, aspect):
    for word in aspect.split():
        #print word,text
        if text == word:
            #print("True")
            return True
    return False


from spacy.symbols import nsubj, VERB, ADJ, amod, NOUN, acomp, dep, advmod, ccomp, pobj, prep, dobj, ADV, neg, attr


def get_adj(i, start_loc, end_loc):
Пример #21
0
def _render_parses(i, to_render):
    to_render[0].user_data["title"] = "Batch %d" % i
    with Path("/tmp/parses.html").open("w") as file_:
        html = displacy.render(to_render[:5], style="dep", page=True)
        file_.write(html)
Пример #22
0
def render_pos_html(list_of_docs):
    return displacy.render(map(lambda x: get_spacy_doc(x), list_of_docs),
                           style='dep',
                           page=True)
Пример #23
0
        # #display text
        # text_placeholder.write(text, unsafe_allow_html=True)

        #render waveform graph
        data = stream.read(CHUNK)  #gives it to you in bytes
        npdata = np.frombuffer(data, np.int16)

        line.set_ydata(npdata)
        fig.canvas.draw()
        fig.canvas.flush_events()

        waveform_placeholder.write(fig)

    doc = nlp(text)
    #render HTML for NER tags
    html = displacy.render(doc, style="ent", options={"ents": ner_selection})
    ner_visual_placeholder.write(get_html_textbox(html),
                                 unsafe_allow_html=True)

    #Get sentiment
    if model in ['en_core_web_sm']:
        _, sentiment, sentiment_colour = get_polarity(doc)
        sentiment_placeholder.write(get_html_sentiment(sentiment,
                                                       sentiment_colour),
                                    unsafe_allow_html=True)
        _, subjectivity, subjectivity_colour = get_subjectivity(doc)
        subjectivity_placeholder.write(get_html_subjectivity(
            subjectivity, subjectivity_colour),
                                       unsafe_allow_html=True)
    else:
        polarity_placeholder.write(
Пример #24
0
def cy(x):
    return displacy.render(x,
                           options=dict(compact=True,
                                        collapse_phrases=True,
                                        word_spacing=15,
                                        distance=100))
Пример #25
0
                colors[format_label(label, "reject")] = COLOR_REJECT
            ner_example_i = st.selectbox(
                f"Merged examples ({len(merged_examples)})",
                range(len(merged_examples)),
                format_func=lambda i: merged_examples[int(i)]["text"][:400],
            )
            ner_example = merged_examples[int(ner_example_i)]
            doc = nlp.make_doc(ner_example["text"])
            ents = []
            for span in ner_example.get("spans", []):
                label = format_label(span["label"], span["answer"])
                ents.append(
                    doc.char_span(span["start"], span["end"], label=label))
            doc.ents = filter_spans(ents)
            html = displacy.render(doc,
                                   style="ent",
                                   options={"colors": colors})
            html = html.replace(
                "\n", " ")  # Newlines seem to mess with the rendering
            st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
            show_ner_example_json = st.checkbox("Show JSON example")
            if show_ner_example_json:
                st.json(ner_example)

            st.subheader("Train a model (experimental)")
            no_missing = st.checkbox(
                "Data is gold-standard and contains no missing values", False)
            start_blank = st.checkbox("Start with blank NER model", True)
            if st.button("🚀 Start training"):
                if start_blank:
                    ner = nlp.create_pipe("ner")
Пример #26
0
import spacy
from spacy import displacy
import sys

import os

file = open(sys.argv[1], 'r')
text = file.read()
file.close()

prdnlp = spacy.load("neural network annotator")  #
doc = prdnlp(text)

html = displacy.render(doc, style="ent")
Html_file = open("render.html", "w")
Html_file.write(html)
Html_file.close()
Пример #27
0
def main():
    '''
    NERS Demo w/ Sample Data
    '''
    # CONFIG  -------------------------------------------------- \\
    # ------------------------------------------------------------ \\

    # brnd, mpn, spplr
    model = 'post'   # pre -> use non-trained model / post -> use trained model

    mpn = 'on'  # on/off
    brnd = 'off'  # on/off
    cmmdty = 'off'  # on/off

    ruler = 'on'
    cleaner = 'on'
    number_tagger = 'off'

    # rem if stemmer is turned on after model does P2 training, then
    # you will need to use POS tag to detect nouns in products
    # then create new generator patterns for all.json
    # then run entity ruler again
    stemmer = 'off'

    # declare outputs
    brnd_pandas_file = r'C:\Users\stacy\Desktop\NERS Demo\out_pandas_brnd.xlsx'  # output
    mpn_pandas_file = r'C:\Users\stacy\Desktop\NERS Demo\out_pandas_mpn.xlsx'  # output
    cmmdty_pandas_file = r'C:\Users\stacy\Desktop\NERS Demo\out_pandas_cmmdty.xlsx'  # output

    # declare inputs
    mpn_file = r'C:\Users\stacy\Desktop\NERS Demo\ners_patterns_mpn.jsonl'  # input
    brnd_file = r'C:\Users\stacy\Desktop\NERS Demo\ners_patterns_brnd.jsonl'  # input
    cmmdty_file = r'C:\Users\stacy\Desktop\NERS Demo\ners_patterns_cmmdty.jsonl'  # input

    #   mpn   brnd    cmmdty      cases
    #   0       0       0           C1
    #   1       0       0           C2
    #   0       1       0           C3
    #   0       0       1           C4
    #   1       1       0           C5
    #   0       1       1           C6
    #   1       0       1           C7
    #   1       1       1           C8

    if mpn == 'off' and brnd == 'off' and cmmdty == 'off':              # C1
        patterns_file = mpn_file
    elif mpn == 'on' and brnd == 'off' and cmmdty == 'off':             # C2
        patterns_file = mpn_file
    elif mpn == 'off' and brnd == 'on' and cmmdty == 'off':             # C3
        patterns_file = brnd_file
    elif mpn == 'off' and brnd == 'off' and cmmdty == 'on':             # C4
        patterns_file = cmmdty_file
    elif mpn == 'on' and brnd == 'on' and cmmdty == 'off':              # C5
        patterns_file = combine_pattern_files(mpn_file, brnd_file)
    elif mpn == 'off' and brnd == 'on' and cmmdty == 'on':              # C6
        patterns_file = combine_pattern_files(brnd_file, cmmdty_file)
    elif mpn == 'on' and brnd == 'off' and cmmdty == 'on':              # C7
        patterns_file = combine_pattern_files(mpn_file, cmmdty_file)
    elif mpn == 'on' and brnd == 'on' and cmmdty == 'on':               # C8
        patterns_file = combine_pattern_files(mpn_file, brnd_file, cmmdty_file)

    tender_file = r'C:\Users\stacy\Desktop\NERS Demo\in_tender.csv'
    #tender_file = r'C:\Users\stacy\Desktop\NERS Demo\descriptions_nonstock.csv'
    write_type = 'w'

    # ------------------------------------------------------------ //
    # ---------------------------------------------------------- //

    # load model
    if model == 'pre':
        # load a language and invoke the entity ruler
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) #('en_core_web_sm', disable=['parser'])
    elif model == 'post':
        nlp = spacy.load('demo_model')

    nlp.add_pipe(sentence_segmenter, after='tagger')

    # add pipes
    if ruler == 'on':
        # rem if model is post then the entity ruler is already in the model
        if model == 'pre':
            # load patterns from external file only if model is not already trained
            nu_ruler = EntityRuler(nlp).from_disk(patterns_file)
            # putting the ruler before ner will override ner decisions in favor of ruler patterns
            nlp.add_pipe(nu_ruler)#, before='ner')
        # remember to swap precedence between ruler and ner after model training
        if model == 'post':
            # load patterns from external file only if model is not already trained
            if "entity_ruler" not in nlp.pipe_names:
                nu_ruler = EntityRuler(nlp).from_disk(patterns_file)
                # putting the ner before ruler will override favor ner decisions
                nlp.add_pipe(nu_ruler)#, before='ner')

    # show pipeline components:
    print(nlp.pipe_names)

    # import test tender and clean it up
    tender = import_csv(tender_file)  # import
    if cleaner == 'on':
        tender = py_string_cleaner.clean_doc(tender)  #  clean

    doc = nlp(tender)

    # CONSOLE OUTPUT  ---------------------------------------------------------
    #   mpn   brnd    cmmdty      cases
    #   0       0       0           C1
    #   1       0       0           C2
    #   0       1       0           C3
    #   0       0       1           C4
    #   1       1       0           C5
    #   0       1       1           C6
    #   1       0       1           C7
    #   1       1       1           C8
    labels = []
    alt_labels = []
    if mpn == 'on' and brnd == 'off' and cmmdty == 'off':
        print('\n')
        labels = ['MPN']  # , 'PRODUCT', 'MPN', 'SKU']
        alt_labels = ['Mpn']  # , 'Product', 'MfrPartNo', 'SkuID']
        total_found = []
        total_unique_found = []
        for label in labels:
            print('Results for {} --------------'.format(label))
            tot_num = 0
            unique_num = 0
            unique = []
            for ent in doc.ents:
                if ent.label_ == label:
                    if ent.text not in unique:
                        unique.append(ent.text)
                        unique_num += 1
                    print([ent.text, ent.label_], end='')
                    tot_num += 1
            print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num))
            total_found.append(tot_num)
            total_unique_found.append(unique_num)

    if mpn == 'off' and brnd == 'on':
        print('\n')
        labels = ['BRND']  # , 'PRODUCT', 'MPN', 'SKU']
        alt_labels = ['Brnd']  # , 'Product', 'MfrPartNo', 'SkuID']
        total_found = []
        total_unique_found = []
        for label in labels:
            print('Results for {} --------------'.format(label))
            tot_num = 0
            unique_num = 0
            unique = []
            for ent in doc.ents:
                if ent.label_ == label:
                    if ent.text not in unique:
                        unique.append(ent.text)
                        unique_num += 1
                    print([ent.text, ent.label_], end='')
                    tot_num += 1
            print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num))
            total_found.append(tot_num)
            total_unique_found.append(unique_num)

    if mpn == 'on' and brnd == 'on':
        print('\n')
        labels = ['BRND', 'MPN']  # , 'PRODUCT', 'MPN', 'SKU']
        alt_labels = ['Brnd', 'Mpn']  # , 'Product', 'MfrPartNo', 'SkuID']
        total_found = []
        total_unique_found = []
        for label in labels:
            print('Results for {} --------------'.format(label))
            tot_num = 0
            unique_num = 0
            unique = []
            for ent in doc.ents:
                if ent.label_ == label:
                    if ent.text not in unique:
                        unique.append(ent.text)
                        unique_num += 1
                    print([ent.text, ent.label_], end='')
                    tot_num += 1
            print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num))
            total_found.append(tot_num)
            total_unique_found.append(unique_num)

    # pandas output for mpns  ------------------------------------------------
    # This technique allows you to isolate entities on
    # a sentence-by-sentence basis, which will allow
    # for matching entities on a record-by-record basis
    if mpn == 'on':
        w_MpnCodes = []
        w_MpnCode_Alts = []
        unique = []
        mpn = ''
        alts = ''
        #ent_exists = False
        j = 0
        for sent in doc.sents:
            i = 0
            for ent in sent.ents:
                # ignore header record
                if j > 0:
                    if ent.label_ == 'MPN':
                        if i == 0:
                            # if it's the first label in the record, save it in mpns
                            mpn = ent.text
                            unique.append(ent.text)
                            i += 1
                        else:
                            # if it's not the first label in the sentence, put it in mpn alts
                            # (if it is already in alts, don't put it in)
                            if ent.text not in unique:
                                unique.append(ent.text)
                                if alts == '':
                                    alts = ent.text
                                else:
                                    alts = alts + ', ' + ent.text
                        #print(ent.label_, ': ', ent.text)

            # store ent results for each record, ignoring the headers
            if j > 0:
                w_MpnCodes.append(mpn.upper())
                w_MpnCode_Alts.append(alts.upper())

                # test ---------------
                print('str ', j, 'w_MpnCodes: ', w_MpnCodes)
                print('str ', j, 'w_MpnCode_Alts: ', w_MpnCode_Alts)
                # test ---------------

            # reset vars for next record
            unique.clear()
            mpn = ''
            alts = ''
            j += 1

        df = pd.DataFrame({ 'w_MpnCodes':w_MpnCodes,
                            'w_MpnCode_Alts':w_MpnCode_Alts})

        writer = pd.ExcelWriter(mpn_pandas_file)
        df.to_excel(writer,'NERS_MPNs', index=False)
        writer.save()

    # pandas output for brnds  ------------------------------------------------
    # This technique allows you to isolate entities on
    # a sentence-by-sentence basis, which will allow
    # for matching entities on a record-by-record basis
    if brnd == 'on':
        w_Brnds = []
        w_Brnd_Alts = []
        unique = []
        brnd_val = ''
        alts = ''
        #ent_exists = False
        j = 0
        for sent in doc.sents:
            i = 0
            for ent in sent.ents:
                # ignore header record
                if j > 0:
                    if ent.label_ == 'BRND':
                        if i == 0:
                            # if it's the first label in the record, save it in brnd
                            brnd_val = ent.text
                            unique.append(ent.text)
                            i += 1
                        else:
                            # if it's not the first label in the sentence, put it in brnd alts
                            # (if it is already in alts, don't put it in)
                            if ent.text not in unique:
                                unique.append(ent.text)
                                if alts == '':
                                    alts = ent.text
                                else:
                                    alts = alts + ', ' + ent.text
                        #print(ent.label_, ': ', ent.text)

            # store ent results for each record, ignoring the headers
            if j > 0:
                w_Brnds.append(brnd_val.upper())
                w_Brnd_Alts.append(alts.upper())

                # test ---------------
                print('str ', j, 'w_Brnds: ', w_Brnds)
                print('str ', j, 'w_Brnd_Alts: ', w_Brnd_Alts)
                # test ---------------

            # reset vars for next record
            unique.clear()
            brnd_val = ''
            alts = ''
            j += 1

        df2 = pd.DataFrame({ 'w_Brnds':w_Brnds,
                            'w_Brnd_Alts':w_Brnd_Alts})

        writer2 = pd.ExcelWriter(brnd_pandas_file)
        df2.to_excel(writer2,'NERS_Brnds', index=False)
        writer2.save()

    # pandas output for cmmdty  ------------------------------------------------
    # This technique allows you to isolate entities on
    # a sentence-by-sentence basis, which will allow
    # for matching entities on a record-by-record basis
    if cmmdty == 'on':
        w_Cmmdtys = []
        w_Cmmdty_Alts = []
        unique = []
        cmmdty_val = ''
        alts = ''
        #ent_exists = False
        j = 0
        for sent in doc.sents:
            i = 0
            for ent in sent.ents:
                # ignore header record
                if j > 0:
                    if ent.label_ == 'CMMDTY':
                        if i == 0:
                            # if it's the first label in the record, save it in brnd
                            cmmdty_val = ent.text
                            unique.append(ent.text)
                            i += 1
                        else:
                            # if it's not the first label in the sentence, put it in brnd alts
                            # (if it is already in alts, don't put it in)
                            if ent.text not in unique:
                                unique.append(ent.text)
                                if alts == '':
                                    alts = ent.text
                                else:
                                    alts = alts + ', ' + ent.text
                        #print(ent.label_, ': ', ent.text)

            # store ent results for each record, ignoring the headers
            if j > 0:
                w_Cmmdtys.append(cmmdty_val.upper())
                w_Cmmdty_Alts.append(alts.upper())

                # test ---------------
                print('str ', j, 'w_Cmmdty: ', w_Cmmdtys)
                print('str ', j, 'w_Cmmdty_Alts: ', w_Cmmdty_Alts)
                # test ---------------

            # reset vars for next record
            unique.clear()
            brnd_val = ''
            alts = ''
            j += 1

        df3 = pd.DataFrame({ 'w_Cmmdtys':w_Cmmdtys,
                            'w_Cmmdty_Alts':w_Cmmdty_Alts})

        writer3 = pd.ExcelWriter(cmmdty_pandas_file)
        df3.to_excel(writer3,'NERS_Cmmdtys', index=False)
        writer3.save()


    # save the model  --------------------------------------------------------
    # save model with entity pattern updates made by the entity ruler
    if ruler == "on":
        output_dir = Path('demo_model')
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

    # TEST  -----------------------------
    mpns = []

    # DISPLACY VISUALIZER -----------------------------------------------------
    # get results for html doc
    results = ''
    i = 0
    for item in alt_labels:
        results = results + '{}: {} tot  {} unq\n'.format(item, total_found[i], total_unique_found[i])
        i += 1
    # store nlp object as string in html var
    spacer = '---------------------------------------------------------\n'
    header = 'Named Entities Found in Target File:\n'
    doc = nlp(header + spacer + results + spacer + tender)
    doc.user_data["title"] = "Named Entity Resolution System (NERS)"
    colors = {
        "MPN": "#C3FFA1",
        "BRND": "#FFDDA1",
        "CMMDTY": "#F3DDA1"
    }
    options = {"ents": ["MPN", "BRND", "CMMDTY"], "colors": colors}
    # displacy.serve(doc, style="ent", options=options)
    html = displacy.render(doc, style="ent", page=True, options=options)  # use the entity visualizer
    # write the html string to the xampp folder and launch in browser through localhost port
    with open('C:/Users/stacy/My Localhost/index.html', 'w') as data:
        data.write(html)

    print('\n' + results)

    # end program
    print('Done.')
Пример #28
0
if "parser" in nlp.pipe_names:
    st.header("Dependency Parse & Part-of-speech tags")
    st.sidebar.header("Dependency Parse")
    split_sents = st.sidebar.checkbox("Split sentences", value=True)
    collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
    collapse_phrases = st.sidebar.checkbox("Collapse phrases")
    compact = st.sidebar.checkbox("Compact mode")
    options = {
        "collapse_punct": collapse_punct,
        "collapse_phrases": collapse_phrases,
        "compact": compact,
    }
    docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
    for sent in docs:
        html = displacy.render(sent, options=options)
        # Double newlines seem to mess with the rendering
        html = html.replace("\n\n", "\n")
        if split_sents and len(docs) > 1:
            st.markdown(f"> {sent.text}")
        st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

if "ner" in nlp.pipe_names:
    st.header("Named Entities")
    st.sidebar.header("Named Entities")
    default_labels = ["PERSON", "ORG", "GPE", "LOC"]
    labels = st.sidebar.multiselect("Entity labels",
                                    nlp.get_pipe("ner").labels, default_labels)
    html = displacy.render(doc, style="ent", options={"ents": labels})
    # Newlines seem to mess with the rendering
    html = html.replace("\n", " ")
Пример #29
0
def tree_vis(fmted_dep_tree):
    displacy.render(fmted_dep_tree,
                    style="dep",
                    manual=True,
                    page=False,
                    minify=True)
Пример #30
0
# target_labels = ['Finding', 'Disease or Syndrome',
#                  'Sign or Symptom', 'Pathologic Function', 'Neoplastic Process', 'Other']

# st.write(doc.ents)

# for ent in doc.ents:
#     st.write(ent.text, ' - ', ent.label_)

html = displacy.render(doc,
                       style="ent",
                       options={
                           "ents": [
                               'FINDING', 'DISEASE OR SYNDROME',
                               'SIGN OR SYMPTOM', 'PATHOLOGIC FUNCTION',
                               'NEOPLASTIC PROCESS', 'OTHER'
                           ],
                           "colors": {
                               'FINDING': '#D0ECE7',
                               'DISEASE OR SYNDROME': '#D6EAF8',
                               'SIGN OR SYMPTOM': '#E8DAEF',
                               'PATHOLOGIC FUNCTION': '#FADBD8',
                               'NEOPLASTIC PROCESS': '#DAF7A6'
                           }
                       })
style = "<style>mark.entity { display: inline-block }</style>"
st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)

data = [[
    str(getattr(ent, attr))
    for attr in ["text", "label_", "start", "end", "start_char", "end_char"]
] for ent in doc.ents
        # if ent.label_ in target_labels
Пример #31
0
def render_output(
    output: Optional[Union[Dict[str, str], QuestionAnsweringOutput]] = None,
    answer: Optional[str] = None,
    context: Optional[str] = None,
    question: Optional[str] = None,
    label: str = 'ANSWER',
    title: str = 'Question',
    grad_deg: str = '90deg',
    grad_pair: List[str] = ['#aa9cfc', '#fc9ce7'],
    span: Optional[Tuple[int, int]] = None,
    style: str = "ent",
    manual: bool = True,
    jupyter: bool = True,
    page: bool = False,
    minify: bool = True,
    return_html: bool = False,
    manual_data: Optional[Dict[str, Any]] = None,
    options: Optional[Dict[str, Any]] = None,
):
    """DisplaCy Visualizer for QA-Model Outputs.

    :param output: An output from the question-answering model. The output
        can be a dictionary with any or all keys: `question, answer, context`.
        Or a `QuestionAnsweringOutput` type object - If answer param is None,
        then the first `top-scored` answer will be chosen automatically.
    :param answer: (optional) A string sequence to represent as the answer.
    :param context: (optional) A list of string sequences or a single string
        to represet as the context (if `List[str]` - sequences will be joined).
    :param span: Span for highlighting the answer within the context. If
        None, its detected automatically.
    :param options: Visualizer options; visit the link for official DOCS:
        `https://spacy.io/api/top-level#displacy_options`
    :param manual_data: Defaults to ENT, keys; `'text', 'ents', 'titles'`
        DOCS: `https://spacy.io/usage/visualizers#manual-usage`
    """
    if output is not None:
        if isinstance(output, dict):
            if 'question' in output:
                question = output['question']
            if 'answer' in output:
                answer = output['answer']
            if 'context' in output:
                context = output['context']

        elif all(hasattr(output, attr) for attr in ('q', 'c', 'sids')):
            question, context = output.q, output.c
            # select the first top answer, if none provided.
            if answer is None:
                answer = output[0].answer

    if context is not None:
        if isinstance(context, list):
            context = ' '.join(context)
            e = f'Found item in List[{type(context[0])}], but expected List[str]'
            assert isinstance(context[0], str), e

    start, end = span if span is not None else (0, 0)
    if span is None:
        match = re.search(answer, context)
        if match and match.span() is not None:
            start, end = match.span()

    docs = dict() if manual_data is None else manual_data
    if manual_data is None:
        if style == "ent":
            docs["ents"] = [dict(start=start, end=end, label=label)]
            if len(context.strip()) > 1:
                docs['text'] = context
            if question is not None:
                docs['title'] = f"\n{title}: {question}\n"

    if options is None:
        if style == "dep":
            options = dict(compact=True, bg="#ed7118", color="#000000")
        else:
            options = dict(ents=None, colors=None)
            gradient = ", ".join([grad_deg] + grad_pair)
            colors = f"linear-gradient({gradient})"
            options.update({'ents': [label], 'colors': {label: colors}})

    if return_html:
        return displacy.render([docs], style=style, jupyter=False,
                               options=options, manual=manual)

    displacy.render([docs], style=style, page=page, minify=minify,
                    jupyter=jupyter, options=options, manual=manual)
Пример #32
0
def main():
    #st.sidebar.title("About")
    if st.sidebar.button("About this app"):
        st.sidebar.info(
            "This is an auto summarizer app for text articles, extracting the most important sentences by using NLP algorithms. It helps us to save time in our busy schedules who prefer to read the summary of those articles before we decide to jump in for reading entire article."
        )

    #st.write('<style>body { margin: 0; font-family: font-family: Tangerine;font-size:48px, Helvetica, sans-serif;font-size: 30px;text-align: center;} .header{padding: 10px 16px; background: #eaf4ff; color: #111; position:fixed;top:0;text-align: center;} .sticky { position: center; top: 0; width: 100%;} </style><div class="header" id="myHeader">'+str('RESUNER')+'</div>', unsafe_allow_html=True)
    st.write(
        '<style>body { margin: 0; font-family: font-family: Tangerine;font-size:48px, Helvetica, sans-serif;font-size: 30px;text-align: justify;} .header{padding: 10px 16px; background: #eaf4ff; color: #111; position:fixed;top:0;text-align: center;} .sticky { position: fixed; top: 0; width: 100%;} </style><div class="header" id="myHeader">'
        + str('Summary Generator and Entity Recognizer') + '</div>',
        unsafe_allow_html=True)
    #st.title("Summary Generator and Entity checker")
    activities = [
        "Summarize", "Summarize for URL", "NER Checker", "NER for URL"
    ]
    choice = st.radio("Select Activity", activities)
    if choice == 'Summarize':
        st.info(
            "Please paste your text into the left side box & click the 'Summarize!' to view the summary"
        )
        st.sidebar.subheader("Summarization")
        raw_text = st.sidebar.text_area("Enter Text Here")
        #summary_choice = st.selectbox("Summary Choice",["Gensim","Sumy Lex Rank"])
        if st.sidebar.button("Summarize!"):
            summary_result = sumy_summarizer(raw_text)
            estimatedTime_org = readingTime(raw_text)
            #text_length = st.slider("Length to Preview",50,100)
            st.info(
                "Original Reading time - {} mins".format(estimatedTime_org))

            st.write(summary_result)
            estimatedTime_res = readingTime(summary_result)
            st.info("Summary Reading time - {} mins".format(estimatedTime_res))

            engine = pyttsx3.init(driverName='sapi5')
            #infile = "tanjil.txt"
            # f = open(infile, 'r')
            #theText = f.read()
            #f.close()

            #Saving part starts from here
            tts = gTTS(text=summary_result, lang='en')
            #saved_file=talkToMe(summary_result , lgg ='en')
            tts.save("saved_file.mp3")
            audio_file = open('saved_file.mp3', 'rb')
            audio_bytes = audio_file.read()
            st.audio(audio_bytes, format='audio/mp3', start_time=0)
        st.sidebar.subheader("Visualizations")
        visualize = ["Select", "WordCloud", "Bigrams", "Trigrams"]
        choice2 = st.sidebar.selectbox("Visualize", visualize)
        #if choice2 == "Only Summary":

        if choice2 == "WordCloud":
            c_text = raw_text
            #plt.figure(figsize=[70,50])
            maskArray = np.array(Image.open("comment.png"))
            wordcloud = WordCloud(max_font_size=200,
                                  max_words=3000,
                                  margin=10,
                                  background_color='white',
                                  mask=maskArray,
                                  contour_width=3,
                                  contour_color='black',
                                  scale=3,
                                  relative_scaling=0.5,
                                  width=1900,
                                  height=1900,
                                  random_state=1).generate(c_text)
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis("off")
            st.pyplot()

        if choice2 == "Bigrams":
            c_text = ngrams((raw_text), 2)
            for i in range(0, len(c_text)):
                c_text[i] = " ".join(c_text[i])
            Bigram_Freq = nltk.FreqDist(c_text)
            maskArray = np.array(Image.open("comment.png"))

            #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq)
            #plt.figure(figsize = (50,25))
            bigram_wordcloud = WordCloud(
                max_font_size=150,
                max_words=2000,
                margin=10,
                background_color='white',
                mask=maskArray,
                contour_width=3,
                contour_color='black',
                scale=3,
                relative_scaling=0.5,
                width=900,
                height=900,
                random_state=1).generate_from_frequencies(Bigram_Freq)
            #plt.figure(figsize = (50,25))
            plt.imshow(bigram_wordcloud, interpolation='bilinear')
            plt.axis("off")
            #            maskArray = np.array(Image.open("C:/Users/NAKKANA1/OneDrive - Novartis Pharma AG/Desktop/aws_study/streamlit/wordcloudsummy/cloud2.png"))
            #wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width,stopwords=stopwords_wc, background_color='white', mask = maskArray).generate_from_frequencies(dict(words_freq))
            #            wordCloud = WordCloud(max_font_size=150,max_words=2000, margin=10, background_color='white', mask = maskArray,
            #                                  scale=3, relative_scaling = 0.5, width=900, height=900,random_state=1).generate_from_frequencies(c_text)
            #            plt.title('Most frequently occurring bigrams connected by same colour and font size')
            #            plt.imshow(wordCloud, interpolation='bilinear')
            #            plt.axis("off")
            #return st.pyplot()
            st.pyplot()

        if choice2 == "Trigrams":
            c_text = ngrams((raw_text), 3)
            for i in range(0, len(c_text)):
                c_text[i] = " ".join(c_text[i])
            trigram_Freq = nltk.FreqDist(c_text)
            maskArray = np.array(Image.open("comment.png"))
            #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq)
            #plt.figure(figsize = (50,25))
            trigram_wordcloud = WordCloud(
                max_font_size=150,
                max_words=200,
                margin=10,
                background_color='white',
                mask=maskArray,
                contour_width=3,
                contour_color='black',
                scale=3,
                relative_scaling=0.5,
                width=900,
                height=900,
                random_state=1).generate_from_frequencies(trigram_Freq)
            #plt.figure(figsize = (50,25))
            plt.imshow(trigram_wordcloud, interpolation='bilinear')
            plt.axis("off")
            st.pyplot()

    #st.write('<style>body { margin: 0; font-family: Arial, Helvetica, sans-serif;} .header{padding: 10px 16px; background: #7f78d2; color: #f1f1f1; position:fixed;top:0;} .sticky { position: fixed; top: 0; width: 100%;} </style><div class="header" id="myHeader">'+str('Summarator')+'</div>', unsafe_allow_html=True)

    if choice == 'NER Checker':
        st.info(
            "About NER Checker: Named-entity recognition (NER) automatically identifies names of people, places, products & organizations. The entities displayed here is PERSON, NORP (nationalities, religious and political groups), FAC (buildings, airports etc.), ORG (organizations), GPE (countries, cities etc.), LOC (mountain ranges, water bodies etc.), PRODUCT (products), EVENT (event names), WORK_OF_ART (books, song titles), LAW (legal document titles), LANGUAGE (named languages), DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL and CARDINAL"
        )

        st.sidebar.subheader("Entity Recognition")
        raw_text = st.sidebar.text_area("Enter Text Here", "Type Here")
        if st.sidebar.button("Analyze!"):
            # NLP
            docx = analyze_text(raw_text)
            html = displacy.render(docx, style='ent')
            html = html.replace("\n\n", "\n")
            #st.write(html,unsafe_allow_html=True)
            st.markdown(html, unsafe_allow_html=True)

    if choice == 'NER for URL':
        st.info(
            "About NER Checker: Named-entity recognition (NER) automatically identifies names of people, places, products & organizations. The entities displayed here is PERSON, NORP (nationalities, religious and political groups), FAC (buildings, airports etc.), ORG (organizations), GPE (countries, cities etc.), LOC (mountain ranges, water bodies etc.), PRODUCT (products), EVENT (event names), WORK_OF_ART (books, song titles), LAW (legal document titles), LANGUAGE (named languages), DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL and CARDINAL"
        )

        st.sidebar.subheader("Analyze text from URL")
        raw_url = st.sidebar.text_input("Enter URL Here", "Type here")
        #text_preview_length = st.slider("Length to Preview",50,100)
        if st.sidebar.button("Analyze"):
            if raw_url != "Type here":
                result = get_text(raw_url)
                #       len_of_full_text = len(result)
                #       len_of_short_text = round(len(result)/text_preview_length)
                #       st.success("Length of Full Text::{}".format(len_of_full_text))
                #       st.success("Length of Short Text::{}".format(len_of_short_text))
                #       st.info(result[:len_of_short_text])
                #summarized_docx = sumy_summarizer(result)
                docx = analyze_text(result)
                html = displacy.render(docx, style="ent")
                html = html.replace("\n\n", "\n")
                #st.write(HTML_WRAPPER1.format(html),unsafe_allow_html=True)
                st.markdown(html, unsafe_allow_html=True)

    if choice == 'Summarize for URL':
        st.info(
            "Please paste your url into the left side box & click the 'Summarize!' to view the summary"
        )

        st.sidebar.subheader("Summary from URL")
        raw_url = st.sidebar.text_input("Enter URL", "Type here")
        #text_length = st.sidebar.slider("Length to Preview",50,100)
        #        text_length = st.slider("Length to Preview",50,100)
        if st.sidebar.button("Summarize!"):
            if raw_url != "Type here":
                result = get_text(raw_url)
                estimatedTime_org_url = readingTime(result)
                #text_length = st.slider("Length to Preview",50,100)
                #st.info("Original Reading time - {} mins".format(estimatedTime_org_url))

                #len_of_full_text = len(result)
                #len_of_short_text = round(len(result)/text_length)
                #st.info("Length::Full Text::{}".format(len_of_full_text))
                #st.info("Length::Short Text::{}".format(len_of_short_text))
                #st.write(result[:len_of_short_text])
                summary_result_url = sumy_summarizer(result)
                st.write(summary_result_url)
                estimatedTime_res_url = readingTime(summary_result_url)
                st.info("Summary Reading time - {} mins".format(
                    estimatedTime_res_url))
                engine = pyttsx3.init(driverName='sapi5')
                #infile = "tanjil.txt"
                # f = open(infile, 'r')
                #theText = f.read()
                #f.close()

                #Saving part starts from here
                tts = gTTS(text=summary_result_url, lang='en')
                #saved_file2=talkToMe(summary_result_url , lgg ='en')
                tts.save("saved_file3.mp3")
                audio_file2 = open('saved_file3.mp3', 'rb')
                audio_bytes2 = audio_file2.read()
                st.audio(audio_bytes2, format='audio/mp3', start_time=0)
        st.sidebar.subheader("Visualizations")
        visualize = ["Select", "WordCloud", "Bigrams", "Trigrams"]
        choice2 = st.sidebar.selectbox("Visualize", visualize)
        #if choice2 == "Only Summary":

        if choice2 == "WordCloud":
            if raw_url != "Type here":
                result = get_text(raw_url)
                c_text = result
                #plt.figure(figsize=[70,50])
                maskArray = np.array(Image.open("comment.png"))
                wordcloud = WordCloud(max_font_size=200,
                                      max_words=3000,
                                      margin=10,
                                      background_color='white',
                                      mask=maskArray,
                                      contour_width=3,
                                      contour_color='black',
                                      scale=3,
                                      relative_scaling=0.5,
                                      width=1900,
                                      height=1900,
                                      random_state=1).generate(c_text)
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                st.pyplot()

        if choice2 == "Bigrams":
            if raw_url != "Type here":
                result = get_text(raw_url)
                c_text = ngrams((result), 2)
                for i in range(0, len(c_text)):
                    c_text[i] = " ".join(c_text[i])
                Bigram_Freq_u = nltk.FreqDist(c_text)
                maskArray = np.array(Image.open("comment.png"))

                #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq)
                #plt.figure(figsize = (50,25))
                bigram_wordcloud_u = WordCloud(
                    max_font_size=150,
                    max_words=2000,
                    margin=10,
                    background_color='white',
                    mask=maskArray,
                    contour_width=3,
                    contour_color='steelblue',
                    scale=3,
                    relative_scaling=0.5,
                    width=900,
                    height=900,
                    random_state=1).generate_from_frequencies(Bigram_Freq_u)
                #plt.figure(figsize = (50,25))
                plt.imshow(bigram_wordcloud_u, interpolation='bilinear')
                plt.axis("off")
                st.pyplot()

        if choice2 == "Trigrams":
            if raw_url != "Type here":
                result = get_text(raw_url)
                c_text = ngrams((result), 3)
                for i in range(0, len(c_text)):
                    c_text[i] = " ".join(c_text[i])
                trigram_Freq_u = nltk.FreqDist(c_text)
                maskArray = np.array(Image.open("comment.png"))

                #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq)
                #plt.figure(figsize = (50,25))
                trigram_wordcloud_u = WordCloud(
                    max_font_size=150,
                    max_words=200,
                    margin=10,
                    background_color='white',
                    mask=maskArray,
                    contour_width=3,
                    contour_color='black',
                    scale=3,
                    relative_scaling=0.5,
                    width=900,
                    height=900,
                    random_state=1).generate_from_frequencies(trigram_Freq_u)
                #plt.figure(figsize = (50,25))
                plt.imshow(trigram_wordcloud_u, interpolation='bilinear')
                plt.axis("off")
                st.pyplot()

    st.sidebar.title("")
    st.sidebar.info("Connect: [email protected]")
Пример #33
0
def main():
    nlp = spacy.load('en_core_web_sm')

    # sentence functions
    print('setence example: ---------------------------')
    sent = nlp(sentences[0])
    print(sent.text)

    for token in sent:
        print(token.text, token.pos_, token.dep_)

    print('\n')

    # string example functions
    print('string example: ---------------------------')
    sampleString = u"I can't imagine spending $3000 for a single bedroom apartment in N.Y.C."
    str = nlp(sampleString)
    print(str.text)

    for token in str:
        print(token.text, token.pos_, token.dep_)

    print('\n')

    # product file functions
    print('products example 1: ---------------------------')
    infile = open('products_DescriptionOnly_short.csv', 'rt')
    print(infile.read(), '\n')

    # reset cursor
    infile.seek(0)

    # start for
    for line in infile:
        nextLine = line.rstrip()
        # nextStr = nextLine
        nlpStr = nlp(nextLine)

        for token in nlpStr:
            print(token.text, token.pos_, token.dep_)

        print('\n')
    # end for

    # close input data file
    infile.close()

    # product file functions (2)
    print('products example 2: ---------------------------')
    # print all data
    infile = open('products_DescriptionOnly.csv', 'rt')
    fData = infile.read()

    # the doc object is processed as it is passed
    # to the language object
    nlpData = nlp(fData)
    print(nlpData)

    # print tokens
    print('\ntokens:')
    for tok in nlpData[:6]:
        print('{} -> {} -> {}'.format(tok.text, tok.pos_, tok.ent_type_))

    # print entities
    print('\nentities:')
    for ent in nlpData.ents:
        print('{} --> {}'.format(ent.string, ent.label_))

    # print persons
    # rem: NLTK comes with pre-trained models for splitting text
    #      to sentences and sentences to words
    print('\n')
    orgNum = 0
    carNum = 0
    perNum = 0
    print('ORGs:')
    for ent in nlpData.ents:
        if ent.label == spacy.symbols.ORG:
            orgNum += 1
            print(ent.text)
        if ent.label == spacy.symbols.CARDINAL:
            carNum += 1
        if ent.label == spacy.symbols.PERSON:
            perNum += 1
        # end if
    print('\n')
    print('# of ORG: ', orgNum)
    print('# of CARDINAL: ', carNum)
    print('# of PERSON: ', perNum)
    infile.close()

    # examine additional spacy functions
    print('\nexplore additional spacy functions: ---------------')
    for token in nlpData[:6]:
        print('token.text: ', token.text)  # the original string
        print('token.ent_type_: ', token.ent_type_)  # entity
        print('token.ent_iob_: ', token.ent_iob_)  # ?
        print('token.pos_: ', token.pos_)  # the part of speech
        print('token.tag_: ', token.tag_)  # ?
        print('token.dep_: ', token.dep_)  # dependency
        print('token.head.text: ', token.head.text)  # navigate up the tree
        print('token.lefts: ', token.lefts)  # left child of head
        print('token.rights: ', token.rights)  # right child of head
        print('\n-----------------')

    # apply more spacy features to a string
    nuDoc = nlp('This is an SKF product called Ball Bearing for $45 USD')
    for token in nuDoc:
        print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}'.format(
            token.text,  # original string
            token.idx,  # index
            token.lemma_,  # base form of the word
            token.is_punct,  # bool: is it punctuation
            token.is_space,  # bool: is it a space
            token.shape_,  # visual signature ie: Xxxxx
            token.pos_,  # part of speech
            token.tag_  # ?  
        ))
    # end for

    # test displaCy
    # viewable in jupyter notebook
    print(
        '\ndisplaCy snippet for jupyter notebook ---------------------------')
    doc = nlp(
        'I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ'
    )
    displacy.render(doc, style='ent', jupyter=True)

    # test the chunker
    print('\ntest the chunker 1 -----------')
    doc = nlp(
        "Wall Street Journal just published an interesting piece on crypto currencies"
    )
    for chunk in doc.noun_chunks:
        print(chunk.text, chunk.label_, chunk.root.text)

    # test the chunker
    print('\ntest the chunker 2 -----------')
    doc = nlp(
        'Bore Diameter 40mm inner ring width 23 mm spherial roller bearing')
    for chunk in doc.noun_chunks:
        print(chunk.text, chunk.label_, chunk.root.text)

    # test span object
    print('\ntest span object -----------')
    span = doc[2:6]  # 40mm inner ring
    print(span.text)

    # test lexical attributes
    print('\ntest lexical attributes ---------------')
    doc = nlp("It costs $5.")
    print('Text:    ', 'It costs $5')
    print('Index:   ', [token.i for token in doc])
    print('Text:    ', [token.text for token in doc])
    print('is_alpha:', [token.is_alpha for token in doc])
    print('is_punct:', [token.is_punct for token in doc])
    print('like_num:', [token.like_num for token in doc])

    # test the dependency parcer
    print('\ntest the dependency parcer -----------')
    doc = nlp(
        'Wall Street Journal just published an interesting piece on crypto currencies'
    )
    for token in doc:
        print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_,
                                                token.dep_, token.head.text,
                                                token.head.tag_))

    # end program
    print('\nDone.')
Пример #34
0
 def entity(self, filename):
     nlp = spacy.load("en_core_web_sm")
     doc = nlp(self.text)
     html_entity = displacy.render(doc, style="ent")
     output_path = Path(filename)
     output_path.open("w", encoding="utf-8").write(html_entity)
Пример #35
0
def test_displacy_spans(en_vocab):
    """Test that displaCy can render Spans."""
    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc[1:4], style="ent")
    assert html.startswith("<div")
def visualizeParsedText(sent):
    doc = nlp(sent)
    displacy.render(doc, style='dep', jupyter=True)
Пример #37
0
def visualize_ent(doc, context=True, sections=True, jupyter=True, colors=None):
    """Create a NER-style visualization
    for targets and modifiers in Doc.
    doc (Doc): A spacy doc
    context (bool): Whether to display the modifiers generated by medSpaCy's cycontext.
        If the doc has not been processed by context, this will be automatically
        changed to False. Default True.
    sections (bool): Whether to display the section titles generated by medSpaCy's
        sectionizer (still in development). If the doc has not been processed by
        sectionizer , this will be automatically changed to False. This may also
        have some overlap with cycontext, in which case duplicate spans will be
        displayed. Default True.
    jupyter (jupyter): If True, will render directly in a Jupyter notebook. If
        False, will return the HTML. Default True.
    colors (dict or None): An optional dictionary which maps labels of targets and modifiers
        to color strings to be rendered. If None, will create a generator which
        cycles through the default matplotlib colors for ent and modifier labels
        and uses a light gray for section headers. Default None.
    """
    # Make sure that doc has the custom medSpaCy attributes registered
    if not hasattr(doc._, "context_graph"):
        context = False
    if not hasattr(doc._, "sections"):
        sections = False

    ents_data = []

    for target in doc.ents:
        ent_data = {
            "start": target.start_char,
            "end": target.end_char,
            "label": target.label_.upper(),
        }
        ents_data.append((ent_data, "ent"))

    if context:
        visualized_modifiers = set()
        for target in doc.ents:
            for modifier in target._.modifiers:
                if modifier in visualized_modifiers:
                    continue
                ent_data = {
                    "start": modifier.span.start_char,
                    "end": modifier.span.end_char,
                    "label": modifier.category,
                }
                ents_data.append((ent_data, "modifier"))
                visualized_modifiers.add(modifier)
    if sections:
        for (title, header, _) in doc._.sections:
            if title is None:
                continue
            ent_data = {
                "start": header.start_char,
                "end": header.end_char,
                "label": f"<< {title.upper()} >>",
            }
            ents_data.append((ent_data, "section"))
    if len(ents_data) == 0:  # No data to display
        viz_data = [{"text": doc.text, "ents": []}]
        options = dict()
    else:
        ents_data = sorted(ents_data, key=lambda x: x[0]["start"])

        # If colors aren't defined, generate color mappings for each entity and modifier label
        # And set all section titles to a light gray
        if colors is None:
            labels = set()
            section_titles = set()
            for (ent_data, ent_type) in ents_data:
                if ent_type in ("ent", "modifier"):
                    labels.add(ent_data["label"])
                elif ent_type == "section":
                    section_titles.add(ent_data["label"])
            colors = _create_color_mapping(labels)
            for title in section_titles:
                colors[title] = "#dee0e3"
        ents_display_data, _ = zip(*ents_data)
        viz_data = [{"text": doc.text, "ents": ents_display_data,}]

        options = {
            "colors": colors,
        }
    return displacy.render(
        viz_data, style="ent", manual=True, options=options, jupyter=jupyter
    )
Пример #38
0
show_ents(doc)

for i in range(len(sentences)):
    doc = nlp(sentences[i])
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.is_stop)

for i in range(len(sentences)):
    doc = nlp(sentences[i])
    for ent in doc.ents:
        print(ent.text, ent.label_)
html = []

doc = nlp(sentences[5])
html = displacy.render(doc, style="ent", jupyter=False)  #NER

html  #View in HTML Viewer

# Write a function to display basic entity info:

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag


def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    return sent

Пример #39
0
def display_dep(doc):
    displacy.render(doc, style="dep")
Пример #40
0
    def train_ner(self,
                  train_data,
                  model=None,
                  new_model_name="german_modified",
                  output_dir=None,
                  n_iter=30,
                  labels=None,
                  test_model=False):
        """Set up the pipeline and entity recognizer, and train the new entity."""
        # training data format:
        # TRAIN_DATA = [
        #     (
        #         "Horses are too tall and they pretend to care about your feelings",
        #         {"entities": [(0, 6, LABEL)]},
        #     ),
        #     ("Do they bite?", {"entities": []}),
        # ]
        TRAIN_DATA = train_data

        random.seed(0)
        # Add entity recognizer to model if it's not in the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner)
        # otherwise, get it, so we can add labels to it
        else:
            ner = nlp.get_pipe("ner")

        [ner.add_label(label)
         for label in labels]  # add new entity label to entity recognizer
        optimizer = nlp.resume_training()
        move_names = list(ner.move_names)
        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        train_losses = []
        with nlp.disable_pipes(*other_pipes):  # only train NER
            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=0.35,
                               losses=losses)
                # print("Losses", losses)
                train_losses.append(losses)

        # test the trained model
        test_text = "Do you like horses?"
        doc = nlp(test_text)
        print("Entities in '%s'" % test_text)
        displacy.render(doc, style='ent', jupyter=True)

        # save model to output directory
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.meta["name"] = new_model_name  # rename model
            nlp.to_disk(output_dir)
            print("Saved model to: ", output_dir)

            if test_model:
                # test the saved model
                print("Loading from", output_dir)
                nlp2 = spacy.load(output_dir)
                # Check the classes have loaded back consistently
                assert nlp2.get_pipe("ner").move_names == move_names
                doc2 = nlp2(test_text)
                for ent in doc2.ents:
                    print(ent.label_, ent.text)

        return train_losses