def test_singlegrank_n_keyterms(spacy_doc): expected = [ 'new york times jerusalem bureau', 'new york times', 'friedman', 'foreign reporting', 'international reporting'] observed = [ term for term, _ in keyterms.singlerank(spacy_doc, n_keyterms=5)] assert len(expected) == len(observed)
def keywords(): #print request.get_json() arg = request.get_json() doc = textacy.Doc(arg['content'], metadata={'title': arg['title']}, lang=unicode('en_core_web_sm')) sgrank_keywords = dict(keyterms.sgrank(doc)) singlerank_keywords = dict(keyterms.singlerank(doc)) textrank_keywords = dict(keyterms.textrank(doc)) sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items()) textrank_keywords.update( (x, y * 0.05) for x, y in textrank_keywords.items()) singlerank_keywords.update( (x, y * 0.05) for x, y in singlerank_keywords.items()) keywords = res = dict( Counter(sgrank_keywords) + Counter(textrank_keywords) + Counter(singlerank_keywords)) sorted_keywords = sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) keyword_string = "" for i, key in enumerate(sorted_keywords): if (i == int(len(sorted_keywords) / 2)): keyword_string = keyword_string + "||" if (i == len(sorted_keywords) - 1 or i == int(len(sorted_keywords) / 2) - 1): keyword_string = keyword_string + key[0] else: keyword_string = keyword_string + key[0] + ",," return keyword_string
def test_singlegrank(spacy_doc): expected = [ 'new york times jerusalem bureau', 'new york times', 'friedman', 'foreign reporting', 'international reporting', 'pulitzer prize', 'book award', 'press international', 'president george', 'beirut'] observed = [term for term, _ in keyterms.singlerank(spacy_doc)] assert len(expected) == len(observed)
def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n` key terms and their corresponding scores Raises: ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'} .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>` .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>` .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>` """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError('algorithm {} not a valid option'.format(algorithm))
def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n` key terms and their corresponding scores Raises: ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'} .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>` .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>` .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>` """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError( 'algorithm {} not a valid option'.format(algorithm))
def test_singlegrank_norm_normalized_str(spacy_doc): expected = [ 'New York Times Jerusalem', 'New York Times', 'Friedman', 'Pulitzer Prize', 'foreign reporting'] observed = [ term for term, _ in keyterms.singlerank(spacy_doc, normalize=spacy_utils.normalized_str, n_keyterms=5)] assert len(expected) == len(observed)
def test_singlegrank_norm_none(self): expected = [ 'New York Times Jerusalem', 'New York Times', 'Friedman', 'Pulitzer Prize', 'foreign reporting'] observed = [ term for term, _ in keyterms.singlerank(self.spacy_doc, normalize=None, n_keyterms=5)] self.assertEqual(len(expected), len(observed))
def test_singlegrank_n_keyterms(spacy_doc): expected = [ "new york times jerusalem bureau", "new york times", "friedman", "foreign reporting", "international reporting", ] observed = [term for term, _ in keyterms.singlerank(spacy_doc, n_keyterms=5)] assert len(expected) == len(observed)
def _apply_keyterm_ranking(self, doc, params=None): if self.method == 'sgrank': keywords = textacy.keyterms.sgrank(doc, **params) \ if params else tck.sgrank(doc) elif self.method == 'textrank': keywords = textacy.keyterms.textrank(doc, **params) \ if params else tck.textrank(doc) elif self.method == 'singlerank': keywords = textacy.keyterms.singlerank(doc, **params) \ if params else tck.singlerank(doc) return keywords
def test_singlegrank_norm_none(spacy_doc): expected = [ "New York Times Jerusalem", "New York Times", "Friedman", "Pulitzer Prize", "foreign reporting", ] observed = [ term for term, _ in keyterms.singlerank(spacy_doc, normalize=None, n_keyterms=5) ] assert len(expected) == len(observed)
def test_singlegrank_norm_lower(self): expected = [ 'new york times jerusalem bureau', 'new york times', 'friedman', 'foreign reporting', 'international reporting'] observed = [ term for term, _ in keyterms.singlerank(self.spacy_doc, normalize='lower', n_keyterms=5)] self.assertEqual(len(expected), len(observed)) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # self.assertEqual(e, o) for term in observed: self.assertEqual(term, term.lower())
def test_singlegrank_norm_lower(spacy_doc): expected = [ 'new york times jerusalem bureau', 'new york times', 'friedman', 'foreign reporting', 'international reporting'] observed = [ term for term, _ in keyterms.singlerank(spacy_doc, normalize='lower', n_keyterms=5)] assert len(expected) == len(observed) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # asert e == o for term in observed: assert term == term.lower()
def test_norm_normalized_str(self, spacy_doc): expected = [ "New York Times Jerusalem", "New York Times", "Friedman", "Pulitzer Prize", "foreign reporting", ] observed = [ term for term, _ in keyterms.singlerank( spacy_doc, normalize=spacy_utils.get_normalized_text, n_keyterms=5) ] assert len(expected) == len(observed)
def test_base(self, spacy_doc): expected = [ "new york times jerusalem bureau", "new york times", "friedman", "foreign reporting", "international reporting", "pulitzer prize", "book award", "press international", "president george", "beirut", ] observed = [term for term, _ in keyterms.singlerank(spacy_doc)] assert len(expected) == len(observed)
def test_norm_lower(self, spacy_doc): expected = [ "new york times jerusalem bureau", "new york times", "friedman", "foreign reporting", "international reporting", ] observed = [ term for term, _ in keyterms.singlerank( spacy_doc, normalize="lower", n_keyterms=5) ] assert len(expected) == len(observed) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # asert e == o for term in observed: assert term == term.lower()
def get_job_posting_keyterms(self, text=None, n_keyterms=None): """ Extract key terms/key phrases from the given text, with context Returns: (list) each element is a dictionary, with keys: token_phrase: the candidate phrase token_span: the start/end span of the candidate phrase context_span: the start/end span of the candidate phrase + context """ if not n_keyterms: n_keyterms = self.n_keyterms if not text: text = self.text processed_text = preprocess_text(text, **self.options) doc = self.nlp(processed_text) keyphrases = singlerank(doc, n_keyterms=n_keyterms, normalize=None) results = [] for keyphrase, confidence in keyphrases: for match in re.finditer(keyphrase, processed_text): context_start = match.start() - self.context_chars if context_start < 0: context_start = 0 context_end = match.end() + self.context_chars if context_end > len(processed_text) - 1: context_end = len(processed_text) - 1 results.append( dict(token_phrase=keyphrase, token_span=match.span(), context_span=(context_start, context_end))) return results
res = textacy.extract.named_entities(doc) for r in res: print(r) print("---------------") print("sgrank:") res = keyterms.sgrank(doc, n_keyterms=50) for r in res: print(r) print("---------------") print("singlerank:") res = keyterms.singlerank(doc, n_keyterms=50) for r in res: print(r) print("---------------") print("textrank:") res = keyterms.textrank(doc, n_keyterms=50) for r in res: print(r) print("---------------") print("key_terms_from_semantic_network:") res = keyterms.key_terms_from_semantic_network(doc, n_keyterms=50) for r in res: