Пример #1
0
def test_singlegrank_n_keyterms(spacy_doc):
    expected = [
        'new york times jerusalem bureau', 'new york times', 'friedman',
        'foreign reporting', 'international reporting']
    observed = [
        term for term, _ in keyterms.singlerank(spacy_doc, n_keyterms=5)]
    assert len(expected) == len(observed)
Пример #2
0
def keywords():
    #print request.get_json()
    arg = request.get_json()
    doc = textacy.Doc(arg['content'],
                      metadata={'title': arg['title']},
                      lang=unicode('en_core_web_sm'))
    sgrank_keywords = dict(keyterms.sgrank(doc))
    singlerank_keywords = dict(keyterms.singlerank(doc))
    textrank_keywords = dict(keyterms.textrank(doc))
    sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items())
    textrank_keywords.update(
        (x, y * 0.05) for x, y in textrank_keywords.items())
    singlerank_keywords.update(
        (x, y * 0.05) for x, y in singlerank_keywords.items())
    keywords = res = dict(
        Counter(sgrank_keywords) + Counter(textrank_keywords) +
        Counter(singlerank_keywords))
    sorted_keywords = sorted(keywords.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    keyword_string = ""

    for i, key in enumerate(sorted_keywords):
        if (i == int(len(sorted_keywords) / 2)):
            keyword_string = keyword_string + "||"
        if (i == len(sorted_keywords) - 1
                or i == int(len(sorted_keywords) / 2) - 1):
            keyword_string = keyword_string + key[0]
        else:
            keyword_string = keyword_string + key[0] + ",,"

    return keyword_string
Пример #3
0
def test_singlegrank(spacy_doc):
    expected = [
        'new york times jerusalem bureau', 'new york times', 'friedman',
        'foreign reporting', 'international reporting', 'pulitzer prize',
        'book award', 'press international', 'president george', 'beirut']
    observed = [term for term, _ in keyterms.singlerank(spacy_doc)]
    assert len(expected) == len(observed)
Пример #4
0
    def key_terms(self, algorithm='sgrank', n=10):
        """
        Extract key terms from a document using `algorithm`.

        Args:
            algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name
                of algorithm to use for key term extraction
            n (int or float, optional): if int, number of top-ranked terms to return
                as keyterms; if float, must be in the open interval (0.0, 1.0),
                representing the fraction of top-ranked terms to return as keyterms

        Returns:
            list[(str, float)]: sorted list of top `n` key terms and their
                corresponding scores

        Raises:
            ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}

        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
        """
        if algorithm == 'sgrank':
            return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n)
        elif algorithm == 'textrank':
            return keyterms.textrank(self.spacy_doc, n_keyterms=n)
        elif algorithm == 'singlerank':
            return keyterms.singlerank(self.spacy_doc, n_keyterms=n)
        else:
            raise ValueError('algorithm {} not a valid option'.format(algorithm))
Пример #5
0
    def key_terms(self, algorithm='sgrank', n=10):
        """
        Extract key terms from a document using `algorithm`.

        Args:
            algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name
                of algorithm to use for key term extraction
            n (int or float, optional): if int, number of top-ranked terms to return
                as keyterms; if float, must be in the open interval (0.0, 1.0),
                representing the fraction of top-ranked terms to return as keyterms

        Returns:
            list[(str, float)]: sorted list of top `n` key terms and their
                corresponding scores

        Raises:
            ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}

        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
        """
        if algorithm == 'sgrank':
            return keyterms.sgrank(self.spacy_doc,
                                   window_width=1500,
                                   n_keyterms=n)
        elif algorithm == 'textrank':
            return keyterms.textrank(self.spacy_doc, n_keyterms=n)
        elif algorithm == 'singlerank':
            return keyterms.singlerank(self.spacy_doc, n_keyterms=n)
        else:
            raise ValueError(
                'algorithm {} not a valid option'.format(algorithm))
Пример #6
0
def test_singlegrank_norm_normalized_str(spacy_doc):
    expected = [
        'New York Times Jerusalem', 'New York Times', 'Friedman',
        'Pulitzer Prize', 'foreign reporting']
    observed = [
        term for term, _
        in keyterms.singlerank(spacy_doc, normalize=spacy_utils.normalized_str, n_keyterms=5)]
    assert len(expected) == len(observed)
Пример #7
0
 def test_singlegrank_norm_none(self):
     expected = [
         'New York Times Jerusalem', 'New York Times', 'Friedman',
         'Pulitzer Prize', 'foreign reporting']
     observed = [
         term for term, _
         in keyterms.singlerank(self.spacy_doc, normalize=None, n_keyterms=5)]
     self.assertEqual(len(expected), len(observed))
Пример #8
0
def test_singlegrank_n_keyterms(spacy_doc):
    expected = [
        "new york times jerusalem bureau",
        "new york times",
        "friedman",
        "foreign reporting",
        "international reporting",
    ]
    observed = [term for term, _ in keyterms.singlerank(spacy_doc, n_keyterms=5)]
    assert len(expected) == len(observed)
Пример #9
0
 def _apply_keyterm_ranking(self, doc, params=None):
     if self.method == 'sgrank':
         keywords = textacy.keyterms.sgrank(doc, **params) \
             if params else tck.sgrank(doc)
     elif self.method == 'textrank':
         keywords = textacy.keyterms.textrank(doc, **params) \
             if params else tck.textrank(doc)
     elif self.method == 'singlerank':
         keywords = textacy.keyterms.singlerank(doc, **params) \
             if params else tck.singlerank(doc)
     return keywords
Пример #10
0
def test_singlegrank_norm_none(spacy_doc):
    expected = [
        "New York Times Jerusalem",
        "New York Times",
        "Friedman",
        "Pulitzer Prize",
        "foreign reporting",
    ]
    observed = [
        term for term, _ in keyterms.singlerank(spacy_doc, normalize=None, n_keyterms=5)
    ]
    assert len(expected) == len(observed)
Пример #11
0
 def test_singlegrank_norm_lower(self):
     expected = [
         'new york times jerusalem bureau', 'new york times', 'friedman',
         'foreign reporting', 'international reporting']
     observed = [
         term for term, _
         in keyterms.singlerank(self.spacy_doc, normalize='lower', n_keyterms=5)]
     self.assertEqual(len(expected), len(observed))
     # can't do this owing to randomness of results
     # for e, o in zip(expected, observed):
     #     self.assertEqual(e, o)
     for term in observed:
         self.assertEqual(term, term.lower())
Пример #12
0
def test_singlegrank_norm_lower(spacy_doc):
    expected = [
        'new york times jerusalem bureau', 'new york times', 'friedman',
        'foreign reporting', 'international reporting']
    observed = [
        term for term, _
        in keyterms.singlerank(spacy_doc, normalize='lower', n_keyterms=5)]
    assert len(expected) == len(observed)
    # can't do this owing to randomness of results
    # for e, o in zip(expected, observed):
    #     asert e == o
    for term in observed:
        assert term == term.lower()
Пример #13
0
 def test_norm_normalized_str(self, spacy_doc):
     expected = [
         "New York Times Jerusalem",
         "New York Times",
         "Friedman",
         "Pulitzer Prize",
         "foreign reporting",
     ]
     observed = [
         term for term, _ in keyterms.singlerank(
             spacy_doc,
             normalize=spacy_utils.get_normalized_text,
             n_keyterms=5)
     ]
     assert len(expected) == len(observed)
Пример #14
0
 def test_base(self, spacy_doc):
     expected = [
         "new york times jerusalem bureau",
         "new york times",
         "friedman",
         "foreign reporting",
         "international reporting",
         "pulitzer prize",
         "book award",
         "press international",
         "president george",
         "beirut",
     ]
     observed = [term for term, _ in keyterms.singlerank(spacy_doc)]
     assert len(expected) == len(observed)
Пример #15
0
 def test_norm_lower(self, spacy_doc):
     expected = [
         "new york times jerusalem bureau",
         "new york times",
         "friedman",
         "foreign reporting",
         "international reporting",
     ]
     observed = [
         term for term, _ in keyterms.singlerank(
             spacy_doc, normalize="lower", n_keyterms=5)
     ]
     assert len(expected) == len(observed)
     # can't do this owing to randomness of results
     # for e, o in zip(expected, observed):
     #     asert e == o
     for term in observed:
         assert term == term.lower()
Пример #16
0
    def get_job_posting_keyterms(self, text=None, n_keyterms=None):
        """
        Extract key terms/key phrases from the given text, with context

        Returns: (list) each element is a dictionary, with keys:
            token_phrase: the candidate phrase
            token_span: the start/end span of the candidate phrase
            context_span: the start/end span of the candidate phrase + context
        """
        if not n_keyterms:
            n_keyterms = self.n_keyterms

        if not text:
            text = self.text

        processed_text = preprocess_text(text, **self.options)

        doc = self.nlp(processed_text)
        keyphrases = singlerank(doc, n_keyterms=n_keyterms, normalize=None)
        results = []
        for keyphrase, confidence in keyphrases:
            for match in re.finditer(keyphrase, processed_text):
                context_start = match.start() - self.context_chars
                if context_start < 0:
                    context_start = 0

                context_end = match.end() + self.context_chars
                if context_end > len(processed_text) - 1:
                    context_end = len(processed_text) - 1

                results.append(
                    dict(token_phrase=keyphrase,
                         token_span=match.span(),
                         context_span=(context_start, context_end)))

        return results
Пример #17
0
res = textacy.extract.named_entities(doc)
for r in res:
    print(r)

print("---------------")
print("sgrank:")

res = keyterms.sgrank(doc, n_keyterms=50)
for r in res:
    print(r)

print("---------------")
print("singlerank:")

res = keyterms.singlerank(doc, n_keyterms=50)
for r in res:
    print(r)

print("---------------")
print("textrank:")

res = keyterms.textrank(doc, n_keyterms=50)
for r in res:
    print(r)

print("---------------")
print("key_terms_from_semantic_network:")

res = keyterms.key_terms_from_semantic_network(doc, n_keyterms=50)
for r in res: