예제 #1
0
def test_textrank_norm_normalized_str(spacy_doc):
    expected = ['Friedman', 'Beirut', 'New', 'Award', 'foreign']
    observed = [
        term for term, _ in keyterms.textrank(
            spacy_doc, normalize=spacy_utils.get_normalized_text, n_keyterms=5)
    ]
    assert len(expected) == len(observed)
예제 #2
0
 def test_norm_none(self, spacy_doc):
     expected = ["Friedman", "Beirut", "New", "Arab", "Award"]
     observed = [
         term for term, _ in keyterms.textrank(
             spacy_doc, normalize=None, n_keyterms=5)
     ]
     assert len(expected) == len(observed)
예제 #3
0
    def key_terms(self, algorithm='sgrank', n=10):
        """
        Extract key terms from a document using `algorithm`.

        Args:
            algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name
                of algorithm to use for key term extraction
            n (int or float, optional): if int, number of top-ranked terms to return
                as keyterms; if float, must be in the open interval (0.0, 1.0),
                representing the fraction of top-ranked terms to return as keyterms

        Returns:
            list[(str, float)]: sorted list of top `n` key terms and their
                corresponding scores

        Raises:
            ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}

        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
        """
        if algorithm == 'sgrank':
            return keyterms.sgrank(self.spacy_doc,
                                   window_width=1500,
                                   n_keyterms=n)
        elif algorithm == 'textrank':
            return keyterms.textrank(self.spacy_doc, n_keyterms=n)
        elif algorithm == 'singlerank':
            return keyterms.singlerank(self.spacy_doc, n_keyterms=n)
        else:
            raise ValueError(
                'algorithm {} not a valid option'.format(algorithm))
예제 #4
0
파일: texts.py 프로젝트: kevntao/textacy
    def key_terms(self, algorithm='sgrank', n=10):
        """
        Extract key terms from a document using `algorithm`.

        Args:
            algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name
                of algorithm to use for key term extraction
            n (int or float, optional): if int, number of top-ranked terms to return
                as keyterms; if float, must be in the open interval (0.0, 1.0),
                representing the fraction of top-ranked terms to return as keyterms

        Returns:
            list[(str, float)]: sorted list of top `n` key terms and their
                corresponding scores

        Raises:
            ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}

        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
        """
        if algorithm == 'sgrank':
            return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n)
        elif algorithm == 'textrank':
            return keyterms.textrank(self.spacy_doc, n_keyterms=n)
        elif algorithm == 'singlerank':
            return keyterms.singlerank(self.spacy_doc, n_keyterms=n)
        else:
            raise ValueError('algorithm {} not a valid option'.format(algorithm))
예제 #5
0
def keywords():
    #print request.get_json()
    arg = request.get_json()
    doc = textacy.Doc(arg['content'],
                      metadata={'title': arg['title']},
                      lang=unicode('en_core_web_sm'))
    sgrank_keywords = dict(keyterms.sgrank(doc))
    singlerank_keywords = dict(keyterms.singlerank(doc))
    textrank_keywords = dict(keyterms.textrank(doc))
    sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items())
    textrank_keywords.update(
        (x, y * 0.05) for x, y in textrank_keywords.items())
    singlerank_keywords.update(
        (x, y * 0.05) for x, y in singlerank_keywords.items())
    keywords = res = dict(
        Counter(sgrank_keywords) + Counter(textrank_keywords) +
        Counter(singlerank_keywords))
    sorted_keywords = sorted(keywords.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    keyword_string = ""

    for i, key in enumerate(sorted_keywords):
        if (i == int(len(sorted_keywords) / 2)):
            keyword_string = keyword_string + "||"
        if (i == len(sorted_keywords) - 1
                or i == int(len(sorted_keywords) / 2) - 1):
            keyword_string = keyword_string + key[0]
        else:
            keyword_string = keyword_string + key[0] + ",,"

    return keyword_string
예제 #6
0
def preprocess_job_market(in_path, out_path):
    import os
    import json
    import codecs
    job_texts = []
    for filename in glob.glob(in_path + '*.json'):
        try:
            with codecs.open(filename, encoding='utf-8') as job_file:
                content = json.load(job_file)
                job_texts.append(content.get('description', u''))
        except:
            print("===Exception reading file " + filename)
            continue
    spacy_lang = en_core_web_sm.load()
    corpus = textacy.corpus.Corpus(spacy_lang)
    corpus_text = '\n'.join(text for text in job_texts)
    corpus.add_text(corpus_text)

    res_file = out_path + 'job_market.csv'
    if not os.path.isfile(res_file):
        termList1 = term_list(
            keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30))
        termList2 = term_list(
            keyterms.sgrank(corpus[0],
                            ngrams=(1, 2),
                            normalize=u'lower',
                            window_width=100,
                            n_keyterms=70,
                            idf=None))
        termSet1 = set(termList1)
        termSet2 = set(termList2)
        diffSet = termSet1 - termSet2
        termList = termList2 + list(diffSet)
        save_terms_text(res_file, termList)
예제 #7
0
def preprocess_kags(in_path, out_path):
    import os
    spacy_lang = en_core_web_sm.load()
    for kag_path in glob.glob(KAG_BASE_PATH + '/*'):
        _, kag_name = os.path.split(kag_path)
        corpus = textacy.corpus.Corpus(spacy_lang)
        texts = []
        for comp_path in glob.glob(kag_path + '/*'):
            for filename in glob.glob(comp_path + '/*.txt'):
                texts.append(open(filename, 'r').read().decode('utf-8'))
        corpus_text = '\n'.join(text for text in texts)
        corpus.add_text(corpus_text)
        #        _ , comp_file = os.path.split(comp_path)
        #        sindex = len(kag_name) + 1
        #        eindex = sindex + comp_file[sindex:].index('_')
        res_file = '{}.csv'.format(get_kag(kag_name))
        termList1 = term_list(
            keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30))
        doc_idf = corpus.word_doc_freqs(lemmatize=None,
                                        weighting='idf',
                                        lowercase=True,
                                        as_strings=True)
        termList2 = term_list(
            keyterms.sgrank(corpus[0],
                            ngrams=(1, 2, 3),
                            normalize=u'lower',
                            window_width=500,
                            n_keyterms=70,
                            idf=doc_idf))
        termSet1 = set(termList1)
        termSet2 = set(termList2)
        diffSet = termSet1 - termSet2
        termList = termList2 + list(diffSet)
        #        save_terms_csv(out_path + res_file, termList)
        save_terms_text(out_path + res_file, termList)
예제 #8
0
def test_textrank(spacy_doc):
    expected = [
        'friedman', 'beirut', 'reporting', 'arab', 'new', 'award', 'foreign',
        'year', 'times', 'jerusalem'
    ]
    observed = [term for term, _ in keyterms.textrank(spacy_doc)]
    assert len(expected) == len(observed)
예제 #9
0
def test_textrank_norm_none(spacy_doc):
    expected = ['Friedman', 'Beirut', 'New', 'Arab', 'Award']
    observed = [
        term for term, _ in keyterms.textrank(
            spacy_doc, normalize=None, n_keyterms=5)
    ]
    assert len(expected) == len(observed)
예제 #10
0
 def test_norm_normalized_str(self, spacy_doc):
     expected = ["Friedman", "Beirut", "New", "Award", "foreign"]
     observed = [
         term for term, _ in keyterms.textrank(
             spacy_doc,
             normalize=spacy_utils.get_normalized_text,
             n_keyterms=5)
     ]
     assert len(expected) == len(observed)
예제 #11
0
 def test_textrank_norm_normalized_str(self):
     expected = ['Friedman', 'Beirut', 'New', 'Award', 'foreign']
     observed = [
         term for term, _ in keyterms.textrank(
             self.spacy_doc,
             normalize=spacy_utils.normalized_str,
             n_keyterms=5)
     ]
     self.assertEqual(len(expected), len(observed))
예제 #12
0
 def _apply_keyterm_ranking(self, doc, params=None):
     if self.method == 'sgrank':
         keywords = textacy.keyterms.sgrank(doc, **params) \
             if params else tck.sgrank(doc)
     elif self.method == 'textrank':
         keywords = textacy.keyterms.textrank(doc, **params) \
             if params else tck.textrank(doc)
     elif self.method == 'singlerank':
         keywords = textacy.keyterms.singlerank(doc, **params) \
             if params else tck.singlerank(doc)
     return keywords
예제 #13
0
 def test_textrank_norm_lower(self):
     expected = ['friedman', 'beirut', 'reporting', 'arab', 'new']
     observed = [
         term for term, _
         in keyterms.textrank(self.spacy_doc, normalize='lower', n_keyterms=5)]
     self.assertEqual(len(expected), len(observed))
     # can't do this owing to randomness of results
     # for e, o in zip(expected, observed):
     #     self.assertEqual(e, o)
     for term in observed:
         self.assertEqual(term, term.lower())
예제 #14
0
def test_textrank_norm_lower(spacy_doc):
    expected = ['friedman', 'beirut', 'reporting', 'arab', 'new']
    observed = [
        term for term, _
        in keyterms.textrank(spacy_doc, normalize='lower', n_keyterms=5)]
    assert len(expected) == len(observed)
    # can't do this owing to randomness of results
    # for e, o in zip(expected, observed):
    #     asert e == o
    for term in observed:
        assert term == term.lower()
예제 #15
0
 def test_norm_lower(self, spacy_doc):
     expected = ["friedman", "beirut", "reporting", "arab", "new"]
     observed = [
         term for term, _ in keyterms.textrank(
             spacy_doc, normalize="lower", n_keyterms=5)
     ]
     assert len(expected) == len(observed)
     # can't do this owing to randomness of results
     # for e, o in zip(expected, observed):
     #     asert e == o
     for term in observed:
         assert term == term.lower()
예제 #16
0
 def test_base(self, spacy_doc):
     expected = [
         "friedman",
         "beirut",
         "reporting",
         "arab",
         "new",
         "award",
         "foreign",
         "year",
         "times",
         "jerusalem",
     ]
     observed = [term for term, _ in keyterms.textrank(spacy_doc)]
     assert len(expected) == len(observed)
예제 #17
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, SpacySpan)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, SpacySpan)
        assert len(trigram) == 3

    nes = list(
        extract.named_entities(doc,
                               drop_determiners=False,
                               exclude_types='numeric'))[:10]
    for ne in nes:
        assert isinstance(ne, SpacySpan)
        assert ne.label_
        assert ne.label_ != 'QUANTITY'

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, SpacySpan)

    stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = keyterms.textrank(doc, n_keyterms=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
예제 #18
0
def preprocess_category(in_path, out_path, category_name):
    import os
    spacy_lang = en_core_web_sm.load()
    print('===GIVEN CATEGORY: ' + category_name)
    for cat_path in glob.glob(in_path + '*'):
        _, cat_name = os.path.split(cat_path)
        print('===CATEGORY: ' + cat_name)
        if category_name == cat_name:
            print('###Fine, found category directory ...')
            #        for comp_path in glob.glob(kag_path + '/*'):
            corpus = textacy.corpus.Corpus(spacy_lang)
            texts = []
            for filename in glob.glob(cat_path + '/*.txt'):
                texts.append(open(filename, 'r').read().decode('utf-8'))
            corpus_text = '\n'.join(text for text in texts)
            corpus.add_text(corpus_text)
            #            _ , comp_file = os.path.split(comp_path)
            #            sindex = len(kag_name) + 1
            #            eindex = sindex + comp_file[sindex:].index('_')
            #            res_file = '{}.csv'.format(comp_file[sindex:eindex])
            res_file = '{}.csv'.format(category_name.lower())

            termList1 = term_list(
                keyterms.textrank(corpus[0], normalize=u'lower',
                                  n_keyterms=30))
            doc_idf = corpus.word_doc_freqs(lemmatize=None,
                                            weighting='idf',
                                            lowercase=True,
                                            as_strings=True)
            termList2 = term_list(
                keyterms.sgrank(corpus[0],
                                ngrams=(1, 2, 3),
                                normalize=u'lower',
                                window_width=500,
                                n_keyterms=70,
                                idf=doc_idf))
            termSet1 = set(termList1)
            termSet2 = set(termList2)
            diffSet = termSet1 - termSet2
            termList = termList2 + list(diffSet)
            save_terms_text(out_path + res_file, termList)
            break
예제 #19
0
def preprocess_jobs_or_cvs_combined(in_path, out_path):
    import os
    for filename in glob.glob(in_path + '*.json'):
        _, cv_file = os.path.split(filename)
        res_file = out_path + '{0}.csv'.format(cv_file[0:cv_file.index('.')])
        if not os.path.isfile(res_file):
            corpus = read_cv2(filename)
            termList1 = term_list(
                keyterms.textrank(corpus[0], normalize=u'lower',
                                  n_keyterms=30))
            termList2 = term_list(
                keyterms.sgrank(corpus[0],
                                ngrams=(1, 2),
                                normalize=u'lower',
                                window_width=100,
                                n_keyterms=70,
                                idf=None))
            termSet1 = set(termList1)
            termSet2 = set(termList2)
            diffSet = termSet1 - termSet2
            termList = termList2 + list(diffSet)
            #            save_terms_csv(res_file, termList)
            save_terms_text(res_file, termList)
예제 #20
0
def test_textrank_n_keyterms(spacy_doc):
    expected = ['friedman', 'beirut', 'reporting', 'arab', 'new']
    observed = [term for term, _ in keyterms.textrank(spacy_doc, n_keyterms=5)]
    assert len(expected) == len(observed)
예제 #21
0
    inspec_files = os.listdir(inspec_folder)
    result = []
    for myfile in inspec_files:
        if myfile.endswith("abstr"):  #This is the actual content.
            content = open(os.path.join(inspec_folder, myfile),
                           encoding="utf-8",
                           errors="ignore").read().strip()
            keywords_file = myfile.replace(".abstr", ".uncontr")
            keywords = [
                process(item)
                for item in open(os.path.join(inspec_folder, keywords_file),
                                 encoding="utf-8",
                                 errors="ignore").read().strip().split(";")
            ]
            result.append((content, keywords))
    return result


inspec = get_inspec()
print(len(inspec))
spacymodel = spacy.load('en_core_web_sm', disable=('parser', 'ner'))
fw = open("tempKPEout-allinspec-textrank.txt", "w")
for content, keywords in inspec:
    fw.write(content + "\n")
    fw.write(str(keywords))
    fw.write("\n")
    spacified = spacymodel(content)
    fw.write(str([term[0] for term in textrank(spacified)]))
    fw.write("\n\n")
fw.close()
예제 #22
0
 def test_textrank_n_keyterms(self, spacy_doc):
     expected = ["friedman", "beirut", "reporting", "arab", "new"]
     observed = [
         term for term, _ in keyterms.textrank(spacy_doc, n_keyterms=5)
     ]
     assert len(expected) == len(observed)
예제 #23
0
 def test_textrank_n_keyterms(self):
     expected = ['friedman', 'beirut', 'reporting', 'arab', 'new']
     observed = [
         term for term, _ in keyterms.textrank(self.spacy_doc, n_keyterms=5)
     ]
     self.assertEqual(len(expected), len(observed))
예제 #24
0
for r in res:
    print(r)

print("---------------")
print("sgrank:")

res = keyterms.sgrank(doc, n_keyterms=50)
for r in res:
    print(r)

print("---------------")
print("singlerank:")

res = keyterms.singlerank(doc, n_keyterms=50)
for r in res:
    print(r)

print("---------------")
print("textrank:")

res = keyterms.textrank(doc, n_keyterms=50)
for r in res:
    print(r)

print("---------------")
print("key_terms_from_semantic_network:")

res = keyterms.key_terms_from_semantic_network(doc, n_keyterms=50)
for r in res:
    print(r)