Exemplo n.º 1
0
def test_default(spacy_doc):
    result = ke.textrank(spacy_doc)
    assert isinstance(result, list) and len(result) > 0
    assert all(isinstance(ts, tuple) and len(ts) == 2 for ts in result)
    assert all(
        isinstance(ts[0], compat.unicode_) and isinstance(ts[1], float)
        for ts in result
    )
Exemplo n.º 2
0
def test_position_bias(spacy_doc):
    result1 = ke.textrank(spacy_doc, position_bias=False)
    result2 = ke.textrank(spacy_doc, position_bias=True)
    assert len(result1) > 0 and len(result2) > 0
    assert result1 != result2
Exemplo n.º 3
0
def test_edge_weighting(spacy_doc):
    result1 = ke.textrank(spacy_doc, edge_weighting="binary")
    result2 = ke.textrank(spacy_doc, edge_weighting="count")
    assert len(result1) > 0 and len(result2) > 0
    assert result1 != result2
Exemplo n.º 4
0
def test_window_size(spacy_doc):
    result1 = ke.textrank(spacy_doc, window_size=2)
    result2 = ke.textrank(spacy_doc, window_size=4)
    assert len(result1) > 0 and len(result2) > 0
    assert result1 != result2
Exemplo n.º 5
0
def test_topn_float(spacy_doc):
    result = ke.textrank(spacy_doc, topn=0.2)
    assert len(result) > 0
    with pytest.raises(ValueError):
        _ = ke.textrank(spacy_doc, topn=2.0)
Exemplo n.º 6
0
def test_n_topn(spacy_doc):
    for n in (5, 25):
        result = ke.textrank(spacy_doc, topn=n)
        assert 0 < len(result) <= n
Exemplo n.º 7
0
    return ax
    



#open data from .txt file
with open('news_article.txt', 'r') as file:
    data = file.read().replace('\n', '')   
article = data.replace(u'\xa0', u' ')

#create doc object
doc = textacy.make_spacy_doc(article, lang='en_core_web_sm')

#KEYTERM EXTRACTION
#Each algorithm returns a list of tuples, containg the keyterm and a score
textrank = ke.textrank(doc,normalize="lemma")
yake = ke.yake(doc,normalize="lemma")
scake = ke.scake(doc,normalize="lemma")
sgrank = ke.sgrank(doc,normalize="lemma")

#separate terms and relevany scores
terms_textrank, scores_textrank  = decompose_keyterms(textrank)
terms_yake, scores_yake  = decompose_keyterms(yake)
terms_scake, scores_scake  = decompose_keyterms(scake)
terms_sgrank, scores_sgrank  = decompose_keyterms(sgrank)

#save results to dataframe
df = keyterm_dataframe(scake,'scake')
print(df)
    
Exemplo n.º 8
0
def main():
    # We pass these dynamic arguments in for parallel jobs
    parser = argparse.ArgumentParser(
        description='Build a phrase dictionary using spaCy and TextaCy')
    parser.add_argument('--source-tmx',
                        type=str,
                        default='',
                        help='The input tmx file to process')
    parser.add_argument('--dictionary-path',
                        type=str,
                        default='',
                        help='The input/output path for the phrase dictionary')
    parser.add_argument('--target-language',
                        type=str,
                        default='',
                        help='es or fr')
    parser.add_argument(
        '--category-id',
        type=str,
        default='',
        help='The model against which to build the phrase dictionary')
    parser.add_argument(
        '--nlp-id',
        type=str,
        default='',
        help='The source language spacy model e.g. en_core_web_md')
    parser.add_argument(
        '--nlp-target',
        type=str,
        default='',
        help='The target language spacy model e.g. fr_core_news_md')
    parser.add_argument('--batch-start',
                        type=int,
                        default=0,
                        metavar='N',
                        help='start at this number + batch-size')
    parser.add_argument('--batch-end',
                        type=int,
                        default=100,
                        metavar='N',
                        help='end at this number')

    args = parser.parse_args()
    set_log_level(Config.DEBUG)

    nlp_model_id = load_spacy_model(args.nlp_id)
    nlp_model_target = load_spacy_model(args.nlp_target)
    logging.debug(f"Loaded models {args.nlp_id} {args.nlp_target}")

    phrases = {}

    tmx_file = load_tmx_file(args.source_tmx)
    logging.debug(f"Loaded {args.source_tmx}")

    if os.path.isfile(
            os.path.join(args.dictionary_path,
                         args.target_language + '_phrase_dictionary.txt')):
        phrase_file_name = str(
            os.path.join(args.dictionary_path,
                         args.target_language + '_phrase_dictionary.txt'))
        phrase_file = load_phrase_dictionary(phrase_file_name, 'a')
        logging.debug(f"Found existing phrase dictionary {phrase_file_name}")
        phrase_list = [line.rstrip('\n') for line in open(phrase_file_name)]
        for line in phrase_list:
            lst_line = line.split(',')
            if len(lst_line[0]) > 0:
                phrases[lst_line[0]] = lst_line[1]

    else:
        phrase_file = load_phrase_dictionary(
            os.path.join(args.dictionary_path,
                         args.target_language + '_phrase_dictionary.txt'), 'a')

    for i, unit in enumerate(tmx_file.getunits()):

        if i < args.batch_start:
            continue

        if i > args.batch_end:
            break

        logging.info(
            f"Processing record {i} of {len(tmx_file.units)} (Batch start {args.batch_start} Batch end "
            f"{args.batch_end})")

        nlp_id = nlp_model_id(unit.getid())
        nlp_target = nlp_model_target(unit.gettarget())
        res_id = ke.textrank(nlp_id,
                             normalize='lemma',
                             include_pos=('NOUN', 'PROPN', 'ADJ', 'VERB'),
                             window_size=5,
                             edge_weighting='binary',
                             position_bias=False,
                             topn=5)
        res_target = ke.textrank(nlp_target,
                                 normalize='lemma',
                                 include_pos=('NOUN', 'PROPN', 'ADJ', 'VERB'),
                                 window_size=5,
                                 edge_weighting='binary',
                                 position_bias=False,
                                 topn=5)

        for r_id in res_id:
            if (len(r_id[0].split()) >
                    1) and (len(res_target) >
                            0):  # We don't want single words, we want phrases
                if not r_id[0] in phrases:
                    translation_results = call_translation(
                        [{
                            'Text': r_id[0]
                        }], args.target_language, args.category_id,
                        Config.SUBSCRIPTION_KEY, Config.REGION)
                    if len(translation_results[0]['translations'][0]) > 0:
                        for r_tar in res_target:
                            if len(r_tar[0].split()) > 1:
                                bleu_score = 0
                                # Let's only take exact matches
                                if r_tar[0].lower().strip(
                                ) == translation_results[0]['translations'][0][
                                        'text'].lower().strip():
                                    bleu_score = 1  # We use absolute matches but keep this here for BLEU if needed
                                    print(
                                        f"Found {r_id[0]} : {r_tar[0].strip()}"
                                    )
                                    phrases[r_id[0]] = r_tar[0].strip()
                                    # As the phrase dictionary is case sensitive, let's include a few variations
                                    phrase_file.write('\n' + r_id[0] + ', ' +
                                                      r_tar[0].strip())
                                # TODO add BLEU evaluation if needed
    phrase_file.close()