Пример #1
0
def analyze_text(corpus_id, text, type, n_archs):
    features = Features(
        concepts=ConceptsOptions(),
        entities=EntitiesOptions(),
        keywords=KeywordsOptions(),
    )
    authenticator = IAMAuthenticator(
        current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY']
    )
    service = NaLaUn(
        version=current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'],
        authenticator=authenticator)
    service.set_service_url(
        current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL']
    )
    response = service.analyze(
        text=text,
        features=features
    )
    results = {}
    typ_list = ['entities', 'concepts', 'keywords']
    for typ in typ_list:
        results[typ] = pd.DataFrame(response.result[typ])

    test_vec = \
        results['concepts'].set_index('text')[['relevance']].apply(norm_dot)
    archetypes = get_corpus_archetypes(corpus_id, type=type, n_archs=n_archs)

    # Select the subset of features in corpus that cover the test vector.
    in_common = list(set(test_vec.index).intersection(
        set(archetypes.fn.columns)
    ))

    similarities = (
        (archetypes.fn[in_common] @ test_vec.loc[in_common]) * 100
    ).applymap(int)
    similarities.columns = ['similarity %']

    test_vec_expanded = pd.DataFrame(
        test_vec,
        index=archetypes.f.columns
    ).apply(scale).fillna(-0.1)

    compare = archetypes.f.T.apply(scale)
    compare['DOC'] = test_vec_expanded.apply(scale)

    archetype_maps = []
    for ix in archetypes.f.index:
        cmp = compare.sort_values(by=ix, ascending=True)[[ix, 'DOC']]
        cmp = cmp[cmp[ix] > 0.1]
        archetype_maps.append(cmp.applymap(np.sqrt))

    return similarities, archetype_maps
Пример #2
0
def analyze_corpus(app, name, directory):
    features = Features(
        concepts=ConceptsOptions(),
        entities=EntitiesOptions(),
        keywords=KeywordsOptions(),
    )
    with app.app_context():
        authenticator = IAMAuthenticator(
            app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY'])
        service = NaLaUn(
            version=app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'],
            authenticator=authenticator)
        service.set_service_url(
            app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL'])

        filenames = os.listdir(directory)
        new_corpus = Corpus(name=name, status='processing')
        db.session.add(new_corpus)
        db.session.commit()
        db.session.flush()
        print('Analyzing corpus in thread. Corpus ID: ' + str(new_corpus.id))
        count = 0
        for file in filenames:
            path = os.path.join(directory, file)
            if not os.path.isfile(path) or not file.endswith('.txt'):
                continue
            with open(path) as f:
                for i in range(3):
                    try:
                        results = service.analyze(text=f.read(),
                                                  features=features)
                        pickled_results = pickle.dumps(results)
                        new_results = CorpusResult(corpus_id=new_corpus.id,
                                                   name=file.replace(
                                                       '.txt', ''),
                                                   data=pickled_results)
                        db.session.add(new_results)
                        db.session.commit()
                        count += 1
                        print('Processed file #{}: {} '.format(count, file))
                    except Exception as e:
                        print(e)
                        time.sleep(0.5)
                        print('Retrying...')
                    else:
                        break
                else:
                    print('Failed to analyze a file ({}) after ' +
                          'multiple attempts.'.format(file))

        new_corpus.status = 'ready'
        db.session.commit()
        print('Finished analyzing corpus.')