Exemplo n.º 1
0
def encode():
    title_repo = CassandraDatabase(project_name='papers',
                                   repo_name='title',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    summary_repo = CassandraDatabase(project_name='papers',
                                     repo_name='summary',
                                     id_sql_type='BIGINT',
                                     content_sql_type="TEXT")

    encoded_title_repo = CassandraDatabase(project_name='papers',
                                           repo_name='title_reviews',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    encoded_summary_repo = CassandraDatabase(project_name='papers',
                                             repo_name='summary_reviews',
                                             id_sql_type='BIGINT',
                                             content_sql_type="TEXT")
    # encoder = USERpcClient()
    encoder = InferRpcClient()
    # path = 'C:\zorba\storage\\vectorizer.joblib'
    # vectorizer = load(path)
    i = 0
    for id, row in title_repo.list():
        print(i)
        i += 1
        title_vec = str(encoder.encode(row)['encoded'])
        summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded'])
        # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist())
        encoded_title_repo.write(id, title_vec)
        encoded_summary_repo.write(id, summary_vec)
Exemplo n.º 2
0
def split_into_sents():
    count = 0
    sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT")
    for id, text in summary_repo.list():
        row = meta_repo.read(id)[0]
        meta = ast.literal_eval(row.replace('nan', '\'\''))
        meta['children'] = []
        sent_text = nltk.sent_tokenize(text)
        for sentence in sent_text:
            sents_repo.write(count, sentence)
            meta['children'].append(count)
            count+=1
        meta_repo.write(id, str(meta))
Exemplo n.º 3
0
def encode_sents():
    sents_repo = CassandraDatabase(project_name='papers',
                                   repo_name='sentences',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    encoded_sents_repo = CassandraDatabase(project_name='papers',
                                           repo_name='sents_count_vec',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    # encoder = InferRpcClient()
    # encoder = USERpcClient()
    path = 'C:\zorba\storage\\vectorizer.joblib'
    vectorizer = load(path)
    i = 0
    for id, row in sents_repo.list():
        print(i)
        i += 1
        # sent_vec = str(encoder.encode(row)['encoded'])
        sent_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        encoded_sents_repo.write(id, sent_vec)
Exemplo n.º 4
0
from services.cassandra_ import CassandraDatabase
import ast
import random

meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                      repo_name='sent_sum_map',
                                      id_sql_type='BIGINT',
                                      content_sql_type="TEXT")
encoded_sents_repo = CassandraDatabase(project_name='papers',
                                       repo_name='sents_count_vec',
                                       id_sql_type='BIGINT',
                                       content_sql_type="TEXT")

for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    ids = meta['children']
    print(id, ids, random.choice(ids))
    print(encoded_sents_repo.read(random.choice(ids))[0])
    break
    for child_id in ids:
        sent_sum_map_repo.write(child_id, str(id))
from services.cassandra_ import CassandraDatabase
from services.cassandra_ import CassandraDatabase
import ast
import spacy

nlp = spacy.load('en_core_web_sm')
meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers',
                                 repo_name='summary',
                                 id_sql_type='BIGINT',
                                 content_sql_type="TEXT")

extractor = KeywordExtractorRpcClient()
i = 0
for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    keywords = meta['keywords']
    text = summary_repo.read(id)[0]
    if len(keywords) != 1:
        keywords += list(extractor.extract(text)['keywords'])
    else:
        keywords = list(extractor.extract(text)['keywords'])

    keywords_lemmas = [token.lemma_ for token in nlp(' '.join(keywords))]
    meta['keywords_lemmas'] = list(set(keywords_lemmas))
    meta['keywords'] = list(set(keywords))
    meta_repo.write(id, str(meta))