def encode(): title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_title_repo = CassandraDatabase(project_name='papers', repo_name='title_reviews', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_summary_repo = CassandraDatabase(project_name='papers', repo_name='summary_reviews', id_sql_type='BIGINT', content_sql_type="TEXT") # encoder = USERpcClient() encoder = InferRpcClient() # path = 'C:\zorba\storage\\vectorizer.joblib' # vectorizer = load(path) i = 0 for id, row in title_repo.list(): print(i) i += 1 title_vec = str(encoder.encode(row)['encoded']) summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded']) # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist()) # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist()) encoded_title_repo.write(id, title_vec) encoded_summary_repo.write(id, summary_vec)
def title_sents(): repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") sent_sum_map_repo = CassandraDatabase(project_name='papers', repo_name='sent_sum_map', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 k = 0 for id, row in repo.list(): k += 1 print(k) result = ast.literal_eval( q.query(json.dumps({ "text": row, "count": 203 }))) sims = result['result'] inter = result['keywords'] papers_ids = [] for sent_id in list(sims.keys()): paper_id = int(sent_sum_map_repo.read(sent_id)[0]) if paper_id not in papers_ids: papers_ids.append(paper_id) if paper_id == id: index = len(papers_ids) break # papers_ids = np.array(papers_ids) # index = np.where(np.array(papers_ids) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10)) '''tootal count 203. USE: [11.40, 3: 11, 10: 143], upvotes: [17.77, 3: 107, 10: 141],
from services.cassandra_ import CassandraDatabase import ast import random meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") sent_sum_map_repo = CassandraDatabase(project_name='papers', repo_name='sent_sum_map', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents_count_vec', id_sql_type='BIGINT', content_sql_type="TEXT") for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) ids = meta['children'] print(id, ids, random.choice(ids)) print(encoded_sents_repo.read(random.choice(ids))[0]) break for child_id in ids: sent_sum_map_repo.write(child_id, str(id))
from services.cassandra_ import CassandraDatabase from services.cassandra_ import CassandraDatabase import ast import spacy nlp = spacy.load('en_core_web_sm') meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") extractor = KeywordExtractorRpcClient() i = 0 for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) keywords = meta['keywords'] text = summary_repo.read(id)[0] if len(keywords) != 1: keywords += list(extractor.extract(text)['keywords']) else: keywords = list(extractor.extract(text)['keywords']) keywords_lemmas = [token.lemma_ for token in nlp(' '.join(keywords))] meta['keywords_lemmas'] = list(set(keywords_lemmas)) meta['keywords'] = list(set(keywords)) meta_repo.write(id, str(meta))
from joblib import dump, load from services.cassandra_ import CassandraDatabase import time from sklearn.feature_extraction import stop_words title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") stop_words = stop_words.ENGLISH_STOP_WORDS corpus = [] for id, row in summary_repo.list(): corpus.append(row) corpus.append(title_repo.read(id)[0]) path = 'C:\zorba\storage\\vectorizer.joblib' vectorizer = CountVectorizer(stop_words=stop_words) t1 = time.time() x = vectorizer.fit(corpus) print(time.time() - t1) print(len(vectorizer.get_feature_names())) dump(vectorizer, path) # vectorizer = load(path)