def process_records(records): """ Adds results of NLP to records """ start = time.time() # print(f'Executing process_records(records), Record number = {len(records)}') for i, rec in enumerate(records): text_values = [ rec['title'], rec['link_title'], rec['announce'], rec['uannounce'], rec['full-text'] ] # . в начале чтоб удалить глюк библиотеки combined_text = '. ' + '. '.join(v for v in text_values if v is not None) o = text_processor.process_text(combined_text, clear=True) entities_text = ', '.join(r['name'] for r in o['entities_list']) rec['lemmatized_text'] = o['lemmatized_text'] rec['entities_text'] = entities_text rec['entities_grouped'] = json.dumps(o['entities_grouped'], ensure_ascii=False) rec['process_status'] = o['process_status'] if i % 10 == 0: print(f'{i:>15}') duration = time.time() - start return records, None, duration
def search(): text = get_text() field = request.args.get('field','lemmatized_text') skip = request.args.get('skip', 0) limit = request.args.get('limit',20) timeout = request.args.get('timeout','5s') lemmatize = request.args.get('lemmatize',True) from_date = request.args.get('from_date','2000-01-01') to_date = request.args.get('to_date','2030-01-01') index = request.args.get('index','articles') if lemmatize!="false": o = text_processor.process_text(text, clear=True) text = o.get('lemmatized_text','') else: logging.warning('NOT lemmatized !!!!') search_result = elastic.search(text, skip=skip, limit=limit, field=field, timeout=timeout, from_date=from_date, to_date=to_date, index=index) return make_response(search_result)
c = conn.cursor() c.execute("SELECT * FROM training") tr_d = c.fetchall() c.execute("SELECT * FROM test") te_d = c.fetchall() c.execute("SELECT * FROM validation") va_d = c.fetchall() tr_x = [d[0] for d in tr_d] tr_y_ = [sent2onehot(d[2]) for d in tr_d] te_x = [d[0] for d in te_d] te_y_ = [sent2onehot(d[2]) for d in te_d] va_x = [d[0] for d in va_d] va_y_ = [sent2onehot(d[2]) for d in va_d] ptr_x = [process_text(x, stop_words) for x in tr_x] pte_x = [process_text(x, stop_words) for x in te_x] pva_x = [process_text(x, stop_words) for x in va_x] vocab_size = 1000 vocab = {} for text in ptr_x: tokens = text.split(' ') for t in tokens: vocab[t] = vocab.get(t, 0) + 1 sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)[:vocab_size] features = [v[0] for v in sorted_vocab] tr_dtmat = word2vec(ptr_x, features)
def data_from_db(self): """ get subreddit corpus from database reddit.db :return: text_matrix: matrix of text in subreddits. rows are subreddits. sub_list: list of subreddits included in the matrix sub_to_index: dictionary for converting from subreddit name to index in the matrix """ sub_list = [] text_matrix = [] unstemmed_text_matrix = [] # used for word cloud later connecting_to_db = True sql_command = "SELECT subreddit, GROUP_CONCAT(body, ' ') as all_comments FROM comments GROUP BY subreddit" while connecting_to_db: try: print("Connecting to DB.\n") pwd = os.getcwd() db_conn = sqlite3.connect(pwd + '/../db/reddit.db') c = db_conn.cursor() results = c.execute(sql_command) except sqlite3.OperationalError: print("Table does not exist yet. Creating from CSV.\n") create_db(db_conn) continue print("Done.") break english_stop_words = stopwords.words('english') r = praw.Reddit(user_agent='daniel_scraper') for i, row in enumerate(list(results)): print("Loading subreddit {}: {}....".format(i, row[0]), end="") ''' try: if r.get_subreddit(row[0]).subscribers < 50000: print("Done") continue except: print("Something went wrong. Continuing.") continue ''' sub_list.append(row[0].lower()) text_matrix.append(process_text(row[1], punctuation, english_stop_words)) unstemmed_text_matrix.append(process_text(row[1], punctuation, english_stop_words, stem=False)) print("Done") sub_to_index = {sub_name: index for sub_name, index in zip(sub_list, range(len(sub_list)))} print("Done.\n") text_matrix = np.array(text_matrix) unstemmed_text_matrix = np.array(unstemmed_text_matrix) np.save('unstemmed_text_matrix.npy', unstemmed_text_matrix) np.save('text_matrix.npy', text_matrix) pickle.dump(sub_list, open("sub_list.p", "wb")) pickle.dump(sub_to_index, open("sub_to_index.p", "wb")) return text_matrix, sub_list, sub_to_index
def clear_lemmas_entities(): text = get_text() o = text_processor.process_text(text, clear=True) return make_response(o)
def lemmas_entities(): text = get_text() o = text_processor.process_text(text) return make_response(o)