def get_grams_from_text(path='lenta-ru-news.csv.gz', n=[1, 2], amount_of_sentense=1000, verbose=True, show_how_much=1000, **kwargs): records = load_lenta(path) grams, count = {}, 1 flatten = lambda l: [' '.join(item) for sublist in l for item in sublist] try: while True and count != amount_of_sentense: item = next(records).text if verbose: print(f'Sentence {count}') if count % show_how_much == 0 else 'pass' #for i in np.arange(1, n+1): for i in n: if i not in list(grams.keys()): grams[i] = Counter() ngram = [list(ngrams(text_prepare(sentense, **kwargs).lower().split(), n=i)) for sentense in nltk.sent_tokenize(item)] grams[i] += Counter(flatten(ngram)) count +=1 except StopIteration: pass finally: del records return grams
def _searchLenta(self): path = 'lenta-ru-news.csv.gz' self._records = load_lenta(path) self._chooseMethod = 3 self._lentaListOfThemes = { "Бизнес", "Госэкономика", "Кино", "Люди", "Музыка", "Наука", "Происшествия", "Следствие и суд", "Украина", "Футбол" }
def get_records(): records = load_lenta('data/lenta-ru-news.csv.gz') # texts = [t['text'] for t in test_set] test_set = [] for i, r in enumerate(records): test_set.append(r.title + '\n' + r.text) if i >= 200: break return test_set
def test_on_k_random_records(K): records = load_lenta(lenta_path) records_num = [i for i in range(N)] chosen_records_num = random.choices(records_num, k=K) my_records = [] for i in chosen_records_num: my_records.append(get_k_record(records, i)) print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END + ' records\n') for i in range(K): print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t') markup = ner(my_records[i].text) show_markup(markup.text, markup.spans) print('\n--------------------------\n\n')
def get_text(path='../../../data/lenta-ru-news.csv.gz', amount_of_sentense=1000, verbose=True, show_how_much=1000, **kwargs): records = load_lenta(path) a = [] count = 1 try: while True and count != amount_of_sentense: item = next(records).text if verbose: print(f'Sentence {count}' ) if count % show_how_much == 0 else 'pass' a.append(text_prepare(item)) count += 1 except StopIteration: pass finally: del records return a
def __init__(self, tokenizer, eval=False): self.tokenizer = tokenizer self.tokenizer_params = { "max_length": 128, "truncation": True, "add_special_tokens": True, } fact_ru = [item.text for item in cor.load_factru(config.fact_ru)] # TEST self.tokenizer.encode_plus(fact_ru[0], **self.tokenizer_params) lenta_ru = [] lenta_ru = [item.text for item in cor.load_lenta(config.lenta_ru)] wiki_ru = [] #wiki_ru = [item.text for item in cor.load_wiki(config.wiki_ru)] with open("data/ru/wiki.txt", "r") as f: wiki_ru = [l.strip() for l in f if len(l) > 35] self.data = fact_ru + wiki_ru + lenta_ru
def test_on_random_record(): records = load_lenta(lenta_path) record, k = get_random_record(records) markup = ner(record.text) print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n') show_markup(markup.text, markup.spans)
def main(args): records = load_lenta('data/lenta-ru-news.csv.gz') for i, r in enumerate(records): print(json.dumps({'text': r.title + '\n' + r.text}, ensure_ascii=False)) if i >= args.doc_count: break
def get_all_pos(self): if self.session: store = "" key = 'id_' + str(self.user_id) for session in Session.objects.filter( expire_date__gt=timezone.now()): store = SessionStore(session_key=session.session_key) if int(store.get('_auth_user_id')) == self.user_id: if key not in store: store[key] = { 'processed': -1, 'total': 0, 'corpus': self.train_sets[0] } unique_bigrams = dict() unique_trigrams = dict() posTagger = PoSTagger(self.lang) tok = Tokenizer(lang=self.lang) for train_file in self.train_sets: # i_start, i_end if self.session: #print("TTTEEESSSSTTTT: ", store[key]['processed']) if store[key]['processed'] != -1: print("_BREAK__BREAK__BREAK__BREAK__BREAK__BREAK__BREAK_") break store[key]['processed'] = 0 store.save() ''' for tt in self.tag_types: if tt not in DB: save tag in db clear self.tag_types for bigramm in unique_bigrams: if bigramm not in DB: save bigramm in db else: upgrade bigram freq in db clear bigramm ''' path = str(self.BASE_DIR) + '\\grammar\\data\\' + train_file if self.lang == 'en': sents = [ ' '.join(sent).replace(':', '').replace('``', '').replace( "''", '').replace('`', "'").split() for sent in brown.sents() ] if self.session: store[key]['total'] = len(sents) for i in range(len(sents)): if self.session: if (i + 1) % SHOW_AFTER == 0: if i != 0: store[key]['processed'] = int( store[key]['processed']) + SHOW_AFTER store.save() print("STORE ", i, ": ", store[key]) #if i == 2: # break cur_sent = ' '.join(sents[i]) self.tag_types, temp = process_sent(cur_sent, self.tag_types, posTagger, make_all_lower=True) unique_bigrams = tok.generate_different_ngrams( [temp], 2, unique_bigrams) #unique_trigrams = tok.generate_different_ngrams([temp], 3, unique_trigrams) #print(i) if self.lang == 'ru': records = load_lenta(path) if self.session: store[key]['total'] = RU_RECORDS rec_count = 0 print(RU_RECORDS) for record in records: if self.session: if (rec_count + 1) % SHOW_AFTER == 0: if rec_count != 0: store[key]['processed'] = int( store[key]['processed']) + SHOW_AFTER store.save() print("STORE ", rec_count, ": ", store[key]) #print(rec_count) if rec_count == RU_RECORDS: break rec_count += 1 tokenized_sent = tok.tokenize_sent(record.text) for sent in tokenized_sent: self.tag_types, temp = process_sent( sent, self.tag_types, posTagger, make_all_lower=True) unique_bigrams = tok.generate_different_ngrams( [temp], 2, unique_bigrams) #unique_trigrams = tok.generate_different_ngrams([temp], 3, unique_trigrams) return unique_bigrams, unique_trigrams