示例#1
0
def get_grams_from_text(path='lenta-ru-news.csv.gz', 
                        n=[1, 2], 
                        amount_of_sentense=1000, 
                        verbose=True, 
                        show_how_much=1000, **kwargs):
    records = load_lenta(path)
    grams, count = {}, 1
    flatten = lambda l: [' '.join(item) for sublist in l for item in sublist]
    try:
        while True and count != amount_of_sentense:
            item = next(records).text
            if verbose:
                print(f'Sentence {count}') if count % show_how_much == 0 else 'pass'
            #for i in np.arange(1, n+1):
            for i in n:
                if i not in list(grams.keys()):
                    grams[i] = Counter()
                ngram = [list(ngrams(text_prepare(sentense, **kwargs).lower().split(), n=i)) for sentense in nltk.sent_tokenize(item)]
                grams[i] += Counter(flatten(ngram))
            count +=1
    except StopIteration:
        pass
    finally:
        del records
    return grams
示例#2
0
 def _searchLenta(self):
     path = 'lenta-ru-news.csv.gz'
     self._records = load_lenta(path)
     self._chooseMethod = 3
     self._lentaListOfThemes = {
         "Бизнес", "Госэкономика", "Кино", "Люди", "Музыка", "Наука",
         "Происшествия", "Следствие и суд", "Украина", "Футбол"
     }
示例#3
0
def get_records():
    records = load_lenta('data/lenta-ru-news.csv.gz')
    # texts = [t['text'] for t in test_set]
    test_set = []
    for i, r in enumerate(records):
        test_set.append(r.title + '\n' + r.text)
        if i >= 200:
            break
    return test_set
示例#4
0
def test_on_k_random_records(K):
    records = load_lenta(lenta_path)
    records_num = [i for i in range(N)]
    chosen_records_num = random.choices(records_num, k=K)
    my_records = []
    for i in chosen_records_num:
        my_records.append(get_k_record(records, i))

    print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END +
          ' records\n')

    for i in range(K):
        print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t')
        markup = ner(my_records[i].text)
        show_markup(markup.text, markup.spans)
        print('\n--------------------------\n\n')
示例#5
0
def get_text(path='../../../data/lenta-ru-news.csv.gz',
             amount_of_sentense=1000,
             verbose=True,
             show_how_much=1000,
             **kwargs):
    records = load_lenta(path)
    a = []
    count = 1
    try:
        while True and count != amount_of_sentense:
            item = next(records).text
            if verbose:
                print(f'Sentence {count}'
                      ) if count % show_how_much == 0 else 'pass'
            a.append(text_prepare(item))
            count += 1
    except StopIteration:
        pass
    finally:
        del records
    return a
示例#6
0
    def __init__(self, tokenizer, eval=False):
        self.tokenizer = tokenizer
        self.tokenizer_params = {
            "max_length": 128,
            "truncation": True,
            "add_special_tokens": True,
        }
        fact_ru = [item.text for item in cor.load_factru(config.fact_ru)]

        # TEST
        self.tokenizer.encode_plus(fact_ru[0], **self.tokenizer_params)

        lenta_ru = []
        lenta_ru = [item.text for item in cor.load_lenta(config.lenta_ru)]

        wiki_ru = []
        #wiki_ru = [item.text for item in cor.load_wiki(config.wiki_ru)]
        with open("data/ru/wiki.txt", "r") as f:
            wiki_ru = [l.strip() for l in f if len(l) > 35]

        self.data = fact_ru + wiki_ru + lenta_ru
示例#7
0
def test_on_random_record():
    records = load_lenta(lenta_path)
    record, k = get_random_record(records)
    markup = ner(record.text)
    print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n')
    show_markup(markup.text, markup.spans)
示例#8
0
def main(args):
    records = load_lenta('data/lenta-ru-news.csv.gz')
    for i, r in enumerate(records):
        print(json.dumps({'text': r.title + '\n' + r.text}, ensure_ascii=False))
        if i >= args.doc_count:
            break
示例#9
0
    def get_all_pos(self):
        if self.session:
            store = ""
            key = 'id_' + str(self.user_id)
            for session in Session.objects.filter(
                    expire_date__gt=timezone.now()):
                store = SessionStore(session_key=session.session_key)
                if int(store.get('_auth_user_id')) == self.user_id:
                    if key not in store:
                        store[key] = {
                            'processed': -1,
                            'total': 0,
                            'corpus': self.train_sets[0]
                        }

        unique_bigrams = dict()
        unique_trigrams = dict()
        posTagger = PoSTagger(self.lang)
        tok = Tokenizer(lang=self.lang)

        for train_file in self.train_sets:
            # i_start, i_end
            if self.session:
                #print("TTTEEESSSSTTTT: ", store[key]['processed'])
                if store[key]['processed'] != -1:
                    print("_BREAK__BREAK__BREAK__BREAK__BREAK__BREAK__BREAK_")
                    break

                store[key]['processed'] = 0
                store.save()
                '''
				for tt in self.tag_types:
					if tt not in DB:
						save tag in db
				clear self.tag_types

				for bigramm in unique_bigrams:
					if bigramm not in DB:
						save bigramm in db
					else:
						upgrade bigram freq in db
				clear bigramm
				'''
            path = str(self.BASE_DIR) + '\\grammar\\data\\' + train_file
            if self.lang == 'en':
                sents = [
                    ' '.join(sent).replace(':', '').replace('``', '').replace(
                        "''", '').replace('`', "'").split()
                    for sent in brown.sents()
                ]
                if self.session:
                    store[key]['total'] = len(sents)
                for i in range(len(sents)):
                    if self.session:
                        if (i + 1) % SHOW_AFTER == 0:
                            if i != 0:
                                store[key]['processed'] = int(
                                    store[key]['processed']) + SHOW_AFTER
                                store.save()
                                print("STORE ", i, ": ", store[key])
                    #if i == 2:
                    #	break
                    cur_sent = ' '.join(sents[i])
                    self.tag_types, temp = process_sent(cur_sent,
                                                        self.tag_types,
                                                        posTagger,
                                                        make_all_lower=True)
                    unique_bigrams = tok.generate_different_ngrams(
                        [temp], 2, unique_bigrams)
                    #unique_trigrams = tok.generate_different_ngrams([temp], 3, unique_trigrams)
                    #print(i)

            if self.lang == 'ru':
                records = load_lenta(path)
                if self.session:
                    store[key]['total'] = RU_RECORDS
                rec_count = 0
                print(RU_RECORDS)
                for record in records:

                    if self.session:
                        if (rec_count + 1) % SHOW_AFTER == 0:
                            if rec_count != 0:
                                store[key]['processed'] = int(
                                    store[key]['processed']) + SHOW_AFTER
                                store.save()
                                print("STORE ", rec_count, ": ", store[key])
                    #print(rec_count)
                    if rec_count == RU_RECORDS:
                        break

                    rec_count += 1
                    tokenized_sent = tok.tokenize_sent(record.text)
                    for sent in tokenized_sent:
                        self.tag_types, temp = process_sent(
                            sent,
                            self.tag_types,
                            posTagger,
                            make_all_lower=True)
                        unique_bigrams = tok.generate_different_ngrams(
                            [temp], 2, unique_bigrams)
                        #unique_trigrams = tok.generate_different_ngrams([temp], 3, unique_trigrams)
        return unique_bigrams, unique_trigrams