def _data_init(self, words, **kwargs): #phrases = Phrases([list(w) for w in words]) #normalized_phrases = {k.decode('utf-8'):v for k,v in phrases.vocab.items()} _,vocab_counts,_ = Phrases.learn_vocab(words,2000,delimiter=b'') vocab_counts = {k.decode('utf-8'):v for k,v in vocab_counts.items()} unigram_scores = {k:v for k,v in vocab_counts.items() if len(k) == 1} ngram_scores = {k:v for k,v in vocab_counts.items() if len(k) > 1} unigrams = sorted(unigram_scores, key=lambda k:unigram_scores[k], reverse=True) ngrams = sorted(ngram_scores, key=lambda k:ngram_scores[k], reverse=True) unigram_limit = len(unigrams) if 'unigram_limit' in kwargs and kwargs['unigram_limit'] is not None: unigram_limit = kwargs['unigram_limit'] #TODO: determine best parameter for controlling bigram vocabulary #TODO: allow for n-grams? #all_phrases = l1_phrases + l2_phrases[:len(l1_phrases)] all_phrases = unigrams[:unigram_limit] + ngrams[:unigram_limit] #TODO: determine bigram number some other way? #self.vocab = trie.Trie() #self.vocab_reverse = trie.Trie() self.vocab.add('') #optional? self.vocab_reverse.add('') for p in all_phrases: self.vocab.add(p) self.vocab_reverse.add(p[::-1])
backend="multiprocessing", prefer="processes") do = delayed(partial(tokenize_sentence_corpus, corpus_out_path)) tasks = (do(i, batch) for i, batch in enumerate(partitions)) executor(tasks) # process_texts(documents_path, year='2020', court='01', corpus_out_path=unigram_sentences_path, batch_size=8, n_jobs=2, # debug=True) stop_words = get_custom_stop_words() pruned_words, counters, total_words = Phrases.learn_vocab( sentences=LineSentence(unigram_sentences_path), max_vocab_size=800000000, common_terms=stop_words, progress_per=100) counters = sorted(counters.items(), key=lambda key_value: key_value[1], reverse=True) count = 0 for key, value in counters: count += 1 print(any2unicode(key), value) print(count) bigram_model = Phrases(LineSentence(unigram_sentences_path), max_vocab_size=800000000,
'tst - recurso de revista rr 1473005620085030137 (tst).', 'data de publicação: 14/08/2015', 'ementa: i - agravo de instrumento em recurso de revista da reclamada.', 'justiça gratuita.', '"demonstrada divergência jurisprudencial específica, impõe-se o provimento do agravo de instrumento para determinar o processamento do recurso de revista da reclamada."', 'agravo de instrumento provido.', 'ii - recurso de revista da reclamada 1 - sindicato.', 'substituição processual.' ] # tokenized_sentences = [[word for word in sentence.split()] for sentence in sentences] tokenized_sentences = [get_relevant_tokens(sentence) for sentence in sentences] pruned_words, counters, total_words = Phrases.learn_vocab( sentences=tokenized_sentences, max_vocab_size=800000000, common_terms=stop_words, progress_per=1) counters = sorted(counters.items(), key=lambda key_value: key_value[1], reverse=True) count = 0 for key, value in counters: count += 1 print(any2unicode(key), value) print(count) bigram_model = Phrases(tokenized_sentences, max_vocab_size=800000000,