예제 #1
0
def main() -> None:
    """Точка входа в приложение."""
    corpus_root = Path('corpus')
    # Настроим логирование результатов
    global _logger
    setup_logger(_logger, corpus_root / 'collocations.log')

    # Загрузим стоп-слова
    nltk.download('stopwords', '.env/share/nltk_data')
    stop_words = set(stopwords.words('russian'))

    # Импортируем корпус
    tags_root = corpus_root / 'pos_tagging'
    reader = ConllCorpusReader(
        str(tags_root), [f.name for f in tags_root.glob('*.tags')],
        columntypes=['words', 'ignore', 'ignore', 'ignore', 'pos'],
        separator='\t')
    _logger.info('Документов: %d', len(reader.fileids()))
    _logger.info('Токенов в первом документе (%s): %d',
                 reader.fileids()[0], len(reader.words(reader.fileids()[0])))

    _logger.info('Загружаем предложения')
    sentences = reader.sents()

    # Строим таблицы сопряжённости для всех слов в корпусе
    _logger.info('Считаем таблицу сопряжённости по всем словам')
    bigram_finder = BigramCollocationFinder.from_documents(
        [w.lower() for w in sent] for sent in tqdm(sentences))
    _logger.info('Всего биграм: %d', bigram_finder.N)

    print_samples(bigram_finder)

    # А теперь отфильтруем по частоте и удалим пунктуацию, стоп-слова
    _logger.info(
        'Отфильтруем пунктуацию, стоп-слова и установим предел по частоте')
    bigram_finder.apply_freq_filter(5)
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w in stop_words)
    _logger.info('Всего биграм: %d', bigram_finder.N)
    print_samples(bigram_finder)
예제 #2
0
 def sents(self, fileids=None, categories=None):
     return ConllCorpusReader.sents(self,
                                    self._resolve(fileids, categories))
	def sents(self, fileids=None, categories=None):
		return ConllCorpusReader.sents(self, self._resolve(fileids, categories))
예제 #4
0
                (prob, state) = max((V[t-1][y0] + trans_p.logprob((y0, y)), y0) for y0 in states)
            V[t][y] = prob
            newpath[y] = path[state] + [y]
        # Don't need to remember the old paths
        path = newpath

    # Return the most likely sequence over the given time frame
    n = len(obs) - 1
    (prob, state) = max((V[n][y], y) for y in states)
    return path[state]

Vit1 = []
Vit2 = []
Vit3 = []
Vit4 = []
for sent in conllreader.sents():
    Vit1.append(zip(sent, viterbi(sent, states, Train.A0j, Train.Aij, Train.Biw)))
    Vit2.append(zip(sent, viterbi(sent, states, Train.A0jLap, Train.AijLap, Train.BiwLap)))
    Vit3.append(zip(sent, viterbi(sent, states, Train.A0jGT, Train.AijGT, Train.BiwGT)))
    Vit4.append(zip(sent, viterbi(sent, states, Train.A0jMLE, Train.AijMLE, Train.BiwMLE)))


# function for writing tagged corpora to files in CoNLL format
def write_conll(filename, tagged_corpus):
    with open(filename, 'w') as out_file:
        for tagged_sent in tagged_corpus:
            tagged_words = ('\t'.join(w_t) for w_t in tagged_sent)
            out_file.write('\n'.join(tagged_words) + '\n\n')

write_conll('unsmoothed.tt', Vit1)
write_conll('laplace.tt', Vit2)
예제 #5
0
            V[t][y] = prob
            newpath[y] = path[state] + [y]
        # Don't need to remember the old paths
        path = newpath

    # Return the most likely sequence over the given time frame
    n = len(obs) - 1
    (prob, state) = max((V[n][y], y) for y in states)
    return path[state]


Vit1 = []
Vit2 = []
Vit3 = []
Vit4 = []
for sent in conllreader.sents():
    Vit1.append(
        zip(sent, viterbi(sent, states, Train.A0j, Train.Aij, Train.Biw)))
    Vit2.append(
        zip(sent,
            viterbi(sent, states, Train.A0jLap, Train.AijLap, Train.BiwLap)))
    Vit3.append(
        zip(sent, viterbi(sent, states, Train.A0jGT, Train.AijGT,
                          Train.BiwGT)))
    Vit4.append(
        zip(sent,
            viterbi(sent, states, Train.A0jMLE, Train.AijMLE, Train.BiwMLE)))


# function for writing tagged corpora to files in CoNLL format
def write_conll(filename, tagged_corpus):