Python CorpusSearcher示例

编程语言: Python

命名空间/包名称: preparation.corpus_searcher

类/类型: CorpusSearcher

hotexamples.com的示例: 2

Python CorpusSearcher - 已找到2个示例。这些是从开源项目中提取的最受好评的preparation.corpus_searcher.CorpusSearcher现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

CorpusSearcher(2)

add_phrase(2)

find_similar(1)

get_random(1)

示例#1

显示文件

        weights1 = [self.weights[i] for i in pos_sample_indexes]

        # оставшиеся после усечения негативные примеры
        str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes]
        relevancy0 = [self.relevancy[i] for i in neg_sample_indexes]
        weights0 = [self.weights[i] for i in neg_sample_indexes]

        self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0))
        self.relevancy = list(itertools.chain(relevancy1, relevancy0))
        self.weights = list(itertools.chain(weights1, weights0))


if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.load()
    random_questions = CorpusSearcher()
    random_facts = CorpusSearcher()

    # прочитаем список случайных вопросов из заранее сформированного файла
    # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing
    # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt)
    print('Loading random questions and facts...')
    with codecs.open(questions_path, 'r', 'utf-8') as rdr:
        for line in rdr:
            if len(line) < 40:
                question = line.strip()
                question = ru_sanitize(u' '.join(
                    tokenizer.tokenize(question.lower())))
                random_questions.add_phrase(normalize_qline(question))

    # Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны

示例#2

显示文件

文件： generate_nonrelevant_premises.py 项目： DnAp/chatbot

    return False


df = pd.read_csv(os.path.join(data_folder, 'premise_question_relevancy.csv'),
                 encoding='utf-8',
                 delimiter='\t',
                 quoting=3)
added_pq = set(
    (premise + '|' + question)
    for premise, question in zip(df['premise'].values, df['question'].values))

segmenter = Segmenter()
tokenizer = Tokenizer()
tokenizer.load()

random_facts = CorpusSearcher()

# Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны
corpus_path = os.path.expanduser('~/Corpus/Raw/ru/text_blocks.txt')
n = 0
print(u'Loading samples from {}'.format(corpus_path))
with codecs.open(corpus_path, 'r', 'utf-8') as rdr:
    for line in rdr:
        line = line.strip()
        phrases = segmenter.split(line)
        for phrase in phrases:
            if phrase[-1] == '.':
                phrase = phrase.strip().replace('--', '-')
                if phrase.count('"') == 1:
                    phrase = phrase.replace('"', '')