def test_file_as_list(tmp_path): words_test = tmp_path / 'test.csv' words_test.write_text(CONTENT) words = file_as_list(words_test, local=False) assert words == [ 'category1;answer1', 'category2;answer2', 'category3;answer3' ]
def training(processors, wiki_extracts, questions, name): start = watch.time() log_info(f'collecting wiki_extracts from folder {wiki_extracts}') concepts = get_concepts_from_folder(wiki_extracts) concepts_sentences = get_sentences_from_concepts(concepts) questions = (line.rstrip('\r\n') for line in file_as_list(questions, local=False)) for processor in processors: questions = processor(questions) concepts_sentences = processor(concepts_sentences) concepts_sentences = list(concepts_sentences) questions = list(questions) log_info(f'found {len(concepts_sentences)} sentences') log_info(f'collected {len(questions)} questions') sentences = get_words_from_sentences(concepts_sentences) questions = get_words_from_sentences(questions) log_info(f'creating language model') model, vectors = create_w2v_model(sentences, questions) save_model(model, f'w2v_{name}100_model.w2v') save_pre_computed_vectors(vectors, f'w2v_{name}100_vectors.pickle') log_info(f'training completed in {watch.time() - start}s\n')
def remove_stopwords_stream(doc, stopwords='res/custom_ch_stopwords.txt'): ''' Remove stopwords in stdin defined in a file. ''' stopwords = file_as_list(stopwords) doc = (x for x in doc if x != '\n') return map(lambda x: remove(x, stopwords), doc)
def remove_stopwords(doc, stopwords='res/custom_ch_stopwords.txt'): ''' Remove stopwords in a list defined in a file. ''' stopwords = file_as_list(stopwords) doc = list(filter(lambda x: x != '', doc)) return list(map(lambda x: remove(x, stopwords), doc))
def category_to_answer(map, file): map = file_as_dict(map, sep=';', local=False) map = {k.lower(): v for k, v in map.items()} cat = file_as_list(file, local=False) for c in cat: if c not in map: print(c)
def remove_stopwords_json_stream(json_docs, stopwords='res/custom_ch_stopwords.txt', text_key='text'): ''' Removes stopwords in stdin defined as json documents. ''' stopwords = file_as_list(stopwords) for doc in json_docs: doc[text_key] = remove(doc[text_key], stopwords) click.get_text_stream('stdout', 'utf-8').write(dump_json(doc) + '\n')
def remove_stopwords_json(json_doc, text_key, stopwords='res/custom_ch_stopwords.txt'): ''' Remove stopwords in a single text property of the json document. :param json_doc: The json document :param text_key: The key of the text property :param stopwords: The filepath to a stop word list :return: The json document without stop words ''' stopwords = file_as_list(stopwords) json_doc[text_key] = remove(json_doc[text_key], stopwords) return json_doc
def process_pipeline(processors, wiki_extracts, questions, name): start = watch.time() concepts = get_concepts_from_folder(wiki_extracts) questions = (line.rstrip('\r\n') for line in file_as_list(questions, local=False)) for processor in processors: questions = processor(questions) concepts = processor(concepts) concepts = list(concepts) questions = list(questions) log_info(f'creating language model') model_100, vectorizer_100 = create_language_model(concepts, questions, 0) model_95, vectorizer_95 = create_language_model(concepts, questions, 0.25) model_90, vectorizer_90 = create_language_model(concepts, questions, 0.5) save(model_100, vectorizer_100, f'tfidf_{name}100_model', f'tfidf_{name}100_vectors') save(model_95, vectorizer_95, f'tfidf_{name}75_model', f'tfidf_{name}75_vectors') save(model_90, vectorizer_90, f'tfidf_{name}50_model', f'tfidf_{name}50_vectors') log_info(f'training completed in {watch.time() - start}s\n')
def load_tags_answers(dataset): tags = file_as_list(f'/mnt/data/raw/{dataset}_tags.txt', local=False) answers = file_as_list(f'/mnt/data/raw/{dataset}_answers.txt', local=False) questions = file_as_list(f'/mnt/data/raw/{dataset}_questions.txt', local=False) return tags, answers, questions
def ionesoft_stopwords(question): stopwords = file_as_list('res/custom_ch_stopwords.txt') question = ionesoft_normalize(question) return remove(question, stopwords)
def stackexchange_stopwords(question): stopwords = file_as_list('res/custom_en_stopwords.txt') question = stackexchange_normalize(question) return remove(question, stopwords)