def test_dump_documents(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) try: path = create_temporary_file(content=None, text=False) with pytest.raises(Exception): pickle_manager.dump_documents(docs1, path) finally: remove_and_check(path) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) metadata = pickle_manager.get_docs_metadata(filename) docs2 = list(pickle_manager.get_documents(filename)) assert len(metadata) == 1 assert metadata['total'] == len(docs1) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) finally: remove_and_check(filename) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Storing documents: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n')
def test_check_data(capsys): df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs, filename) pickle_manager.check_data(filename) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Checking data: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') count = 10 metadata1 = {'total': count} pda1 = pickle_manager.PickleDumpAppend(metadata1, filename) for not_Document in range(count): pda1.dump_append(not_Document) pda1.close() with pytest.raises(AssertionError): pickle_manager.check_data(filename) metadata2 = {'total': -1} pickle_manager.PickleDumpAppend(metadata2, filename).close() with pytest.raises(AssertionError): pickle_manager.check_data(filename) finally: remove_and_check(filename)
def test_prepare(capsys): text_field = 'text field' class_field = 'class field' quantity = 2 fields = {text_field: 'Teste value.', class_field: 'c1'} analyzed_sentences = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * quantity } docs1 = [ Document(index=0, fields=fields, analyzed_sentences=analyzed_sentences), Document(index=1, fields=fields, analyzed_sentences=None), ] synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt'] expected_corpus_str = [[' '.join(['teste value'] * quantity), ''], [' '.join(['prova value'] * quantity), '']] expected_classifications = [[fields[class_field]] * quantity ] * len(synonyms_files) expected_idxs_to_remove = [[1]] * len(synonyms_files) expected_corpus = [[['teste', 'value'] * quantity, []], [['prova', 'value'] * quantity, []]] try: filename = generate_available_filename() pickle_manager.dump_documents(docs1, filename) for i, synonyms_file in enumerate(synonyms_files): ft = FeatureExtractor(synonyms_file=synonyms_file) for training_mode in [True, False]: corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare( text_field, class_field, None, docs1, training_mode) corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare( text_field, class_field, filename, None, training_mode) assert (corpus_str1, classifications1, idxs_to_remove1, corpus1) == (corpus_str2, classifications2, idxs_to_remove2, corpus2) assert corpus_str1 == expected_corpus_str[i] assert classifications1 == expected_classifications[i] assert idxs_to_remove1 == expected_idxs_to_remove[i] assert corpus1 == expected_corpus[i] captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith( 'Preparing to create classification: 100%|') assert captured.err.endswith( 'doc/s]\n') or captured.err.endswith('s/doc]\n') if synonyms_file is not None: remove_and_check(synonyms_file) finally: remove_and_check(filename)
def test_get_docs_metadata(): df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs, filename) metadata = pickle_manager.get_docs_metadata(filename) finally: remove_and_check(filename) assert type(metadata) is dict assert len(metadata) == 1 assert metadata['total'] == len(docs)
def main(parameters): execution_info = pd.DataFrame() execution_info['Start date'] = [functions.get_local_time_str()] logger.debug("Starting execution.") if basename(parameters.excel_file) == '20newsgroups': parameters = load_20newsgroups(parameters) if parameters.preprocess_data: if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file): logger.error("Please, provide a valid Excel file or a valid preprocessed data file.") quit() if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file): logger.info("Loading Excel file.") data_frame = pd.read_excel(parameters.excel_file) data_frame = data_frame.fillna("NaN") logger.info("Creating documents.") docs = functions.data_frame_to_document_list(data_frame) logger.info("Storing generated documents.") pickle_manager.dump_documents(docs, parameters.preprocessed_data_file) logger.info("Preprocessing documents.") preprocessor = Preprocessor(mosestokenizer_language_code=parameters.mosestokenizer_language_code, store_data=True, spell_checker_lang=parameters.spell_checker_lang, n_jobs=parameters.number_of_jobs) preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Checking generated data.") pickle_manager.check_data(parameters.preprocessed_data_file) else: if not isfile(parameters.preprocessed_data_file): logger.error("The indicated preprocessed data file does not exist.") quit() logger.info("Extracting features and splitting dataset into training and test subsets.") feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=True, feature_reduction=parameters.feature_reduction, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, n_jobs=parameters.number_of_jobs) corpus, classifications, idxs_to_remove, _docs_lemmas = feature_extractor.prepare(text_field=parameters.excel_column_with_text_data, class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file) if parameters.final_training: X_train, y_train = feature_extractor.generate_X_y(corpus, classifications, training_mode=True) else: corpus_train, corpus_test, classifications_train, classifications_test = train_test_split(corpus, classifications, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration, idxs_to_remove) X_train, y_train = feature_extractor.generate_X_y(corpus_train, classifications_train, training_mode=True) X_test, y_test = feature_extractor.generate_X_y(corpus_test, classifications_test, training_mode=False) X_train, y_train = resample(parameters.resampling, X_train, y_train) logger.info("Running classifiers.") p = classifiers.Pipeline(parameters.classifiers) logger.info("Accuracies:") if parameters.final_training: p.start(X_train, y_train, n_jobs=parameters.number_of_jobs, set_n_accepted_probs=parameters.set_num_accepted_probs, class_weight=parameters.class_weights, generate_roc_plots=parameters.generate_roc_plots) else: predictions_dict = p.start(X_train, y_train, X_test, y_test, parameters.number_of_jobs, parameters.set_num_accepted_probs, parameters.class_weights, parameters.generate_roc_plots) dump_json(predictions_dict, 'predictions.json') execution_info['End date'] = [functions.get_local_time_str()] logger.debug("Execution completed.") if not parameters.final_training: functions.generate_report(execution_info, parameters.__dict__, predictions_dict)
def test_get_documents(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: dump_documents(docs1, filename) for d1, d2 in [(None, '100%|'), ('Loading documents', 'Loading documents: 100%|')]: docs2 = list(get_documents(filename, description=d1)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith(d2) assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') finally: remove_and_check(filename)
def test_set_docs_metadata(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) metadata1 = pickle_manager.get_docs_metadata(filename) metadata2 = metadata1.copy() metadata2['new_field'] = 'test_field_value' assert metadata1 != metadata2 pickle_manager.set_docs_metadata(metadata2, filename) assert pickle_manager.get_docs_metadata(filename) == metadata2 docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:] == 'Storing subsets: 0MB [00:00, ?MB/s]\n' finally: remove_and_check(filename)
def test_get_documents(): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) try: f = open(filename, 'ab') dump(obj=0, file=f, protocol=pickle_manager._pickle_protocol) f.close() docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) pytest.fail() except Exception as e: assert len(e.args) == 1 assert e.args[ 0] == "The file '%s' has more documents than indicated in the metadata." % ( filename) finally: remove_and_check(filename)
def test_train_test_split(): text_field = 'Example column' df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) preprocessor = Preprocessor() preprocessor.preprocess(text_field, None, docs) ft = FeatureExtractor() corpus, classifications, _, _ = ft.prepare( text_field=text_field, class_field='Classification column', preprocessed_data_file=None, docs=docs, training_mode=False) test_size = 0.3 preprocessed_data_file = generate_available_filename() force = False idxs_to_remove = [5] try: pickle_manager.dump_documents(docs, preprocessed_data_file) assert pickle_manager.get_docs_metadata(preprocessed_data_file) == { 'total': 10 } desired = { 'total': 10, 'test_size': test_size, 'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]), 'test_set_indexes': np.array([7, 9, 4]) } for my_force in [False, True]: train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, my_force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), desired) for key in ['test_size', 'training_set_indexes', 'test_set_indexes']: m = desired.copy() m[key] = None pickle_manager.set_docs_metadata(m, preprocessed_data_file) train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), desired) for key, value in [('test_size', 0.2), ('training_set_indexes', np.array([1, 0, 2, 8, 3]))]: m = desired.copy() m[key] = value pickle_manager.set_docs_metadata(m, preprocessed_data_file) train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), m) finally: remove_and_check(preprocessed_data_file) pass
def test_preprocess(capsys): text_field = 'Test field' index = -1 fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2} analyzed_sentences1 = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tikens', 'lemma': 'tikens', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } analyzed_sentences2 = { text_field: [[{ 'form': 'Test', 'lemma': 'test', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tokens', 'lemma': 'token', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1), ('en_US', analyzed_sentences2)]: doc = Document(index=index, fields=fields, analyzed_sentences=dict()) p = Preprocessor(spell_checker_lang=spell_checker_lang) assert p.stop is False p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) assert p.stop is False assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Preprocessing: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') p.stop = True with pytest.raises(SystemExit): p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) del (p) if spell_checker_lang is not None: rmtree('./hunspell') docs = [ Document(index=index, fields=fields, analyzed_sentences=dict()) for index in range(2) ] preprocessed_data_file = utils.generate_available_filename() try: pickle_manager.dump_documents(docs, preprocessed_data_file) pickle_manager.check_data(preprocessed_data_file) p = Preprocessor(store_data=True) assert all([ doc.analyzed_sentences == dict() for doc in pickle_manager.get_documents(preprocessed_data_file) ]) p.preprocess(text_field, preprocessed_data_file, None) assert all([ doc.analyzed_sentences == analyzed_sentences1 for doc in pickle_manager.get_documents(preprocessed_data_file) ]) pickle_manager.check_data(preprocessed_data_file) finally: utils.remove_and_check(preprocessed_data_file)