def test7(self): test_file = 'example7' print(" Testing {0}.html".format(test_file)) html_file = os.path.join('Examples','html_files','{0}.html'.format(test_file)) expected_output_file = os.path.join('Examples','expected_e2e_test_output','{0}.json'.format(test_file)) expected_output = openJsonFile(expected_output_file) te = TextExtractor(openHtmlFileAsString(html_file)) data = ngrams.generate_ngrams(te.getListOfWords(),5) self.assertDictEqual(data,expected_output)
def test4(self): test_file = 'example4' print(" Testing {0}.html".format(test_file)) html_file = os.path.join('Examples','html_files','{0}.html'.format(test_file)) expected_output_file = os.path.join('Examples','expected_html_output','{0}.txt'.format(test_file)) te = TextExtractor(openHtmlFileAsString(html_file)) expected_output = openTxtFileAsString(expected_output_file) self.assertEqual(str(te),expected_output)
def generate_expected_output(): if not os.path.exists('Examples'): print("ERROR: please invoke this script one level below the 'Examples' directory") sys.exit(1) if not os.path.exists(os.path.join('Examples','expected_html_output')): os.mkdir(os.path.join('Examples','expected_html_output')) if not os.path.exists(os.path.join('Examples','expected_ngram_output')): os.mkdir(os.path.join('Examples','expected_ngram_output')) if not os.path.exists(os.path.join('Examples', 'expected_e2e_test_output')): os.mkdir(os.path.join('Examples', 'expected_e2e_test_output')) #TEXT EXTRACTION TESTS #for all of the html files for filename in os.listdir(os.path.join('Examples', 'html_files')): file_path = os.path.join('Examples', 'html_files', filename) #grab the contents with open(file_path, 'r') as infile: input_string = infile.read() #extract them te = TextExtractor(input_string) #the output file name will be the input file name, but .txt instead of .html. outfile_name = "{0}.txt".format(filename.split('.')[0]) #the output file goes to the expected_html output directory. outfile_name = os.path.join('Examples','expected_html_output', outfile_name) #dump the results into the output file. with open(outfile_name, 'w') as outfile: outfile.write(str(te)) print("Wrote {0}".format(outfile_name)) #NGRAM TESTS #for every text input file. for filename in os.listdir(os.path.join('Examples', 'txt_files')): file_path = os.path.join('Examples', 'txt_files', filename) #grab its contents. with open(file_path, 'r') as infile: input_string = infile.read().split() #Feed them into the ngram generator. We test 5 grams, because that's what's in the system spec. data = ngrams.generate_ngrams(input_string, 5) #output file name is the input file name .json outfile_name = "{0}.json".format(filename.split('.')[0]) #the output file goes in the expected_ngram_output directory outfile_name = os.path.join('Examples','expected_ngram_output', outfile_name) #dump a json to the output file. with open(outfile_name, 'w') as outfile: json.dump(data, outfile) print("Wrote {0}".format(outfile_name)) #E2E TESTS #for every html file for filename in os.listdir(os.path.join('Examples', 'html_files')): file_path = os.path.join('Examples', 'html_files', filename) #grab the contents with open(file_path, 'r') as infile: input_string = infile.read() #extract them te = TextExtractor(input_string) #ngram them data = ngrams.generate_ngrams(te.getListOfWords(), 5) #the output file name will be the input file name, but .txt instead of .html. outfile_name = "{0}.json".format(filename.split('.')[0]) #the output file goes to the expected_html output directory. outfile_name = os.path.join('Examples', 'expected_e2e_test_output', outfile_name) #dump the results into the output file. with open(outfile_name, 'w') as outfile: json.dump(data, outfile) print("Wrote {0}".format(outfile_name))