def test7(self):
    test_file = 'example7'
    print(" Testing {0}.html".format(test_file))

    html_file = os.path.join('Examples','html_files','{0}.html'.format(test_file))
    expected_output_file = os.path.join('Examples','expected_e2e_test_output','{0}.json'.format(test_file))
    expected_output = openJsonFile(expected_output_file)

    te = TextExtractor(openHtmlFileAsString(html_file))
    data = ngrams.generate_ngrams(te.getListOfWords(),5)

    self.assertDictEqual(data,expected_output)
  def test4(self):
    test_file = 'example4'
    print(" Testing {0}.html".format(test_file))
    
    html_file = os.path.join('Examples','html_files','{0}.html'.format(test_file))
    expected_output_file = os.path.join('Examples','expected_html_output','{0}.txt'.format(test_file))

    te = TextExtractor(openHtmlFileAsString(html_file))
    expected_output = openTxtFileAsString(expected_output_file)

    self.assertEqual(str(te),expected_output)
def generate_expected_output():

  if not os.path.exists('Examples'):
    print("ERROR: please invoke this script one level below the 'Examples' directory")
    sys.exit(1)

  if not os.path.exists(os.path.join('Examples','expected_html_output')):
    os.mkdir(os.path.join('Examples','expected_html_output'))


  if not os.path.exists(os.path.join('Examples','expected_ngram_output')):
    os.mkdir(os.path.join('Examples','expected_ngram_output'))


  if not os.path.exists(os.path.join('Examples', 'expected_e2e_test_output')):
    os.mkdir(os.path.join('Examples', 'expected_e2e_test_output'))

  #TEXT EXTRACTION TESTS
  #for all of the html files
  for filename in os.listdir(os.path.join('Examples', 'html_files')):
    file_path = os.path.join('Examples', 'html_files', filename)
    #grab the contents
    with open(file_path, 'r') as infile:
      input_string = infile.read()
    #extract them
    te = TextExtractor(input_string)
    
    #the output file name will be the input file name, but .txt instead of .html.
    outfile_name = "{0}.txt".format(filename.split('.')[0])
    #the output file goes to the expected_html output directory.
    outfile_name = os.path.join('Examples','expected_html_output', outfile_name)
    #dump the results into the output file.
    with open(outfile_name, 'w') as outfile:
      outfile.write(str(te))
      print("Wrote {0}".format(outfile_name))

  #NGRAM TESTS
  #for every text input file.
  for filename in os.listdir(os.path.join('Examples', 'txt_files')):
    file_path = os.path.join('Examples', 'txt_files', filename)
    #grab its contents.
    with open(file_path, 'r') as infile:
      input_string = infile.read().split()
    #Feed them into the ngram generator. We test 5 grams, because that's what's in the system spec.
    data = ngrams.generate_ngrams(input_string, 5)
    #output file name is the input file name .json
    outfile_name = "{0}.json".format(filename.split('.')[0])
    #the output file goes in the expected_ngram_output directory
    outfile_name = os.path.join('Examples','expected_ngram_output', outfile_name)
    #dump a json to the output file.
    with open(outfile_name, 'w') as outfile:
      json.dump(data, outfile)
      print("Wrote {0}".format(outfile_name))

  #E2E TESTS
  #for every html file
  for filename in os.listdir(os.path.join('Examples', 'html_files')):
    file_path = os.path.join('Examples', 'html_files', filename)
    #grab the contents
    with open(file_path, 'r') as infile:
      input_string = infile.read()
    #extract them
    te = TextExtractor(input_string)
    #ngram them
    data = ngrams.generate_ngrams(te.getListOfWords(), 5)
    #the output file name will be the input file name, but .txt instead of .html.
    outfile_name = "{0}.json".format(filename.split('.')[0])
    #the output file goes to the expected_html output directory.
    outfile_name = os.path.join('Examples', 'expected_e2e_test_output', outfile_name)
    #dump the results into the output file.
    with open(outfile_name, 'w') as outfile:
      json.dump(data, outfile)
      print("Wrote {0}".format(outfile_name))