def Text_Transformation_controller(crawling_data,max_n_gram_size): te = text_extract.TextExtractor(crawling_data['content']) #Run the text extractor output = dict() #create the dictionary output['metadata'] = te.extractMetadata() #Get metadata from html output['metadata']['url'] = crawling_data['url'] #get from crawling API output['metadata']['timestamp'] = crawling_data['metadata']['timestamp'], #get from crawling API list_of_words = te.getListOfWords() n_grams = ngrams.generate_ngrams(list_of_words,max_n_gram_size) titles = ngrams.generate_ngrams(te.getTitleListOfWords(),max_n_gram_size) headers = ngrams.generate_ngrams(te.getHeaderListOfWords(),max_n_gram_size) output['ngrams'] = { 'all' : n_grams, 'headers' : headers, #we need to parse these 'title' : titles #we need to parse these. } output["text"] = list_of_words #This line will print the results in output print(json.dumps(output,indent=4)) return output
def Text_Transformation_controller(crawling_data, max_n_gram_size): te = text_extract.TextExtractor( crawling_data['content']) #Run the text extractor output = dict() #create the dictionary output['metadata'] = te.extractMetadata() #Get metadata from html output['metadata']['url'] = crawling_data['url'] #get from crawling API output['metadata']['timestamp'] = crawling_data['metadata'][ 'timestamp'], #get from crawling API list_of_words = te.getListOfWords() n_grams = ngrams.generate_ngrams(list_of_words, max_n_gram_size) titles = ngrams.generate_ngrams(te.getTitleListOfWords(), max_n_gram_size) headers = ngrams.generate_ngrams(te.getHeaderListOfWords(), max_n_gram_size) output['ngrams'] = { 'all': n_grams, 'headers': headers, #we need to parse these 'title': titles #we need to parse these. } output["text"] = list_of_words #This line will print the results in output print(json.dumps(output, indent=4)) return output
def test6(self): test_file = 'example6' print(" Testing {0}.txt".format(test_file)) txt_file = os.path.join('Examples','txt_files','{0}.txt'.format(test_file)) expected_output_file = os.path.join('Examples','expected_ngram_output','{0}.json'.format(test_file)) data = ngrams.generate_ngrams(openTxtFileAsList(txt_file),5) expected_output = openJsonFile(expected_output_file) self.assertDictEqual(data,expected_output)
def test7(self): test_file = 'example7' print(" Testing {0}.html".format(test_file)) html_file = os.path.join('Examples','html_files','{0}.html'.format(test_file)) expected_output_file = os.path.join('Examples','expected_e2e_test_output','{0}.json'.format(test_file)) expected_output = openJsonFile(expected_output_file) te = TextExtractor(openHtmlFileAsString(html_file)) data = ngrams.generate_ngrams(te.getListOfWords(),5) self.assertDictEqual(data,expected_output)
def generate_ngrams_threaded(n, c): print("starting ngrams of length", n) ngrams = generate_ngrams(n, c) print("generated ngrams of length", n) return ngrams
def generate_expected_output(): if not os.path.exists('Examples'): print("ERROR: please invoke this script one level below the 'Examples' directory") sys.exit(1) if not os.path.exists(os.path.join('Examples','expected_html_output')): os.mkdir(os.path.join('Examples','expected_html_output')) if not os.path.exists(os.path.join('Examples','expected_ngram_output')): os.mkdir(os.path.join('Examples','expected_ngram_output')) if not os.path.exists(os.path.join('Examples', 'expected_e2e_test_output')): os.mkdir(os.path.join('Examples', 'expected_e2e_test_output')) #TEXT EXTRACTION TESTS #for all of the html files for filename in os.listdir(os.path.join('Examples', 'html_files')): file_path = os.path.join('Examples', 'html_files', filename) #grab the contents with open(file_path, 'r') as infile: input_string = infile.read() #extract them te = TextExtractor(input_string) #the output file name will be the input file name, but .txt instead of .html. outfile_name = "{0}.txt".format(filename.split('.')[0]) #the output file goes to the expected_html output directory. outfile_name = os.path.join('Examples','expected_html_output', outfile_name) #dump the results into the output file. with open(outfile_name, 'w') as outfile: outfile.write(str(te)) print("Wrote {0}".format(outfile_name)) #NGRAM TESTS #for every text input file. for filename in os.listdir(os.path.join('Examples', 'txt_files')): file_path = os.path.join('Examples', 'txt_files', filename) #grab its contents. with open(file_path, 'r') as infile: input_string = infile.read().split() #Feed them into the ngram generator. We test 5 grams, because that's what's in the system spec. data = ngrams.generate_ngrams(input_string, 5) #output file name is the input file name .json outfile_name = "{0}.json".format(filename.split('.')[0]) #the output file goes in the expected_ngram_output directory outfile_name = os.path.join('Examples','expected_ngram_output', outfile_name) #dump a json to the output file. with open(outfile_name, 'w') as outfile: json.dump(data, outfile) print("Wrote {0}".format(outfile_name)) #E2E TESTS #for every html file for filename in os.listdir(os.path.join('Examples', 'html_files')): file_path = os.path.join('Examples', 'html_files', filename) #grab the contents with open(file_path, 'r') as infile: input_string = infile.read() #extract them te = TextExtractor(input_string) #ngram them data = ngrams.generate_ngrams(te.getListOfWords(), 5) #the output file name will be the input file name, but .txt instead of .html. outfile_name = "{0}.json".format(filename.split('.')[0]) #the output file goes to the expected_html output directory. outfile_name = os.path.join('Examples', 'expected_e2e_test_output', outfile_name) #dump the results into the output file. with open(outfile_name, 'w') as outfile: json.dump(data, outfile) print("Wrote {0}".format(outfile_name))
from ngrams import generate_ngrams, cdf, pick_ngram, filter_dict_keys ### Complex text generation (predict next n using last m grams) # basically ngrams help you pick the next n words with the highest probility given the previous n words # We could have the previous n be an anction list and the next n be on of our patterns # Could have everything be patterns or everything be action list # What should n be? it doesn't seem like we have enough data for it to be that large # Should I get rid of all of the view switches? past_n_target = 6 # how far to try and look back past_n_min = 2 # min amount to look back. if a matching ngram of this length is not found, the program will exit forward_n = 1 # how many new grams to add each iteration min_ngrams_needed = 2 # how many ngrams need to be found all_ngrams = generate_ngrams(past_n_target + forward_n, corpus) generated = ['the'] for i in range(0, 20): filtered_ngrams = {} temp_past_n = min(past_n_target, len(generated)) while not filtered_ngrams: filtered_ngrams = filter_dict_keys(all_ngrams, generated[-temp_past_n:], starting_index=past_n_target - temp_past_n) print(generated[-temp_past_n:], filtered_ngrams, len(filtered_ngrams)) temp_past_n -= 1 if (temp_past_n < past_n_min) or (len(filtered_ngrams) < min_ngrams_needed):