def Text_Transformation_controller(crawling_data,max_n_gram_size):

    te = text_extract.TextExtractor(crawling_data['content'])		#Run the text extractor
    
    output = dict()													#create the dictionary
    output['metadata'] = te.extractMetadata()						#Get metadata from html
    output['metadata']['url'] = crawling_data['url']				#get from crawling API
    output['metadata']['timestamp'] = crawling_data['metadata']['timestamp'], 	#get from crawling API
    
    list_of_words = te.getListOfWords() 
    n_grams = ngrams.generate_ngrams(list_of_words,max_n_gram_size)
    titles = ngrams.generate_ngrams(te.getTitleListOfWords(),max_n_gram_size)
    headers = ngrams.generate_ngrams(te.getHeaderListOfWords(),max_n_gram_size)

    output['ngrams'] = {
      'all' : n_grams,
      'headers' : headers, #we need to parse these
      'title' : titles  #we need to parse these.
    }
    output["text"] = list_of_words
    
    #This line will print the results in output
    print(json.dumps(output,indent=4))
	
    return output
Exemplo n.º 2
0
def Text_Transformation_controller(crawling_data, max_n_gram_size):

    te = text_extract.TextExtractor(
        crawling_data['content'])  #Run the text extractor

    output = dict()  #create the dictionary
    output['metadata'] = te.extractMetadata()  #Get metadata from html
    output['metadata']['url'] = crawling_data['url']  #get from crawling API
    output['metadata']['timestamp'] = crawling_data['metadata'][
        'timestamp'],  #get from crawling API

    list_of_words = te.getListOfWords()
    n_grams = ngrams.generate_ngrams(list_of_words, max_n_gram_size)
    titles = ngrams.generate_ngrams(te.getTitleListOfWords(), max_n_gram_size)
    headers = ngrams.generate_ngrams(te.getHeaderListOfWords(),
                                     max_n_gram_size)

    output['ngrams'] = {
        'all': n_grams,
        'headers': headers,  #we need to parse these
        'title': titles  #we need to parse these.
    }
    output["text"] = list_of_words

    #This line will print the results in output
    print(json.dumps(output, indent=4))

    return output
  def test6(self):
    test_file = 'example6'
    print(" Testing {0}.txt".format(test_file))

    txt_file = os.path.join('Examples','txt_files','{0}.txt'.format(test_file))
    expected_output_file = os.path.join('Examples','expected_ngram_output','{0}.json'.format(test_file))

    data = ngrams.generate_ngrams(openTxtFileAsList(txt_file),5)
    expected_output = openJsonFile(expected_output_file)

    self.assertDictEqual(data,expected_output)
  def test7(self):
    test_file = 'example7'
    print(" Testing {0}.html".format(test_file))

    html_file = os.path.join('Examples','html_files','{0}.html'.format(test_file))
    expected_output_file = os.path.join('Examples','expected_e2e_test_output','{0}.json'.format(test_file))
    expected_output = openJsonFile(expected_output_file)

    te = TextExtractor(openHtmlFileAsString(html_file))
    data = ngrams.generate_ngrams(te.getListOfWords(),5)

    self.assertDictEqual(data,expected_output)
Exemplo n.º 5
0
def generate_ngrams_threaded(n, c):
    print("starting ngrams of length", n)
    ngrams = generate_ngrams(n, c)
    print("generated ngrams of length", n)
    return ngrams
def generate_expected_output():

  if not os.path.exists('Examples'):
    print("ERROR: please invoke this script one level below the 'Examples' directory")
    sys.exit(1)

  if not os.path.exists(os.path.join('Examples','expected_html_output')):
    os.mkdir(os.path.join('Examples','expected_html_output'))


  if not os.path.exists(os.path.join('Examples','expected_ngram_output')):
    os.mkdir(os.path.join('Examples','expected_ngram_output'))


  if not os.path.exists(os.path.join('Examples', 'expected_e2e_test_output')):
    os.mkdir(os.path.join('Examples', 'expected_e2e_test_output'))

  #TEXT EXTRACTION TESTS
  #for all of the html files
  for filename in os.listdir(os.path.join('Examples', 'html_files')):
    file_path = os.path.join('Examples', 'html_files', filename)
    #grab the contents
    with open(file_path, 'r') as infile:
      input_string = infile.read()
    #extract them
    te = TextExtractor(input_string)
    
    #the output file name will be the input file name, but .txt instead of .html.
    outfile_name = "{0}.txt".format(filename.split('.')[0])
    #the output file goes to the expected_html output directory.
    outfile_name = os.path.join('Examples','expected_html_output', outfile_name)
    #dump the results into the output file.
    with open(outfile_name, 'w') as outfile:
      outfile.write(str(te))
      print("Wrote {0}".format(outfile_name))

  #NGRAM TESTS
  #for every text input file.
  for filename in os.listdir(os.path.join('Examples', 'txt_files')):
    file_path = os.path.join('Examples', 'txt_files', filename)
    #grab its contents.
    with open(file_path, 'r') as infile:
      input_string = infile.read().split()
    #Feed them into the ngram generator. We test 5 grams, because that's what's in the system spec.
    data = ngrams.generate_ngrams(input_string, 5)
    #output file name is the input file name .json
    outfile_name = "{0}.json".format(filename.split('.')[0])
    #the output file goes in the expected_ngram_output directory
    outfile_name = os.path.join('Examples','expected_ngram_output', outfile_name)
    #dump a json to the output file.
    with open(outfile_name, 'w') as outfile:
      json.dump(data, outfile)
      print("Wrote {0}".format(outfile_name))

  #E2E TESTS
  #for every html file
  for filename in os.listdir(os.path.join('Examples', 'html_files')):
    file_path = os.path.join('Examples', 'html_files', filename)
    #grab the contents
    with open(file_path, 'r') as infile:
      input_string = infile.read()
    #extract them
    te = TextExtractor(input_string)
    #ngram them
    data = ngrams.generate_ngrams(te.getListOfWords(), 5)
    #the output file name will be the input file name, but .txt instead of .html.
    outfile_name = "{0}.json".format(filename.split('.')[0])
    #the output file goes to the expected_html output directory.
    outfile_name = os.path.join('Examples', 'expected_e2e_test_output', outfile_name)
    #dump the results into the output file.
    with open(outfile_name, 'w') as outfile:
      json.dump(data, outfile)
      print("Wrote {0}".format(outfile_name))
Exemplo n.º 7
0
from ngrams import generate_ngrams, cdf, pick_ngram, filter_dict_keys

### Complex text generation (predict next n using last m grams)
# basically ngrams help you pick the next n words with the highest probility given the previous n words
# We could have the previous n be an anction list and the next n be on of our patterns
# Could have everything be patterns or everything be action list
# What should n be? it doesn't seem like we have enough data for it to be that large
# Should I get rid of all of the view switches?

past_n_target = 6  # how far to try and look back
past_n_min = 2  # min amount to look back. if a matching ngram of this length is not found, the program will exit
forward_n = 1  # how many new grams to add each iteration
min_ngrams_needed = 2  # how many ngrams need to be found

all_ngrams = generate_ngrams(past_n_target + forward_n, corpus)

generated = ['the']

for i in range(0, 20):
    filtered_ngrams = {}
    temp_past_n = min(past_n_target, len(generated))
    while not filtered_ngrams:
        filtered_ngrams = filter_dict_keys(all_ngrams,
                                           generated[-temp_past_n:],
                                           starting_index=past_n_target -
                                           temp_past_n)
        print(generated[-temp_past_n:], filtered_ngrams, len(filtered_ngrams))

        temp_past_n -= 1
        if (temp_past_n < past_n_min) or (len(filtered_ngrams) <
                                          min_ngrams_needed):