Exemplo n.º 1
0
def test_main():
    #only detailed description
    raw_text = file_reader.get_string_from_txt('test.test')

    words = nltk.word_tokenize(raw_text)

    file_writer.print_string_to_txt("|\n".join(words), 'words.test')

    regex_all_num = re.compile(r'\d+', re.M)

    ref_numerals = []

    for i, w in enumerate(words):
        matchObj = re.fullmatch(regex_all_num, w)
        if matchObj:
            #print(matchObj.group(0))
            ref_numerals.append((w, i))
            print(ref_numeral_features(w))
Exemplo n.º 2
0
def create_labeled_set():
    raw_text = file_reader.get_string_from_txt('training/train_1.txt')

    #print(raw_text)

    linebreak_split = raw_text.strip().split("\n")
    #print(linebreak_split)
    sents = []

    for l in linebreak_split:
        sent = nltk.word_tokenize(l)
        #print(sent)
        sents.append(sent)

    data = ClassifiedSentenceData(sents)
    #print(data.featuresets)
    #print(len(data.featuresets))

    return data
Exemplo n.º 3
0
def get_test_string():
    return file_reader.get_string_from_txt(
        #'2018-09-21 15694060 nonfinal rejection.txt')
        #'2018-09-18 15599191 nonfinal rejection.txt')
        '2018-09-26 15332415 final rejection.txt')
Exemplo n.º 4
0

def get_trained_classifier():
    featuresets = get_featuresets()

    #randomize data
    np.random.shuffle(featuresets)
    classifier = pk_nltk.train_classifier(featuresets)

    return classifier


classifier = get_trained_classifier()

#read app in .txt to string
raw_text = file_reader.get_string_from_txt('1225.txt')

#tokenize raw text
words = nltk.word_tokenize(raw_text)

test_set = set()

new_dataset = []

for w in words:
    #print(w)
    #print(classifier.classify(ref_numeral_features(w)))

    new_dataset.append((w, classifier.classify(ref_numeral_features(w))))
    if (classifier.classify(ref_numeral_features(w))):
        #print(w)
Exemplo n.º 5
0
def main():

    images = None
    output_files = True
    generate_pickle = True
    """ doesn't work

    #root = tk.Tk()
    global root
    root.title("Convert drawings?")
    frame = tk.Frame(root)
    frame.pack()
    
    button = tk.Button(frame, 
                       text="YES", 
                       command=convert_drawings_to_images(output_files, generate_pickle))
    button.pack(side=tk.LEFT)
    
    slogan = tk.Button(frame,
                       text="NO",
                       command=quit)
    slogan.pack(side=tk.LEFT)
    
    
    root.mainloop()    

    print("root destroyed")
    """
    #sys.exit()

    images = convert_drawings_to_images(output_files, generate_pickle)

    sys.exit()

    read_pickle = True
    #ocr_images(images, read_pickle)

    #print(pytesseract.image_to_string("image_ref_numeral_only.jpg"))
    """
    ocr_dict = pytess.convert_images_to_string(images)
    
    filehandler = open('ocr_dict.p','wb')
    pickle.dump(ocr_dict, filehandler)
    
    print(ocr_dict)
    """

    #output_path = fr.convert_pdf_to_txt(filepath)

    #print(output_path)

    print("open application txt file")
    #get application txt file
    filepath = fr.get_filepath()

    #get raw text of application
    raw_text = fr.get_string_from_txt(filepath)

    print("open drawings csv")
    filepath = fr.get_filepath()

    csv_dict, ref_numerals_dict = get_drawings_data(filepath)

    print(ref_numerals_dict)
    #sys.exit()

    analyze_drawings_against_application(csv_dict, raw_text)

    find_ref_numerals_in_application(ref_numerals_dict, raw_text)
    """
Exemplo n.º 6
0
        print(range_hyphen_split)
        start_para = range_hyphen_split[0].strip()
        end_para = range_hyphen_split[1].strip()
        for j in range(int(start_para), int(end_para)+1):
            if j not in input_paragraphs:
                input_paragraphs.append(j)

print(input_paragraphs)

#test
#input_paragraphs = [103,104,105,106,107,108,109,29]
input_path = "input/test_reference.txt"

input_paragraphs.sort()

reference_string = file_reader.get_string_from_txt(input_path)

#print(reference_string)
reference_split = reference_string.split("\n")
#print(reference_split)

output_string = []

for para in input_paragraphs:
    print(para)
    regex = r'\[\d+' + str(para) + r'\].+'
    match_obj = re.search(regex, reference_string)
    if (match_obj):
        print(match_obj[0])
        output_string.append(match_obj[0])
        
Exemplo n.º 7
0
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 26 11:13:36 2018

@author: alanyliu
"""

import file_reader
import file_writer
import re

input_path = "training/2018-09-21 15694060 nonfinal rejection.txt"
text = file_reader.get_string_from_txt(input_path)

print(text)

text = re.sub(r'(]\.)', '\g<1>\n', text)
text = re.sub(r'\n\s+\n', '\n', text)
text = re.sub(r'\n{2,}', '\n', text)
#print(re.findall(r'\w\n\w',text))
#print(re.findall(r'(\w)(\n)(\w)',text))
text = re.sub(r'(\w|,)(\n)(\w)', '\g<1> \g<3>', text)
text = re.sub(r'(\.)\s', '\g<1>\n', text)

file_writer.print_string_to_txt(text, "training/train.txt")

#punct-features ideas
#next word is punctuation
#last letter of previous word is capitalized
#probably doesn't end with acronym
#false positives, false negatives