示例#1
0
def parse_txt_files():
    """
    This is a hacking way of parsing txt files
    You might find it useful sometimes. :)
    """
    parser = XMLStreamParser("", "crf_files/final_model", "")
    note_dir = "test/data/note_texts"
    parsed_note_dir = "test/data/note_texts_parsed"
    if not os.path.exists(parsed_note_dir):
        os.mkdir(parsed_note_dir)
    for file in os.listdir(note_dir):
        new_string = None
        with open(note_dir + "/" + file) as f:
            if file.endswith(".txt"):
                string = f.read()
                new_string = parser.add_line_break(string)
                with open(parsed_note_dir + "/" + file, "w+") as wf:
                    wf.write(new_string)
"""
This is the integration test of all important methods in this package

Note: Please run this script from root folder
"""

from lib.data_preprocessor import prepare_crf_data
from lib.model_generator import generate_model
from lib.xml_stream_parser import XMLStreamParser
from lib.word_templaterizer import TemplateGenerator
from lib.cross_validation import *

if __name__ == "__main__":

    """
    Integration test for parsing
    You need:
    1. A tempate file
    2. Training data folder path
    3. Testing data folder path
    """
    print("Integration test for parsing")
    # Data Preprocessing from original data folder
    prepare_crf_data("test/data/note_texts/", "crf_files/note_train_features")

    # Generate target model
    generate_model("crf_files/final_template", "crf_files/train_features", "crf_files/final_model")

    parser = XMLStreamParser("test/data/fake_notes.xml", "crf_files/final_model", "NOTE_TEXT")
    parser.parse_and_write_to("test/data/fake_notes_parsed.xml")
def parse(input_dir, output_dir, file):
    parser = XMLStreamParser(input_dir + file, "crf_files/final_model", "NOTE_TEXT")
    parser.parse_and_write_to(output_dir + re.sub(".xml", "", file) + "_parsed.xml" )
示例#4
0
def parse_big_xml(model="crf_files/final_model"):
    parser = XMLStreamParser("test/data/fake_notes.xml", model, "NOTE_TEXT")
    parser.parse_and_write_to("test/data/fake_notes_parsed.xml")