예제 #1
0
def tokenize_and_output(csv_filename, tokenizers_by_key_of_description, output_filename, key_of_code,
                         keys_of_descriptions, vocab, delimiter, code_prefix,
                          use_descriptions=True, stop_words=None):
    reader = CSVReader(csv_filename, delimiter)
    dataset = reader.read_from_file()
    try:
        #os.remove(output_filename)
        pass
    except OSError:
        pass  
    
    with open(output_filename, 'w+') as out_file:
        for record in dataset:
            tokenized_record = []
            if use_descriptions:
                for key_of_description in keys_of_descriptions:
                    tokenizer = tokenizers_by_key_of_description[key_of_description]    
                    tokenized_record.extend(tokenizer.tokenize(record[key_of_description])) 
            if stop_words != None:
                tokenized_record = [w for w in tokenized_record if w.lower() not in stop_words]
                
            tokenized_record = tokenize_code(record[key_of_code], code_prefix) + tokenized_record 
            output_line = " ".join(tokenized_record)
            vocab.update(tokenized_record)
            print(output_line, file=out_file) 
예제 #2
0
     exit(-1)
 
 token_file = sys.argv[1]
 vector_file = sys.argv[2]
 code_type = sys.argv[3]
 catalog = sys.argv[4]
 phrase = sys.argv[5]
 
 start = time.clock()
 
 if code_type not in ['ICD', 'CHOP', 'DRG']:
     print("Code type has to be one of ICD|DRG|CHOP")
     exit(-2)
     
 print("Reading catalog")
 reader = CSVReader(catalog, ',')
 descriptions_de = {}
 dataset = reader.read_from_file()
 for record in dataset:
     descriptions_de[code_type + '_' + record['code'].replace('.', '').upper()] = record['text_de']
     
 print("Reading vectors and tokens..")
 
 vector_by_token = read_vectors(vector_file)
 res = read_code_vectors(vector_by_token, token_file)
 vectors_by_codes = res['vectors']
 tokens_by_codes = res['tokens']
 
 code_vocab = []
 for code in vectors_by_codes.keys():
     if(code.startswith(code_type)):
예제 #3
0
        exit(-1)

    token_file = sys.argv[1]
    vector_file = sys.argv[2]
    code_type = sys.argv[3]
    catalog = sys.argv[4]
    phrase = sys.argv[5]

    start = time.clock()

    if code_type not in ['ICD', 'CHOP', 'DRG']:
        print("Code type has to be one of ICD|DRG|CHOP")
        exit(-2)

    print("Reading catalog")
    reader = CSVReader(catalog, ',')
    descriptions_de = {}
    dataset = reader.read_from_file()
    for record in dataset:
        descriptions_de[code_type + '_' + record['code'].replace(
            '.', '').upper()] = record['text_de']

    print("Reading vectors and tokens..")

    vector_by_token = read_vectors(vector_file)
    res = read_code_vectors(vector_by_token, token_file)
    vectors_by_codes = res['vectors']
    tokens_by_codes = res['tokens']

    code_vocab = []
    for code in vectors_by_codes.keys():