def tokenize_and_output(csv_filename, tokenizers_by_key_of_description, output_filename, key_of_code, keys_of_descriptions, vocab, delimiter, code_prefix, use_descriptions=True, stop_words=None): reader = CSVReader(csv_filename, delimiter) dataset = reader.read_from_file() try: #os.remove(output_filename) pass except OSError: pass with open(output_filename, 'w+') as out_file: for record in dataset: tokenized_record = [] if use_descriptions: for key_of_description in keys_of_descriptions: tokenizer = tokenizers_by_key_of_description[key_of_description] tokenized_record.extend(tokenizer.tokenize(record[key_of_description])) if stop_words != None: tokenized_record = [w for w in tokenized_record if w.lower() not in stop_words] tokenized_record = tokenize_code(record[key_of_code], code_prefix) + tokenized_record output_line = " ".join(tokenized_record) vocab.update(tokenized_record) print(output_line, file=out_file)
token_file = sys.argv[1] vector_file = sys.argv[2] code_type = sys.argv[3] catalog = sys.argv[4] phrase = sys.argv[5] start = time.clock() if code_type not in ['ICD', 'CHOP', 'DRG']: print("Code type has to be one of ICD|DRG|CHOP") exit(-2) print("Reading catalog") reader = CSVReader(catalog, ',') descriptions_de = {} dataset = reader.read_from_file() for record in dataset: descriptions_de[code_type + '_' + record['code'].replace('.', '').upper()] = record['text_de'] print("Reading vectors and tokens..") vector_by_token = read_vectors(vector_file) res = read_code_vectors(vector_by_token, token_file) vectors_by_codes = res['vectors'] tokens_by_codes = res['tokens'] code_vocab = [] for code in vectors_by_codes.keys(): if(code.startswith(code_type)): code_vocab.append(code)
token_file = sys.argv[1] vector_file = sys.argv[2] code_type = sys.argv[3] catalog = sys.argv[4] phrase = sys.argv[5] start = time.clock() if code_type not in ['ICD', 'CHOP', 'DRG']: print("Code type has to be one of ICD|DRG|CHOP") exit(-2) print("Reading catalog") reader = CSVReader(catalog, ',') descriptions_de = {} dataset = reader.read_from_file() for record in dataset: descriptions_de[code_type + '_' + record['code'].replace( '.', '').upper()] = record['text_de'] print("Reading vectors and tokens..") vector_by_token = read_vectors(vector_file) res = read_code_vectors(vector_by_token, token_file) vectors_by_codes = res['vectors'] tokens_by_codes = res['tokens'] code_vocab = [] for code in vectors_by_codes.keys(): if (code.startswith(code_type)): code_vocab.append(code)