Exemplo n.º 1
0
def get_clean_data(rsa_file_path, rsa_format, ano_file_path, ano_format, meta_data, only_first_month = False):
    cmd_codes = meta_data['cmd']
    stay_type_codes = meta_data['stay_type']
    stay_complexity_codes = meta_data['stay_complexity']
    ano_data = list()
    exit_month_data = list()
    chunk = 1000
    sex_data_first_col = 0
    age_in_year_data_first_col = sex_data_first_col + 2
    age_in_day_data_first_col = age_in_year_data_first_col + formats.age_in_year_cols_count
    stay_length_data_first_col = age_in_day_data_first_col + formats.age_in_day_cols_count
    cmd_codes_first_col = stay_length_data_first_col + formats.stay_length_cols
    stay_type_codes_first_col = cmd_codes_first_col + len(cmd_codes)
    stay_complexity_codes_first_col = stay_type_codes_first_col + len(stay_type_codes)
    cols_count = stay_complexity_codes_first_col + len(stay_complexity_codes)
    np_data = np.zeros((chunk, cols_count), dtype=np.int)
    rsa_data = sparse.csr_matrix((0, cols_count))
    index = 0
    global_index = 0
    lines_count = 0
    with open(rsa_file_path) as rsa_file:
        with open(ano_file_path) as ano_file:
            while True:
                if index == chunk:
                    rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data)])
                    np_data.fill(0)
                    index = 0
                rsa_line = rsa_file.readline()
                ano_line = ano_file.readline()
                if ano_tools.is_ano_ok(ano_line, ano_format) and rsa_tools.is_rsa_ok(rsa_line, rsa_format):
                    rsa = rsa_tools.get_rsa(rsa_line, rsa_format)
                    exit_month = rsa['exit_month']
                    if only_first_month and exit_month != 1:
                        continue
                    exit_month_data.append(exit_month)
                    ano = ano_tools.get_ano(ano_line, ano_format, global_index)
                    ano_data.append(ano)
                    np_data[index, sex_data_first_col + rsa['sex']] = 1
                    np_data[index, age_in_year_data_first_col + rsa['age_in_year_cat']] = 1
                    np_data[index, age_in_day_data_first_col + rsa['age_in_day_cat']] = 1
                    np_data[index, stay_length_data_first_col + rsa['stay_length_cat']] = 1
                    if rsa['cmd'] != '':
                        np_data[index, cmd_codes_first_col + cmd_codes.index(rsa['cmd'])] = 1
                    if rsa['stay_type'] != '':
                        np_data[index, stay_type_codes_first_col + stay_type_codes.index(rsa['stay_type'])] = 1
                    if rsa['stay_complexity'] != '':
                        np_data[index, stay_complexity_codes_first_col + stay_complexity_codes.index(rsa['stay_complexity'])] = 1
                    index += 1
                    global_index += 1
                if lines_count % 10000 == 0:
                    print '\rPorcessed %s \t added %s' % (lines_count, global_index),
                lines_count += 1
                if not rsa_line and not ano_line:
                    break

            if index % chunk != 0:
                rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data[0:index, :])])
    return {'anos': ano_data,
     'rsas': rsa_data,
     'exit_month_data': exit_month_data}
Exemplo n.º 2
0
def get_clean_anos_data(rsa_file_path, rsa_format, ano_file_path, ano_format):
    ano_data = list()
    i = 0
    with open(rsa_file_path) as rsa_file:
        with open(ano_file_path) as ano_file:
            while True:
                rsa_line = rsa_file.readline()
                ano_line = ano_file.readline()
                if ano_tools.is_ano_ok(ano_line, ano_format) and rsa_tools.is_rsa_ok(rsa_line, rsa_format):
                    ano = ano_tools.get_ano(ano_line, ano_format)
                    ano_data.append(ano)
                if i % 10000 == 0:
                    print '\rPorcessed ', i,
                i += 1
                if not rsa_line and not ano_line:
                    break

    return ano_data