def main(): sc = SparkContext( appName="Transforming Eff Care" ) src_effcare = sc.textFile(utils.data_home + "/effective_care.csv") src_readmission = sc.textFile(utils.data_home + "/readmissions.csv") transform1 = partial(transform_row, orig_headers=eff_care_headers, new_headers=utils.map_headers(proc_headers, eff_care_headers)) transform2 = partial(transform_row, orig_headers=readmission_headers, new_headers=utils.map_headers(proc_headers, readmission_headers)) transformed_eff = src_effcare.map(utils.to_row_sep).map(transform1).map(utils.to_row_string) transformed_readmission = src_readmission.map(utils.to_row_sep).map(transform2).map(utils.to_row_string) (transformed_eff + transformed_readmission) \ .saveAsTextFile(utils.data_home + "/procedures_data")
def transform_row(line_parts): headers = utils.map_headers(new_headers, orig_headers) new_parts = [] for (key, index), transform in zip(headers.iteritems(), transform_fns): if index > -1: value = line_parts[index] value = transform(value) else: value = utils.NULL_FMT new_parts.append(value) return new_parts