def run(args): logging.info("Starting") Utilities.ensure_requisite_folders(args.output) logging.info("Read covariate") covariate = pq.read_table(args.covariate).to_pandas() logging.info("Read data") data = pq.read_table(args.data).to_pandas() logging.info("Processing") covariate_names = covariate.columns.values[1:] results = {"individual": data.individual.values} variables = [x for x in data.columns.values[1:]] for i, column in enumerate(variables): logging.log(9, "%i/%i:%s", i, len(variables), column) d = data[["individual", column]].rename(columns={ column: "y" }).merge(covariate, on="individual", how="inner").drop("individual", axis=1) y, X = dmatrices("y ~ {}".format(" + ".join(covariate_names)), data=d, return_type="dataframe") model = sm.OLS(y, X) result = model.fit() results[column] = result.resid results = pandas.DataFrame(results)[["individual"] + variables] Parquet.save_variable(args.output, results) logging.info("Finished")
def run(args): start = timer() Utilities.ensure_requisite_folders(args.parquet_output) logging.info("Loading variable") variables = ModelTraining.load_variable_file(args.variable_file) logging.info("Saving") Parquet.save_variable(args.parquet_output, variables) end = timer() logging.info("Finished in %s", str(end-start))
def process_phenotype(path, name, output_prefix): pheno = ModelTraining.load_variable_file(path) pheno_path = output_prefix + ".expression." + name + ".parquet" Parquet.save_variable(pheno_path, pheno)