def run_stage1(params_dict): imzXMLPath = params_dict.get('imzxmlpath') imzBinPath = params_dict.get('imzbinpath') logs_dir = params_dict.get('logsdir') output_rdd = params_dict.get('outpathrdd') output_matrix = params_dict.get('outputmatrix') conf = SparkConf().set('spark.eventLog.enabled', 'true').set('spark.eventLog.dir', logs_dir).set('spark.driver.maxResultSize', '2g') sc = SparkContext(appName='Loader', conf=conf) dataset = converter(sc, imzXMLPath, imzBinPath, output_rdd) rdd = dataset.spectra dataset.save(output_rdd) data = MSIDataset.load(sc, output_rdd) mat = MSIMatrix(data) non_zer = mat.nonzeros rdd = non_zer.map(lambda x: ' ,'.join(str(ele) for ele in x)) # str(x[0]) +',' + str(x[1]) + ',' + str(x[2]) ) rdd.saveAsTextFile(output_matrix)
def run_stage4(params_dict): #logs_dir = params_dict.get('logsdir') column_leverage_score = params_dict.get('leveragescores') on_rows = params_dict.get('on_rows') sc = params_dict.get('sc') mappings = params_dict.get('mappingsfile') raw_rdd = params_dict.get('raw_rdd') output = params_dict.get('spatialvals') if on_rows else params_dict.get( 'columnmzvals') column_leverage_scores = sc.textFile(column_leverage_score).map( lambda x: float(str(x))) zipped = column_leverage_scores.zipWithIndex().map(lambda x: (x[1], x[0])) mappings = sc.textFile(mappings).map(lambda x: map(int, x.split(','))).map( lambda x: (x[1], x[0], 'zip')) unioned = zipped.union(mappings) rows_grouped = unioned.map(lambda x: (x[0], (x))).groupByKey().map( lambda x: (x[0], list(x[1]))) new_ids = rows_grouped.map(lambda x: replace(x[1])) data = MSIDataset.load(sc, raw_rdd) mz_range = data.mz_range xlen, ylen, tlen, mzlen = data.shape if on_rows: save_rdd = new_ids.map( lambda x: (get_x(x[0], xlen), get_y(x[0], xlen), x[0], x[1])) sorted_rdd = save_rdd.sortBy(lambda x: x[3], ascending=False) else: def f(xs): mz_axis = get_mz_axis(mz_range) for x in xs: yield (get_t(x[0], tlen), get_mz(x[0], tlen), transform_tomz(x[0], mz_axis, tlen), x[0], x[1]) get_t_mz = new_ids.mapPartitions(f) sorted_rdd = get_t_mz.sortBy(lambda x: x[4], ascending=False) formatted_vals = sorted_rdd.map(lambda x: ', '.join(str(i) for i in x)) formatted_vals.saveAsTextFile(output)
def run_stage1(params_dict): imzXMLPath = params_dict.get('imzxmlpath') imzBinPath = params_dict.get('imzbinpath') logs_dir = params_dict.get('logsdir') output_rdd = params_dict.get('outpathrdd') output_matrix = params_dict.get('outputmatrix') conf = SparkConf().set('spark.eventLog.enabled', 'true').set( 'spark.eventLog.dir', logs_dir).set('spark.driver.maxResultSize', '2g') sc = SparkContext(appName='Loader', conf=conf) dataset = converter(sc, imzXMLPath, imzBinPath, output_rdd) rdd = dataset.spectra dataset.save(output_rdd) data = MSIDataset.load(sc, output_rdd) mat = MSIMatrix(data) non_zer = mat.nonzeros rdd = non_zer.map(lambda x: ' ,'.join(str(ele) for ele in x) ) # str(x[0]) +',' + str(x[1]) + ',' + str(x[2]) ) rdd.saveAsTextFile(output_matrix)
def run_stage4(params_dict): #logs_dir = params_dict.get('logsdir') column_leverage_score = params_dict.get('leveragescores') on_rows = params_dict.get('on_rows') sc = params_dict.get('sc') mappings = params_dict.get('mappingsfile') raw_rdd = params_dict.get('raw_rdd') output = params_dict.get('spatialvals') if on_rows else params_dict.get('columnmzvals') column_leverage_scores = sc.textFile(column_leverage_score).map(lambda x: float(str(x))) zipped = column_leverage_scores.zipWithIndex().map(lambda x:(x[1],x[0])) mappings = sc.textFile(mappings).map(lambda x: map(int ,x.split(','))).map(lambda x:(x[1],x[0],'zip')) unioned = zipped.union(mappings) rows_grouped = unioned.map(lambda x:(x[0], (x))).groupByKey().map(lambda x:(x[0], list(x[1]))) new_ids = rows_grouped.map(lambda x: replace(x[1])) data = MSIDataset.load(sc, raw_rdd) mz_range = data.mz_range xlen,ylen,tlen,mzlen = data.shape if on_rows: save_rdd = new_ids.map(lambda x: (get_x(x[0], xlen), get_y(x[0], xlen), x[0], x[1])) sorted_rdd = save_rdd.sortBy(lambda x:x[3], ascending=False) else: def f(xs): mz_axis = get_mz_axis(mz_range) for x in xs: yield (get_t(x[0], tlen),get_mz(x[0], tlen), transform_tomz(x[0], mz_axis, tlen), x[0],x[1]) get_t_mz = new_ids.mapPartitions(f) sorted_rdd = get_t_mz.sortBy(lambda x:x[4], ascending=False) formatted_vals = sorted_rdd.map(lambda x: ', '.join(str(i) for i in x)) formatted_vals.saveAsTextFile(output)