示例#1
0
文件: stage1.py 项目: chocjy/sc-2015
def run_stage1(params_dict):
	imzXMLPath = params_dict.get('imzxmlpath')
	imzBinPath = params_dict.get('imzbinpath')
	logs_dir = params_dict.get('logsdir')
	output_rdd = params_dict.get('outpathrdd')
	output_matrix = params_dict.get('outputmatrix')
	conf = SparkConf().set('spark.eventLog.enabled', 'true').set('spark.eventLog.dir', logs_dir).set('spark.driver.maxResultSize', '2g') 

	sc = SparkContext(appName='Loader', conf=conf)
	dataset = converter(sc, imzXMLPath, imzBinPath, output_rdd)

	rdd = dataset.spectra
	dataset.save(output_rdd)
	data = MSIDataset.load(sc, output_rdd)
	mat = MSIMatrix(data)
	non_zer = mat.nonzeros

	rdd = non_zer.map(lambda x: ' ,'.join(str(ele) for ele in x))  #  str(x[0]) +',' + str(x[1]) + ',' + str(x[2]) )
	rdd.saveAsTextFile(output_matrix)
示例#2
0
def run_stage4(params_dict):
    #logs_dir = params_dict.get('logsdir')
    column_leverage_score = params_dict.get('leveragescores')
    on_rows = params_dict.get('on_rows')
    sc = params_dict.get('sc')
    mappings = params_dict.get('mappingsfile')

    raw_rdd = params_dict.get('raw_rdd')
    output = params_dict.get('spatialvals') if on_rows else params_dict.get(
        'columnmzvals')

    column_leverage_scores = sc.textFile(column_leverage_score).map(
        lambda x: float(str(x)))
    zipped = column_leverage_scores.zipWithIndex().map(lambda x: (x[1], x[0]))

    mappings = sc.textFile(mappings).map(lambda x: map(int, x.split(','))).map(
        lambda x: (x[1], x[0], 'zip'))
    unioned = zipped.union(mappings)

    rows_grouped = unioned.map(lambda x: (x[0], (x))).groupByKey().map(
        lambda x: (x[0], list(x[1])))
    new_ids = rows_grouped.map(lambda x: replace(x[1]))

    data = MSIDataset.load(sc, raw_rdd)
    mz_range = data.mz_range

    xlen, ylen, tlen, mzlen = data.shape
    if on_rows:
        save_rdd = new_ids.map(
            lambda x: (get_x(x[0], xlen), get_y(x[0], xlen), x[0], x[1]))
        sorted_rdd = save_rdd.sortBy(lambda x: x[3], ascending=False)
    else:

        def f(xs):
            mz_axis = get_mz_axis(mz_range)
            for x in xs:
                yield (get_t(x[0], tlen), get_mz(x[0], tlen),
                       transform_tomz(x[0], mz_axis, tlen), x[0], x[1])

        get_t_mz = new_ids.mapPartitions(f)
        sorted_rdd = get_t_mz.sortBy(lambda x: x[4], ascending=False)
    formatted_vals = sorted_rdd.map(lambda x: ', '.join(str(i) for i in x))
    formatted_vals.saveAsTextFile(output)
示例#3
0
def run_stage1(params_dict):
    imzXMLPath = params_dict.get('imzxmlpath')
    imzBinPath = params_dict.get('imzbinpath')
    logs_dir = params_dict.get('logsdir')
    output_rdd = params_dict.get('outpathrdd')
    output_matrix = params_dict.get('outputmatrix')
    conf = SparkConf().set('spark.eventLog.enabled', 'true').set(
        'spark.eventLog.dir', logs_dir).set('spark.driver.maxResultSize', '2g')

    sc = SparkContext(appName='Loader', conf=conf)
    dataset = converter(sc, imzXMLPath, imzBinPath, output_rdd)

    rdd = dataset.spectra
    dataset.save(output_rdd)
    data = MSIDataset.load(sc, output_rdd)
    mat = MSIMatrix(data)
    non_zer = mat.nonzeros

    rdd = non_zer.map(lambda x: ' ,'.join(str(ele) for ele in x)
                      )  #  str(x[0]) +',' + str(x[1]) + ',' + str(x[2]) )
    rdd.saveAsTextFile(output_matrix)
示例#4
0
文件: stage4.py 项目: chocjy/sc-2015
def run_stage4(params_dict):
    #logs_dir = params_dict.get('logsdir')
    column_leverage_score = params_dict.get('leveragescores')
    on_rows = params_dict.get('on_rows')
    sc = params_dict.get('sc')
    mappings = params_dict.get('mappingsfile') 
    
    raw_rdd = params_dict.get('raw_rdd')
    output =  params_dict.get('spatialvals') if on_rows else params_dict.get('columnmzvals')
    
    column_leverage_scores = sc.textFile(column_leverage_score).map(lambda x: float(str(x)))
    zipped = column_leverage_scores.zipWithIndex().map(lambda x:(x[1],x[0]))
    
    mappings = sc.textFile(mappings).map(lambda x: map(int ,x.split(','))).map(lambda x:(x[1],x[0],'zip'))
    unioned = zipped.union(mappings)

    rows_grouped = unioned.map(lambda x:(x[0], (x))).groupByKey().map(lambda x:(x[0], list(x[1])))
    new_ids = rows_grouped.map(lambda x: replace(x[1]))


    data = MSIDataset.load(sc, raw_rdd)
    mz_range = data.mz_range

    xlen,ylen,tlen,mzlen = data.shape
    if on_rows:
        save_rdd = new_ids.map(lambda x: (get_x(x[0], xlen), get_y(x[0], xlen), x[0], x[1]))
        sorted_rdd = save_rdd.sortBy(lambda x:x[3], ascending=False)
    else:
        def f(xs):
          mz_axis = get_mz_axis(mz_range)
          for x in xs:
            yield (get_t(x[0], tlen),get_mz(x[0], tlen), transform_tomz(x[0], mz_axis, tlen),  x[0],x[1])
        get_t_mz = new_ids.mapPartitions(f)
        sorted_rdd = get_t_mz.sortBy(lambda x:x[4], ascending=False)
    formatted_vals = sorted_rdd.map(lambda x: ', '.join(str(i) for i in x))
    formatted_vals.saveAsTextFile(output)