Пример #1
0
def calculate_pred_score(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"):
    """
    calculate svm prediction score around the true signal site
    """

    local = False ## switch between local and compute cluster 
    ## cluster compute options   
    cluster_resource = {'pvmem':'8gb', 'pmem':'8gb', 'mem':'8gb', 'vmem':'8gb','ppn':'1', 'nodes':'1', 'walltime':'24:00:00'}
    #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'}

    num_seq_ex = 10 ## number of sequences are in single job 
    center_offset = 500 ## nearby regions FIXME  
    args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex)

    intm_ret = pg.pg_map(predict_site_region, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="8gb") 
    print("Done with calculating the score for center region of example sequences")

    pred_out_val = reduce_pred_score(intm_ret) 
    print("Done with collecting scores from different workers")

    ## save the scores 
    fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1()) 
    compressed_pickle.save(fname, pred_out_val) 

    print("saving the scores in file %s" % fname)
Пример #2
0
def setup_splits(signal, method_name, method, param, num_folds, test_size,
                 random_state):
    """
    splitting the example data into train/test/validation group 
    """

    data = data_loader.load_all(signal)
    sizes = dict((org, len(data[org]["labels"])) for org in data.keys())

    # set up splitting strategy
    kf = MultitaskShuffleSplitThreeWay(sizes,
                                       n_iter=num_folds,
                                       indices=True,
                                       test_size=test_size * 2,
                                       random_state=random_state)

    param_grid = list(ParameterGrid(param))
    argument_list = []

    for fold_idx, (train_idx, dev_idx, test_idx) in enumerate(kf):
        for grid_idx, grid_point in enumerate(param_grid):
            arg = [
                signal, method, fold_idx, train_idx, dev_idx, test_idx,
                grid_idx, grid_point
            ]
            argument_list.append(arg)

    local = False
    max_num_threads = 2

    if method_name in ['union', 'individual']:
        param = {
            'vmem': '4gb',
            'pvmem': '4gb',
            'pmem': '4gb',
            'mem': '4gb',
            'ppn': '1',
            'nodes': '1',
            'walltime': '2:00:00'
        }
        intermediate_ret = pg.pg_map(compute_core,
                                     argument_list,
                                     param=param,
                                     local=local,
                                     maxNumThreads=1,
                                     mem="4gb")

    #import ipdb
    #ipdb.set_trace()

    print "DONE with computation"

    flat_intermediate = list(chain.from_iterable(intermediate_ret))
    perf_dev, perf_test = reduce_result(flat_intermediate)

    print "DONE reducing"

    return perf_dev, perf_test
Пример #3
0
def calculate_pred_score(svm_file,
                         org,
                         example_type="pos",
                         signal="tss",
                         data_path="SRA-rnaseq"):
    """
    calculate svm prediction score around the true signal site
    """

    local = False  ## switch between local and compute cluster
    ## cluster compute options
    cluster_resource = {
        'pvmem': '8gb',
        'pmem': '8gb',
        'mem': '8gb',
        'vmem': '8gb',
        'ppn': '1',
        'nodes': '1',
        'walltime': '24:00:00'
    }
    #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'}

    num_seq_ex = 10  ## number of sequences are in single job
    center_offset = 500  ## nearby regions FIXME
    args_req_list = data_process_depot(svm_file, org, example_type, signal,
                                       data_path, num_seq_ex)

    intm_ret = pg.pg_map(predict_site_region,
                         args_req_list,
                         param=cluster_resource,
                         local=local,
                         maxNumThreads=1,
                         mem="8gb")
    print(
        "Done with calculating the score for center region of example sequences"
    )

    pred_out_val = reduce_pred_score(intm_ret)
    print("Done with collecting scores from different workers")

    ## save the scores
    fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1())
    compressed_pickle.save(fname, pred_out_val)

    print("saving the scores in file %s" % fname)
Пример #4
0
def shift_signal_position(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"):
    """
    manually look at the position around the original position 
    """

    local = False ## switch between local and compute cluster 
    ## cluster compute options   
    cluster_resource = {'pvmem':'4gb', 'pmem':'4gb', 'mem':'4gb', 'vmem':'4gb','ppn':'1', 'nodes':'1', 'walltime':'24:00:00'}

    num_seq_ex = 2 ## number of sequences are in a single job  
    args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex)

    ## job dispatching 
    intm_ret = pg.pg_map(recenter_examples, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="4gb") 
    print("Done with trimming example sequences")

    fixed_example_seq = reduce_modified_seq(intm_ret) 
    print("Done with collecting the trimmed examples")
        
    write_fasta_rec(fixed_example_seq, signal, example_type) 
    print("Done with writing examples in fasta format")
Пример #5
0
def shift_signal_position(svm_file,
                          org,
                          example_type="pos",
                          signal="tss",
                          data_path="SRA-rnaseq"):
    """
    manually look at the position around the original position 
    """

    local = False  ## switch between local and compute cluster
    ## cluster compute options
    cluster_resource = {
        'pvmem': '4gb',
        'pmem': '4gb',
        'mem': '4gb',
        'vmem': '4gb',
        'ppn': '1',
        'nodes': '1',
        'walltime': '24:00:00'
    }

    num_seq_ex = 2  ## number of sequences are in a single job
    args_req_list = data_process_depot(svm_file, org, example_type, signal,
                                       data_path, num_seq_ex)

    ## job dispatching
    intm_ret = pg.pg_map(recenter_examples,
                         args_req_list,
                         param=cluster_resource,
                         local=local,
                         maxNumThreads=1,
                         mem="4gb")
    print("Done with trimming example sequences")

    fixed_example_seq = reduce_modified_seq(intm_ret)
    print("Done with collecting the trimmed examples")

    write_fasta_rec(fixed_example_seq, signal, example_type)
    print("Done with writing examples in fasta format")
Пример #6
0
def setup_splits(signal, method_name, method, param, num_folds, test_size, random_state):
    """
    splitting the example data into train/test/validation group 
    """

    data = data_loader.load_all(signal)
    sizes = dict((org, len(data[org]["labels"])) for org in data.keys())

    # set up splitting strategy
    kf = MultitaskShuffleSplitThreeWay(sizes, n_iter=num_folds, indices=True, test_size=test_size*2, random_state=random_state)

    param_grid = list(ParameterGrid(param))
    argument_list = []

    for fold_idx, (train_idx, dev_idx, test_idx) in enumerate(kf):
        for grid_idx, grid_point in enumerate(param_grid):
            arg = [signal, method, fold_idx, train_idx, dev_idx, test_idx, grid_idx, grid_point]
            argument_list.append(arg)

    local = False 
    max_num_threads = 2

    if method_name in ['union', 'individual']:
        param = {'vmem':'4gb', 'pvmem':'4gb', 'pmem':'4gb', 'mem':'4gb', 'ppn':'1', 'nodes':'1', 'walltime':'2:00:00'}
        intermediate_ret = pg.pg_map(compute_core, argument_list, param=param, local=local, maxNumThreads=1, mem="4gb")

    #import ipdb 
    #ipdb.set_trace()

    print "DONE with computation"

    flat_intermediate = list(chain.from_iterable(intermediate_ret))
    perf_dev, perf_test = reduce_result(flat_intermediate)

    print "DONE reducing"

    return perf_dev, perf_test
Пример #7
0
def manual_pos_shift(svm_file, org, signal="tss", data_path="SRA-rnaseq"):
    """
    manually look at the position around the original position 
    """

    ## loading data
    data = load_examples_from_fasta(signal, org, data_path)
    assert len(data["examples"]) == len(data["labels"])

    ## unpack the model
    import bz2
    import cPickle

    fh = bz2.BZ2File(svm_file, "rb")
    model = cPickle.load(fh)
    fh.close()

    ## getting the model information
    center_pos = model.param["center_pos"]
    center_offset = model.param["center_offset"]

    print ("model - center pos: %i, center reg: %i" % (center_pos, center_offset))

    start_scan = center_pos - center_offset
    stop_scan = center_pos + center_offset

    cnt = 0
    data_set = []
    argument_list = []

    label_type = -1  ## label_type will be +1/-1

    ## get the individual examples to recenter the signal position manually
    for idx, single_example in enumerate(data["examples"]):

        datum = [single_example]
        label_info = data["labels"][idx]

        if label_info != label_type:
            cnt += 1

            if cnt % 10 == 0:  ## packing 10 seq to one job
                data_set.append(datum)

                arg = [start_scan, stop_scan, model, data_set]
                argument_list.append(arg)

                data_set = []
            else:
                data_set.append(datum)

    local = False
    cluster_resource = {
        "pvmem": "4gb",
        "pmem": "4gb",
        "mem": "4gb",
        "vmem": "4gb",
        "ppn": "1",
        "nodes": "1",
        "walltime": "4:00:00",
    }
    task_type = 0  # 1 recenter seq, 0 predict score

    if task_type:
        intm_ret = pg.pg_map(
            predict_and_recenter, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb"
        )
        print "Done with computation"

        fixed_example_seq = reduce_modified_seq(intm_ret)
        print "Done reducing the results"

        write_fasta_rec(fixed_example_seq, signal)

    else:
        intm_ret = pg.pg_map(
            predict_around_region, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb"
        )
        print "Done with computation"

        pred_out_val = reduce_pred_score(intm_ret)
        print "Done reducing the results"

        ## save the scores
        fname = "%s_pred_score_%s" % (signal, uuid.uuid1())
        compressed_pickle.save(fname, pred_out_val)

        print ("saving the score in file %s" % fname)