Пример #1
0
def download_gtf(yaml_config):
    """
    download gtf/gff file from remote data publishing services
    """
    operation_seleted = "a"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = [] 
    for org_name, det in orgdb.items():
        ## arguments to pygrid 
        arg = [[det['release_nb'], det['long_name'], det['genome_dir']]]

        if det['release_db'] == 'ensembl_metazoa_genome':
            job = pg.cBioJob(call_metazoa_gtf, arg) 
        elif det['release_db'] == 'phytozome_genome':
            job = pg.cBioJob(call_phytozome_gtf, arg) 
        elif det['release_db'] == 'ensembl_genome':
            job = pg.cBioJob(call_ensembl_gtf, arg) 
        else:
            print "error: download gtf plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det['release_db']
            sys.exit(0)

        job.mem="2gb"
        job.vmem="2gb"
        job.pmem="2gb"
        job.pvmem="2gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"
        
        Jobs.append(job)
    print 
    print "sending gtf download job to worker"
    print 
    processedJobs = pg.process_jobs(Jobs)
Пример #2
0
def alignment_filter(yaml_config):
    """
    run multimapper resolution program 
    """
    operation_seleted = "m"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():

        num_cpus = 5
        ## arguments to pygrid 
        arg = [[det['short_name'], det['read_map_dir'], num_cpus]]

        job = pg.cBioJob(call_alignment_filter, arg) 

        ## native specifications 
        job.pmem="90gb"
        job.pvmem="90gb"
        job.mem="90gb"
        job.vmem="90gb"
        job.nodes = 1
        job.ppn = num_cpus
        job.walltime = "48:00:00"

        Jobs.append(job)
    print 
    print "sending multi map resolution jobs to worker"
    print 
    processedJobs = pg.process_jobs(Jobs)
Пример #3
0
def download_sra_data(yaml_config):
    """
    download sra file for the working organism   
    """
    operation_seleted = "1"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        arg = [[det['sra_run_id'], det['fastq_path']]]

        job = pg.cBioJob(call_download_sra_file, arg)

        job.mem = "2gb"
        job.vmem = "2gb"
        job.pmem = "2gb"
        job.pvmem = "2gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "1:00:00"

        Jobs.append(job)
    print
    print "sending download SRA file jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #4
0
def align_rnaseq_reads(yaml_config):
    """
    wrapper for aligning rnaseq reads using 
    """
    operation_seleted = "3"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid 
        lib_type = 'PE'
        lib_type = 'SE' if len(det['fastq'])==1 else lib_type

        ## library insert size 
        lib_insert_size = 100000
        num_cpu = 3

        arg = [[det, lib_type, lib_insert_size, num_cpu]]

        job = pg.cBioJob(call_align_reads, arg) 
    
        job.mem="90gb"
        job.vmem="90gb"
        job.pmem="30gb"
        job.pvmem="30gb"
        job.nodes = 1
        job.ppn = num_cpu
        job.walltime = "48:00:00"
        
        Jobs.append(job)
    print 
    print "sending read alignment with STAR jobs to worker"
    print 
    processedJobs = pg.process_jobs(Jobs, local=False)
Пример #5
0
def align_rnaseq_reads(yaml_config):
    """
    wrapper for aligning rnaseq reads using 
    """
    operation_seleted = "3"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        lib_type = 'PE'
        lib_type = 'SE' if len(det['fastq']) == 1 else lib_type

        ## library insert size
        lib_insert_size = 100000
        num_cpu = 3

        arg = [[det, lib_type, lib_insert_size, num_cpu]]

        job = pg.cBioJob(call_align_reads, arg)

        job.mem = "90gb"
        job.vmem = "90gb"
        job.pmem = "30gb"
        job.pvmem = "30gb"
        job.nodes = 1
        job.ppn = num_cpu
        job.walltime = "48:00:00"

        Jobs.append(job)
    print
    print "sending read alignment with STAR jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs, local=False)
Пример #6
0
def create_genome_index(yaml_config):
    """
    wrapper for calling genome index function 
    """
    operation_seleted = "2"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        num_cpus = 4
        arg = [[
            det['fasta'], det['genome_index_dir'], det['gtf'], num_cpus,
            det['read_length'] - 1
        ]]

        job = pg.cBioJob(call_genome_index, arg)

        job.mem = "46gb"
        job.vmem = "46gb"
        job.pmem = "46gb"
        job.pvmem = "46gb"
        job.nodes = 1
        job.ppn = num_cpus
        job.walltime = "24:00:00"

        Jobs.append(job)
    print
    print "sending star genome index jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #7
0
def transcript_prediction_stringtie(yaml_config):
    """
    transcript prediction using StringTie
    """

    operation_seleted = "5"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid

        arg = [[
            det["read_map_dir"], det["short_name"], det["read_assembly_dir"]
        ]]

        job = pg.cBioJob(call_transcript_prediction_stringtie, arg)

        cpus = 1
        ## native specifications
        job.mem = "12gb"
        job.vmem = "12gb"
        job.pmem = "12gb"
        job.pvmem = "12gb"
        job.nodes = 1
        job.ppn = cpus
        job.walltime = "24:00:00"

        Jobs.append(job)
    print("\nsending transcript assembly stringtie jobs to worker\n")

    local_compute = False  ## switching between local multithreading and cluster computing

    processedJobs = pg.process_jobs(Jobs, local=local_compute)
Пример #8
0
def transcript_prediction_cuff(yaml_config):
    """
    transcript prediction using cufflinks
    """
    operation_seleted = "c"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid 
        arg = [[det, 4]]

        job = pg.cBioJob(call_transcript_prediction_cuff, arg) 

        ## native specifications 
        job.mem="96gb"
        job.vmem="96gb"
        job.pmem="24gb"
        job.pvmem="24gb"
        job.nodes = 1
        job.ppn = 4
        job.walltime = "32:00:00"

        Jobs.append(job)
    print 
    print "sending transcript assembly cufflinks jobs to worker"
    print 
    processedJobs = pg.process_jobs(Jobs)
Пример #9
0
def transcript_prediction_cuff(yaml_config):
    """
    transcript prediction using cufflinks
    """
    operation_seleted = "c"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        arg = [[det, 4]]

        job = pg.cBioJob(call_transcript_prediction_cuff, arg)

        ## native specifications
        job.mem = "96gb"
        job.vmem = "96gb"
        job.pmem = "24gb"
        job.pvmem = "24gb"
        job.nodes = 1
        job.ppn = 4
        job.walltime = "32:00:00"

        Jobs.append(job)
    print
    print "sending transcript assembly cufflinks jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #10
0
def transcript_prediction_trsk(yaml_config):
    """
    transcript prediction using TranscriptSkimmer
    """
    operation_seleted = "4"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        arg = [det]

        job = pg.cBioJob(call_transcript_prediction_trsk, arg)

        ## native specifications
        job.mem = "32gb"
        job.vmem = "32gb"
        job.pmem = "32gb"
        job.pvmem = "32gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "9:00:00"

        Jobs.append(job)
    print
    print "sending transcript assembly trsk jobs to worker"
    print

    local = True  ## cluster compute switch
    processedJobs = pg.process_jobs(Jobs, local=local)
Пример #11
0
def create_genome_index(yaml_config):
    """
    wrapper for calling genome index function 
    """
    operation_seleted = "2"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid 
        num_cpus = 4 
        arg = [[det['fasta'], det['genome_index_dir'], det['gtf'], num_cpus, det['read_length']-1]]

        job = pg.cBioJob(call_genome_index, arg) 
    
        job.mem="46gb"
        job.vmem="46gb"
        job.pmem="46gb"
        job.pvmem="46gb"
        job.nodes = 1
        job.ppn = num_cpus
        job.walltime = "24:00:00"
        
        Jobs.append(job)
    print 
    print "sending star genome index jobs to worker"
    print 
    processedJobs = pg.process_jobs(Jobs)
Пример #12
0
def download_sra_data(yaml_config):
    """
    download sra file for the working organism   
    """
    operation_seleted = "1"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = [] 
    for org_name, det in orgdb.items():
        ## arguments to pygrid 
        arg = [[det['sra_run_id'], det['fastq_path']]]

        job = pg.cBioJob(call_download_sra_file, arg) 
    
        job.mem="2gb"
        job.vmem="2gb"
        job.pmem="2gb"
        job.pvmem="2gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "1:00:00"
        
        Jobs.append(job)
    print 
    print "sending download SRA file jobs to worker"
    print 
    processedJobs = pg.process_jobs(Jobs)
Пример #13
0
def make_jobs():
    """
    creates a list of cBioJob objects,
    which carry all information needed
    for a function to be executed on SGE:
    - function object
    - arguments
    - settings
    """

    # set up list of arguments
    inputvec = [[3], [5], [10], [20]]

    # create empty job vector
    jobs=[]

    # create job objects
    for arg in inputvec:

        job = cBioJob(compute_factorial, arg) 
        job.mem="1gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "00:10:00"
        
        jobs.append(job)

    return jobs
Пример #14
0
def transcript_prediction_stringtie(yaml_config):
    """
    transcript prediction using StringTie
    """

    operation_seleted = "5"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid 

        arg = [[det["read_map_dir"], det["short_name"], det["read_assembly_dir"]]]

        job = pg.cBioJob(call_transcript_prediction_stringtie, arg) 
        
        cpus = 1 
        ## native specifications 
        job.mem="12gb"
        job.vmem="12gb"
        job.pmem="12gb"
        job.pvmem="12gb"
        job.nodes = 1
        job.ppn = cpus
        job.walltime = "24:00:00"

        Jobs.append(job)
    print("\nsending transcript assembly stringtie jobs to worker\n")

    local_compute = False ## switching between local multithreading and cluster computing
    
    processedJobs = pg.process_jobs(Jobs, local=local_compute)
Пример #15
0
def alignment_filter(yaml_config):
    """
    run multimapper resolution program 
    """
    operation_seleted = "m"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():

        num_cpus = 5
        ## arguments to pygrid
        arg = [[det['short_name'], det['read_map_dir'], num_cpus]]

        job = pg.cBioJob(call_alignment_filter, arg)

        ## native specifications
        job.pmem = "90gb"
        job.pvmem = "90gb"
        job.mem = "90gb"
        job.vmem = "90gb"
        job.nodes = 1
        job.ppn = num_cpus
        job.walltime = "48:00:00"

        Jobs.append(job)
    print
    print "sending multi map resolution jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #16
0
def transcript_prediction_trsk(yaml_config):
    """
    transcript prediction using TranscriptSkimmer
    """
    operation_seleted = "4"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid 
        arg = [det]

        job = pg.cBioJob(call_transcript_prediction_trsk, arg) 

        ## native specifications 
        job.mem="32gb"
        job.vmem="32gb"
        job.pmem="32gb"
        job.pvmem="32gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "9:00:00"

        Jobs.append(job)
    print 
    print "sending transcript assembly trsk jobs to worker"
    print 

    local = True  ## cluster compute switch 
    processedJobs = pg.process_jobs(Jobs, local=local)
Пример #17
0
def make_jobs():
    """
    creates a list of cBioJob objects,
    which carry all information needed
    for a function to be executed on SGE:
    - function object
    - arguments
    - settings
    """

    # set up list of arguments
    inputvec = [[3], [5], [10], [20]]

    # create empty job vector
    jobs = []

    # create job objects
    for arg in inputvec:

        job = cBioJob(compute_factorial, arg)
        job.mem = "1gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "00:10:00"

        jobs.append(job)

    return jobs
Пример #18
0
def download_gtf(yaml_config):
    """
    download gtf/gff file from remote data publishing services
    """
    operation_seleted = "a"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        arg = [[det['release_nb'], det['long_name'], det['genome_dir']]]

        if det['release_db'] == 'ensembl_metazoa_genome':
            job = pg.cBioJob(call_metazoa_gtf, arg)
        elif det['release_db'] == 'phytozome_genome':
            job = pg.cBioJob(call_phytozome_gtf, arg)
        elif det['release_db'] == 'ensembl_genome':
            job = pg.cBioJob(call_ensembl_gtf, arg)
        elif det['release_db'] == 'ensembl_fungi_genome':
            job = pg.cBioJob(call_fungi_gtf, arg)
        elif det['release_db'] == 'ensembl_protists_genome':
            job = pg.cBioJob(call_protists_gtf, arg)
        else:
            exit(
                "error: download gtf plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers."
                % det['release_db'])

        job.mem = "2gb"
        job.vmem = "2gb"
        job.pmem = "2gb"
        job.pvmem = "2gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"

        Jobs.append(job)
    print
    print "sending gtf download job to worker"
    print
    local_compute = True
    processedJobs = pg.process_jobs(Jobs, local=local_compute)
Пример #19
0
def download_fasta(yaml_config):
    """
    download fasta file from remote data publishing services
    """
    operation_seleted = "g"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        ## arguments to pygrid
        arg = [[det["release_nb"], det["long_name"], det["genome_dir"]]]

        if det["release_db"] == "ensembl_metazoa_genome":
            job = pg.cBioJob(call_metazoa_fasta, arg)
        elif det["release_db"] == "phytozome_genome":
            job = pg.cBioJob(call_phytozome_fasta, arg)
        elif det["release_db"] == "ensembl_genome":
            job = pg.cBioJob(call_ensembl_fasta, arg)
        elif det["release_db"] == "ensembl_fungi_genome":
            job = pg.cBioJob(call_fungi_fasta, arg)
        else:
            exit(
                "error: download fasta plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers."
                % det["release_db"]
            )

        job.mem = "2gb"
        job.vmem = "2gb"
        job.pmem = "2gb"
        job.pvmem = "2gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"

        Jobs.append(job)
    print
    print "sending fasta download job to worker"
    print
    local_compute = True
    processedJobs = pg.process_jobs(Jobs, local=local_compute)
Пример #20
0
def filter_genes(yaml_config, data_method):
    """
    filter out invalid gene models from the provided genome annotation
    """
    operation_seleted = "f"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        if data_method == "cufflinks":
            gff_file = "%s/transcripts.gtf" % det["read_assembly_dir"]  ## cufflinks run output file
            outFile = "%s/%s_cufflinks_genes.gff" % (
                det["read_assembly_dir"],
                org_name,
            )  ## example: A_thaliana_cufflinks_genes.gff
        elif data_method == "trsk":
            gff_file = "%s/tmp_trsk_genes.gff" % det["read_assembly_dir"]  ## trsk run output file
            outFile = "%s/%s_trsk_genes.gff" % (
                det["read_assembly_dir"],
                org_name,
            )  ## example: A_thaliana_trsk_genes.gff
        else:
            gff_file = det["gtf"]  ## public database genome annotation file
            outFile = "%s/%s_%s.gff" % (
                det["read_assembly_dir"],
                org_name,
                det["genome_release_db"],
            )  ## example: A_thaliana_arabidopsis-tair10.gff

        ## arguments to pygrid
        arg = [[gff_file, det["fasta"], outFile]]
        job = pg.cBioJob(call_filter_genes, arg)

        ## native specifications
        job.mem = "6gb"
        job.vmem = "6gb"
        job.pmem = "6gb"
        job.pvmem = "6gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"

        Jobs.append(job)
    print
    print "sending filter gene models jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #21
0
def filter_genes(yaml_config, data_method):
    """
    filter out invalid gene models from the provided genome annotation
    """
    operation_seleted = "f"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        if data_method == "cufflinks":
            gff_file = "%s/transcripts.gtf" % det[
                'read_assembly_dir']  ## cufflinks run output file
            outFile = "%s/%s_cufflinks_genes.gff" % (
                det['read_assembly_dir'], org_name
            )  ## example: A_thaliana_cufflinks_genes.gff
        elif data_method == "trsk":
            gff_file = "%s/tmp_trsk_genes.gff" % det[
                'read_assembly_dir']  ## trsk run output file
            outFile = "%s/%s_trsk_genes.gff" % (
                det['read_assembly_dir'], org_name
            )  ## example: A_thaliana_trsk_genes.gff
        else:
            gff_file = det['gtf']  ## public database genome annotation file
            outFile = "%s/%s_%s.gff" % (
                det['read_assembly_dir'], org_name, det['genome_release_db']
            )  ## example: A_thaliana_arabidopsis-tair10.gff

        ## arguments to pygrid
        arg = [[gff_file, det['fasta'], outFile]]
        job = pg.cBioJob(call_filter_genes, arg)

        ## native specifications
        job.mem = "6gb"
        job.vmem = "6gb"
        job.pmem = "6gb"
        job.pvmem = "6gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"

        Jobs.append(job)
    print
    print "sending filter gene models jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #22
0
def main(yaml_config):
    """
    """

    config_map = yaml.safe_load(open(yaml_config, "rU"))
    exp_path_pfx = config_map['experiment_data_path']['dir']

    org_db = defaultdict()
    for ent in config_map['experiment']:
        species_name = ent['organism_name']

        genus, species = species_name.strip().split("_")
        short_name = "%s_%s" % (genus[0].upper(), species)

        org_db[short_name] = dict(short_name=short_name)
        org_db[short_name]['work_dir'] = "%s/%s/set_union_refix" % (
            exp_path_pfx, short_name)
        org_db[short_name]['data_dir'] = "%s/%s/set_2" % (exp_path_pfx,
                                                          short_name)

    ## prepare jobs
    Jobs = []
    for org_name, det in org_db.items():

        arg = [[org_name, det['work_dir'], det['data_dir']]]

        job = pg.cBioJob(distribute_model_train, arg)
        job.mem = "4gb"
        job.vmem = "4gb"
        job.pmem = "4gb"
        job.pvmem = "4gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"

        Jobs.append(job)

    compute_local = True
    print "sending jobs to worker"
    processedJobs = pg.process_jobs(Jobs, local=compute_local)
Пример #23
0
def main(yaml_config):
    """
    """

    config_map = yaml.safe_load(open(yaml_config, "rU"))
    exp_path_pfx = config_map["experiment_data_path"]["dir"]

    org_db = defaultdict()
    for ent in config_map["experiment"]:
        species_name = ent["organism_name"]

        genus, species = species_name.strip().split("_")
        short_name = "%s_%s" % (genus[0].upper(), species)

        org_db[short_name] = dict(short_name=short_name)
        org_db[short_name]["work_dir"] = "%s/%s/set_union_refix" % (exp_path_pfx, short_name)
        org_db[short_name]["data_dir"] = "%s/%s/set_2" % (exp_path_pfx, short_name)

    ## prepare jobs
    Jobs = []
    for org_name, det in org_db.items():

        arg = [[org_name, det["work_dir"], det["data_dir"]]]

        job = pg.cBioJob(distribute_model_train, arg)
        job.mem = "4gb"
        job.vmem = "4gb"
        job.pmem = "4gb"
        job.pvmem = "4gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "2:00:00"

        Jobs.append(job)

    compute_local = True
    print "sending jobs to worker"
    processedJobs = pg.process_jobs(Jobs, local=compute_local)
Пример #24
0
def decompose_sra_file(yaml_config):
    """
    decompress the .sra file from ncbi sra
    """
    operation_seleted = "d"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = [] 
    for org_name, det in orgdb.items():
        sra_file = "%s/%s.sra"  % (det['fastq_path'], det['sra_run_id'])

        if not os.path.isfile(sra_file):## check the file present or not  
            print "error: missing sequencing read file %s" % sra_file
            sys.exit(0)
        
        ## TODO can be consider to the yaml file options 
        #library_type = "pe"
        library_type = "pe"
        compress_format = "gzip"

        ## arguments to pygrid 
        arg = [[sra_file, det['fastq_path']]]

        job = pg.cBioJob(call_decompose_sra_file, arg) 
    
        job.mem="6gb"
        job.vmem="6gb"
        job.pmem="6gb"
        job.pvmem="6gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "24:00:00"
        
        Jobs.append(job)
    print 
    print "sending decompress SRA file jobs to worker"
    print 
    processedJobs = pg.process_jobs(Jobs)
Пример #25
0
def decompose_sra_file(yaml_config):
    """
    decompress the .sra file from ncbi sra
    """
    operation_seleted = "d"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        sra_file = "%s/%s.sra" % (det['fastq_path'], det['sra_run_id'])

        if not os.path.isfile(sra_file):  ## check the file present or not
            print "error: missing sequencing read file %s" % sra_file
            sys.exit(0)

        ## TODO can be consider to the yaml file options
        #library_type = "pe"
        library_type = "pe"
        compress_format = "gzip"

        ## arguments to pygrid
        arg = [[sra_file, det['fastq_path']]]

        job = pg.cBioJob(call_decompose_sra_file, arg)

        job.mem = "6gb"
        job.vmem = "6gb"
        job.pmem = "6gb"
        job.pvmem = "6gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "24:00:00"

        Jobs.append(job)
    print
    print "sending decompress SRA file jobs to worker"
    print
    processedJobs = pg.process_jobs(Jobs)
Пример #26
0
def fetch_db_signals(yaml_config, data_method):
    """
    get the genomic signal labels bases on the annotation from external database
    """
    operation_seleted = "6"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        if data_method == "trsk":
            gff_file = "%s/%s_trsk_genes.gff" % (det['read_assembly_dir'],
                                                 org_name)
            out_dir = "%s/trsk_4K_labels" % det[
                'labels_dir']  ## new label sequence dir
        elif data_method == "cufflinks":
            gff_file = "%s/%s_cufflinks_genes.gff" % (det['read_assembly_dir'],
                                                      org_name)
            out_dir = "%s/cuff_4K_labels" % det['labels_dir']
        elif data_method == "onlinedb":
            gff_file = "%s/%s_%s.gff" % (det['read_assembly_dir'], org_name,
                                         det['genome_release_db'])  ## db_anno
            out_dir = "%s/jmlr_1K_sm_labels" % det['labels_dir']

        if not os.path.isfile(gff_file):  ## check the file present or not
            exit("error: genome annotation file missing %s" % gff_file)

        if not os.path.exists(out_dir):  ## create the new label sequence dir
            os.makedirs(out_dir)

        for the_file in os.listdir(out_dir):  ## cleaning the existing one
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception, e:
                print e

        #import subprocess
        ## get the label count for each organisms, essentially the max number of genes available
        #cmd = "grep -P \"\tgene\t\" %s | wc -l" % gff_file
        #proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        #count, err = proc.communicate()
        #count = int(count.strip())

        ## depends on the genomic signal type
        count = 5000
        signal_type = "tss"
        poslabels_cnt = 1000
        neglabels_cnt = 3000
        flank_nts = 1200

        ## arguments to pygrid
        arg = [[
            det['fasta'], gff_file, signal_type, count, poslabels_cnt,
            neglabels_cnt, flank_nts, out_dir
        ]]
        job = pg.cBioJob(call_fetch_db_signals, arg)

        ## native specifications
        job.mem = "5gb"
        job.vmem = "5gb"
        job.pmem = "5gb"
        job.pvmem = "5gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "1:00:00"

        Jobs.append(job)
Пример #27
0
def fetch_db_signals(yaml_config, data_method):
    """
    get the genomic signal labels bases on the annotation from external database
    """
    operation_seleted = "6"
    orgdb = expdb.experiment_db(yaml_config, operation_seleted)

    Jobs = []
    for org_name, det in orgdb.items():
        if data_method == "trsk":
            gff_file = "%s/%s_trsk_genes.gff" % (det['read_assembly_dir'], org_name)
            out_dir = "%s/trsk_4K_labels" % det['labels_dir']## new label sequence dir 
        elif data_method == "cufflinks":
            gff_file = "%s/%s_cufflinks_genes.gff" % (det['read_assembly_dir'], org_name)
            out_dir = "%s/cuff_4K_labels" % det['labels_dir']
        elif data_method == "onlinedb":
            gff_file = "%s/%s_%s.gff" % (det['read_assembly_dir'], org_name, det['genome_release_db']) ## db_anno 
            out_dir = "%s/jmlr_1K_sm_labels" % det['labels_dir']
        
        if not os.path.isfile(gff_file):## check the file present or not  
            exit("error: genome annotation file missing %s" % gff_file)
       
        if not os.path.exists(out_dir): ## create the new label sequence dir 
            os.makedirs(out_dir)

        for the_file in os.listdir(out_dir): ## cleaning the existing one 
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception, e:
                print e 
    
        #import subprocess 
        ## get the label count for each organisms, essentially the max number of genes available 
        #cmd = "grep -P \"\tgene\t\" %s | wc -l" % gff_file
        #proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        #count, err = proc.communicate() 
        #count = int(count.strip())
        
        ## depends on the genomic signal type 
        count = 5000
        signal_type = "tss"
        poslabels_cnt = 1000
        neglabels_cnt = 3000
        flank_nts = 1200 

        ## arguments to pygrid 
        arg = [[det['fasta'], gff_file, signal_type, count, poslabels_cnt, neglabels_cnt, flank_nts, out_dir]]
        job = pg.cBioJob(call_fetch_db_signals, arg) 

        ## native specifications 
        job.mem="5gb"
        job.vmem="5gb"
        job.pmem="5gb"
        job.pvmem="5gb"
        job.nodes = 1
        job.ppn = 1
        job.walltime = "1:00:00"

        Jobs.append(job)