def download_gtf(yaml_config): """ download gtf/gff file from remote data publishing services """ operation_seleted = "a" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['release_nb'], det['long_name'], det['genome_dir']]] if det['release_db'] == 'ensembl_metazoa_genome': job = pg.cBioJob(call_metazoa_gtf, arg) elif det['release_db'] == 'phytozome_genome': job = pg.cBioJob(call_phytozome_gtf, arg) elif det['release_db'] == 'ensembl_genome': job = pg.cBioJob(call_ensembl_gtf, arg) else: print "error: download gtf plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det['release_db'] sys.exit(0) job.mem="2gb" job.vmem="2gb" job.pmem="2gb" job.pvmem="2gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending gtf download job to worker" print processedJobs = pg.process_jobs(Jobs)
def alignment_filter(yaml_config): """ run multimapper resolution program """ operation_seleted = "m" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): num_cpus = 5 ## arguments to pygrid arg = [[det['short_name'], det['read_map_dir'], num_cpus]] job = pg.cBioJob(call_alignment_filter, arg) ## native specifications job.pmem="90gb" job.pvmem="90gb" job.mem="90gb" job.vmem="90gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "48:00:00" Jobs.append(job) print print "sending multi map resolution jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def download_sra_data(yaml_config): """ download sra file for the working organism """ operation_seleted = "1" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['sra_run_id'], det['fastq_path']]] job = pg.cBioJob(call_download_sra_file, arg) job.mem = "2gb" job.vmem = "2gb" job.pmem = "2gb" job.pvmem = "2gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job) print print "sending download SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def align_rnaseq_reads(yaml_config): """ wrapper for aligning rnaseq reads using """ operation_seleted = "3" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid lib_type = 'PE' lib_type = 'SE' if len(det['fastq'])==1 else lib_type ## library insert size lib_insert_size = 100000 num_cpu = 3 arg = [[det, lib_type, lib_insert_size, num_cpu]] job = pg.cBioJob(call_align_reads, arg) job.mem="90gb" job.vmem="90gb" job.pmem="30gb" job.pvmem="30gb" job.nodes = 1 job.ppn = num_cpu job.walltime = "48:00:00" Jobs.append(job) print print "sending read alignment with STAR jobs to worker" print processedJobs = pg.process_jobs(Jobs, local=False)
def align_rnaseq_reads(yaml_config): """ wrapper for aligning rnaseq reads using """ operation_seleted = "3" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid lib_type = 'PE' lib_type = 'SE' if len(det['fastq']) == 1 else lib_type ## library insert size lib_insert_size = 100000 num_cpu = 3 arg = [[det, lib_type, lib_insert_size, num_cpu]] job = pg.cBioJob(call_align_reads, arg) job.mem = "90gb" job.vmem = "90gb" job.pmem = "30gb" job.pvmem = "30gb" job.nodes = 1 job.ppn = num_cpu job.walltime = "48:00:00" Jobs.append(job) print print "sending read alignment with STAR jobs to worker" print processedJobs = pg.process_jobs(Jobs, local=False)
def create_genome_index(yaml_config): """ wrapper for calling genome index function """ operation_seleted = "2" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid num_cpus = 4 arg = [[ det['fasta'], det['genome_index_dir'], det['gtf'], num_cpus, det['read_length'] - 1 ]] job = pg.cBioJob(call_genome_index, arg) job.mem = "46gb" job.vmem = "46gb" job.pmem = "46gb" job.pvmem = "46gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "24:00:00" Jobs.append(job) print print "sending star genome index jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_stringtie(yaml_config): """ transcript prediction using StringTie """ operation_seleted = "5" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[ det["read_map_dir"], det["short_name"], det["read_assembly_dir"] ]] job = pg.cBioJob(call_transcript_prediction_stringtie, arg) cpus = 1 ## native specifications job.mem = "12gb" job.vmem = "12gb" job.pmem = "12gb" job.pvmem = "12gb" job.nodes = 1 job.ppn = cpus job.walltime = "24:00:00" Jobs.append(job) print("\nsending transcript assembly stringtie jobs to worker\n") local_compute = False ## switching between local multithreading and cluster computing processedJobs = pg.process_jobs(Jobs, local=local_compute)
def transcript_prediction_cuff(yaml_config): """ transcript prediction using cufflinks """ operation_seleted = "c" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det, 4]] job = pg.cBioJob(call_transcript_prediction_cuff, arg) ## native specifications job.mem="96gb" job.vmem="96gb" job.pmem="24gb" job.pvmem="24gb" job.nodes = 1 job.ppn = 4 job.walltime = "32:00:00" Jobs.append(job) print print "sending transcript assembly cufflinks jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_cuff(yaml_config): """ transcript prediction using cufflinks """ operation_seleted = "c" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det, 4]] job = pg.cBioJob(call_transcript_prediction_cuff, arg) ## native specifications job.mem = "96gb" job.vmem = "96gb" job.pmem = "24gb" job.pvmem = "24gb" job.nodes = 1 job.ppn = 4 job.walltime = "32:00:00" Jobs.append(job) print print "sending transcript assembly cufflinks jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_trsk(yaml_config): """ transcript prediction using TranscriptSkimmer """ operation_seleted = "4" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [det] job = pg.cBioJob(call_transcript_prediction_trsk, arg) ## native specifications job.mem = "32gb" job.vmem = "32gb" job.pmem = "32gb" job.pvmem = "32gb" job.nodes = 1 job.ppn = 1 job.walltime = "9:00:00" Jobs.append(job) print print "sending transcript assembly trsk jobs to worker" print local = True ## cluster compute switch processedJobs = pg.process_jobs(Jobs, local=local)
def create_genome_index(yaml_config): """ wrapper for calling genome index function """ operation_seleted = "2" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid num_cpus = 4 arg = [[det['fasta'], det['genome_index_dir'], det['gtf'], num_cpus, det['read_length']-1]] job = pg.cBioJob(call_genome_index, arg) job.mem="46gb" job.vmem="46gb" job.pmem="46gb" job.pvmem="46gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "24:00:00" Jobs.append(job) print print "sending star genome index jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def download_sra_data(yaml_config): """ download sra file for the working organism """ operation_seleted = "1" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['sra_run_id'], det['fastq_path']]] job = pg.cBioJob(call_download_sra_file, arg) job.mem="2gb" job.vmem="2gb" job.pmem="2gb" job.pvmem="2gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job) print print "sending download SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def make_jobs(): """ creates a list of cBioJob objects, which carry all information needed for a function to be executed on SGE: - function object - arguments - settings """ # set up list of arguments inputvec = [[3], [5], [10], [20]] # create empty job vector jobs=[] # create job objects for arg in inputvec: job = cBioJob(compute_factorial, arg) job.mem="1gb" job.nodes = 1 job.ppn = 1 job.walltime = "00:10:00" jobs.append(job) return jobs
def transcript_prediction_stringtie(yaml_config): """ transcript prediction using StringTie """ operation_seleted = "5" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det["read_map_dir"], det["short_name"], det["read_assembly_dir"]]] job = pg.cBioJob(call_transcript_prediction_stringtie, arg) cpus = 1 ## native specifications job.mem="12gb" job.vmem="12gb" job.pmem="12gb" job.pvmem="12gb" job.nodes = 1 job.ppn = cpus job.walltime = "24:00:00" Jobs.append(job) print("\nsending transcript assembly stringtie jobs to worker\n") local_compute = False ## switching between local multithreading and cluster computing processedJobs = pg.process_jobs(Jobs, local=local_compute)
def alignment_filter(yaml_config): """ run multimapper resolution program """ operation_seleted = "m" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): num_cpus = 5 ## arguments to pygrid arg = [[det['short_name'], det['read_map_dir'], num_cpus]] job = pg.cBioJob(call_alignment_filter, arg) ## native specifications job.pmem = "90gb" job.pvmem = "90gb" job.mem = "90gb" job.vmem = "90gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "48:00:00" Jobs.append(job) print print "sending multi map resolution jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_trsk(yaml_config): """ transcript prediction using TranscriptSkimmer """ operation_seleted = "4" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [det] job = pg.cBioJob(call_transcript_prediction_trsk, arg) ## native specifications job.mem="32gb" job.vmem="32gb" job.pmem="32gb" job.pvmem="32gb" job.nodes = 1 job.ppn = 1 job.walltime = "9:00:00" Jobs.append(job) print print "sending transcript assembly trsk jobs to worker" print local = True ## cluster compute switch processedJobs = pg.process_jobs(Jobs, local=local)
def make_jobs(): """ creates a list of cBioJob objects, which carry all information needed for a function to be executed on SGE: - function object - arguments - settings """ # set up list of arguments inputvec = [[3], [5], [10], [20]] # create empty job vector jobs = [] # create job objects for arg in inputvec: job = cBioJob(compute_factorial, arg) job.mem = "1gb" job.nodes = 1 job.ppn = 1 job.walltime = "00:10:00" jobs.append(job) return jobs
def download_gtf(yaml_config): """ download gtf/gff file from remote data publishing services """ operation_seleted = "a" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['release_nb'], det['long_name'], det['genome_dir']]] if det['release_db'] == 'ensembl_metazoa_genome': job = pg.cBioJob(call_metazoa_gtf, arg) elif det['release_db'] == 'phytozome_genome': job = pg.cBioJob(call_phytozome_gtf, arg) elif det['release_db'] == 'ensembl_genome': job = pg.cBioJob(call_ensembl_gtf, arg) elif det['release_db'] == 'ensembl_fungi_genome': job = pg.cBioJob(call_fungi_gtf, arg) elif det['release_db'] == 'ensembl_protists_genome': job = pg.cBioJob(call_protists_gtf, arg) else: exit( "error: download gtf plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det['release_db']) job.mem = "2gb" job.vmem = "2gb" job.pmem = "2gb" job.pvmem = "2gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending gtf download job to worker" print local_compute = True processedJobs = pg.process_jobs(Jobs, local=local_compute)
def download_fasta(yaml_config): """ download fasta file from remote data publishing services """ operation_seleted = "g" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det["release_nb"], det["long_name"], det["genome_dir"]]] if det["release_db"] == "ensembl_metazoa_genome": job = pg.cBioJob(call_metazoa_fasta, arg) elif det["release_db"] == "phytozome_genome": job = pg.cBioJob(call_phytozome_fasta, arg) elif det["release_db"] == "ensembl_genome": job = pg.cBioJob(call_ensembl_fasta, arg) elif det["release_db"] == "ensembl_fungi_genome": job = pg.cBioJob(call_fungi_fasta, arg) else: exit( "error: download fasta plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det["release_db"] ) job.mem = "2gb" job.vmem = "2gb" job.pmem = "2gb" job.pvmem = "2gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending fasta download job to worker" print local_compute = True processedJobs = pg.process_jobs(Jobs, local=local_compute)
def filter_genes(yaml_config, data_method): """ filter out invalid gene models from the provided genome annotation """ operation_seleted = "f" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): if data_method == "cufflinks": gff_file = "%s/transcripts.gtf" % det["read_assembly_dir"] ## cufflinks run output file outFile = "%s/%s_cufflinks_genes.gff" % ( det["read_assembly_dir"], org_name, ) ## example: A_thaliana_cufflinks_genes.gff elif data_method == "trsk": gff_file = "%s/tmp_trsk_genes.gff" % det["read_assembly_dir"] ## trsk run output file outFile = "%s/%s_trsk_genes.gff" % ( det["read_assembly_dir"], org_name, ) ## example: A_thaliana_trsk_genes.gff else: gff_file = det["gtf"] ## public database genome annotation file outFile = "%s/%s_%s.gff" % ( det["read_assembly_dir"], org_name, det["genome_release_db"], ) ## example: A_thaliana_arabidopsis-tair10.gff ## arguments to pygrid arg = [[gff_file, det["fasta"], outFile]] job = pg.cBioJob(call_filter_genes, arg) ## native specifications job.mem = "6gb" job.vmem = "6gb" job.pmem = "6gb" job.pvmem = "6gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending filter gene models jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def filter_genes(yaml_config, data_method): """ filter out invalid gene models from the provided genome annotation """ operation_seleted = "f" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): if data_method == "cufflinks": gff_file = "%s/transcripts.gtf" % det[ 'read_assembly_dir'] ## cufflinks run output file outFile = "%s/%s_cufflinks_genes.gff" % ( det['read_assembly_dir'], org_name ) ## example: A_thaliana_cufflinks_genes.gff elif data_method == "trsk": gff_file = "%s/tmp_trsk_genes.gff" % det[ 'read_assembly_dir'] ## trsk run output file outFile = "%s/%s_trsk_genes.gff" % ( det['read_assembly_dir'], org_name ) ## example: A_thaliana_trsk_genes.gff else: gff_file = det['gtf'] ## public database genome annotation file outFile = "%s/%s_%s.gff" % ( det['read_assembly_dir'], org_name, det['genome_release_db'] ) ## example: A_thaliana_arabidopsis-tair10.gff ## arguments to pygrid arg = [[gff_file, det['fasta'], outFile]] job = pg.cBioJob(call_filter_genes, arg) ## native specifications job.mem = "6gb" job.vmem = "6gb" job.pmem = "6gb" job.pvmem = "6gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending filter gene models jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def main(yaml_config): """ """ config_map = yaml.safe_load(open(yaml_config, "rU")) exp_path_pfx = config_map['experiment_data_path']['dir'] org_db = defaultdict() for ent in config_map['experiment']: species_name = ent['organism_name'] genus, species = species_name.strip().split("_") short_name = "%s_%s" % (genus[0].upper(), species) org_db[short_name] = dict(short_name=short_name) org_db[short_name]['work_dir'] = "%s/%s/set_union_refix" % ( exp_path_pfx, short_name) org_db[short_name]['data_dir'] = "%s/%s/set_2" % (exp_path_pfx, short_name) ## prepare jobs Jobs = [] for org_name, det in org_db.items(): arg = [[org_name, det['work_dir'], det['data_dir']]] job = pg.cBioJob(distribute_model_train, arg) job.mem = "4gb" job.vmem = "4gb" job.pmem = "4gb" job.pvmem = "4gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) compute_local = True print "sending jobs to worker" processedJobs = pg.process_jobs(Jobs, local=compute_local)
def main(yaml_config): """ """ config_map = yaml.safe_load(open(yaml_config, "rU")) exp_path_pfx = config_map["experiment_data_path"]["dir"] org_db = defaultdict() for ent in config_map["experiment"]: species_name = ent["organism_name"] genus, species = species_name.strip().split("_") short_name = "%s_%s" % (genus[0].upper(), species) org_db[short_name] = dict(short_name=short_name) org_db[short_name]["work_dir"] = "%s/%s/set_union_refix" % (exp_path_pfx, short_name) org_db[short_name]["data_dir"] = "%s/%s/set_2" % (exp_path_pfx, short_name) ## prepare jobs Jobs = [] for org_name, det in org_db.items(): arg = [[org_name, det["work_dir"], det["data_dir"]]] job = pg.cBioJob(distribute_model_train, arg) job.mem = "4gb" job.vmem = "4gb" job.pmem = "4gb" job.pvmem = "4gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) compute_local = True print "sending jobs to worker" processedJobs = pg.process_jobs(Jobs, local=compute_local)
def decompose_sra_file(yaml_config): """ decompress the .sra file from ncbi sra """ operation_seleted = "d" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): sra_file = "%s/%s.sra" % (det['fastq_path'], det['sra_run_id']) if not os.path.isfile(sra_file):## check the file present or not print "error: missing sequencing read file %s" % sra_file sys.exit(0) ## TODO can be consider to the yaml file options #library_type = "pe" library_type = "pe" compress_format = "gzip" ## arguments to pygrid arg = [[sra_file, det['fastq_path']]] job = pg.cBioJob(call_decompose_sra_file, arg) job.mem="6gb" job.vmem="6gb" job.pmem="6gb" job.pvmem="6gb" job.nodes = 1 job.ppn = 1 job.walltime = "24:00:00" Jobs.append(job) print print "sending decompress SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def decompose_sra_file(yaml_config): """ decompress the .sra file from ncbi sra """ operation_seleted = "d" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): sra_file = "%s/%s.sra" % (det['fastq_path'], det['sra_run_id']) if not os.path.isfile(sra_file): ## check the file present or not print "error: missing sequencing read file %s" % sra_file sys.exit(0) ## TODO can be consider to the yaml file options #library_type = "pe" library_type = "pe" compress_format = "gzip" ## arguments to pygrid arg = [[sra_file, det['fastq_path']]] job = pg.cBioJob(call_decompose_sra_file, arg) job.mem = "6gb" job.vmem = "6gb" job.pmem = "6gb" job.pvmem = "6gb" job.nodes = 1 job.ppn = 1 job.walltime = "24:00:00" Jobs.append(job) print print "sending decompress SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def fetch_db_signals(yaml_config, data_method): """ get the genomic signal labels bases on the annotation from external database """ operation_seleted = "6" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): if data_method == "trsk": gff_file = "%s/%s_trsk_genes.gff" % (det['read_assembly_dir'], org_name) out_dir = "%s/trsk_4K_labels" % det[ 'labels_dir'] ## new label sequence dir elif data_method == "cufflinks": gff_file = "%s/%s_cufflinks_genes.gff" % (det['read_assembly_dir'], org_name) out_dir = "%s/cuff_4K_labels" % det['labels_dir'] elif data_method == "onlinedb": gff_file = "%s/%s_%s.gff" % (det['read_assembly_dir'], org_name, det['genome_release_db']) ## db_anno out_dir = "%s/jmlr_1K_sm_labels" % det['labels_dir'] if not os.path.isfile(gff_file): ## check the file present or not exit("error: genome annotation file missing %s" % gff_file) if not os.path.exists(out_dir): ## create the new label sequence dir os.makedirs(out_dir) for the_file in os.listdir(out_dir): ## cleaning the existing one file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception, e: print e #import subprocess ## get the label count for each organisms, essentially the max number of genes available #cmd = "grep -P \"\tgene\t\" %s | wc -l" % gff_file #proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #count, err = proc.communicate() #count = int(count.strip()) ## depends on the genomic signal type count = 5000 signal_type = "tss" poslabels_cnt = 1000 neglabels_cnt = 3000 flank_nts = 1200 ## arguments to pygrid arg = [[ det['fasta'], gff_file, signal_type, count, poslabels_cnt, neglabels_cnt, flank_nts, out_dir ]] job = pg.cBioJob(call_fetch_db_signals, arg) ## native specifications job.mem = "5gb" job.vmem = "5gb" job.pmem = "5gb" job.pvmem = "5gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job)
def fetch_db_signals(yaml_config, data_method): """ get the genomic signal labels bases on the annotation from external database """ operation_seleted = "6" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): if data_method == "trsk": gff_file = "%s/%s_trsk_genes.gff" % (det['read_assembly_dir'], org_name) out_dir = "%s/trsk_4K_labels" % det['labels_dir']## new label sequence dir elif data_method == "cufflinks": gff_file = "%s/%s_cufflinks_genes.gff" % (det['read_assembly_dir'], org_name) out_dir = "%s/cuff_4K_labels" % det['labels_dir'] elif data_method == "onlinedb": gff_file = "%s/%s_%s.gff" % (det['read_assembly_dir'], org_name, det['genome_release_db']) ## db_anno out_dir = "%s/jmlr_1K_sm_labels" % det['labels_dir'] if not os.path.isfile(gff_file):## check the file present or not exit("error: genome annotation file missing %s" % gff_file) if not os.path.exists(out_dir): ## create the new label sequence dir os.makedirs(out_dir) for the_file in os.listdir(out_dir): ## cleaning the existing one file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception, e: print e #import subprocess ## get the label count for each organisms, essentially the max number of genes available #cmd = "grep -P \"\tgene\t\" %s | wc -l" % gff_file #proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #count, err = proc.communicate() #count = int(count.strip()) ## depends on the genomic signal type count = 5000 signal_type = "tss" poslabels_cnt = 1000 neglabels_cnt = 3000 flank_nts = 1200 ## arguments to pygrid arg = [[det['fasta'], gff_file, signal_type, count, poslabels_cnt, neglabels_cnt, flank_nts, out_dir]] job = pg.cBioJob(call_fetch_db_signals, arg) ## native specifications job.mem="5gb" job.vmem="5gb" job.pmem="5gb" job.pvmem="5gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job)