def permute_taxon_blast(self, hits_num): print('permutation of viral blast:{}\t{}'.format(self.par['type'], hits_num)) # counts_df = pd.DataFrame() outfile = '{}{}.txt'.format(myIO.dir_os(self.par['dir_out']).create_dir(), hits_num) if os.path.isfile(outfile): print('Read file: ', outfile) counts_df = pd.read_csv(outfile, header=0, index_col=0, sep="\t", low_memory=False) else: #1: permutated peptides pep_names = list(self.par['binary_aln_df'].index) pep_df = myList.basic(pep_names).permute_list(self.par['permutation_times'], hits_num) #2: permutation based on the non-overlapped hits num for col, perm_pep in pep_df.items(): perm_zb = self.par['binary_aln_df'].ix[perm_pep] p_collapse_zb, p_sim_tag = myDataframe.basic(perm_zb).unispecie(self.par['sim_threshold']) counts_df[col] = p_collapse_zb.apply(sum,axis=0) + p_sim_tag #print list(perm_tmp[col]) #export counts_df.to_csv(outfile, sep='\t', header=True, index_label=self.par['type']) #combine permuated counts #print counts_df.shape perm_mean = counts_df.apply(lambda x: np.mean(np.floor(x)), axis=1).round() #print perm_mean return perm_mean
def seek_fq(self,dir_raw_data): print('Retrieve all *.fastq files under', dir_raw_data) raw_files = [] #get all files all_files = myIO.dir_os(dir_raw_data).recrusive_files() #find file with .fastq or .fq for af in all_files: m = re.search(r'fastq$|fq$', af) if m: #print 'raw data:',af raw_files.append(af) return raw_files
def phipseq_alignment(self, sample_name): print('\n######Anslysis of {} will be trigerred!#####'.format( sample_name)) #initiate sample par sample_var = dict(self.par) sample_var['start_time'] = time.time() #sample name sample_var['sample_name'] = sample_name #sample directory sample_dir = self.par['sample_dirs'][sample_name] sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir() print('\tSample directory: ', sample_var['sample_dir']) #raw data sample_var['sample_raw_files'] = ','.join( sample_var['sample_to_raw'][sample_name]) print('\tRaw files: ', sample_var['sample_raw_files']) #export sample_var['file_head'] = sample_var['sample_dir'] + sample_name #default same file sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam' #file of read counts sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt' sample_var['sample_pro_sumRC_file'] = sample_var[ 'file_head'] + '_pro_sumRC.txt' sample_var['sample_pro_maxRC_file'] = sample_var[ 'file_head'] + '_pro_maxRC.txt' #file for saturation analysis sample_var['sample_saturation_file'] = sample_var[ 'file_head'] + '_saturation.txt' #sample log sample_var['sample_log'] = sample_var['file_head'] + '.log' #sequence alignment if sample_var['phip_alignment'] == 'yes': print("\n###sequence alignment", sample_var['tool_aligner']) #output is sam file if sample_var['tool_aligner'] == 'bowtie1': myAlign.alignment(sample_var).bowtie1_alignment() #counts reads if sample_var['phip_counting'] == 'yes': #RC matrix by peptides myAlign.alignment(sample_var).count_reads() #RC matrix by proteins if 'file_annotation' in self.par.keys(): self.combine_peptides(sample_var) #update sample log sample_times = mySystem.system().get_time(sample_var['start_time']) sample_times['sample_name'] = sample_name myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
def match_fasta(self): files = myIO.dir_os(self.par['dir_out']).incrusive_files() #select a fasta file fa_files = filter(lambda x: x.endswith(('.fa', '.fasta')), files) self.par['match_fa'] = mySystem.system().select_key(fa_files) #select a gtf or gff file gtf_files = filter(lambda x: x.endswith(('.gtf', '.gff3')), files) self.par['match_gtf'] = mySystem.system().select_key(gtf_files) #match if par['web_site'] == 'ENSEML': myGenome.genome(par['match_fa']).match_ensembl_fa(par['match_gtf']) elif par['web_site'] == 'NCBI': myGenome.genome(par['match_fa']).match_ncbi_fa(par['match_gtf'])
def init_dir_file(self): self.par['dir_home'] = myIO.dir_os(self.par['dir_home']).create_dir() print('home directory of phip tool:', self.par['dir_home']) #dir_home = /home/yuan/phip/ #alignment related self.par['dir_aligner'] = self.par['dir_home'] + 'bowtie1/' self.par['aligner_options'] = '{}bowtie {}'.format( self.par['dir_aligner'], self.par['aligner_options']) self.par['genome_index'] = self.par['dir_aligner'] + self.par[ 'genome_index_name'] self.par['dir_ref_seq'] = self.par['dir_home'] + 'ref_seq/' self.par['file_ref_fa'] = '{}{}.fa'.format( self.par['dir_ref_seq'], self.par['genome_index_name']) if 'file_annotation' in self.par.keys(): self.par['file_annotation'] = self.par['dir_ref_seq'] + self.par[ 'file_annotation'] # #judge ref library human or virus if 'VirScan' in self.par['genome_index_name']: self.par['lib'] = 'virus' self.par[ 'file_NC'] = self.par['dir_ref_seq'] + 'virus_BeadsOnly.txt' elif 'human' in self.par['genome_index_name']: self.par['lib'] = 'human' self.par[ 'file_NC'] = self.par['dir_ref_seq'] + 'human_BeadsOnly.txt' elif 'PublicEpitope' in self.par['genome_index_name']: self.par['lib'] = 'PE' elif 'LISH' in self.par['genome_index_name']: self.par['lib'] = 'LISH' #dir of raw data if 'dir_raw_data' not in self.par.keys(): self.par['dir_raw_data'] = myIO.dir_os(self.par['dir_home'] + 'raw_data').create_dir() #results related if 'dir_result' not in self.par.keys(): self.par['dir_result'] = myIO.dir_os(self.par['dir_home'] + 'result').create_dir() #print('Result directory', self.par['dir_result']) if 'dir_result_array' not in self.par.keys(): self.par['dir_result_array'] = self.par['dir_result'] #dir of statistics self.par['dir_stat'] = myIO.dir_os(self.par['dir_result'] + 'statistics').create_dir() self.par['dir_QC'] = myIO.dir_os(self.par['dir_stat'] + 'QC').create_dir() self.par['dir_enrichment'] = myIO.dir_os(self.par['dir_stat'] + 'enrichment').create_dir() #sample info self.par[ 'file_sample_info'] = self.par['dir_result'] + 'sample_info.csv' self.par['dir_log'] = self.par['dir_result'] + 'sample_log/' self.par['file_log'] = self.par['dir_result'] + 'output.log' self.par['file_total_log'] = self.par['dir_result'] + 'Total.log' self.par['file_stat'] = self.par['dir_QC'] + 'statistics.csv' self.par['file_ref_txt'] = self.par['dir_result'] + 'references.txt' self.par[ 'file_pro_pep'] = self.par['dir_result'] + 'protein_peptides.txt' #raw data related #print(self.par['dir_raw_data']) # self.par['RC_levels'] = ['lowRC'] #lowRC, midRC, highRC self.par['phip_levels'] = ['pep', 'promax', 'prosum'] files_dict = {} for pl in self.par['phip_levels']: file_head = '{}{}_'.format(self.par['dir_stat'], pl) #raw reads files_dict[pl + '_RC'] = file_head + 'RC.txt' #noramlized by total raw counts files_dict[pl + '_scalingRC'] = file_head + 'scalingRC.txt' files_dict[ pl + '_scalingRC_prosum'] = file_head + 'scalingRC_prosum.txt' files_dict[ pl + '_scalingRC_promax'] = file_head + 'scalingRC_promax.txt' #scalingRC against regressed median of phip sample and regressed sd of negative controls files_dict[pl + '_NCPHIPzscores'] = file_head + 'NCPHIPzscores.txt' files_dict[ pl + '_NCPHIPzscores_prosum'] = file_head + 'NCPHIPzscores_prosum.txt' files_dict[ pl + '_NCPHIPzscores_promax'] = file_head + 'NCPHIPzscores_promax.txt' self.par['files_dict'] = files_dict #default parameters self.par['specieZ_threshold'] = int( self.par['specieZ_threshold'] ) if 'specieZ_threshold' in self.par.keys() else 10 self.par['align_score'] = float( self.par['align_score']) if 'align_score' in self.par.keys( ) else 80 #p value cutoff for binomial testing self.par['p_threshold'] = float( self.par['p_threshold']) if 'p_threshold' in self.par.keys( ) else .001 #x value is observed successes cutoff for binomial test self.par['x_threshold'] = float( self.par['x_threshold']) if 'x_threshold' in self.par.keys() else 1 self.par['sim_threshold'] = float( self.par['sim_threshold']) if 'sim_threshold' in self.par.keys( ) else 0.8 self.par['zscore_threshold'] = int( self.par['zscore_threshold'] ) if 'zscore_threshold' in self.par.keys() else 10 self.par['permutation_times'] = int( self.par['permutation_times'] ) if 'permutation_times' in self.par.keys() else 100 self.par['threads_num'] = int(self.par['threads_num']) self.par['scaling_factor'] = int( self.par['scaling_factor']) if 'scaling_factor' in self.par.keys( ) else 1e6 #print self.par myDict.basic(self.par).print_dict() # return (self.par)
def __init__(self,out_dir): self.out_dir = myIO.dir_os(out_dir).create_dir() self.url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/'
def __init__(self, specie, out_dir): self.specie = specie self.out_dir = myIO.dir_os(out_dir+specie).create_dir() #initiate url list self.url_list()
def decompose_fq2(self, par): print('The splited FASTQ files are stored into {}'.format( par['dir_raw_data'])) #our directory out_dir = myIO.dir_os(par['dir_raw_data']).create_dir() #sequencing direction: R1 or R2 direction = self.R1R2() #read relationship between barcode vs sample from sample_file barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict() #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()} barcode_sample['unassigned'] = 'unassigned' #print barcode_sample #open file handles based on barcode_sample file_handle = {} barcode_file = {} known_dict = {} un_dict = {} for barcode, sample_name in barcode_sample.items(): fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction) file_handle[barcode] = open(fq_file, 'wt') barcode_file[barcode] = fq_file known_dict[barcode] = { 'sample_name': sample_name, 'read_counts': 0 } ### stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|' dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15, '-' * 15) print(dash_line) print( stdout_format.format('Raw reads', 'Assigned reads', 'Percentage', 'Trim reads')) print(stdout_format.format('millions', 'millions', '%', 'nt->nt')) print(dash_line) n = 0 #total number of reads m = 0 # total number assigned reads #file handle #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2: F1 = self.readonly_handle(self.biofile) #fastq_file F2 = self.readonly_handle(par['I1_file']) #I1_file F3 = self.readonly_handle(par['I2_file']) #I2_fie with F1, F2, F3: #read 4 lines at a time per file for L1, La, Le, L2, Lb, Lf, L3, Lc, Lg, L4, Ld, Lh in itertools.zip_longest( *[F1, F2, F3] * 4): barcode = Lb.rstrip() + Lf.rstrip() rlen = len(L2) - 1 tag = False #assign record based on barcode if barcode in file_handle and rlen >= par['seq_min']: L_name = re.sub(r'\/', '#' + barcode + '/', L1) #print L_name, La #trim reads from 5 end or 3-end L2 = L2.rstrip() L4 = L4.rstrip() L2 = L2[par['seq_start']:par['seq_end']] + "\n" L4 = L4[par['seq_start']:par['seq_end']] + "\n" #output file handle file_handle[barcode].writelines([L_name, L2, L3, L4]) #counting known_dict[barcode]['read_counts'] += 1 m += 1 tag = True else: #output file handle file_handle['unassigned'].writelines([L1, L2, L3, L4]) un_dict[barcode] = un_dict[ barcode] + 1 if barcode in un_dict else 1 known_dict['unassigned']['read_counts'] += 1 n += 1 #export when if n >= 1e5 and n % 5e5 == 0: #million perc = round(m * 100 / n, 2) flen = len(L2) - 1 read_info = "{}-->{}".format( rlen, flen) if tag is True else "{}-->X".format(rlen) print( stdout_format.format(n / 1e6, m / 1e6, perc, read_info)) #if n==3e6: break else: print(dash_line) print( stdout_format.format(n / 1e6, m / 1e6, round(m * 100 / n, 2), '---')) print(dash_line) #calculate percentage for bc in known_dict.keys(): RC = float(known_dict[bc]['read_counts']) known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2) #close file handle for b, F in file_handle.items(): #close file handle F.close() #delete empty file if os.stat(barcode_file[b]).st_size == 0: os.remove(barcode_file[b]) #export statistics myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t') myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
def demultiplex_fq(self, par): #our directory out_dir = myIO.dir_os(par['dir_raw_data']).create_dir() #sequencing direction: R1 or R2 direction = self.R1R2() #read relationship between barcode vs sample from sample_file barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict() #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()} barcode_sample['unassigned'] = 'unassigned' #print barcode_sample #open file handles based on barcode_sample file_handle = {} barcode_file = {} known_dict = {} un_dict = {} for barcode, sample_name in barcode_sample.items(): fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction) file_handle[barcode] = open(fq_file, 'wt') barcode_file[barcode] = fq_file known_dict[barcode] = { 'sample_name': sample_name, 'read_counts': 0 } ### #file handle #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2: F1 = self.readonly_handle(self.biofile) F2 = self.readonly_handle(par['index_file']) n = 0 #total number of reads m = 0 # total number assigned reads stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|' dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15, '-' * 15) print(dash_line) print( stdout_format.format('Raw reads', 'Assigned reads', 'Percentage', 'Read Length')) print(stdout_format.format('millions', 'millions', '%', 'nt')) print(dash_line) with F1, F2: #read 4 lines at a time per file for L1, La, L2, Lb, L3, Lc, L4, Ld in itertools.zip_longest( *[F1, F2] * 4): barcode = Lb.rstrip() #assign record based on barcode if barcode in file_handle and len(L2) >= par['seq_min']: L_name = re.sub(r'\/', '#' + barcode + '/', L1) #print L_name, La #trim reads from 5 end if par['seq_start'] > 0: L2 = L2[par['seq_start']:] L4 = L4[par['seq_start']:] #trim the longer reads from 3-end if par['seq_end'] != 0: L2 = L2.rstrip() L4 = L4.rstrip() L2 = L2[:par['seq_end']] + "\n" L4 = L4[:par['seq_end']] + "\n" #output file handle file_handle[barcode].writelines([L_name, L2, L3, L4]) #counting known_dict[barcode]['read_counts'] += 1 m += 1 else: #output file handle file_handle['unassigned'].writelines([L1, L2, L3, L4]) un_dict[barcode] = un_dict[ barcode] + 1 if barcode in un_dict else 1 known_dict['unassigned']['read_counts'] += 1 n += 1 #export when if m >= 1e6 and m % 1e6 == 0: #million print( stdout_format.format(n / 1e6, m / 1e6, round(m * 100 / n, 2), len(L2) - 1)) #if n==3e6: break else: print(dash_line) print( stdout_format.format(n / 1e6, m / 1e6, m * 100 / n, '---')) print(dash_line) #calculate percentage for bc in known_dict.keys(): RC = float(known_dict[bc]['read_counts']) known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2) #close file handle for b, F in file_handle.items(): #close file handle F.close() #delete empty file if os.stat(barcode_file[b]).st_size == 0: os.remove(barcode_file[b]) #export statistics myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t') myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
#pass arguments start, end = sys.argv[1].split('-') par = { 'specie_permutation': 'yes', 'organism_permutation': 'yes', 'threads_num': 24, 'start': int(start), 'end': int(end) + 1, 'align_score': 80, 'sim_threshold': 0.8, 'dir_bin': dir_bin + '/', 'dir_home': dir_home + '/', 'permutation_times': 100 } par['dir_permutation'] = myIO.dir_os(par['dir_home'] + 'permutation/').create_dir() print('###permutation procedure\n\n') pool = mpd.Pool(processes=par['threads_num']) #permuation of organism alignment if par['organism_permutation'] == 'yes': #read aln file file_aln = par['dir_home'] + 'ref_seq/organism_blast.txt' par['binary_aln_df'] = myDataframe.basic().aln_df( file_aln, par['align_score']) par['type'] = myIO.file_os(file_aln).name_prefix() par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' + par['type']).create_dir() # for hits_num in range(par['start'], par['end']):
def par_command(argv): phip_libs = ['human', 'virus', 'PE', 'allergome', 'LISH'] #initiate parameters par = {'fq_file':'NA','barcode_file':'NA','index_file':'NA','I1_file':'NA','I2_file':'NA', \ 'dir_raw_data':'NA', 'dir_raw':'NA','dir_in':'NA', 'out':'NA', \ 'dir_result':'NA', 'multiplexing_mode':0, 'ref_libs':phip_libs[:2], \ 'seq_start':0, 'seq_end':0, 'seq_min':10, 'seq_max':0 } usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \ '-f <fastq file> -i <index file> -b <barcode file>\n' try: opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:l:x:y:m:n:z:c:",["help",\ "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_len",\ 'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','dir_raw','ref_library']) except getopt.GetoptError: print(usage_out) sys.exit(2) #get parameters for opt, arg in opts: if opt in ('-h', '--help'): print(usage_out) #common usage # python Process_FASTQ.py -f * -i * -b * -o * -y *" print("-h --help\tUsage information of this script.") print( "-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)" ) print( "-f --fastq_file\tFastq file determined by a sequencing analyzer." ) print("-i --index_file\tIndex file matched with the fastq file.") print( "-b --barcode_file\tBarcode file matched with the index file.") print( "-o --raw_data\tDirectory storing demulitplexed *fastq files.") print( "-y --out\tDirectory storing sample_info.csv and variables.txt." ) print( "-c --ref_library\tReference libraries can be one of {}, default is {}." .format(phip_libs, phip_libs[:2])) sys.exit(2) elif opt in ("-f", "--fastq_file"): par['fq_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-i", "--index_file"): par['index_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-b", "--barcode_file"): par['barcode_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-o", "--raw_data"): par['dir_raw_data'] = myIO.dir_os( os.path.abspath(arg)).create_dir() elif opt in ( "-z", "--all_raw_data"): # only for one more sets of fastq splits par['dir_raw'] = myIO.dir_os(os.path.abspath(arg)).create_dir() elif opt in ('-x', "--dir_in"): par['dir_in'] = os.path.abspath(arg) par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in']) elif opt in ('-y', "--out"): par['out'] = arg elif opt in ("-l", "--fixed_len"): len_min, len_max = arg.split(':') par['seq_min'] = abs(int(len_min)) par['seq_max'] = abs(int(len_max)) elif opt in ("-t", "--trim_len"): trim_end5, trim_end3 = arg.split(':') par['seq_start'] = abs(int(trim_end5)) par['seq_end'] = -abs(int(trim_end3)) elif opt in ("-m", "--I1_file"): par['I1_file'] = os.path.abspath(arg) elif opt in ("-n", "--I2_file"): par['I2_file'] = os.path.abspath(arg) elif opt in ("-c", "--ref_library"): libs = arg.split(',') par['ref_libs'] = [x for x in libs if x in phip_libs] # if par['seq_max'] > 0: par['seq_end'] = par['seq_max'] # myDict.basic(par).print_dict() return par
if 'fq_files' in par.keys(): for fq in par['fq_files']: myGenome.genome(fq).trim_fq(par['dir_raw_data'], par['seq_start'], par['seq_end']) #generate sample_info file under result dir: if par['out'] != 'NA': #current dir par['dir_bin'] = os.path.dirname(os.path.realpath(__file__)) + '/' par['dir_home'] = os.path.abspath( os.path.join(par['dir_bin'], os.pardir)) + '/' print('Home directory of phip pipsline: ', par['dir_home']) #libraries. default is human and virus for lib in par['ref_libs']: par['dir_result'] = myIO.dir_os( os.path.abspath(par['out'] + '_' + lib)).create_dir() if os.path.isdir(par['dir_result']): #1: sample_info.csv par['file_sample_info'] = par['dir_result'] + 'sample_info.csv' print('The sample information file: ', par['file_sample_info']) #read sample_info.csv myParallel.samples(par).export_sample_info() #2: copy template variables.txt into lib folder template_file = '{}variables_{}.txt'.format( par['dir_bin'], lib) var_file = '{}variables.txt'.format(par['dir_result']) print('Save {} and then update it.'.format(var_file)) shutil.copy(template_file, var_file) #update parameters of variables.txt refresh = { 'dir_home': par['dir_home'],
def par_command(argv): phip_libs = ['human', 'virus', 'allergome', 'provirome', 'toxome', 'mouse', 'PE', 'zika', 'arbo', 'LISH'] #initiate parameter na_str='fq_file,barcode_file,index_file,I1_file,I2_file,dir_raw_data,dir_in,out,dir_result' par=dict([(key, 'NA') for key in na_str.split(',')]) par.update({'ref_libs':phip_libs[:2], 'seq_start':0, 'seq_end':None, 'seq_min':10}) usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \ '-f <fastq file> -i <index file> -b <barcode file>\n' try: opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:r:l:x:y:m:n:c:",["help",\ "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_5end", 'len_trim',\ 'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','ref_library']) except getopt.GetoptError: print(usage_out) sys.exit(2) #get parameters for opt, arg in opts: if opt in ('-h', '--help'): print(usage_out) #common usage # python Process_FASTQ.py -f * -i * -b * -o * -y *" print("-h --help\tUsage information of this script.") print("-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)") print("-f --fastq_file\tFastq file determined by a sequencing analyzer.") print("-i --index_file\tIndex file matched with the fastq file.") print("-b --barcode_file\tBarcode file matched with the index file.") print("-o --raw_data\tDirectory storing demulitplexed *fastq files.") print("-y --out\tDirectory storing sample_info.csv and variables.txt.") print("-c --ref_library\tReference libraries can be any of {}, default is {}.".format(phip_libs, phip_libs[:2])) sys.exit(2) elif opt in ("-f", "--fastq_file"): par['fq_file'] = os.path.abspath(arg) elif opt in ("-i", "--index_file"): par['index_file'] = os.path.abspath(arg) elif opt in ("-b", "--barcode_file"): par['barcode_file'] = os.path.abspath(arg) elif opt in ("-o", "--raw_data"): par['dir_raw_data'] = myIO.dir_os(os.path.abspath(arg)).create_dir() elif opt in ('-x', "--dir_in"): par['dir_in'] = os.path.abspath(arg) par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in']) elif opt in ('-y', "--out"): par['out'] = arg elif opt in ("-l", "--min_len"): # discard shorter reads due to poor sequencing par['seq_min'] = abs(int(arg)) elif opt in ("-t", "--trim_5end"): #trim_end5: length of nt from the 5-end par['seq_start'] = abs(int(arg)) elif opt in ("-r", "--fixed_len"): #len_trim: length of nt after trimming 5-end and 3-end par['seq_len'] = abs(int(arg)) par['seq_end'] = par['seq_start'] + par['seq_len'] elif opt in ("-m", "--I1_file"): par['I1_file'] = os.path.abspath(arg) elif opt in ("-n", "--I2_file"): par['I2_file'] = os.path.abspath(arg) elif opt in ("-c", "--ref_library"): libs = arg.split(',') par['ref_libs'] = [x for x in libs if x in phip_libs] # myDict.basic(par).print_dict() return par
#download the idmapping file local_file = myDownload.uniprot(par['dir_out']).download_idmapping() ################################################################################# if __name__ == "__main__": #initiate dictionary saving parameters par = { 'in_out': 'Continue' } annot = download_annot(par) ######################################## # #2: download dir par['dir_home'] = myIO.dir_os('/home/yuan/data_preparation/').stdin_dir( 'Enter the directory path storing downloads files') print par['dir_home'] while (par['in_out'] == 'Continue'): #2:select ftp or web site web_sites = ['NCBI', 'ENSEML', 'UniProt'] par['web_site'] = mySystem.system().select_key( web_sites, 'Select public database') par['dir_out'] = par['dir_home'] + par['web_site'] + '/' #1: select file types if par['web_site'] in ['NCBI', 'ENSEML']: operations = ['Genome annotation', 'match fasta and gtf'] par['operations'] = mySystem.system().select_key( operations, 'What is your operations') elif par['web_site'] == 'UniProt': par['operations'] = 'UniProt idmapping'