def init_analysis(self): #1: read annotation file if 'file_annotation' in self.par.keys(): self.par['annot_df'] = myDataframe.basic().annot_df( self.par['file_annotation']) #genome annotation: associations of protein-peptides self.par['dict_pro_pep'] = myCommon.basic( self.par).protein_peptides() #virus only if 'VirScan' in self.par['file_annotation']: #extract aa stretch #get dependent petides that two peptides shared at least 7-aa. self.par['dependent_pep'] = myCommon.basic( self.par).taxon_dependent_peptides() #2: check bowtie or build bowtie index myAlign.alignment(self.par).build_bowtie_index() #3: sample info self.par = myParallel.samples(self.par).export_sample_info() #samples of negative controls group1 = self.par['group1'] if 'NC' in group1.keys(): self.par['NC_samples'] = group1['NC'].split(',') self.par['phip_samples'] = list( set(self.par['sample_names']) - set(self.par['NC_samples'])) print('\nNumber of negative Controls (Beads only): ', self.par['NC_samples'].__len__()) print('Number of PhIP samples: ', self.par['sample_names'].__len__()) #myDict.basic(self.par['sample_dirs']).print_dict() #read reference sequence file (*.fa) ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa() self.par['ref_dict'] = ref_dict
#current dir par['dir_bin'] = os.path.dirname(os.path.realpath(__file__)) + '/' par['dir_home'] = os.path.abspath( os.path.join(par['dir_bin'], os.pardir)) + '/' print('Home directory of phip pipsline: ', par['dir_home']) #libraries. default is human and virus for lib in par['ref_libs']: par['dir_result'] = myIO.dir_os( os.path.abspath(par['out'] + '_' + lib)).create_dir() if os.path.isdir(par['dir_result']): #1: sample_info.csv par['file_sample_info'] = par['dir_result'] + 'sample_info.csv' print('The sample information file: ', par['file_sample_info']) #read sample_info.csv myParallel.samples(par).export_sample_info() #2: copy template variables.txt into lib folder template_file = '{}variables_{}.txt'.format( par['dir_bin'], lib) var_file = '{}variables.txt'.format(par['dir_result']) print('Save {} and then update it.'.format(var_file)) shutil.copy(template_file, var_file) #update parameters of variables.txt refresh = { 'dir_home': par['dir_home'], 'dir_result': par['dir_result'] } refresh['dir_raw_data'] = par['dir_raw_data'] if par[ 'dir_raw'] == 'NA' else par['dir_raw'] myIO.file_os(var_file, '=').line_replace(refresh) else:
def par_command(argv): phip_libs = ['human', 'virus', 'PE', 'allergome', 'LISH'] #initiate parameters par = {'fq_file':'NA','barcode_file':'NA','index_file':'NA','I1_file':'NA','I2_file':'NA', \ 'dir_raw_data':'NA', 'dir_raw':'NA','dir_in':'NA', 'out':'NA', \ 'dir_result':'NA', 'multiplexing_mode':0, 'ref_libs':phip_libs[:2], \ 'seq_start':0, 'seq_end':0, 'seq_min':10, 'seq_max':0 } usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \ '-f <fastq file> -i <index file> -b <barcode file>\n' try: opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:l:x:y:m:n:z:c:",["help",\ "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_len",\ 'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','dir_raw','ref_library']) except getopt.GetoptError: print(usage_out) sys.exit(2) #get parameters for opt, arg in opts: if opt in ('-h', '--help'): print(usage_out) #common usage # python Process_FASTQ.py -f * -i * -b * -o * -y *" print("-h --help\tUsage information of this script.") print( "-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)" ) print( "-f --fastq_file\tFastq file determined by a sequencing analyzer." ) print("-i --index_file\tIndex file matched with the fastq file.") print( "-b --barcode_file\tBarcode file matched with the index file.") print( "-o --raw_data\tDirectory storing demulitplexed *fastq files.") print( "-y --out\tDirectory storing sample_info.csv and variables.txt." ) print( "-c --ref_library\tReference libraries can be one of {}, default is {}." .format(phip_libs, phip_libs[:2])) sys.exit(2) elif opt in ("-f", "--fastq_file"): par['fq_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-i", "--index_file"): par['index_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-b", "--barcode_file"): par['barcode_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-o", "--raw_data"): par['dir_raw_data'] = myIO.dir_os( os.path.abspath(arg)).create_dir() elif opt in ( "-z", "--all_raw_data"): # only for one more sets of fastq splits par['dir_raw'] = myIO.dir_os(os.path.abspath(arg)).create_dir() elif opt in ('-x', "--dir_in"): par['dir_in'] = os.path.abspath(arg) par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in']) elif opt in ('-y', "--out"): par['out'] = arg elif opt in ("-l", "--fixed_len"): len_min, len_max = arg.split(':') par['seq_min'] = abs(int(len_min)) par['seq_max'] = abs(int(len_max)) elif opt in ("-t", "--trim_len"): trim_end5, trim_end3 = arg.split(':') par['seq_start'] = abs(int(trim_end5)) par['seq_end'] = -abs(int(trim_end3)) elif opt in ("-m", "--I1_file"): par['I1_file'] = os.path.abspath(arg) elif opt in ("-n", "--I2_file"): par['I2_file'] = os.path.abspath(arg) elif opt in ("-c", "--ref_library"): libs = arg.split(',') par['ref_libs'] = [x for x in libs if x in phip_libs] # if par['seq_max'] > 0: par['seq_end'] = par['seq_max'] # myDict.basic(par).print_dict() return par
def par_command(argv): phip_libs = ['human', 'virus', 'allergome', 'provirome', 'toxome', 'mouse', 'PE', 'zika', 'arbo', 'LISH'] #initiate parameter na_str='fq_file,barcode_file,index_file,I1_file,I2_file,dir_raw_data,dir_in,out,dir_result' par=dict([(key, 'NA') for key in na_str.split(',')]) par.update({'ref_libs':phip_libs[:2], 'seq_start':0, 'seq_end':None, 'seq_min':10}) usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \ '-f <fastq file> -i <index file> -b <barcode file>\n' try: opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:r:l:x:y:m:n:c:",["help",\ "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_5end", 'len_trim',\ 'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','ref_library']) except getopt.GetoptError: print(usage_out) sys.exit(2) #get parameters for opt, arg in opts: if opt in ('-h', '--help'): print(usage_out) #common usage # python Process_FASTQ.py -f * -i * -b * -o * -y *" print("-h --help\tUsage information of this script.") print("-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)") print("-f --fastq_file\tFastq file determined by a sequencing analyzer.") print("-i --index_file\tIndex file matched with the fastq file.") print("-b --barcode_file\tBarcode file matched with the index file.") print("-o --raw_data\tDirectory storing demulitplexed *fastq files.") print("-y --out\tDirectory storing sample_info.csv and variables.txt.") print("-c --ref_library\tReference libraries can be any of {}, default is {}.".format(phip_libs, phip_libs[:2])) sys.exit(2) elif opt in ("-f", "--fastq_file"): par['fq_file'] = os.path.abspath(arg) elif opt in ("-i", "--index_file"): par['index_file'] = os.path.abspath(arg) elif opt in ("-b", "--barcode_file"): par['barcode_file'] = os.path.abspath(arg) elif opt in ("-o", "--raw_data"): par['dir_raw_data'] = myIO.dir_os(os.path.abspath(arg)).create_dir() elif opt in ('-x', "--dir_in"): par['dir_in'] = os.path.abspath(arg) par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in']) elif opt in ('-y', "--out"): par['out'] = arg elif opt in ("-l", "--min_len"): # discard shorter reads due to poor sequencing par['seq_min'] = abs(int(arg)) elif opt in ("-t", "--trim_5end"): #trim_end5: length of nt from the 5-end par['seq_start'] = abs(int(arg)) elif opt in ("-r", "--fixed_len"): #len_trim: length of nt after trimming 5-end and 3-end par['seq_len'] = abs(int(arg)) par['seq_end'] = par['seq_start'] + par['seq_len'] elif opt in ("-m", "--I1_file"): par['I1_file'] = os.path.abspath(arg) elif opt in ("-n", "--I2_file"): par['I2_file'] = os.path.abspath(arg) elif opt in ("-c", "--ref_library"): libs = arg.split(',') par['ref_libs'] = [x for x in libs if x in phip_libs] # myDict.basic(par).print_dict() return par