def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) annotated_f = igfft.igfft_multiprocess(f, file_type='FASTQ', species=species, locus=loci, parsing_settings={ 'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions }, num_processes=number_threads, delete_alignment_file=True) annotated_files.append(annotated_f[0]) output_file_list = ','.join(annotated_files) print output_file_list return output_file_list
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) # Run trimmomatic trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'SE' trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0] # Run quality filtering filtered_trimmed_file = fastx.Run_Quality_Filter(trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(trimmedf) processed_files.append(filtered_trimmed_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): annotated_f = igfft.igfft_multiprocess(f, file_type='FASTQ', species=species, locus=loci, parsing_settings={'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions}, num_processes=number_threads, delete_alignment_file=True) annotated_files.append(annotated_f[0]) print('Pairing sequences') output_dir = os.path.dirname(annotated_files[0]) pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='GEORGIOU_INHOUSE', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff) print('Pipeline complete')
def Run_Quality_Filter(files, output_dir, quality, percent, encoding='-Q33'): if not type(files) is list: files = [files] for i, each_file in enumerate(files): if each_file.endswith('.gz'): print "Unzipping file: {0}...".format(each_file) files[i] = useful.gunzip_python(each_file) print "Unzipping complete" file_list = 'cat ' + ' '.join(['"' + f + '"' for f in files]) + ' | ' outfile = os.path.join( output_dir, os.path.basename(files[0]).replace( '.fastq', '')) + '.filtered.{0}.fastq'.format('q' + str(quality) + 'p' + str(percent)) print "Running filtering..." subprocess.check_output('{3} {5} -v {4} -o "{0}" -q {1} -p {2}'.format( outfile, str(quality), str(percent), file_list, encoding, fastq_quality_filter_location), shell=True) print "filtering complete" return outfile
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) # Run trimmomatic trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'SE' trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0] # Run quality filtering filtered_trimmed_file = fastx.Run_Quality_Filter( trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(trimmedf) processed_files.append(filtered_trimmed_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): annotated_f = igfft.igfft_multiprocess(f, file_type='FASTQ', species=species, locus=loci, parsing_settings={ 'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions }, num_processes=number_threads, delete_alignment_file=True) annotated_files.append(annotated_f[0]) print('Pairing sequences') output_dir = os.path.dirname(annotated_files[0]) pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='GEORGIOU_INHOUSE', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff) print('Pipeline complete')
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for pair_of_files in input_files: folder_path = os.path.dirname(pair_of_files[0]) for i, f in enumerate(pair_of_files): if f.endswith('.gz'): print('Unzipping: ', f) pair_of_files[i] = useful.gunzip_python(f) # Run trimmomatic if trim_seqs: print('Trimming low quality bases') trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'PE' input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters) else: input_files = pair_of_files # Stitch R1-R2 files pairing_parameters = { 'v': min_overlap_length, 'm': max_assembly_length, 'n': min_assembly_length, 'u': max_fraction_uncalled, } print('Stitching R1-R2 reads') pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0] # Run quality filtering filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(pear_results) processed_files.append(filtered_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + '.mixcr.alignment' output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation' # Run MIXCR file print('Running MIXCR') [annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads) # Parse MIXCR file print('Parsing MIXCR') annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file[0]) print('Pipeline complete')
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') annotated_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) annotated_f = igfft.igfft_multiprocess(f, file_type='FASTQ', species=species, locus=loci, parsing_settings={'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions}, num_processes=number_threads, delete_alignment_file=True) annotated_files.append(annotated_f[0]) output_file_list = ','.join(annotated_files) print output_file_list return output_file_list
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for pair_of_files in input_files: folder_path = os.path.dirname(pair_of_files[0]) for i, f in enumerate(pair_of_files): if f.endswith('.gz'): print('Unzipping: ', f) pair_of_files[i] = useful.gunzip_python(f) # Run trimmomatic if trim_seqs: print('Trimming low quality bases') trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'PE' input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters) else: input_files = pair_of_files # Stitch R1-R2 files pairing_parameters = { 'v': min_overlap_length, 'm': max_assembly_length, 'n': min_assembly_length, 'u': max_fraction_uncalled, } print('Stitching R1-R2 reads') pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0] # Run quality filtering filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(pear_results) processed_files.append(filtered_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): annotated_f = igfft.igfft_multiprocess(f, species=species, locus=loci, parsing_settings={'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions}, num_processes=number_threads, delete_alignment_file=True) annotated_files.append(annotated_f[0]) print('Pipeline complete')
def run_gglab_pipeline(input_files, species, loci, group_name=""): # Unzip files print ("Processing raw fastq files") processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith(".gz"): print ("Unzipping: ", f) f = useful.gunzip_python(f) annotated_f = igfft.igfft_multiprocess( f, file_type="FASTQ", species=species, locus=loci, parsing_settings={"isotype": isotyping_barcodes, "remove_insertions": remove_insertions}, num_processes=number_threads, delete_alignment_file=True, ) annotated_files.append(annotated_f[0]) output_file_list = ",".join(annotated_files) print output_file_list return output_file_list
def Run_Quality_Filter(files, output_dir, quality, percent, encoding='-Q33'): if not type(files) is list: files = [files] for i, each_file in enumerate(files): if each_file.endswith('.gz'): print "Unzipping file: {0}...".format(each_file) files[i] = useful.gunzip_python(each_file) print "Unzipping complete" file_list = 'cat '+' '.join(['"'+f+'"' for f in files]) +' | ' outfile = os.path.join(output_dir, os.path.basename(files[0]).replace('.fastq', '')) + '.filtered.{0}.fastq'.format('q' + str(quality) + 'p' + str(percent)) print "Running filtering..." subprocess.check_output('{3} {5} -v {4} -o "{0}" -q {1} -p {2}'.format(outfile,str(quality),str(percent),file_list,encoding, fastq_quality_filter_location), shell=True) print "filtering complete" return outfile
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) # Run trimmomatic trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'SE' trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0] # Run quality filtering filtered_trimmed_file = fastx.Run_Quality_Filter(trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(trimmedf) processed_files.append(filtered_trimmed_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + '.mixcr.alignment' output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation' # Run MIXCR file print('Running MIXCR') [annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads) # Parse MIXCR file print('Parsing MIXCR') annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file) print('Pairing sequences') output_dir = os.path.dirname(annotated_files[0]) pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='MIXCR', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff) print('Pipeline complete')
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for pair_of_files in input_files: folder_path = os.path.dirname(pair_of_files[0]) for i, f in enumerate(pair_of_files): if f.endswith('.gz'): print('Unzipping: ', f) pair_of_files[i] = useful.gunzip_python(f) # Run trimmomatic if trim_seqs: print('Trimming low quality bases') trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'PE' input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters) else: input_files = pair_of_files # Stitch R1-R2 files pairing_parameters = { 'v': min_overlap_length, 'm': max_assembly_length, 'n': min_assembly_length, 'u': max_fraction_uncalled, } print('Stitching R1-R2 reads') pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0] # Run quality filtering filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(pear_results) processed_files.append(filtered_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): annotated_f = igfft.igfft_multiprocess(f, species=species, locus=loci, parsing_settings={ 'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions }, num_processes=number_threads, delete_alignment_file=True) annotated_files.append(annotated_f[0]) print('Pipeline complete')
def Run_FASTX_Barcode_Splitter(files,output_dir,settings={'orientation':'bol'},search_reverse_complement=True): parameters = copy.deepcopy(settings) if 'orientation' not in parameters: raise Exception('"Orientation" is required in the parameters field') if not type(files) is list: files = [files] for i,each_file in enumerate(files): if each_file.endswith('.gz'): print "Unzipping file: {0}...".format(each_file) files[i] = useful.gunzip_python(each_file) files[i] = each_file[:-3] print "Unzipping complete" suffix = parameters.pop('suffix') if 'suffix' in parameters else '' barcode_splitter_command = 'cat '+' '.join(files)+' | ' if output_dir[-1] == '/': output_dir = output_dir[:-1] if 'prefix' in parameters and parameters['prefix'] != '': prefix = output_dir+'/'+parameters['prefix'] else: prefix = output_dir+'/' parameters.pop('prefix',None) additional_folders = os.path.dirname(prefix) if not os.path.isdir(additional_folders): os.mkdir(additional_folders) orientation = parameters.pop('orientation',None) barcode_splitter_command += 'fastx_barcode_splitter.pl ' for p in parameters: barcode_splitter_command+='--{0} {1} '.format(p,parameters[p]) barcode_splitter_command +='--prefix '+prefix+ ' --suffix '+suffix+' --'+orientation output = useful.get_stdout(barcode_splitter_command).rstrip(' \n').split('\n')#output = subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n') if output[0].lower().startswith('error'): raise Exception("Error found in barcode split program: "+output[0]) result = {'barcodes':defaultdict(int)} for line in output[1:-2]: line = line.split('\t') result['barcodes'][line[2]] = int(line[1]) result['total'] = int(output[-1].split('\t')[1]) result['unmatched'] = int(output[-2].split('\t')[1]) if search_reverse_complement: initial_file = [] new_file = [] map_barcode_to_file = {} with open(parameters['bcfile']) as file: lines=file.readlines() new_bcfile=open(settings['bcfile']+'rc','w') for l in lines: c = l.split('\t') initial_file.append(c[0].strip()) new_file.append(c[0].strip()+'rev') map_barcode_to_file[c[0].strip()+'rev'] = prefix+c[0].strip()+suffix new_bcfile.write(c[0].strip()+'rev'+'\t'+Reverse_Complement(c[1].strip())+'\n') new_bcfile.close() shutil.copyfile(prefix + 'unmatched' + suffix, prefix + 'unmatched' + suffix + '.temp') files = [prefix + 'unmatched' + suffix + '.temp'] parameters['bcfile'] +='rc' if orientation == 'eol': orientation='bol' elif orientation == 'bol': orientation= 'eol' barcode_splitter_command = 'cat "'+' '.join(files)+'" | ' barcode_splitter_command += barcode_split_perl_script for p in parameters: barcode_splitter_command+='--{0} {1} '.format(p,parameters[p]) barcode_splitter_command += '--prefix ' + prefix + ' --suffix ' + suffix + ' --' + orientation output = useful.get_stdout(barcode_splitter_command).rstrip(' \n').split('\n') # subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n') for i, line in enumerate(output[1:-2]): line = line.split('\t') result['barcodes'][map_barcode_to_file[line[0].strip()]] += int(line[1]) result['unmatched'] = int(output[-2].split('\t')[1]) cleanup_command = '' for i, each_bc_file in enumerate(initial_file): cleanup_command += "mv '{0}{1}{3}' '{0}{1}{3}.temp';cat '{0}{1}{3}.temp' '{0}{2}{3}' > '{0}{1}{3}'; rm '{0}{1}{3}.temp';rm '{0}{2}{3}';".format(prefix, each_bc_file, new_file[i], suffix) cleanup_command += "rm '{0}{1}'; ".format(prefix, 'unmatched' + suffix + '.temp') subprocess.cal(cleanup_command, shell=True) return result
def run_trimmomatic(files, output_directory=None, method='SE', phred=None, optional_parameters={}): ''' Wrapper function for running trimmomatic program within python Trimmomatic will remove low quality bases from the ends of NGS reads using an average quality score in a given window size Parameters ---------- files : string or list of strings List of input filenames (fastq or fastq.gz) for the MISEQ files. We either accept a single string or a list of two strings. working_directory : string, default none Pathname of desired output directory outfile : string, default empty string Desired filename name method : SE or PE, default 'SE' String representing whether to treat input files as single (SE) or paired-end files (PE) phred : integer, default None If None, then will rely on trimmomatic to guess the quality encoding. If a number, then will pass this value into the phred field. optional_parameters : dict, default empty parameters An optional dict of all parameters you would like to pass to trimmomatic http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf ''' method = method.upper() if method not in ['SE', 'PE']: raise Exception( 'Incorrect value provided for parameter "method". Provided value: ' + method) if not isinstance(files, list): files = [files] if len(files) > 2: raise Exception( str(len(files)) + 'total files have been passed to function. We only except 1 or 2 filepaths representing the R1/R2 reads' ) for i, f in enumerate(files): if f.endswith('.gz'): print('Unzipping: ', f) files[i] = useful.gunzip_python(f) output_directory = useful.get_parent_dir( files[0]) if not output_directory else os.path.abspath( output_directory) return_file_names = [] command_loops = [] if method == 'SE': # Trim each file at a time for f in files: input_file_names = [] output_file_names = [] input_file_names.append('"' + f + '"') out = f[:-6] if f.endswith('.fastq') else f output_file_names.extend(['"' + out + '.trimmed.fastq"']) return_file_names.append(out + '.trimmed.fastq') command_loops.append([input_file_names, output_file_names]) else: input_file_names = [] output_file_names = [] # trim all files simultaneously for f in files: input_file_names.append('"' + f + '"') out = f[:-6] if f.endswith('.fastq') else f output_file_names.extend([ '"' + out + '.trimmed.fastq"', '"' + out + '.trimmed.unpaired.fastq"' ]) return_file_names.append(out + '.trimmed.fastq') command_loops.append([input_file_names, output_file_names]) phred_var = '-phred' + str(phred) if phred else '' # We should change the java folder to recognize /usr/local/bin... for loops in command_loops: inputs = loops[0] outputs = loops[1] trim_command = 'java -jar {5} {0} {4} -threads 2 {1} {2} {3}'.format( method, ' '.join(inputs), ' '.join(outputs), ' '.join([ key + ':' + str(value) for key, value in optional_parameters.iteritems() ]), phred_var, trimmomatic_location) worked = subprocess.call(trim_command, shell=True) if worked > 0: raise Exception('Trimmomatic failed') return return_file_names
def run_gglab_pipeline(input_files, species, loci, group_name=""): # Unzip files print("Processing raw fastq files") processed_files = [] for pair_of_files in input_files: folder_path = os.path.dirname(pair_of_files[0]) for i, f in enumerate(pair_of_files): if f.endswith(".gz"): print("Unzipping: ", f) pair_of_files[i] = useful.gunzip_python(f) # Run trimmomatic if trim_seqs: print("Trimming low quality bases") trimming_parameters = { "SLIDINGWINDOW": str(window_trim) + ":" + str(quality_cutoff_trim), "MINLEN": min_read_len_post_trim, } method = "PE" input_files = processing.run_trimmomatic( pair_of_files, folder_path, method, phred_encode, trimming_parameters ) else: input_files = pair_of_files # Stitch R1-R2 files pairing_parameters = { "v": min_overlap_length, "m": max_assembly_length, "n": min_assembly_length, "u": max_fraction_uncalled, } print("Stitching R1-R2 reads") pear_results = processing.run_pear( input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory, )[0] # Run quality filtering filtered_file = fastx.Run_Quality_Filter( pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases ) os.remove(pear_results) processed_files.append(filtered_file) print("Annotating processed fastq files") annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + ".mixcr.alignment" output_file_annotation = useful.removeFileExtension(f) + ".mixcr.annotation" # Run MIXCR file print("Running MIXCR") [annotated_f, command_val] = mixcr.RunMixcr( f, output_file, filetype="FASTQ", loci=[], species="", exportPrettyAlignment=False, num_threads=number_threads, ) # Parse MIXCR file print("Parsing MIXCR") annotated_file = mixcr.parseMIXCR( f, output_file, "FASTQ", output_file_annotation, command_val=command_val ) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file[0]) print("Pipeline complete")
def run_flash(r1file, r2file, working_directory, outfile='', parameters={}, suffix=''): r1_path = useful.get_parent_dir(r1file) # '/'.join(r1file.split('/')[:-1]) r2_path = useful.get_parent_dir(r2file) # '/'.join(r2file.split('/')[:-1]) if not parameters: print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400" parameters = {'r': 300, 'f': 400} if r1file.endswith('.gz'): print "Unzipping R1 File.." r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print "Unzipping R2 File.." r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile += '.flashed' + suffix if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print( 'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN' .format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename( r1file)) # working_directory+'/'+os.path.basename(r1file) r2file = os.path.join(working_directory, os.path.basename( r2file)) # working_directory+'/'+os.path.basename(r2file) flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location) parameters['o'] = outfile parameters['d'] = working_directory for p, val in parameters.iteritems(): flash_command += ' -{0} {1}'.format(p, str(val)) flash_command += ' -q' # run on quiet command # os.system(flash_command) worked = subprocess.call(flash_command, shell=True) if worked > 0: raise Exception('Flash failed') os.rename( os.path.join(working_directory, outfile + '.extendedFrags.fastq'), os.path.join(working_directory, outfile)) try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count( os.path.join(working_directory, outfile)) except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = (os.path.join(working_directory, outfile), read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file))) return resulting_counts
def run_flash(r1file, r2file, working_directory, outfile='', parameters={}, suffix=''): r1_path = useful.get_parent_dir(r1file) # '/'.join(r1file.split('/')[:-1]) r2_path = useful.get_parent_dir(r2file) # '/'.join(r2file.split('/')[:-1]) if not parameters: print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400" parameters = {'r': 300, 'f': 400} if r1file.endswith('.gz'): print "Unzipping R1 File.." r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print "Unzipping R2 File.." r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile += '.flashed' + suffix if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename(r1file)) # working_directory+'/'+os.path.basename(r1file) r2file = os.path.join(working_directory, os.path.basename(r2file)) # working_directory+'/'+os.path.basename(r2file) flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location) parameters['o'] = outfile parameters['d'] = working_directory for p, val in parameters.iteritems(): flash_command += ' -{0} {1}'.format(p, str(val)) flash_command += ' -q' # run on quiet command # os.system(flash_command) worked = subprocess.call(flash_command, shell=True) if worked > 0: raise Exception('Flash failed') os.rename(os.path.join(working_directory, outfile + '.extendedFrags.fastq'), os.path.join(working_directory, outfile)) try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count(os.path.join(working_directory, outfile)) except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = ( os.path.join(working_directory, outfile), read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file)) ) return resulting_counts
def run_pear(r1file, r2file, working_directory, outfile='', parameters={}, suffix='', num_threads=1, memory='1G'): r1_path = useful.get_parent_dir(r1file) r2_path = useful.get_parent_dir(r2file) if r1file.endswith('.gz'): print("Unzipping R1 File..") r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print("Unzipping R2 File..") r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile = os.path.join(working_directory, outfile) if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename(r1file)) r2file = os.path.join(working_directory, os.path.basename(r2file)) pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location) parameters['o'] = outfile parameters['y'] = memory parameters['j'] = num_threads for p, val in parameters.iteritems(): pear_command += ' -{0} {1}'.format(p, str(val)) worked = subprocess.call(pear_command, shell=True) if worked > 0: raise Exception('Error in pear program') try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count(outfile + '.assembled.fastq') except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = ( outfile + '.assembled.fastq', read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file)) ) return resulting_counts
def run_trimmomatic(files, output_directory=None, method='SE', phred=None, optional_parameters={}): ''' Wrapper function for running trimmomatic program within python Trimmomatic will remove low quality bases from the ends of NGS reads using an average quality score in a given window size Parameters ---------- files : string or list of strings List of input filenames (fastq or fastq.gz) for the MISEQ files. We either accept a single string or a list of two strings. working_directory : string, default none Pathname of desired output directory outfile : string, default empty string Desired filename name method : SE or PE, default 'SE' String representing whether to treat input files as single (SE) or paired-end files (PE) phred : integer, default None If None, then will rely on trimmomatic to guess the quality encoding. If a number, then will pass this value into the phred field. optional_parameters : dict, default empty parameters An optional dict of all parameters you would like to pass to trimmomatic http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf ''' method = method.upper() if method not in ['SE', 'PE']: raise Exception('Incorrect value provided for parameter "method". Provided value: ' + method) if not isinstance(files, list): files = [files] if len(files) > 2: raise Exception(str(len(files)) + 'total files have been passed to function. We only except 1 or 2 filepaths representing the R1/R2 reads') for i, f in enumerate(files): if f.endswith('.gz'): print('Unzipping: ', f) files[i] = useful.gunzip_python(f) output_directory = useful.get_parent_dir(files[0]) if not output_directory else os.path.abspath(output_directory) return_file_names = [] command_loops = [] if method == 'SE': # Trim each file at a time for f in files: input_file_names = [] output_file_names = [] input_file_names.append('"' + f + '"') out = f[:-6] if f.endswith('.fastq') else f output_file_names.extend(['"' + out + '.trimmed.fastq"']) return_file_names.append(out + '.trimmed.fastq') command_loops.append([input_file_names, output_file_names]) else: input_file_names = [] output_file_names = [] # trim all files simultaneously for f in files: input_file_names.append('"' + f + '"') out = f[:-6] if f.endswith('.fastq') else f output_file_names.extend(['"' + out + '.trimmed.fastq"', '"' + out + '.trimmed.unpaired.fastq"']) return_file_names.append(out + '.trimmed.fastq') command_loops.append([input_file_names, output_file_names]) phred_var = '-phred' + str(phred) if phred else '' # We should change the java folder to recognize /usr/local/bin... for loops in command_loops: inputs = loops[0] outputs = loops[1] trim_command = 'java -jar {5} {0} {4} -threads 2 {1} {2} {3}'.format(method, ' '.join(inputs), ' '.join(outputs), ' '.join([key + ':' + str(value) for key, value in optional_parameters.iteritems()]), phred_var, trimmomatic_location) worked = subprocess.call(trim_command, shell=True) if worked > 0: raise Exception('Trimmomatic failed') return return_file_names
def run_pear(r1file, r2file, working_directory, outfile='', parameters={}, suffix='', num_threads=1, memory='1G'): r1_path = useful.get_parent_dir(r1file) r2_path = useful.get_parent_dir(r2file) if r1file.endswith('.gz'): print("Unzipping R1 File..") r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print("Unzipping R2 File..") r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile = os.path.join(working_directory, outfile) if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print( 'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN' .format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename(r1file)) r2file = os.path.join(working_directory, os.path.basename(r2file)) pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location) parameters['o'] = outfile parameters['y'] = memory parameters['j'] = num_threads for p, val in parameters.iteritems(): pear_command += ' -{0} {1}'.format(p, str(val)) worked = subprocess.call(pear_command, shell=True) if worked > 0: raise Exception('Error in pear program') try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count(outfile + '.assembled.fastq') except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = (outfile + '.assembled.fastq', read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file))) return resulting_counts
def Run_FASTX_Barcode_Splitter(files, output_dir, settings={'orientation': 'bol'}, search_reverse_complement=True): parameters = copy.deepcopy(settings) if 'orientation' not in parameters: raise Exception('"Orientation" is required in the parameters field') if not type(files) is list: files = [files] for i, each_file in enumerate(files): if each_file.endswith('.gz'): print "Unzipping file: {0}...".format(each_file) files[i] = useful.gunzip_python(each_file) files[i] = each_file[:-3] print "Unzipping complete" suffix = parameters.pop('suffix') if 'suffix' in parameters else '' barcode_splitter_command = 'cat ' + ' '.join(files) + ' | ' if output_dir[-1] == '/': output_dir = output_dir[:-1] if 'prefix' in parameters and parameters['prefix'] != '': prefix = output_dir + '/' + parameters['prefix'] else: prefix = output_dir + '/' parameters.pop('prefix', None) additional_folders = os.path.dirname(prefix) if not os.path.isdir(additional_folders): os.mkdir(additional_folders) orientation = parameters.pop('orientation', None) barcode_splitter_command += 'fastx_barcode_splitter.pl ' for p in parameters: barcode_splitter_command += '--{0} {1} '.format(p, parameters[p]) barcode_splitter_command += '--prefix ' + prefix + ' --suffix ' + suffix + ' --' + orientation output = useful.get_stdout(barcode_splitter_command).rstrip(' \n').split( '\n' ) #output = subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n') if output[0].lower().startswith('error'): raise Exception("Error found in barcode split program: " + output[0]) result = {'barcodes': defaultdict(int)} for line in output[1:-2]: line = line.split('\t') result['barcodes'][line[2]] = int(line[1]) result['total'] = int(output[-1].split('\t')[1]) result['unmatched'] = int(output[-2].split('\t')[1]) if search_reverse_complement: initial_file = [] new_file = [] map_barcode_to_file = {} with open(parameters['bcfile']) as file: lines = file.readlines() new_bcfile = open(settings['bcfile'] + 'rc', 'w') for l in lines: c = l.split('\t') initial_file.append(c[0].strip()) new_file.append(c[0].strip() + 'rev') map_barcode_to_file[c[0].strip() + 'rev'] = prefix + c[0].strip() + suffix new_bcfile.write(c[0].strip() + 'rev' + '\t' + Reverse_Complement(c[1].strip()) + '\n') new_bcfile.close() shutil.copyfile(prefix + 'unmatched' + suffix, prefix + 'unmatched' + suffix + '.temp') files = [prefix + 'unmatched' + suffix + '.temp'] parameters['bcfile'] += 'rc' if orientation == 'eol': orientation = 'bol' elif orientation == 'bol': orientation = 'eol' barcode_splitter_command = 'cat "' + ' '.join(files) + '" | ' barcode_splitter_command += barcode_split_perl_script for p in parameters: barcode_splitter_command += '--{0} {1} '.format(p, parameters[p]) barcode_splitter_command += '--prefix ' + prefix + ' --suffix ' + suffix + ' --' + orientation output = useful.get_stdout(barcode_splitter_command).rstrip( ' \n' ).split( '\n' ) # subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n') for i, line in enumerate(output[1:-2]): line = line.split('\t') result['barcodes'][map_barcode_to_file[line[0].strip()]] += int( line[1]) result['unmatched'] = int(output[-2].split('\t')[1]) cleanup_command = '' for i, each_bc_file in enumerate(initial_file): cleanup_command += "mv '{0}{1}{3}' '{0}{1}{3}.temp';cat '{0}{1}{3}.temp' '{0}{2}{3}' > '{0}{1}{3}'; rm '{0}{1}{3}.temp';rm '{0}{2}{3}';".format( prefix, each_bc_file, new_file[i], suffix) cleanup_command += "rm '{0}{1}'; ".format( prefix, 'unmatched' + suffix + '.temp') subprocess.cal(cleanup_command, shell=True) return result
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) # Run trimmomatic trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'SE' trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0] # Run quality filtering filtered_trimmed_file = fastx.Run_Quality_Filter( trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(trimmedf) processed_files.append(filtered_trimmed_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + '.mixcr.alignment' output_file_annotation = useful.removeFileExtension( f) + '.mixcr.annotation' # Run MIXCR file print('Running MIXCR') [annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads) # Parse MIXCR file print('Parsing MIXCR') annotated_file = mixcr.parseMIXCR( f, output_file, 'FASTQ', output_file_annotation, command_val=command_val ) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file) print('Pairing sequences') output_dir = os.path.dirname(annotated_files[0]) pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='MIXCR', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff) print('Pipeline complete')