def trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding): trimmomatic_out_files = [] for fastq in fastq_files: trimmomatic_out_files.append(os.path.join(trimmomatic_folder, str(os.path.splitext(os.path.splitext(os.path.basename(fastq))[0])[0] + 'P.fastq.gz'))) trimmomatic_out_files.append(os.path.join(trimmomatic_folder, str(os.path.splitext(os.path.splitext(os.path.basename(fastq))[0])[0] + 'U.fastq.gz'))) # Run Trimmomatic command = ['java', '', '-jar', jar_path_trimmomatic, 'PE', '-threads', str(threads), '', ' '.join(fastq_files), ' '.join(trimmomatic_out_files), '', '', '', str('SLIDINGWINDOW:' + slidingWindow), str('LEADING:' + str(leading)), str('TRAILING:' + str(trailing)), str('MINLEN:' + str(minLength)), 'TOPHRED33'] if str(jarMaxMemory) != 'off': command[1] = '-Xmx' + str(int(round(jarMaxMemory * 1024, 0))) + 'M' if not doNotTrimCrops: if maxReadsLength is not None: if crop is not None: crop = maxReadsLength - crop[0] command[10] = str('CROP:' + str(crop)) else: if nts2clip_based_ntsContent is not None: crop = nts2clip_based_ntsContent[1] print str(crop) + ' nucleotides will be clipped at the end of reads' crop = maxReadsLength - crop command[10] = str('CROP:' + str(crop)) else: print 'Because FastQC did not run successfully, --trimCrop option will not be considered' if headCrop is not None: command[11] = str('HEADCROP:' + str(headCrop[0])) else: if nts2clip_based_ntsContent is not None: headCrop = nts2clip_based_ntsContent[0] print str(headCrop) + ' nucleotides will be clipped at the beginning of reads' command[11] = str('HEADCROP:' + str(headCrop)) if not doNotSearchAdapters: if adaptersFasta is not None: print 'Removing adapters contamination using ' + adaptersFasta command[12] = 'ILLUMINACLIP:' + adaptersFasta + ':3:30:10:6:true' else: trimmomatic_adapters_folder = os.path.join(os.path.dirname(script_path), 'src', 'Trimmomatic-0.36', 'adapters') adapters_files = [os.path.join(trimmomatic_adapters_folder, 'Nextera_XT_INNUca.fasta'), os.path.join(trimmomatic_adapters_folder, 'NexteraPE-PE.fa'), os.path.join(trimmomatic_adapters_folder, 'TruSeq2-PE.fa'), os.path.join(trimmomatic_adapters_folder, 'TruSeq3-PE-2.fa')] print 'Removing adapters contamination using ' + str(adapters_files) adaptersFasta = concatenateFastaFiles(adapters_files, trimmomatic_folder, 'concatenated_adaptersFile.fasta') command[12] = 'ILLUMINACLIP:' + adaptersFasta + ':3:30:10:6:true' if fastq_encoding is not None: if fastq_encoding == 33: command[7] = '-phred33' elif fastq_encoding == 64: command[7] = '-phred64' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) else: print 'Trimmomatic fail! Trying run with Phred+33 enconding defined...' command[7] = '-phred33' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: print 'Trimmomatic fail again! Trying run with Phred+64 enconding defined...' command[7] = '-phred64' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully
def indexSequenceBowtie2(referenceFile, threads): if os.path.isfile(str(referenceFile + '.1.bt2')): run_successfully = True else: command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully
def compress_decompress(compressed_file, decompressed_file, compressed_True): run_successfully = False malformated_fastq = False length_sequence = None compression_type = None if not compressed_True: compression_type = utils.compressionType(compressed_file) if compression_type is not None or compressed_True: command = ['', '', '--stdout', '--keep', '', '>', ''] if not compressed_True: command[0] = compression_type[0] command[1] = '--decompress' command[4] = compressed_file command[6] = decompressed_file else: command[0] = 'gzip' command[4] = decompressed_file command[6] = compressed_file run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) if run_successfully and not compressed_True: malformated_fastq, length_sequence = check_uncompression_fastq(decompressed_file) elif compression_type is None and not compressed_True: run_successfully = True malformated_fastq, length_sequence = check_uncompression_fastq(compressed_file) decompressed_file = compressed_file if malformated_fastq: run_successfully = False utils.saveVariableToPickle([run_successfully, compressed_file if compressed_True else decompressed_file, length_sequence], os.path.dirname(decompressed_file), os.path.splitext(os.path.basename(decompressed_file))[0])
def pilon(jar_path_pilon, assembly, bam_file, outdir, jarMaxMemory): assembly_polished = os.path.splitext(assembly)[0] + '.polished.fasta' command = ['java', '', '-jar', jar_path_pilon, '--genome', assembly, '--frags', bam_file, '--outdir', outdir, '--output', os.path.basename(os.path.splitext(assembly_polished)[0]), '--changes', '--vcf'] if str(jarMaxMemory) != 'off': command[1] = '-Xmx' + str(int(round(jarMaxMemory * 1024, 0))) + 'M' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: assembly_polished = None return run_successfully, assembly_polished
def sortAlignment(alignment_file, output_file, sortByName_True, threads): outFormat_string = os.path.splitext(output_file)[1][1:].lower() command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] if sortByName_True: command[6] = '-n' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: output_file = None return run_successfully, output_file
def runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus): pass_qc = False failing = {} failing['sample'] = False warnings = {} novel_alleles = os.path.join(outdir, 'mlst_novel_alleles.fasta') command = ['mlst', '--novel', novel_alleles, contigs] run_successfully, stdout, _ = utils.runCommandPopenCommunicate(command, False, None, True) if run_successfully: scheme_mlst = stdout.splitlines()[0].split('\t')[1].split('_')[0] st = stdout.splitlines()[0].split('\t')[2] profile = stdout.splitlines()[0].split('\t')[3:] if st == '-': clean_novel_alleles(novel_alleles=novel_alleles, scheme_mlst=scheme_mlst, profile=profile) else: if os.path.isfile(novel_alleles): os.remove(novel_alleles) report = 'MLST found ST ' + str(st) + ' from scheme ' + scheme_mlst print(report) with open(os.path.join(outdir, 'mlst_report.txt'), 'wt') as writer: writer.write('#scheme' + '\n' + scheme_mlst + '\n' + '#ST' + '\n' + st + '\n') writer.write('#profile' + '\n' + ' '.join(profile) + '\n') writer.flush() if scheme_mlst.split('_', 1)[0] == scheme.split('_', 1)[0]: pass_qc = True else: if scheme == 'unknown' and scheme_mlst != '-': pass_qc = True warnings['sample'] = 'Found {scheme_mlst} scheme for a species with unknown' \ ' scheme'.format(scheme_mlst=scheme_mlst) elif scheme == 'unknown' and scheme_mlst == '-': pass_qc = True elif species_genus == 'yersinia' and mlst_scheme_genus == 'yersinia': pass_qc = True warnings['sample'] = 'Found a Yersinia scheme ({scheme_mlst}), but it is different from what it was' \ ' expected ({scheme})'.format(scheme_mlst=scheme_mlst, scheme=scheme) else: if mlst_scheme_genus is not None and scheme_mlst == scheme == mlst_scheme_genus: pass_qc = True else: failing['sample'] = 'MLST scheme found ({scheme_mlst}) and provided ({scheme}) are not the' \ ' same'.format(scheme_mlst=scheme_mlst, scheme=scheme) print(failing['sample']) else: failing['sample'] = 'Did not run' if len(warnings) > 0: print(warnings['sample']) return run_successfully, pass_qc, failing, warnings
def get_bam_subset(alignment_file, sequences_2_keep, threads): bam_subset = os.path.splitext(alignment_file)[0] + '.subset.bam' command = ['samtools', 'view', '-buh', '-F', '4', '-o', bam_subset, '-@', str(threads), alignment_file, ' '.join(sequences_2_keep)] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) if not run_successfully: bam_subset = None return run_successfully, bam_subset
def getScheme(species): command = ['which', 'mlst'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) mlst_folder = os.path.abspath(os.path.realpath(stdout.splitlines()[0])) mlst_db_path, species_scheme_map_new = get_species_scheme_map_version(mlst_folder) scheme, genus_mlst_scheme = parse_species_scheme_map(species.lower().split(' '), mlst_db_path, species_scheme_map_new) print '\n' + 'MLST scheme found for {species}: {scheme}'.format(species=species, scheme=scheme) return scheme, species.lower().split(' ')[0], genus_mlst_scheme
def gzip_files(file_2_compress, pickle_prefix, outdir): if file_2_compress.endswith('.temp'): out_file = os.path.splitext(file_2_compress)[0] else: out_file = file_2_compress command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) if run_successfully: os.remove(file_2_compress) utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + os.path.basename(file_2_compress)))
def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): command = ['samtools', 'faidx', fasta, '', '', ''] shell_true = False if region_None is not None: command[3] = region_None if region_outfile_none is not None: command[4] = '>' command[5] = region_outfile_none shell_true = True run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, shell_true, None, print_comand_True) return run_successfully, stdout
def get_bam_subset(alignment_file, sequences_2_keep, threads): bam_subset = os.path.splitext(alignment_file)[0] + '.subset.bam' command = [ 'samtools', 'view', '-buh', '-F', '4', '-o', bam_subset, '-@', str(threads), alignment_file, ' '.join(sequences_2_keep) ] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) if not run_successfully: bam_subset = None return run_successfully, bam_subset
def runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus): pass_qc = False failing = {} failing['sample'] = False warnings = {} command = ['mlst', contigs] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) if run_successfully: scheme_mlst = stdout.splitlines()[0].split('\t')[1].split('_')[0] st = stdout.splitlines()[0].split('\t')[2] profile = stdout.splitlines()[0].split('\t')[3:] report = 'MLST found ST ' + str(st) + ' from scheme ' + scheme_mlst print report with open(os.path.join(outdir, 'mlst_report.txt'), 'wt') as writer: writer.write('#scheme' + '\n' + scheme_mlst + '\n' + '#ST' + '\n' + st + '\n') writer.write('#profile' + '\n' + ' '.join(profile) + '\n') writer.flush() if scheme_mlst.split('_', 1)[0] == scheme.split('_', 1)[0]: pass_qc = True else: if scheme == 'unknown' and scheme_mlst != '-': pass_qc = True warnings[ 'sample'] = 'Found {scheme_mlst} scheme for a species with unknown scheme'.format( scheme_mlst=scheme_mlst) elif scheme == 'unknown' and scheme_mlst == '-': pass_qc = True elif species_genus == 'yersinia' and mlst_scheme_genus == 'yersinia': pass_qc = True warnings[ 'sample'] = 'Found a Yersinia scheme ({scheme_mlst}), but it is different from what it was expected({scheme})'.format( scheme_mlst=scheme_mlst, scheme=scheme) else: failing[ 'sample'] = 'MLST scheme found (' + scheme_mlst + ') and provided (' + scheme + ') are not the same' print failing['sample'] else: warnings['sample'] = 'Did not run;' pass_qc = True if len(warnings) > 0: print warnings['sample'] return run_successfully, pass_qc, failing, warnings
def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id): command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir] if not sra: command[4] = '-P33001' command[7] = str('era-fasp@' + aspera_file_path) pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1] else: command[7] = '[email protected]:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( a=ena_id[:3], b=ena_id[:6], c=ena_id) pickle = pickle_prefix + '.' + ena_id run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, pickle)
def countSequencedBases(fastq_file, outdir): run_successfully = False bases = None # Determine compression type compression_type = utils.compressionType(fastq_file) if compression_type is not None: command = [compression_type[1], '--keep', '--stdout', fastq_file, '|', 'grep', '--after-context=1', '"@"', '|', 'grep', '--invert-match', '"^--$"', '|', 'grep', '--invert-match', '"@"', '|', 'wc', ''] # Number of characters command[18] = '--chars' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: bases = int(stdout.splitlines()[0]) # Number of lines command[18] = '--lines' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: lines = int(stdout.splitlines()[0]) bases = bases - lines utils.saveVariableToPickle([run_successfully, bases], outdir, str('estimate_coverage.' + os.path.basename(fastq_file)))
def sortAlignment(alignment_file, output_file, sortByName_True, threads): outFormat_string = os.path.splitext(output_file)[1][1:].lower() command = [ 'samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file ] if sortByName_True: command[6] = '-n' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) if not run_successfully: output_file = None return run_successfully, output_file
def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id): command = ['wget', '--tries=1', '', '-O', ''] if not sra: command[2] = ftp_file_path file_download = ftp_file_path.rsplit('/', 1)[1] command[4] = os.path.join(outdir, file_download) pickle = pickle_prefix + '.' + file_download else: command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( a=ena_id[:3], b=ena_id[:6], c=ena_id) command[4] = os.path.join(outdir, ena_id + '.sra') pickle = pickle_prefix + '.' + ena_id run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, pickle)
def pilon(jar_path_pilon, assembly, bam_file, outdir, jarMaxMemory): assembly_polished = os.path.splitext(assembly)[0] + '.polished.fasta' command = [ 'java', '', '-jar', jar_path_pilon, '--genome', assembly, '--frags', bam_file, '--outdir', outdir, '--output', os.path.basename(os.path.splitext(assembly_polished)[0]), '--changes', '--vcf' ] if str(jarMaxMemory) != 'off': command[1] = '-Xmx' + str(int(round(jarMaxMemory * 1024, 0))) + 'M' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) if not run_successfully: assembly_polished = None return run_successfully, assembly_polished
def sra_2_fastq(download_dir, ena_id): command = ['fastq-dump', '-I', '-O', download_dir, '--split-files', '{download_dir}{ena_id}.sra'.format( download_dir=download_dir, ena_id=ena_id)] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) if run_successfully: files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if not f.startswith('.') and os.path.isfile(os.path.join(download_dir, f)) and f.endswith('.fastq')] pool = multiprocessing.Pool(processes=2) results = [] p = pool.map_async(rename_header_sra, files, callback=results.extend) p.wait() run_successfully = all(results) return run_successfully
def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): gene_vcf = os.path.join( outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf') command = [ 'samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', '0', '--fasta-ref', reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed', '--output-tags', 'INFO/AD,AD,DP', bam_file ] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) if not run_successfully: gene_vcf = None return run_successfully, gene_vcf
def getScheme(species): command = ['which', 'mlst'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) mlst_folder = os.path.abspath(os.path.realpath(stdout.splitlines()[0])) mlst_db_path, species_scheme_map_new = get_species_scheme_map_version( mlst_folder) scheme, genus_mlst_scheme = parse_species_scheme_map( species.lower().split(' '), mlst_db_path, species_scheme_map_new) print('\n' + 'MLST scheme found for {species}: {scheme}'.format( species=species, scheme=scheme)) return scheme, species.lower().split(' ')[0], genus_mlst_scheme
def mapping_bowtie2(fastq_files, reference_file, outdir, keep_bam=False, threads=1): """ Map reads against a reference fasta file Parameters ---------- fastq_files : list List of fastq files reference_file : str Path to the reference file (the assembly) outdir : str Path to the output directory keep_bam : bool, default False True if want to keep the BAM file produced (with mapped and unmapped reads) threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not sam_file : str or None If everything went fine, it returns the path to the sam file, otherwise it returns None """ sam_file = os.path.join(outdir, str('alignment.sam')) # Index reference file run_successfully = indexSequenceBowtie2(reference_file, threads) if run_successfully: command = ['bowtie2', '-q', '--very-sensitive-local', '--threads', str(threads), '-x', reference_file, '', '', '-S', sam_file] if len(fastq_files) == 1: command[7] = '-U ' + fastq_files[0] else: command[7] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] if not keep_bam: command[8] = '--no-unal' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: sam_file = None return run_successfully, sam_file
def mapping_bowtie2(fastq_files, reference_file, outdir, keep_bam=False, threads=1): """ Map reads against a reference fasta file Parameters ---------- fastq_files : list List of fastq files reference_file : str Path to the reference file (the assembly) outdir : str Path to the output directory keep_bam : bool, default False True if want to keep the BAM file produced (with mapped and unmapped reads) threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not sam_file : str or None If everything went fine, it returns the path to the sam file, otherwise it returns None """ sam_file = os.path.join(outdir, str('alignment.sam')) # Index reference file run_successfully = indexSequenceBowtie2(reference_file, threads) if run_successfully: command = ['bowtie2', '-q', '--very-sensitive-local', '--threads', str(threads), '-x', reference_file, '', '', '--fr', '-I', '0', '-X', '2000', '-S', sam_file] if len(fastq_files) == 1: command[7] = '-U ' + fastq_files[0] else: command[7] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] if not keep_bam: command[8] = '--no-unal' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: sam_file = None return run_successfully, sam_file
def controlForZeroReads(fastq_files): not_empty_fastq = False fastq = fastq_files[0] compression_type = utils.compressionType(fastq) if compression_type is not None: command = [compression_type[1], '--stdout', '--keep', fastq, '|', 'head', '-n', '4'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: stdout = stdout.splitlines() if len(stdout) == 4: not_empty_fastq = True return not_empty_fastq
def spades(spades_folder, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, kmers, assembled_se_reads): contigs = os.path.join(spades_folder, 'contigs.fasta') command = ['spades.py', '', '--only-assembler', '--threads', str(threads), '--memory', str(maxMemory), '--cov-cutoff', str(minCoverageAssembly), '', '-1', fastq_files[0], '-2', fastq_files[1], '', '-o', spades_folder] if not notUseCareful: command[1] = '--careful' if len(kmers) > 0: kmers = ','.join(map(str, kmers)) command[9] = str('-k ' + kmers) if assembled_se_reads is not None: command[14] = str('-s ' + assembled_se_reads) run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully, contigs
def getting_mapping_statistics(alignment_file): command = ['samtools', 'flagstat', alignment_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) dict_mapping_statistics = {} if run_successfully: stdout = stdout.splitlines() for line in stdout: line = line.splitlines()[0] if len(line) > 0: line = line.split(' ', 3) field = line[3].split('(', 1) if len(field) == 0: field = field[0].replace(' ', '_') else: field = field[0].rsplit(' ', 1)[0].replace(' ', '_') dict_mapping_statistics[field] = {'qc_passed': int(line[0]), 'qc_failed': int(line[2])} return run_successfully, dict_mapping_statistics
def mappingBowtie2(fastq_files, referenceFile, threads, outdir): sam_file = os.path.join(outdir, str('alignment.sam')) # Index reference file run_successfully = indexSequenceBowtie2(referenceFile, threads) if run_successfully: command = ['bowtie2', '-q', '--very-sensitive-local', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '-S', sam_file] if len(fastq_files) == 1: command[8] = '-U ' + fastq_files[0] else: command[8] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: sam_file = None return run_successfully, sam_file
def runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus): pass_qc = False failing = {} failing['sample'] = False warnings = {} command = ['mlst', contigs] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if run_successfully: scheme_mlst = stdout.splitlines()[0].split('\t')[1].split('_')[0] st = stdout.splitlines()[0].split('\t')[2] profile = stdout.splitlines()[0].split('\t')[3:] report = 'MLST found ST ' + str(st) + ' from scheme ' + scheme_mlst print report with open(os.path.join(outdir, 'mlst_report.txt'), 'wt') as writer: writer.write('#scheme' + '\n' + scheme_mlst + '\n' + '#ST' + '\n' + st + '\n') writer.write('#profile' + '\n' + ' '.join(profile) + '\n') writer.flush() if scheme_mlst.split('_', 1)[0] == scheme.split('_', 1)[0]: pass_qc = True else: if scheme == 'unknown' and scheme_mlst != '-': pass_qc = True warnings['sample'] = 'Found {scheme_mlst} scheme for a species with unknown scheme'.format(scheme_mlst=scheme_mlst) elif scheme == 'unknown' and scheme_mlst == '-': pass_qc = True elif species_genus == 'yersinia' and mlst_scheme_genus == 'yersinia': pass_qc = True warnings['sample'] = 'Found a Yersinia scheme ({scheme_mlst}), but it is different from what it was expected ({scheme})'.format(scheme_mlst=scheme_mlst, scheme=scheme) else: failing['sample'] = 'MLST scheme found (' + scheme_mlst + ') and provided (' + scheme + ') are not the same' print failing['sample'] else: warnings['sample'] = 'Did not run;' pass_qc = True if len(warnings) > 0: print warnings['sample'] return run_successfully, pass_qc, failing, warnings
def fastQC(fastqc_folder, threads, adaptersFasta, fastq_files): # Create temporary FastQC foldes os.mkdir(os.path.join(fastqc_folder, 'temp.fastqc_temporary_dir', '')) # Run FastQC command = ['fastqc', '-o', fastqc_folder, '--extract', '--nogroup', '--format', 'fastq', '--threads', str(threads), '', '--dir', os.path.join(fastqc_folder, 'temp.fastqc_temporary_dir', '')] command = command + fastq_files if adaptersFasta is not None: adaptersTEMP = adapters2fastQC(fastqc_folder, adaptersFasta) print 'Scanning for adapters contamination using ' + adaptersFasta command[9] = '--adapters ' + adaptersTEMP run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) # Remove temporary files os.rmdir(os.path.join(fastqc_folder, 'temp.fastqc_temporary_dir', '')) if adaptersFasta is not None: os.remove(adaptersTEMP) return run_successfully
def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, sufix): sequence_dict = None gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta')) run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False) if run_successfully: command = [ 'bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file ] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) if run_successfully: sequence_dict = parse_fasta_inMemory(stdout) return run_successfully, sequence_dict
def fastQintegrity(fastq, outdir): run_successfully = False temporary_output_file = os.path.join(outdir, os.path.splitext(os.path.basename(fastq))[0]) compression_type = utils.compressionType(fastq) encoding, min_reads_length, max_reads_length = None, None, None if compression_type is not None: command = [compression_type[1], '--stdout', '--keep', fastq, '>', temporary_output_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: encoding, min_reads_length, max_reads_length = run_guess_encoding_single_thread(temporary_output_file, None, outdir) if os.path.isfile(temporary_output_file): os.remove(temporary_output_file) utils.saveVariableToPickle([run_successfully, encoding, min_reads_length, max_reads_length], outdir, os.path.basename(fastq))
def controlForZeroReads(fastq_files): not_empty_fastq = False fastq = fastq_files[0] compression_type = utils.compressionType(fastq) if compression_type is not None: command = [ compression_type[1], '--stdout', '--keep', fastq, '|', 'head', '-n', '4' ] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, False) if run_successfully: stdout = stdout.splitlines() if len(stdout) == 4: not_empty_fastq = True return not_empty_fastq
def getting_mapping_statistics(alignment_file): command = ['samtools', 'flagstat', alignment_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, True) dict_mapping_statistics = {} if run_successfully: stdout = stdout.splitlines() for line in stdout: line = line.splitlines()[0] if len(line) > 0: line = line.split(' ', 3) field = line[3].split('(', 1) if len(field) == 0: field = field[0].replace(' ', '_') else: field = field[0].rsplit(' ', 1)[0].replace(' ', '_') dict_mapping_statistics[field] = { 'qc_passed': int(line[0]), 'qc_failed': int(line[2]) } return run_successfully, dict_mapping_statistics
def mappingBowtie2(fastq_files, referenceFile, threads, outdir): sam_file = os.path.join(outdir, str('alignment.sam')) # Index reference file run_successfully = indexSequenceBowtie2(referenceFile, threads) if run_successfully: command = [ 'bowtie2', '-q', '--very-sensitive-local', '--threads', str(threads), '-x', referenceFile, '', '-S', sam_file ] if len(fastq_files) == 1: command[8] = '-U ' + fastq_files else: command[8] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) if not run_successfully: sam_file = None return run_successfully, sam_file
def controlForZeroReads(fastq_files): not_empty_fastq = False fastq = fastq_files[0] command = ['', '--stdout', '--keep', fastq, '|', 'head', '-n', '4'] filetype = utils.compressionType(fastq) if filetype == 'gz': command[0] = 'gunzip' elif filetype == 'bz2': command[0] = 'bunzip2' if command[0] != '': run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: stdout = stdout.splitlines() if len(stdout) == 4: not_empty_fastq = True return not_empty_fastq
def spades(spades_folder, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, kmers): contigs = os.path.join(spades_folder, 'contigs.fasta') command = [ 'spades.py', '', '--only-assembler', '--threads', str(threads), '--memory', str(maxMemory), '--cov-cutoff', str(minCoverageAssembly), '', '-1', fastq_files[0], '-2', fastq_files[1], '-o', spades_folder ] if not notUseCareful: command[1] = '--careful' if len(kmers) > 0: kmers = ','.join(map(str, kmers)) command[9] = str('-k ' + kmers) run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) return run_successfully, contigs
def fastQintegrity(fastq, outdir): run_successfully = False temporary_output_file = os.path.join( outdir, os.path.splitext(os.path.basename(fastq))[0]) command = ['', '--stdout', '--keep', fastq, '>', temporary_output_file] filetype = utils.compressionType(fastq) if filetype == 'gz': command[0] = 'gunzip' elif filetype == 'bz2': command[0] = 'bunzip2' if command[0] != '': run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, False) if os.path.isfile(temporary_output_file): os.remove(temporary_output_file) utils.saveVariableToPickle(run_successfully, outdir, os.path.basename(fastq))
def alignmentToFastq(alignment_file, outdir, threads, pair_end_type): fastq_basename = os.path.splitext(alignment_file)[0] outfiles = None bamFile = fastq_basename + '.temp.bam' # sort cram run_successfully, bamFile = sortAlignment(alignment_file, bamFile, True, threads) if run_successfully: command = ['samtools', 'fastq', '', bamFile] if pair_end_type.lower() == 'paired': command[2] = '-1 ' + str(fastq_basename + '_1.fq') + ' -2 ' + str(fastq_basename + '_2.fq') elif pair_end_type == 'single': command[2] = '-0 ' + str(fastq_basename + '.fq') run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if run_successfully: if pair_end_type.lower() == 'paired': outfiles = [str(fastq_basename + '_1.fq'), str(fastq_basename + '_2.fq')] elif pair_end_type.lower() == 'single': outfiles = [str(fastq_basename + '.fq')] if os.path.isfile(bamFile): os.remove(bamFile) return run_successfully, outfiles
def indexAlignment(alignment_file): command = ['samtools', 'index', alignment_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully
def getBlastPath(): print '\n' + 'The following blastn will be used' command = ['which', 'blastn'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) print stdout
def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter): genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab') command = ['samtools', 'depth', '-a', '-r', sequence_to_analyse, alignment_file, '>', genome_coverage_data_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) return run_successfully, genome_coverage_data_file
def curl_installed(): command = ['which', 'curl'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) return run_successfully
def runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus): pass_qc = False failing = {} failing['sample'] = False warnings = {} novel_alleles = os.path.join(outdir, 'mlst_novel_alleles.fasta') command = ['mlst', '--novel', novel_alleles, contigs] run_successfully, stdout, _ = utils.runCommandPopenCommunicate( command, False, None, True) if run_successfully: scheme_mlst = stdout.splitlines()[0].split('\t')[1].split('_')[0] st = stdout.splitlines()[0].split('\t')[2] profile = stdout.splitlines()[0].split('\t')[3:] if st == '-' and os.path.isfile(novel_alleles): clean_novel_alleles(novel_alleles=novel_alleles, scheme_mlst=scheme_mlst, profile=profile) else: if os.path.isfile(novel_alleles): os.remove(novel_alleles) report = 'MLST found ST ' + str(st) + ' from scheme ' + scheme_mlst print(report) with open(os.path.join(outdir, 'mlst_report.txt'), 'wt') as writer: writer.write('#scheme' + '\n' + scheme_mlst + '\n' + '#ST' + '\n' + st + '\n') writer.write('#profile' + '\n' + ' '.join(profile) + '\n') writer.flush() if scheme_mlst.split('_', 1)[0] == scheme.split('_', 1)[0]: pass_qc = True else: if scheme == 'unknown' and scheme_mlst != '-': pass_qc = True warnings['sample'] = 'Found {scheme_mlst} scheme for a species with unknown' \ ' scheme'.format(scheme_mlst=scheme_mlst) elif scheme == 'unknown' and scheme_mlst == '-': pass_qc = True elif scheme != 'unknown' and scheme_mlst == '-': pass_qc = True warnings[ 'sample'] = 'Could not find a scheme for a species with known scheme ({})'.format( scheme) elif species_genus == 'yersinia' and mlst_scheme_genus == 'yersinia': pass_qc = True warnings['sample'] = 'Found a Yersinia scheme ({scheme_mlst}), but it is different from what it was' \ ' expected ({scheme})'.format(scheme_mlst=scheme_mlst, scheme=scheme) else: if mlst_scheme_genus is not None and scheme_mlst == scheme == mlst_scheme_genus: pass_qc = True else: failing['sample'] = 'MLST scheme found ({scheme_mlst}) and provided ({scheme}) are not the' \ ' same'.format(scheme_mlst=scheme_mlst, scheme=scheme) print(failing['sample']) else: failing['sample'] = 'Did not run' if len(warnings) > 0: print(warnings['sample']) return run_successfully, pass_qc, failing, warnings
def downloadWithFtp(ftp_file_path, outdir, pickle_prefix): file_download = ftp_file_path.rsplit('/', 1)[1] command = ['wget', ftp_file_path, '-O', os.path.join(outdir, file_download)] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download))
def downloadWithAspera(aspera_file_path, asperaKey, outdir, pickle_prefix): command = ['ascp', '-QT', '-l', '300m', '-i', asperaKey, str('era-fasp@' + aspera_file_path), outdir] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]))
def getBlastPath(): print('\n' + 'The following blastn will be used') command = ['which', 'blastn'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) print(stdout)
def run_pear(decompressed_reads_list, sample_name, threads, outdir, fastq_encoding, trimmomatic_run_successfully, minimum_overlap_reads): pass_qc = False failing = {} command = ['pear', '--forward-fastq', decompressed_reads_list[0], '--reverse-fastq', decompressed_reads_list[1], '--output', os.path.join(outdir, sample_name), '--p-value', str(1.0), '--min-assembly-length', str(minimum_overlap_reads), '--phred-base', '', '--cap', str(0), '--threads', str(threads), '--memory', str(str(threads) + 'G'), '--keep-original'] if trimmomatic_run_successfully: command[12] = '33' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) else: if fastq_encoding is not None: command[12] = str(fastq_encoding[1]) run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) else: print 'Pear fail! Trying run with Phred+33 enconding defined...' command[12] = '33' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: print 'Pear fail again! Trying run with Phred+64 enconding defined...' command[12] = '64' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) with open(os.path.join(outdir, str(sample_name + '.pear_out.txt')), 'wt') as writer: for line in stdout: writer.write(line) unassembled_pe_reads, assembled_se_reads = None, None assembled_reads, unassembled_reads, discarded_reads = None, None, None if run_successfully: assembled_reads, unassembled_reads, discarded_reads = parse_pearOutput_getAssembled(stdout) unassembled_pe_reads_uncompressed, assembled_se_reads_uncompressed = get_pear_output(outdir, sample_name) if assembled_reads == 0: assembled_se_reads = None else: compress_decompress(str(assembled_se_reads_uncompressed + '.gz'), assembled_se_reads_uncompressed, True) run_successfully, assembled_se_reads = get_compressed_decompressed_reads(outdir) assembled_se_reads = assembled_se_reads[0] if run_successfully else assembled_se_reads if unassembled_reads == 0: unassembled_pe_reads = None else: if run_successfully: pool = multiprocessing.Pool(processes=threads) for fastq in unassembled_pe_reads_uncompressed: pool.apply_async(compress_decompress, args=(str(fastq + '.gz'), fastq, True,)) pool.close() pool.join() run_successfully, unassembled_pe_reads = get_compressed_decompressed_reads(outdir) os.remove(assembled_se_reads_uncompressed) for fastq in unassembled_pe_reads_uncompressed: os.remove(fastq) if float(assembled_reads) / float(assembled_reads + unassembled_reads) < 0.75: pass_qc = True failing['sample'] = False else: failing['sample'] = 'Number of overlapping reads is >= 75% of total reads' print failing return run_successfully, pass_qc, failing, assembled_se_reads, unassembled_pe_reads, assembled_reads, unassembled_reads, discarded_reads
def indexAlignment(alignment_file): command = ['samtools', 'index', alignment_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) return run_successfully
def trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding): trimmomatic_out_files = [] for fastq in fastq_files: trimmomatic_out_files.append( os.path.join( trimmomatic_folder, str( os.path.splitext( os.path.splitext(os.path.basename(fastq))[0])[0] + 'P.fastq.gz'))) trimmomatic_out_files.append( os.path.join( trimmomatic_folder, str( os.path.splitext( os.path.splitext(os.path.basename(fastq))[0])[0] + 'U.fastq.gz'))) # Run Trimmomatic command = [ 'java', '', '-jar', jar_path_trimmomatic, 'PE', '-threads', str(threads), '', ' '.join(fastq_files), ' '.join(trimmomatic_out_files), '', '', '', str('SLIDINGWINDOW:' + slidingWindow), str('LEADING:' + str(leading)), str('TRAILING:' + str(trailing)), str('MINLEN:' + str(minLength)), 'TOPHRED33' ] if str(jarMaxMemory) != 'off': command[1] = '-Xmx' + str(int(round(jarMaxMemory * 1024, 0))) + 'M' if not doNotTrimCrops: if maxReadsLength is not None: if crop is not None: crop = maxReadsLength - crop[0] command[10] = str('CROP:' + str(crop)) else: if nts2clip_based_ntsContent is not None: crop = nts2clip_based_ntsContent[1] print str( crop ) + ' nucleotides will be clipped at the end of reads' crop = maxReadsLength - crop command[10] = str('CROP:' + str(crop)) else: print 'Because FastQC did not run successfully, --trimCrop option will not be considered' if headCrop is not None: command[11] = str('HEADCROP:' + str(headCrop[0])) else: if nts2clip_based_ntsContent is not None: headCrop = nts2clip_based_ntsContent[0] print str( headCrop ) + ' nucleotides will be clipped at the beginning of reads' command[11] = str('HEADCROP:' + str(headCrop)) if not doNotSearchAdapters: if adaptersFasta is not None: print 'Removing adapters contamination using ' + adaptersFasta command[12] = 'ILLUMINACLIP:' + adaptersFasta + ':3:30:10:6:true' else: trimmomatic_adapters_folder = os.path.join( os.path.dirname(script_path), 'src', 'Trimmomatic-0.36', 'adapters') adapters_files = [ os.path.join(trimmomatic_adapters_folder, 'Nextera_XT_INNUca.fasta'), # os.path.join(trimmomatic_adapters_folder, 'NexteraPE-PE.fa'), os.path.join(trimmomatic_adapters_folder, 'TruSeq2-PE.fa'), os.path.join(trimmomatic_adapters_folder, 'TruSeq3-PE-2.fa') ] print 'Removing adapters contamination using ' + str( adapters_files) adaptersFasta = concatenateFastaFiles( adapters_files, trimmomatic_folder, 'concatenated_adaptersFile.fasta') command[12] = 'ILLUMINACLIP:' + adaptersFasta + ':3:30:10:6:true' run_successfully = False if fastq_encoding is not None: if fastq_encoding == 33: command[7] = '-phred33' elif fastq_encoding == 64: command[7] = '-phred64' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) if not run_successfully: print 'Trying to run Trimmomatic with Phred+33 enconding defined...' command[7] = '-phred33' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) if not run_successfully: print 'Trimmomatic fail again! Trying to run with Phred+64 enconding defined...' command[7] = '-phred64' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) return run_successfully