Exemplo n.º 1
0
def call_external(cmd, logger, raise_exception=False):
    '''
    Calls specified external command, waits for the process to finish and returns
    whatever the child process has returned. If 'raise_exception' is True
    a CalledProcessError exception is raised, which includes the return code,
    command and the output (stdout + stderr) of the external process. The call can be 
    wrapped inside the try_and_except function for handling the possible raised 
    exceptions (should be used with 'raise_exception'=True). Otherwise, the
    exception can be handled manually.
    
    @param cmd: Command to be ran with all appropriate arguments. The validity of
        the command the the arguments are not checked.
    @type cmd: arr.
    @param logger: Logger to be used for logging the output from the process.
    @type logger: logger.
    @param raise_exception: Specifies whether a CalledProcessError should be raised
        when returncode is not 0 (default False).
    @type raise_exception: bool.
    
    @return: Returns the returncode from the external process.
    
    @raise CalledProcessError: If returncode is not 0 and 'raise_exception' is
        set to True, then CalledProcessError will be raised.
    '''
    import subprocess
    import log_writer

    # For now (v 2.7) can't use subprocess.call, subprocess.check_all because PIPE
    #    is not correctly reading the method. For now, use Popen and wait().
    process = subprocess.Popen(cmd,
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()

    # Log the outputs of the external programm.
    # FIXME: 'log_error_to' will always go to 'error' if exit code > 0 (see log_process)
    process_out, process_err = log_writer.log_process(logger,
                                                      process,
                                                      log_error_to="info")

    # Use stdout as output, unless err is not empty. In which case append it.
    # This may pollute the error log with stdout form the process.
    out = deepcopy(process_out)
    if process_err:
        out = out.join(["********ERROR********\n", process_err])

    if raise_exception and process.returncode != 0:
        raise phe_exceptions.PheExternalError(
            "External script has returned non-zero exit code.",
            subprocess.CalledProcessError(process.returncode, cmd, out))
    else:
        retval = {
            'proc_returncode': process.returncode,
            'proc_stdout': process_out,
            'proc_stderr': process_err
        }
        return retval
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger):
	
	"""
	Function
	Generate pileup file by using SAMtools mpileup command.
	NB: use -B -A -f option to optimises coverage and --A
	flag count anomalous read 
	 
	The option for method:
	tmp_dir[str]: the path to where  pileup file will be created
	sorted_bam_file[str]:  the path to the BAM file location
	refFn[str]:  the path to the reference file location
	samtools[str]: the path to SAMtools command
	logger[str]: the path to where the stderr and stdout logged
	"""
	
	#1. Index bam file
	log_writer.info_header(logger, "index bam file")
	process = subprocess.Popen([samtools, 'index',
								sorted_bam_file],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	
	#2. Generate pileup file
	pileFn =  os.path.join(tmp_dir, 'all.pileup')
	pileupFile = open(pileFn, 'w')
	log_writer.info_header(logger, "Generate pileup file")
	process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-f',
								refFn, sorted_bam_file],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	for l in process.stdout:
		pileupFile.write(l)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	pileupFile.close()
	
	
	
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger):
    """
	Function
	Generate pileup file by using SAMtools mpileup command.
	NB: use -B -A -f option to optimises coverage and --A
	flag count anomalous read 
	 
	The option for method:
	tmp_dir[str]: the path to where  pileup file will be created
	sorted_bam_file[str]:  the path to the BAM file location
	refFn[str]:  the path to the reference file location
	samtools[str]: the path to SAMtools command
	logger[str]: the path to where the stderr and stdout logged
	"""

    #1. Index bam file
    log_writer.info_header(logger, "index bam file")
    process = subprocess.Popen([samtools, 'index', sorted_bam_file],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    #2. Generate pileup file
    pileFn = os.path.join(tmp_dir, 'all.pileup')
    pileupFile = open(pileFn, 'wb')
    log_writer.info_header(logger, "Generate pileup file")
    process = subprocess.Popen(
        [samtools, 'mpileup', '-B', '-A', '-f', refFn, sorted_bam_file],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE)
    for l in process.stdout:
        pileupFile.write(l)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    pileupFile.close()
Exemplo n.º 4
0
def call_external(cmd, logger, raise_exception=False):
    '''
    Calls specified external command, waits for the process to finish and returns
    whatever the child process has returned. If 'raise_exception' is True
    a CalledProcessError exception is raised, which includes the return code,
    command and the output (stdout + stderr) of the external process. The call can be 
    wrapped inside the try_and_except function for handling the possible raised 
    exceptions (should be used with 'raise_exception'=True). Otherwise, the
    exception can be handled manually.
    
    @param cmd: Command to be ran with all appropriate arguments. The validity of
        the command the the arguments are not checked.
    @type cmd: arr.
    @param logger: Logger to be used for logging the output from the process.
    @type logger: logger.
    @param raise_exception: Specifies whether a CalledProcessError should be raised
        when returncode is not 0 (default False).
    @type raise_exception: bool.
    
    @return: Returns the returncode from the external process.
    
    @raise CalledProcessError: If returncode is not 0 and 'raise_exception' is
        set to True, then CalledProcessError will be raised.
    '''
    import subprocess
    import log_writer
    
    # For now (v 2.7) can't use subprocess.call, subprocess.check_all because PIPE
    #    is not correctly reading the method. For now, use Popen and wait().
    process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
    process.wait()
    
    # Log the outputs of the external programm.
    # FIXME: 'log_error_to' will always go to 'error' if exit code > 0 (see log_process) 
    process_out, process_err = log_writer.log_process(logger, process, log_error_to = "info")
    
    # Use stdout as output, unless err is not empty. In which case append it.
    # This may pollute the error log with stdout form the process.
    out = deepcopy(process_out)
    if process_err:
        out = out.join(["********ERROR********\n", process_err])
    
    if raise_exception and process.returncode != 0:
        raise phe_exceptions.PheExternalError("External script has returned non-zero exit code.", 
                                        subprocess.CalledProcessError(process.returncode, cmd, out))
    else:
        retval={'proc_returncode' : process.returncode, 'proc_stdout' : process_out, 'proc_stderr' : process_err }
        return retval
Exemplo n.º 5
0
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir,
            bowtie, samtools, id, logger):
    """

  This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided.

  :param fastqs: directory that contains two fastq files.  It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq
  :type fastqs: directory
  :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function.
  :type reference_fasta_file: file
  :param bowtie: path to bowtie
  :type bowtie: path
  :param samtools: path to samtools
  :type samtools: path

  :returns: sorted and indexed bam file.
  :rtype: file

  """
    try:
        os.makedirs(output_dir + "/tmp")
    except OSError:
        if os.path.isdir(output_dir + "/tmp"):
            # We are nearly safe
            pass
        else:
            # There was an error on creation, so make sure we know about it
            raise

    null = open(os.devnull, 'w')

    bam_sorted = os.path.join(output_dir, id + '-sorted')
    bam_out = os.path.join(output_dir, id + '-sorted.bam')
    sam_parsed = os.path.join(output_dir + "/tmp",
                              id + '.tmp')  # temporary sam output
    sam = os.path.join(output_dir + "/tmp", id + '.sam')
    bam = os.path.join(output_dir + "/tmp", id + '.bam')

    # copy the reference fasta file to the tmp directory and index
    reference_fasta_file = output_dir + "/tmp/reference.fasta"
    shutil.copyfile(reference_fasta_file_path, reference_fasta_file)
    print "running bowtie index"
    bowtie_index = bowtie + "-build"
    log_writer.info_header(logger, "Creating reference_fasta_fileerence index")
    process = subprocess.Popen(
        [bowtie_index, reference_fasta_file, reference_fasta_file],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE
    )  # generate index of reference_fasta_fileerence fasta for mapping
    process.wait()
    log_writer.log_process(logger, process)

    # # run bowtie
    cmd = [bowtie]
    cmd += [
        '--fr', '--minins', '300', '--maxins', '1100', '-x',
        reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1], '-S', sam,
        '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
        'S,1,0.50'
    ]  # write to tmp
    log_writer.info_header(logger, "Running bowtie to generate sam file")
    #  print "running bowtie"
    process = subprocess.Popen(cmd,
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file.  The sam_parsed file is the output that is used to convert to bam.

    try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr",
                   remove_secondary_mapping_bit, sam, sam_parsed)

    log_writer.info_header(logger, "Convert sam to bam")
    process = subprocess.Popen(
        [samtools, 'view', '-bhS', '-o', bam, sam_parsed],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    # sort bam
    log_writer.info_header(logger, "Sort the bam file")
    process = subprocess.Popen([samtools, 'sort', bam, bam_sorted],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    # index bam
    log_writer.info_header(logger, "Index the BAM file")
    process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    return bam_out
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids, logger):
	
	"""
	Function
	(1) Map each read set to each of the possible locus variants  by calling Bowtie2
	(with  very sensitive options) and create tmp file
	(2) Convert the tmp to sam file by unset the secondary alignment bit score
	(3) Convert the sam to BAM file
	(4) sort BAM file
	
	The option for method:
	tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created
	fastq_files[list]: the path to the fastq file location
	refFn[str]:  the path to the  reference file location
	expand[str] : True
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: unique identifier number
	logger[str]: the path to where the stderr and stdout logged
	
	Return
	sorted_bam_file[str]: sorted BAM file
	"""
	
	tmp = os.path.join(tmp_dir, ids +'.tmp') # temporary sam output
	sam = os.path.join(tmp_dir, ids +  '.sam')
	
	if expand:
		#1. Creating tmp file
		log_writer.info_header(logger, "Creating tmp file")
		# -k = report up to 99999 good alignments per read.
		#--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 
		process = subprocess.Popen([bowtie, '--fr',
									'--no-unal',
									'--minins', '300', '--maxins', '1100',
									'-x', refFn,
									'-1', fastq_files[0], '-2', fastq_files[1],
									'-S', tmp,
									'-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'],
									stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		process.wait()
		log_writer.log_process(logger, process, log_error_to = "info")
		
		#2.remove_secondary_mapping_bit
		log_writer.info_header(logger, "remove_secondary_mapping_bit")
		i = open(tmp)
		o = open(sam, 'w')
		remove_secondary_mapping_bit(tmp, sam)
		i.close()
		o.close()
	
	else:
		log_writer.info_header(logger, "Creating sam file")
		process= subprocess.Popen([bowtie,  '--fr',
								   '--no-unal',
								   '--minins', '300', '--maxins', '1100',
								   '-x', refFn,
								   '-1', fastq_files[0], '-2', fastq_files[1],
								   '-S', sam,
								   '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'],
			stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		process.wait()
		log_writer.log_process(logger, process, log_error_to = "info")
	
	#3.Converting sam to bam file
	bam = os.path.join(tmp_dir, ids +  '.unsortedbam')
	log_writer.info_header(logger, "Converting sam to bam")
	process = subprocess.Popen([samtools, 'view',
								'-bhS',
								'-o', bam, sam], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	
	#4.Sort bam file
	out0 = os.path.join(tmp_dir,ids +  '-all')
	log_writer.info_header(logger, "Sorting bam")
	sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam')
	process = subprocess.Popen([samtools, 'sort',
								bam, out0], stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	
	return sorted_bam_file
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids,
                    logger):
    """
	Function
	(1) Map each read set to each of the possible locus variants  by calling Bowtie2
	(with  very sensitive options) and create tmp file
	(2) Convert the tmp to sam file by unset the secondary alignment bit score
	(3) Convert the sam to BAM file
	(4) sort BAM file
	
	The option for method:
	tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created
	fastq_files[list]: the path to the fastq file location
	refFn[str]:  the path to the  reference file location
	expand[str] : True
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: unique identifier number
	logger[str]: the path to where the stderr and stdout logged
	
	Return
	sorted_bam_file[str]: sorted BAM file
	"""

    tmp = os.path.join(tmp_dir, ids + '.tmp')  # temporary sam output
    sam = os.path.join(tmp_dir, ids + '.sam')

    if expand:
        #1. Creating tmp file
        log_writer.info_header(logger, "Creating tmp file")
        # -k = report up to 99999 good alignments per read.
        #--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50
        process = subprocess.Popen([
            bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100',
            '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', tmp,
            '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
            'S,1,0.50'
        ],
                                   stderr=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        process.wait()
        log_writer.log_process(logger, process, log_error_to="info")

        #2.remove_secondary_mapping_bit
        log_writer.info_header(logger, "remove_secondary_mapping_bit")
        i = open(tmp)
        o = open(sam, 'w')
        remove_secondary_mapping_bit(tmp, sam)
        i.close()
        o.close()

    else:
        log_writer.info_header(logger, "Creating sam file")
        process = subprocess.Popen([
            bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100',
            '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', sam,
            '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
            'S,1,0.50'
        ],
                                   stderr=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        process.wait()
        log_writer.log_process(logger, process, log_error_to="info")

    #3.Converting sam to bam file
    bam = os.path.join(tmp_dir, ids + '.unsortedbam')
    log_writer.info_header(logger, "Converting sam to bam")
    process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, sam],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    #4.Sort bam file
    out0 = os.path.join(tmp_dir, ids + '-all')
    log_writer.info_header(logger, "Sorting bam")
    sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam')
    process = subprocess.Popen([samtools, 'sort', bam, out0],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    return sorted_bam_file
def flanking_regions(profile_file_directory, output_directory, logger):
    """
	Function
	(1) Extract flanking regions of 100bp upstream and downstream of each MLST locus
	by blast against a reference genome.BLAST uses the first locus sequence as a query.
	(2) Creates summary.txt file (a tab-delimited text file display the path to the
	loci and flanking sequences) 
	
	The option of the method
	profile_file_directory[str]: The path to the reference.seq, profile.txt and
	the Locus variant sequences (*.fas) files location
	output_directory[str]: The path to where the summary.txt file will be created
	logger[str]: The path to where the stderr and stdout logged
	"""

    reference_fasta_file = profile_file_directory + "/reference.seq"
    refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna)
    locus_files = glob.glob(profile_file_directory + "/*.fas")
    locus_files = sorted(locus_files)

    summary_file_handle = open(output_directory + "/summary.txt", "w")
    for seq in locus_files:
        (seqDir, seqFileName) = os.path.split(seq)
        (seqBaseName, ext) = os.path.splitext(seqFileName)
        bait = seqBaseName + "_bait.fasta"
        log_writer.info_header(logger, "create bait file")
        process = subprocess.Popen([
            'seqret', seq, '-firstonly', '-auto', '-out',
            output_directory + '/' + bait
        ],
                                   stderr=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        process.wait()
        log_writer.log_process(logger, process, log_error_to="info")
        cline = NcbiblastnCommandline(query=output_directory + '/' + bait,
                                      db=profile_file_directory + "/reference",
                                      evalue=0.001,
                                      out=output_directory +
                                      "/my_blast_tmp.xml",
                                      outfmt=5)
        stdout_log_output, stderr_log_output = cline()
        result_handle = open(output_directory + "/my_blast_tmp.xml")
        blast_record = NCBIXML.read(result_handle)
        query_length = blast_record.query_letters
        for alignment in blast_record.alignments:
            hsp = alignment.hsps[0]
            if hsp.align_length / float(query_length) > 0.5:
                if hsp.sbjct_start > hsp.sbjct_end:
                    subject_start = hsp.sbjct_start + (hsp.query_start - 1)
                else:
                    subject_start = hsp.sbjct_start - (hsp.query_start - 1)
                if hsp.sbjct_start > hsp.sbjct_end:
                    subject_end = hsp.sbjct_end - (query_length -
                                                   hsp.query_end)
                else:
                    subject_end = hsp.sbjct_end + (query_length -
                                                   hsp.query_end)
                revcomp = 1
                if hsp.sbjct_start > hsp.sbjct_end:
                    revcomp = -1
                left_coords = [
                    min(subject_start, subject_end) - 100,
                    min(subject_start, subject_end) - 1
                ]
                right_coords = [
                    max(subject_start, subject_end) + 1,
                    max(subject_start, subject_end) + 100
                ]
                left_cmd = [
                    "seqret ", reference_fasta_file, " -sbegin ",
                    str(left_coords[0]), " -send ",
                    str(left_coords[1]), " -osformat fasta -auto -out " +
                    output_directory + "/tmp_left_flank.fasta"
                ]
                os.system(''.join(left_cmd))

                right_cmd = [
                    "seqret ", reference_fasta_file, " -sbegin ",
                    str(right_coords[0]), " -send ",
                    str(right_coords[1]), " -osformat fasta -auto -out " +
                    output_directory + "/tmp_right_flank.fasta"
                ]
                os.system(''.join(right_cmd))

                left_record = SeqIO.read(
                    output_directory + "/tmp_left_flank.fasta", "fasta")

                if revcomp < 0:
                    left_record.id = "down"
                    left_record.seq = left_record.seq.reverse_complement()
                else:
                    left_record.id = "up"
                right_record = SeqIO.read(
                    output_directory + "/tmp_right_flank.fasta", "fasta")
                if revcomp < 0:
                    right_record.id = "up"
                    right_record.seq = right_record.seq.reverse_complement()
                else:
                    right_record.id = "down"
                right_record.description = ""
                left_record.description = ""
                out_handle = open(
                    output_directory + "/" + seqBaseName + "_flanks.fasta",
                    "w")
                out_handle.write(right_record.format("fasta"))
                out_handle.write(left_record.format("fasta"))
                out_handle.close()
                summary_file_handle.write('\t'.join([
                    seqBaseName, seq, output_directory + "/" + seqBaseName +
                    "_flanks.fasta"
                ]) + "\n")
    summary_file_handle.close()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger):
    """
	Function
	(1) Concatenate flanking regions to correspondent locus variants sequence in fasta format.
	Newly concatenated sequence are then indexed by Bowtie2
	(2) Then extract and store as pickled object:
		a. locus- variant names (loci.pkl)
		b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl)
		c. Locus variants sequence (refSeqs.pkl)
	
	The option of the method
	specFn[str]: A tab-delimited text file display the path to the seven flanking  and loci sequences(summary.txt)
	tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created
	bowtie[str]: The command used to index the reference sequence
	logger[str]: The path to where the stderr and stdout logged
	
	Return
	loci[list]: loci name
	"""

    (specDir, summaryFileName) = os.path.split(specFn)
    spc = []

    for l in open(specFn):
        spc.append(l.split())
    refFn = os.path.join(tmp_dir, "reference.fa")
    rf = open(refFn, "w")
    ranges = {}
    loci = []
    refSeqs = {}
    for (loc, variantsFn, flanksFn) in spc:
        loci.append(loc)
        fs = {}
        f = open(os.path.join(specDir, flanksFn))
        for r in Bio.SeqIO.parse(f, "fasta"):
            fs[r.id] = r.seq
        f = open(os.path.join(specDir, variantsFn))
        for r in Bio.SeqIO.parse(f, "fasta"):
            s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna)
            s += fs['up']
            s += r.seq
            s += fs['down']
            Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta")
            ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq))
            refSeqs[r.id] = s
    rf.close()
    rangesFn = os.path.join(
        tmp_dir, "ranges.pkl"
    )  #start and end position of locus variant sequences (without the flanking sequences)
    f = open(rangesFn, 'w')
    pickle.dump(ranges, f)
    f.close()
    lociFn = os.path.join(tmp_dir, "loci.pkl")
    f = open(lociFn, 'w')
    pickle.dump(loci, f)
    f.close()
    refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl")  #Locus variants sequence
    f = open(refSeqsFn, 'w')
    pickle.dump(refSeqs, f)
    f.close()
    bowtie2_index = bowtie + "-build"
    log_writer.info_header(logger, "bowtie_indexed")
    process = subprocess.Popen(
        [bowtie2_index, refFn, refFn],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE
    )  # generate index of reference fasta for mapping
    process.wait()

    log_writer.log_process(logger, process, log_error_to="info")
    os.system("rm -f summary.txt")
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir, bowtie, samtools, id, logger, threads):

  """

  This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided.

  :param fastqs: directory that contains two fastq files.  It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq
  :type fastqs: directory
  :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function.
  :type reference_fasta_file: file
  :param bowtie: path to bowtie
  :type bowtie: path
  :param samtools: path to samtools
  :type samtools: path

  :returns: sorted and indexed bam file.
  :rtype: file

  """
  try:
    os.makedirs(output_dir + "/tmp")
  except OSError:
    if os.path.isdir(output_dir + "/tmp"):
      # We are nearly safe
      pass
    else:
      # There was an error on creation, so make sure we know about it
      raise

  null = open(os.devnull, 'w')

  bam_sorted = os.path.join(output_dir, id + '-sorted')
  bam_out = os.path.join(output_dir, id + '-sorted.bam')
  sam_parsed = os.path.join(output_dir + "/tmp", id + '.tmp') # temporary sam output
  sam = os.path.join(output_dir + "/tmp", id + '.sam')
  bam = os.path.join(output_dir + "/tmp", id + '.bam')
  
 
  # copy the reference fasta file to the tmp directory and index
  reference_fasta_file = output_dir + "/tmp/reference.fasta"
  shutil.copyfile(reference_fasta_file_path, reference_fasta_file)
  print "running bowtie index"
  bowtie_index=  bowtie + "-build"
  log_writer.info_header(logger, "Creating reference_fasta_fileerence index")
  process = subprocess.Popen([bowtie_index, reference_fasta_file, reference_fasta_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference_fasta_fileerence fasta for mapping
  process.wait()
  log_writer.log_process(logger, process)
  
  # # run bowtie
  cmd = [bowtie]
  cmd += [ '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1],'-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50', '-p', str(threads)] # write to tmp 
  log_writer.info_header(logger, "Running bowtie to generate sam file")
#  print "running bowtie"
  process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")

  # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file.  The sam_parsed file is the output that is used to convert to bam.

  try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr", remove_secondary_mapping_bit, sam, sam_parsed)

  log_writer.info_header(logger, "Convert sam to bam")
  process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, '-@', str(threads), sam_parsed], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")
  # sort bam
  log_writer.info_header(logger, "Sort the bam file")
  process = subprocess.Popen([samtools, 'sort', '-o', bam_sorted + ".bam", '-@', str(threads), bam], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")
  # index bam
  log_writer.info_header(logger, "Index the BAM file")
  process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")
  return bam_out 
Exemplo n.º 11
0
def getNovelAllele(variant, locus, fastq_files,  bowtie, samtools, ids, tmp_dir, logger):
	
	"""
	Function 
	Generate SAM, BAM and pileup file for novel allele
	
	The option for method:
	variant[str]: locus variant number
	locus[str]: locus name
	fastq_files[str]: the path to the fastq file 
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: sample unique identifier number
	tmp_dir[str]: the path to where  SAM, BAM, Pileup will be created
	"""
	
	refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl")))
	allele_name = locus + "-" + variant 
	typeFn = os.path.join(tmp_dir, allele_name + ".fa")
	typeFile = open(typeFn, "w")
	s = refSeqs[allele_name]
	Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta")
	typeFile.close()
	
	
	#index refrence sample
	bowtie2_index = bowtie + "-build"
	log_writer.info_header(logger, "index refrence sample")
	process = subprocess.Popen([bowtie2_index,
								typeFn,
								typeFn],
							stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref
	process.wait()	
	log_writer.log_process(logger, process, log_error_to = "info")
	
	# create sam and bam files
	log_writer.info_header(logger, "Creating sam and bam files")
	bam = create_bam_file(tmp_dir,
						  fastq_files,
						  typeFn,
						  False,
						  bowtie,
						  samtools,
						  ids,
						  logger)
	
	#name bam file
	bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam")
	process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")

	#index bam file
	process = subprocess.Popen([samtools, 'index',
								bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	log_writer.info_header(logger, "index bam file")
	
	#generate pilup file
	piFn= os.path.join(tmp_dir,  ids + "." + allele_name + '.pileup')
	f = open(piFn, 'w')
	log_writer.info_header(logger, "generate pileup file")
	process = subprocess.Popen([samtools, 'mpileup',
								'-B', '-A', '-cf', typeFn, bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup
	for l in process.stdout:
		f.write(l)
	f.close()
	process.wait()	
def getNovelAllele(variant, locus, fastq_files,  bowtie, samtools, ids, tmp_dir, logger):
	
	"""
	Function 
	Generate SAM, BAM and pileup file for novel allele
	
	The option for method:
	variant[str]: locus variant number
	locus[str]: locus name
	fastq_files[str]: the path to the fastq file 
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: sample unique identifier number
	tmp_dir[str]: the path to where  SAM, BAM, Pileup will be created
	"""
	
	refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl"),'rb'))
	allele_name = locus + "-" + variant 
	typeFn = os.path.join(tmp_dir, allele_name + ".fa")
	typeFile = open(typeFn, "w")
	s = refSeqs[allele_name]
	Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta")
	typeFile.close()
	
	
	#index refrence sample
	bowtie2_index = bowtie + "-build"
	log_writer.info_header(logger, "index refrence sample")
	process = subprocess.Popen([bowtie2_index,
								typeFn,
								typeFn],
							stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref
	process.wait()	
	log_writer.log_process(logger, process, log_error_to = "info")
	
	# create sam and bam files
	log_writer.info_header(logger, "Creating sam and bam files")
	bam = create_bam_file(tmp_dir,
						  fastq_files,
						  typeFn,
						  False,
						  bowtie,
						  samtools,
						  ids,
						  logger)
	
	#name bam file
	bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam")
	process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")

	#index bam file
	process = subprocess.Popen([samtools, 'index',
								bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	log_writer.info_header(logger, "index bam file")
	
	#generate pilup file
	piFn= os.path.join(tmp_dir,  ids + "." + allele_name + '.pileup')
	f = open(piFn, 'wb')
	log_writer.info_header(logger, "generate pileup file")
	process = subprocess.Popen([samtools, 'mpileup',
								'-B', '-A', '-cf', typeFn, bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup
	for l in process.stdout:
		f.write(l)
	f.close()
	process.wait()
def flanking_regions(profile_file_directory, output_directory, logger):
	
	"""
	Function
	(1) Extract flanking regions of 100bp upstream and downstream of each MLST locus
	by blast against a reference genome.BLAST uses the first locus sequence as a query.
	(2) Creates summary.txt file (a tab-delimited text file display the path to the
	loci and flanking sequences) 
	
	The option of the method
	profile_file_directory[str]: The path to the reference.seq, profile.txt and
	the Locus variant sequences (*.fas) files location
	output_directory[str]: The path to where the summary.txt file will be created
	logger[str]: The path to where the stderr and stdout logged
	"""
	
	reference_fasta_file = profile_file_directory + "/reference.seq"	
	refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna)
	locus_files = glob.glob(profile_file_directory + "/*.fas")
	locus_files = sorted(locus_files)
	
	summary_file_handle = open(output_directory + "/summary.txt", "w")
	for seq in locus_files:
		(seqDir,seqFileName) = os.path.split(seq)	
		(seqBaseName,ext) = os.path.splitext(seqFileName)
		bait = seqBaseName + "_bait.fasta"
		log_writer.info_header(logger, "create bait file")
		process = subprocess.Popen(['seqret',seq,'-firstonly','-auto','-out',output_directory+ '/' + bait], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		process.wait()
		log_writer.log_process(logger, process, log_error_to = "info")
		cline = NcbiblastnCommandline(query=output_directory+ '/' + bait, db=profile_file_directory + "/reference",evalue=0.001, out=output_directory + "/my_blast_tmp.xml", outfmt=5)		
		stdout_log_output, stderr_log_output = cline()
		result_handle = open(output_directory + "/my_blast_tmp.xml")
		blast_record = NCBIXML.read(result_handle)
		query_length = blast_record.query_letters
		for alignment in blast_record.alignments:
			hsp = alignment.hsps[0] 
			if hsp.align_length/float(query_length) > 0.5:
				if hsp.sbjct_start > hsp.sbjct_end:
					subject_start = hsp.sbjct_start + (hsp.query_start - 1)
				else:
					subject_start = hsp.sbjct_start - (hsp.query_start - 1)
				if hsp.sbjct_start > hsp.sbjct_end:
					subject_end = hsp.sbjct_end - (query_length - hsp.query_end)
				else:
					subject_end = hsp.sbjct_end + (query_length - hsp.query_end)
				revcomp = 1 
				if hsp.sbjct_start > hsp.sbjct_end:
					revcomp = -1
				left_coords = [min(subject_start,subject_end)-100,min(subject_start,subject_end)-1]
				right_coords = [max(subject_start,subject_end)+1,max(subject_start,subject_end)+100]
				left_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(left_coords[0])," -send ",str(left_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_left_flank.fasta"]
				os.system(''.join(left_cmd))
				
				right_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(right_coords[0])," -send ",str(right_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_right_flank.fasta"]
				os.system(''.join(right_cmd)) 
				
				left_record = SeqIO.read(output_directory + "/tmp_left_flank.fasta", "fasta")
				
				if revcomp < 0:
					left_record.id = "down"
					left_record.seq = left_record.seq.reverse_complement()
				else:
					left_record.id = "up"
				right_record = SeqIO.read(output_directory + "/tmp_right_flank.fasta", "fasta")
				if revcomp < 0:
					right_record.id = "up"
					right_record.seq = right_record.seq.reverse_complement()
				else:
					right_record.id = "down"
				right_record.description = ""
				left_record.description = ""
				out_handle = open(output_directory + "/" + seqBaseName + "_flanks.fasta", "w")
				out_handle.write(right_record.format("fasta"))
				out_handle.write(left_record.format("fasta"))
				out_handle.close()
				summary_file_handle.write('\t'.join([seqBaseName,seq,output_directory + "/" + seqBaseName + "_flanks.fasta"]) + "\n")
	summary_file_handle.close()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger):
	
	"""
	Function
	(1) Concatenate flanking regions to correspondent locus variants sequence in fasta format.
	Newly concatenated sequence are then indexed by Bowtie2
	(2) Then extract and store as pickled object:
		a. locus- variant names (loci.pkl)
		b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl)
		c. Locus variants sequence (refSeqs.pkl)
	
	The option of the method
	specFn[str]: A tab-delimited text file display the path to the seven flanking  and loci sequences(summary.txt)
	tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created
	bowtie[str]: The command used to index the reference sequence
	logger[str]: The path to where the stderr and stdout logged
	
	Return
	loci[list]: loci name
	"""
	
	(specDir,summaryFileName) = os.path.split(specFn)
	spc = []
	
	for l in open(specFn):
		spc.append(l.split())
	refFn = os.path.join(tmp_dir, "reference.fa")
	rf = open(refFn, "w") 
	ranges = {}
	loci = [] 
	refSeqs = {} 
	for (loc, variantsFn, flanksFn) in spc:
		loci.append(loc)
		fs = {} 
		f = open(os.path.join(specDir, flanksFn))
		for r in Bio.SeqIO.parse(f, "fasta"):
			fs[r.id] = r.seq
		f = open(os.path.join(specDir, variantsFn))
		for r in Bio.SeqIO.parse(f, "fasta"):
			s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna) 
			s += fs['up']
			s += r.seq
			s += fs['down']
			Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta") 
			ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq))
			refSeqs[r.id] = s 
	rf.close()
	rangesFn = os.path.join(tmp_dir, "ranges.pkl") #start and end position of locus variant sequences (without the flanking sequences)
	f = open(rangesFn, 'w')
	pickle.dump(ranges, f)
	f.close()
	lociFn = os.path.join(tmp_dir, "loci.pkl")
	f = open(lociFn, 'w')
	pickle.dump(loci, f)
	f.close()
	refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl") #Locus variants sequence
	f = open(refSeqsFn, 'w')
	pickle.dump(refSeqs, f)
	f.close()
	bowtie2_index = bowtie + "-build"
	log_writer.info_header(logger, "bowtie_indexed")
	process = subprocess.Popen([bowtie2_index, refFn, refFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference fasta for mapping
	process.wait()
	
	log_writer.log_process(logger, process, log_error_to = "info")
	os.system("rm -f summary.txt")