Пример #1
0
def create_index(reference_fasta_file, bowtie_path, samtools_path, tmpdir,
                 logger):
    'Build a bowtie2 index and fai index from the given input(s)'

    fasta_file = tmpdir + "reference.fasta"
    shutil.copyfile(reference_fasta_file, fasta_file)
    bt2_index = fasta_file + '.1.bt2'
    fai_index = fasta_file + '.fai'
    log_writer.info_header(
        logger, 'Building bowtie2 index for {}...'.format(fasta_file))
    if os.path.exists(bt2_index):
        log_writer.write_log(
            logger,
            'Bowtie2 index for {} is already built...'.format(fasta_file),
            'info')
    else:
        bowtie2_index = bowtie_path + '-build'
        subprocess.call([bowtie2_index, '-f', fasta_file, fasta_file])

    log_writer.info_header(
        logger, 'Building samtools index for {}...'.format(fasta_file))
    if os.path.exists(fai_index):
        log_writer.write_log(
            logger,
            'Samtools index for {} is already built...'.format(fasta_file),
            'info')
    else:
        subprocess.call([samtools_path, 'faidx', fasta_file])

    return fasta_file
Пример #2
0
def clean_up(tmp, logger):
    ' Remove temporary files'
    log_writer.info_header(logger, 'Removing temporary files:' + tmp)
    try:
        shutil.rmtree(tmp)
    except OSError:
        raise
Пример #3
0
def get_from_config( CONFIG_FILE, get_items=None ):
    '''  
    Returns data for the calling function from CONFIG_FILE. 

    Args
        CONFIG_FILE, string : environment variable loaded using module, pointing to config file
        get_items, list : optional, list of items to return from config file

    Return
        data-structure : default return is the data structure represented by the config file for the calling function
        items, unpacked tuple : if get_items is specified, unpacked variables specified by the config file for the calling function 

    The config_file.yml must represent a dictionary, with the primary key == name_of_func_using_value
    <k> = function_using_FOO
        <k> = FOO
    `       <v> = value_of_foo
    'function_using_FOO' is retrieved from the call stack and used selectively to return data from the config_file.yml
    
    N.B. the calling code for a single item needs a comma to unpack the return from tuple
    return_item, = get_from_config( config_file, get_items=['return_item'] )

    MGGoulden 20130906
    amended 20130919
    '''

    import os
    import yaml
    import logging
    import inspect 
    # from common_modules
    import log_writer # setup_logger, write_log, error_header, info_header, log_process    


    # get a pointer to the logger
    logger = logging.getLogger( 'stdout_stderr_logger' ) 
    log_txt = 'IN get_from_config( CONFIG_FILE = '+CONFIG_FILE+', get_items = ' +str(get_items)+ ')'
    log_writer.info_header( logger, log_txt )

    # sanity check
    if not CONFIG_FILE in os.environ:
        log_txt = 'The environment variable ('+CONFIG_FILE+') is not available; module load may be required; quitting ... '
        log_writer.error_header( logger, log_txt )
    elif not os.path.exists( os.environ[ CONFIG_FILE ] ):
        log_txt = 'The environment variable ('+CONFIG_FILE+') points to a file ('+os.environ[CONFIG_FILE]+') which does not exist; module amendment required; quitting ... '
        log_writer.error_header( logger, log_txt )

    # who calls? - identify the calling func from stack
    caller =  inspect.stack()[1][3]
    if caller == 'try_and_except':
        caller =  inspect.stack()[2][3]

    read_me = os.environ[CONFIG_FILE]    
    with open( read_me, 'r') as f:
        CONFIG = yaml.load(f)
        if not get_items:
            return CONFIG[caller]
        else:
            return [ CONFIG[ caller ][x] for x in get_items ]
Пример #4
0
def pileup(sorted_bamfile, reference, samtools, outdir, logger):
    ' Create a mpileup file '
    log_writer.info_header(logger, 'Create mpileup file')
    filename = os.path.join(outdir, os.path.basename(sorted_bamfile).split('.')[0] + '.mpileup')
    pileupFile = open(filename, 'w')
    process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-f', reference, sorted_bamfile], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # -A -count anomalous read pairs, -B - disable BAQ computation  and -f FILE - indexed reference sequence file
    result = process.stdout
    for l in result:
        pileupFile.write(l)
    pileupFile.close()
Пример #5
0
def modify_bowtie_sam(samfile, logger):
    'Modify SAM formatted output from Bowtie to maintain secondary alignments for downstream pileup'
    with open(samfile) as sam, open(samfile + '.mod', 'w') as sam_mod:
        
        log_writer.info_header(logger, 'Modifying SAM formatted output from Bowtie to maintain secondary alignments for downstream pileup...')
        for line in sam:
            if not line.startswith('@'):
                fields = line.split('\t')
                flag = int(fields[1])
                flag = (flag - 256) if (flag > 256) else flag
                sam_mod.write('\t'.join([fields[0], str(flag)] + fields[2:]))
            else:
                sam_mod.write(line)
    return samfile + '.mod'
Пример #6
0
def mapping(fastq_file, reference, bowtie_path, samtools_path, outdir, logger):
    sample = os.path.basename(fastq_file).split('.')[0]
    # create index if not available
    tmp = outdir + "/{0}_tmp/".format(sample)
    if not os.path.exists(tmp): os.mkdir(tmp)
    reference_fasta_file = create_index(reference, bowtie_path, samtools_path,
                                        tmp, logger)

    fastq1 = fastq_file
    #if _args.source == 0:
    fastq2 = fastq_file.replace('1.fastq', '2.fastq')
    #else:
    #fastq2 = fastq_file.replace('_1.fq', '_2.fq')
    samfile = tmp + sample + '.sam'
    bamfile = tmp + sample + '.bam'
    sorted_bam_prefix = outdir + '/' + sample + '.sorted'
    if not os.path.isfile(sorted_bam_prefix +
                          '.bam'):  # change this to bamfiles/prefix.sorted.bam
        # create sam file
        log_writer.info_header(logger, "Running bowtie to generate sam file")
        subprocess.call([
            bowtie_path, '--fr', '--minins', '300', '--maxins', '1100', '-x',
            reference_fasta_file, '-1', fastq1, '-2', fastq2, '-S', samfile,
            '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
            'S,1,0.50'
        ])  # write to tmp

        # remove flags > 256 to allow reads to map in more than one locations
        sam_mod = modify_bowtie_sam(samfile, logger)

        # convert to bam
        log_writer.info_header(logger,
                               "Running samtools to convert sam to bam file")
        subprocess.call([samtools_path, 'view', '-bS', '-o', bamfile, sam_mod])

        # sort bam file
        log_writer.info_header(logger, "Sort the bam file")
        subprocess.call([samtools_path, 'sort', bamfile, sorted_bam_prefix])

        # index bam file
        log_writer.info_header(logger, "Index the BAM file")
        subprocess.call([samtools_path, 'index', sorted_bam_prefix + '.bam'])

        # get stats
        output = subprocess.check_output(
            [samtools_path, "flagstat", sorted_bam_prefix + '.bam'])

        ## clean up all unnecessary files
        #clean_up(tmp, logger)

    return sorted_bam_prefix + '.bam', reference_fasta_file
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger):
	
	"""
	Function
	Generate pileup file by using SAMtools mpileup command.
	NB: use -B -A -f option to optimises coverage and --A
	flag count anomalous read 
	 
	The option for method:
	tmp_dir[str]: the path to where  pileup file will be created
	sorted_bam_file[str]:  the path to the BAM file location
	refFn[str]:  the path to the reference file location
	samtools[str]: the path to SAMtools command
	logger[str]: the path to where the stderr and stdout logged
	"""
	
	#1. Index bam file
	log_writer.info_header(logger, "index bam file")
	process = subprocess.Popen([samtools, 'index',
								sorted_bam_file],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	
	#2. Generate pileup file
	pileFn =  os.path.join(tmp_dir, 'all.pileup')
	pileupFile = open(pileFn, 'w')
	log_writer.info_header(logger, "Generate pileup file")
	process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-f',
								refFn, sorted_bam_file],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	for l in process.stdout:
		pileupFile.write(l)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	pileupFile.close()
	
	
	
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger):
    """
	Function
	Generate pileup file by using SAMtools mpileup command.
	NB: use -B -A -f option to optimises coverage and --A
	flag count anomalous read 
	 
	The option for method:
	tmp_dir[str]: the path to where  pileup file will be created
	sorted_bam_file[str]:  the path to the BAM file location
	refFn[str]:  the path to the reference file location
	samtools[str]: the path to SAMtools command
	logger[str]: the path to where the stderr and stdout logged
	"""

    #1. Index bam file
    log_writer.info_header(logger, "index bam file")
    process = subprocess.Popen([samtools, 'index', sorted_bam_file],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    #2. Generate pileup file
    pileFn = os.path.join(tmp_dir, 'all.pileup')
    pileupFile = open(pileFn, 'wb')
    log_writer.info_header(logger, "Generate pileup file")
    process = subprocess.Popen(
        [samtools, 'mpileup', '-B', '-A', '-f', refFn, sorted_bam_file],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE)
    for l in process.stdout:
        pileupFile.write(l)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    pileupFile.close()
def flanking_regions(profile_file_directory, output_directory, logger):
    """
	Function
	(1) Extract flanking regions of 100bp upstream and downstream of each MLST locus
	by blast against a reference genome.BLAST uses the first locus sequence as a query.
	(2) Creates summary.txt file (a tab-delimited text file display the path to the
	loci and flanking sequences) 
	
	The option of the method
	profile_file_directory[str]: The path to the reference.seq, profile.txt and
	the Locus variant sequences (*.fas) files location
	output_directory[str]: The path to where the summary.txt file will be created
	logger[str]: The path to where the stderr and stdout logged
	"""

    reference_fasta_file = profile_file_directory + "/reference.seq"
    refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna)
    locus_files = glob.glob(profile_file_directory + "/*.fas")
    locus_files = sorted(locus_files)

    summary_file_handle = open(output_directory + "/summary.txt", "w")
    for seq in locus_files:
        (seqDir, seqFileName) = os.path.split(seq)
        (seqBaseName, ext) = os.path.splitext(seqFileName)
        bait = seqBaseName + "_bait.fasta"
        log_writer.info_header(logger, "create bait file")
        process = subprocess.Popen([
            'seqret', seq, '-firstonly', '-auto', '-out',
            output_directory + '/' + bait
        ],
                                   stderr=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        process.wait()
        log_writer.log_process(logger, process, log_error_to="info")
        cline = NcbiblastnCommandline(query=output_directory + '/' + bait,
                                      db=profile_file_directory + "/reference",
                                      evalue=0.001,
                                      out=output_directory +
                                      "/my_blast_tmp.xml",
                                      outfmt=5)
        stdout_log_output, stderr_log_output = cline()
        result_handle = open(output_directory + "/my_blast_tmp.xml")
        blast_record = NCBIXML.read(result_handle)
        query_length = blast_record.query_letters
        for alignment in blast_record.alignments:
            hsp = alignment.hsps[0]
            if hsp.align_length / float(query_length) > 0.5:
                if hsp.sbjct_start > hsp.sbjct_end:
                    subject_start = hsp.sbjct_start + (hsp.query_start - 1)
                else:
                    subject_start = hsp.sbjct_start - (hsp.query_start - 1)
                if hsp.sbjct_start > hsp.sbjct_end:
                    subject_end = hsp.sbjct_end - (query_length -
                                                   hsp.query_end)
                else:
                    subject_end = hsp.sbjct_end + (query_length -
                                                   hsp.query_end)
                revcomp = 1
                if hsp.sbjct_start > hsp.sbjct_end:
                    revcomp = -1
                left_coords = [
                    min(subject_start, subject_end) - 100,
                    min(subject_start, subject_end) - 1
                ]
                right_coords = [
                    max(subject_start, subject_end) + 1,
                    max(subject_start, subject_end) + 100
                ]
                left_cmd = [
                    "seqret ", reference_fasta_file, " -sbegin ",
                    str(left_coords[0]), " -send ",
                    str(left_coords[1]), " -osformat fasta -auto -out " +
                    output_directory + "/tmp_left_flank.fasta"
                ]
                os.system(''.join(left_cmd))

                right_cmd = [
                    "seqret ", reference_fasta_file, " -sbegin ",
                    str(right_coords[0]), " -send ",
                    str(right_coords[1]), " -osformat fasta -auto -out " +
                    output_directory + "/tmp_right_flank.fasta"
                ]
                os.system(''.join(right_cmd))

                left_record = SeqIO.read(
                    output_directory + "/tmp_left_flank.fasta", "fasta")

                if revcomp < 0:
                    left_record.id = "down"
                    left_record.seq = left_record.seq.reverse_complement()
                else:
                    left_record.id = "up"
                right_record = SeqIO.read(
                    output_directory + "/tmp_right_flank.fasta", "fasta")
                if revcomp < 0:
                    right_record.id = "up"
                    right_record.seq = right_record.seq.reverse_complement()
                else:
                    right_record.id = "down"
                right_record.description = ""
                left_record.description = ""
                out_handle = open(
                    output_directory + "/" + seqBaseName + "_flanks.fasta",
                    "w")
                out_handle.write(right_record.format("fasta"))
                out_handle.write(left_record.format("fasta"))
                out_handle.close()
                summary_file_handle.write('\t'.join([
                    seqBaseName, seq, output_directory + "/" + seqBaseName +
                    "_flanks.fasta"
                ]) + "\n")
    summary_file_handle.close()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger):
	
	"""
	Function
	(1) Concatenate flanking regions to correspondent locus variants sequence in fasta format.
	Newly concatenated sequence are then indexed by Bowtie2
	(2) Then extract and store as pickled object:
		a. locus- variant names (loci.pkl)
		b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl)
		c. Locus variants sequence (refSeqs.pkl)
	
	The option of the method
	specFn[str]: A tab-delimited text file display the path to the seven flanking  and loci sequences(summary.txt)
	tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created
	bowtie[str]: The command used to index the reference sequence
	logger[str]: The path to where the stderr and stdout logged
	
	Return
	loci[list]: loci name
	"""
	
	(specDir,summaryFileName) = os.path.split(specFn)
	spc = []
	
	for l in open(specFn):
		spc.append(l.split())
	refFn = os.path.join(tmp_dir, "reference.fa")
	rf = open(refFn, "w") 
	ranges = {}
	loci = [] 
	refSeqs = {} 
	for (loc, variantsFn, flanksFn) in spc:
		loci.append(loc)
		fs = {} 
		f = open(os.path.join(specDir, flanksFn))
		for r in Bio.SeqIO.parse(f, "fasta"):
			fs[r.id] = r.seq
		f = open(os.path.join(specDir, variantsFn))
		for r in Bio.SeqIO.parse(f, "fasta"):
			s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna) 
			s += fs['up']
			s += r.seq
			s += fs['down']
			Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta") 
			ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq))
			refSeqs[r.id] = s 
	rf.close()
	rangesFn = os.path.join(tmp_dir, "ranges.pkl") #start and end position of locus variant sequences (without the flanking sequences)
	f = open(rangesFn, 'w')
	pickle.dump(ranges, f)
	f.close()
	lociFn = os.path.join(tmp_dir, "loci.pkl")
	f = open(lociFn, 'w')
	pickle.dump(loci, f)
	f.close()
	refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl") #Locus variants sequence
	f = open(refSeqsFn, 'w')
	pickle.dump(refSeqs, f)
	f.close()
	bowtie2_index = bowtie + "-build"
	log_writer.info_header(logger, "bowtie_indexed")
	process = subprocess.Popen([bowtie2_index, refFn, refFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference fasta for mapping
	process.wait()
	
	log_writer.log_process(logger, process, log_error_to = "info")
	os.system("rm -f summary.txt")
def flanking_regions(profile_file_directory, output_directory, logger):
	
	"""
	Function
	(1) Extract flanking regions of 100bp upstream and downstream of each MLST locus
	by blast against a reference genome.BLAST uses the first locus sequence as a query.
	(2) Creates summary.txt file (a tab-delimited text file display the path to the
	loci and flanking sequences) 
	
	The option of the method
	profile_file_directory[str]: The path to the reference.seq, profile.txt and
	the Locus variant sequences (*.fas) files location
	output_directory[str]: The path to where the summary.txt file will be created
	logger[str]: The path to where the stderr and stdout logged
	"""
	
	reference_fasta_file = profile_file_directory + "/reference.seq"	
	refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna)
	locus_files = glob.glob(profile_file_directory + "/*.fas")
	locus_files = sorted(locus_files)
	
	summary_file_handle = open(output_directory + "/summary.txt", "w")
	for seq in locus_files:
		(seqDir,seqFileName) = os.path.split(seq)	
		(seqBaseName,ext) = os.path.splitext(seqFileName)
		bait = seqBaseName + "_bait.fasta"
		log_writer.info_header(logger, "create bait file")
		process = subprocess.Popen(['seqret',seq,'-firstonly','-auto','-out',output_directory+ '/' + bait], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		process.wait()
		log_writer.log_process(logger, process, log_error_to = "info")
		cline = NcbiblastnCommandline(query=output_directory+ '/' + bait, db=profile_file_directory + "/reference",evalue=0.001, out=output_directory + "/my_blast_tmp.xml", outfmt=5)		
		stdout_log_output, stderr_log_output = cline()
		result_handle = open(output_directory + "/my_blast_tmp.xml")
		blast_record = NCBIXML.read(result_handle)
		query_length = blast_record.query_letters
		for alignment in blast_record.alignments:
			hsp = alignment.hsps[0] 
			if hsp.align_length/float(query_length) > 0.5:
				if hsp.sbjct_start > hsp.sbjct_end:
					subject_start = hsp.sbjct_start + (hsp.query_start - 1)
				else:
					subject_start = hsp.sbjct_start - (hsp.query_start - 1)
				if hsp.sbjct_start > hsp.sbjct_end:
					subject_end = hsp.sbjct_end - (query_length - hsp.query_end)
				else:
					subject_end = hsp.sbjct_end + (query_length - hsp.query_end)
				revcomp = 1 
				if hsp.sbjct_start > hsp.sbjct_end:
					revcomp = -1
				left_coords = [min(subject_start,subject_end)-100,min(subject_start,subject_end)-1]
				right_coords = [max(subject_start,subject_end)+1,max(subject_start,subject_end)+100]
				left_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(left_coords[0])," -send ",str(left_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_left_flank.fasta"]
				os.system(''.join(left_cmd))
				
				right_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(right_coords[0])," -send ",str(right_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_right_flank.fasta"]
				os.system(''.join(right_cmd)) 
				
				left_record = SeqIO.read(output_directory + "/tmp_left_flank.fasta", "fasta")
				
				if revcomp < 0:
					left_record.id = "down"
					left_record.seq = left_record.seq.reverse_complement()
				else:
					left_record.id = "up"
				right_record = SeqIO.read(output_directory + "/tmp_right_flank.fasta", "fasta")
				if revcomp < 0:
					right_record.id = "up"
					right_record.seq = right_record.seq.reverse_complement()
				else:
					right_record.id = "down"
				right_record.description = ""
				left_record.description = ""
				out_handle = open(output_directory + "/" + seqBaseName + "_flanks.fasta", "w")
				out_handle.write(right_record.format("fasta"))
				out_handle.write(left_record.format("fasta"))
				out_handle.close()
				summary_file_handle.write('\t'.join([seqBaseName,seq,output_directory + "/" + seqBaseName + "_flanks.fasta"]) + "\n")
	summary_file_handle.close()
Пример #12
0
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir,
            bowtie, samtools, id, logger):
    """

  This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided.

  :param fastqs: directory that contains two fastq files.  It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq
  :type fastqs: directory
  :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function.
  :type reference_fasta_file: file
  :param bowtie: path to bowtie
  :type bowtie: path
  :param samtools: path to samtools
  :type samtools: path

  :returns: sorted and indexed bam file.
  :rtype: file

  """
    try:
        os.makedirs(output_dir + "/tmp")
    except OSError:
        if os.path.isdir(output_dir + "/tmp"):
            # We are nearly safe
            pass
        else:
            # There was an error on creation, so make sure we know about it
            raise

    null = open(os.devnull, 'w')

    bam_sorted = os.path.join(output_dir, id + '-sorted')
    bam_out = os.path.join(output_dir, id + '-sorted.bam')
    sam_parsed = os.path.join(output_dir + "/tmp",
                              id + '.tmp')  # temporary sam output
    sam = os.path.join(output_dir + "/tmp", id + '.sam')
    bam = os.path.join(output_dir + "/tmp", id + '.bam')

    # copy the reference fasta file to the tmp directory and index
    reference_fasta_file = output_dir + "/tmp/reference.fasta"
    shutil.copyfile(reference_fasta_file_path, reference_fasta_file)
    print "running bowtie index"
    bowtie_index = bowtie + "-build"
    log_writer.info_header(logger, "Creating reference_fasta_fileerence index")
    process = subprocess.Popen(
        [bowtie_index, reference_fasta_file, reference_fasta_file],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE
    )  # generate index of reference_fasta_fileerence fasta for mapping
    process.wait()
    log_writer.log_process(logger, process)

    # # run bowtie
    cmd = [bowtie]
    cmd += [
        '--fr', '--minins', '300', '--maxins', '1100', '-x',
        reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1], '-S', sam,
        '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
        'S,1,0.50'
    ]  # write to tmp
    log_writer.info_header(logger, "Running bowtie to generate sam file")
    #  print "running bowtie"
    process = subprocess.Popen(cmd,
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file.  The sam_parsed file is the output that is used to convert to bam.

    try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr",
                   remove_secondary_mapping_bit, sam, sam_parsed)

    log_writer.info_header(logger, "Convert sam to bam")
    process = subprocess.Popen(
        [samtools, 'view', '-bhS', '-o', bam, sam_parsed],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    # sort bam
    log_writer.info_header(logger, "Sort the bam file")
    process = subprocess.Popen([samtools, 'sort', bam, bam_sorted],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    # index bam
    log_writer.info_header(logger, "Index the BAM file")
    process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")
    return bam_out
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids, logger):
	
	"""
	Function
	(1) Map each read set to each of the possible locus variants  by calling Bowtie2
	(with  very sensitive options) and create tmp file
	(2) Convert the tmp to sam file by unset the secondary alignment bit score
	(3) Convert the sam to BAM file
	(4) sort BAM file
	
	The option for method:
	tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created
	fastq_files[list]: the path to the fastq file location
	refFn[str]:  the path to the  reference file location
	expand[str] : True
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: unique identifier number
	logger[str]: the path to where the stderr and stdout logged
	
	Return
	sorted_bam_file[str]: sorted BAM file
	"""
	
	tmp = os.path.join(tmp_dir, ids +'.tmp') # temporary sam output
	sam = os.path.join(tmp_dir, ids +  '.sam')
	
	if expand:
		#1. Creating tmp file
		log_writer.info_header(logger, "Creating tmp file")
		# -k = report up to 99999 good alignments per read.
		#--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 
		process = subprocess.Popen([bowtie, '--fr',
									'--no-unal',
									'--minins', '300', '--maxins', '1100',
									'-x', refFn,
									'-1', fastq_files[0], '-2', fastq_files[1],
									'-S', tmp,
									'-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'],
									stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		process.wait()
		log_writer.log_process(logger, process, log_error_to = "info")
		
		#2.remove_secondary_mapping_bit
		log_writer.info_header(logger, "remove_secondary_mapping_bit")
		i = open(tmp)
		o = open(sam, 'w')
		remove_secondary_mapping_bit(tmp, sam)
		i.close()
		o.close()
	
	else:
		log_writer.info_header(logger, "Creating sam file")
		process= subprocess.Popen([bowtie,  '--fr',
								   '--no-unal',
								   '--minins', '300', '--maxins', '1100',
								   '-x', refFn,
								   '-1', fastq_files[0], '-2', fastq_files[1],
								   '-S', sam,
								   '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'],
			stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		process.wait()
		log_writer.log_process(logger, process, log_error_to = "info")
	
	#3.Converting sam to bam file
	bam = os.path.join(tmp_dir, ids +  '.unsortedbam')
	log_writer.info_header(logger, "Converting sam to bam")
	process = subprocess.Popen([samtools, 'view',
								'-bhS',
								'-o', bam, sam], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	
	#4.Sort bam file
	out0 = os.path.join(tmp_dir,ids +  '-all')
	log_writer.info_header(logger, "Sorting bam")
	sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam')
	process = subprocess.Popen([samtools, 'sort',
								bam, out0], stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	
	return sorted_bam_file
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids,
                    logger):
    """
	Function
	(1) Map each read set to each of the possible locus variants  by calling Bowtie2
	(with  very sensitive options) and create tmp file
	(2) Convert the tmp to sam file by unset the secondary alignment bit score
	(3) Convert the sam to BAM file
	(4) sort BAM file
	
	The option for method:
	tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created
	fastq_files[list]: the path to the fastq file location
	refFn[str]:  the path to the  reference file location
	expand[str] : True
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: unique identifier number
	logger[str]: the path to where the stderr and stdout logged
	
	Return
	sorted_bam_file[str]: sorted BAM file
	"""

    tmp = os.path.join(tmp_dir, ids + '.tmp')  # temporary sam output
    sam = os.path.join(tmp_dir, ids + '.sam')

    if expand:
        #1. Creating tmp file
        log_writer.info_header(logger, "Creating tmp file")
        # -k = report up to 99999 good alignments per read.
        #--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50
        process = subprocess.Popen([
            bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100',
            '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', tmp,
            '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
            'S,1,0.50'
        ],
                                   stderr=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        process.wait()
        log_writer.log_process(logger, process, log_error_to="info")

        #2.remove_secondary_mapping_bit
        log_writer.info_header(logger, "remove_secondary_mapping_bit")
        i = open(tmp)
        o = open(sam, 'w')
        remove_secondary_mapping_bit(tmp, sam)
        i.close()
        o.close()

    else:
        log_writer.info_header(logger, "Creating sam file")
        process = subprocess.Popen([
            bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100',
            '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', sam,
            '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i',
            'S,1,0.50'
        ],
                                   stderr=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        process.wait()
        log_writer.log_process(logger, process, log_error_to="info")

    #3.Converting sam to bam file
    bam = os.path.join(tmp_dir, ids + '.unsortedbam')
    log_writer.info_header(logger, "Converting sam to bam")
    process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, sam],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    #4.Sort bam file
    out0 = os.path.join(tmp_dir, ids + '-all')
    log_writer.info_header(logger, "Sorting bam")
    sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam')
    process = subprocess.Popen([samtools, 'sort', bam, out0],
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    process.wait()
    log_writer.log_process(logger, process, log_error_to="info")

    return sorted_bam_file
def getNovelAllele(variant, locus, fastq_files,  bowtie, samtools, ids, tmp_dir, logger):
	
	"""
	Function 
	Generate SAM, BAM and pileup file for novel allele
	
	The option for method:
	variant[str]: locus variant number
	locus[str]: locus name
	fastq_files[str]: the path to the fastq file 
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: sample unique identifier number
	tmp_dir[str]: the path to where  SAM, BAM, Pileup will be created
	"""
	
	refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl"),'rb'))
	allele_name = locus + "-" + variant 
	typeFn = os.path.join(tmp_dir, allele_name + ".fa")
	typeFile = open(typeFn, "w")
	s = refSeqs[allele_name]
	Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta")
	typeFile.close()
	
	
	#index refrence sample
	bowtie2_index = bowtie + "-build"
	log_writer.info_header(logger, "index refrence sample")
	process = subprocess.Popen([bowtie2_index,
								typeFn,
								typeFn],
							stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref
	process.wait()	
	log_writer.log_process(logger, process, log_error_to = "info")
	
	# create sam and bam files
	log_writer.info_header(logger, "Creating sam and bam files")
	bam = create_bam_file(tmp_dir,
						  fastq_files,
						  typeFn,
						  False,
						  bowtie,
						  samtools,
						  ids,
						  logger)
	
	#name bam file
	bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam")
	process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")

	#index bam file
	process = subprocess.Popen([samtools, 'index',
								bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	log_writer.info_header(logger, "index bam file")
	
	#generate pilup file
	piFn= os.path.join(tmp_dir,  ids + "." + allele_name + '.pileup')
	f = open(piFn, 'wb')
	log_writer.info_header(logger, "generate pileup file")
	process = subprocess.Popen([samtools, 'mpileup',
								'-B', '-A', '-cf', typeFn, bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup
	for l in process.stdout:
		f.write(l)
	f.close()
	process.wait()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger):
    """
	Function
	(1) Concatenate flanking regions to correspondent locus variants sequence in fasta format.
	Newly concatenated sequence are then indexed by Bowtie2
	(2) Then extract and store as pickled object:
		a. locus- variant names (loci.pkl)
		b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl)
		c. Locus variants sequence (refSeqs.pkl)
	
	The option of the method
	specFn[str]: A tab-delimited text file display the path to the seven flanking  and loci sequences(summary.txt)
	tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created
	bowtie[str]: The command used to index the reference sequence
	logger[str]: The path to where the stderr and stdout logged
	
	Return
	loci[list]: loci name
	"""

    (specDir, summaryFileName) = os.path.split(specFn)
    spc = []

    for l in open(specFn):
        spc.append(l.split())
    refFn = os.path.join(tmp_dir, "reference.fa")
    rf = open(refFn, "w")
    ranges = {}
    loci = []
    refSeqs = {}
    for (loc, variantsFn, flanksFn) in spc:
        loci.append(loc)
        fs = {}
        f = open(os.path.join(specDir, flanksFn))
        for r in Bio.SeqIO.parse(f, "fasta"):
            fs[r.id] = r.seq
        f = open(os.path.join(specDir, variantsFn))
        for r in Bio.SeqIO.parse(f, "fasta"):
            s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna)
            s += fs['up']
            s += r.seq
            s += fs['down']
            Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta")
            ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq))
            refSeqs[r.id] = s
    rf.close()
    rangesFn = os.path.join(
        tmp_dir, "ranges.pkl"
    )  #start and end position of locus variant sequences (without the flanking sequences)
    f = open(rangesFn, 'w')
    pickle.dump(ranges, f)
    f.close()
    lociFn = os.path.join(tmp_dir, "loci.pkl")
    f = open(lociFn, 'w')
    pickle.dump(loci, f)
    f.close()
    refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl")  #Locus variants sequence
    f = open(refSeqsFn, 'w')
    pickle.dump(refSeqs, f)
    f.close()
    bowtie2_index = bowtie + "-build"
    log_writer.info_header(logger, "bowtie_indexed")
    process = subprocess.Popen(
        [bowtie2_index, refFn, refFn],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE
    )  # generate index of reference fasta for mapping
    process.wait()

    log_writer.log_process(logger, process, log_error_to="info")
    os.system("rm -f summary.txt")
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir, bowtie, samtools, id, logger, threads):

  """

  This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided.

  :param fastqs: directory that contains two fastq files.  It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq
  :type fastqs: directory
  :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function.
  :type reference_fasta_file: file
  :param bowtie: path to bowtie
  :type bowtie: path
  :param samtools: path to samtools
  :type samtools: path

  :returns: sorted and indexed bam file.
  :rtype: file

  """
  try:
    os.makedirs(output_dir + "/tmp")
  except OSError:
    if os.path.isdir(output_dir + "/tmp"):
      # We are nearly safe
      pass
    else:
      # There was an error on creation, so make sure we know about it
      raise

  null = open(os.devnull, 'w')

  bam_sorted = os.path.join(output_dir, id + '-sorted')
  bam_out = os.path.join(output_dir, id + '-sorted.bam')
  sam_parsed = os.path.join(output_dir + "/tmp", id + '.tmp') # temporary sam output
  sam = os.path.join(output_dir + "/tmp", id + '.sam')
  bam = os.path.join(output_dir + "/tmp", id + '.bam')
  
 
  # copy the reference fasta file to the tmp directory and index
  reference_fasta_file = output_dir + "/tmp/reference.fasta"
  shutil.copyfile(reference_fasta_file_path, reference_fasta_file)
  print "running bowtie index"
  bowtie_index=  bowtie + "-build"
  log_writer.info_header(logger, "Creating reference_fasta_fileerence index")
  process = subprocess.Popen([bowtie_index, reference_fasta_file, reference_fasta_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference_fasta_fileerence fasta for mapping
  process.wait()
  log_writer.log_process(logger, process)
  
  # # run bowtie
  cmd = [bowtie]
  cmd += [ '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1],'-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50', '-p', str(threads)] # write to tmp 
  log_writer.info_header(logger, "Running bowtie to generate sam file")
#  print "running bowtie"
  process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")

  # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file.  The sam_parsed file is the output that is used to convert to bam.

  try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr", remove_secondary_mapping_bit, sam, sam_parsed)

  log_writer.info_header(logger, "Convert sam to bam")
  process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, '-@', str(threads), sam_parsed], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")
  # sort bam
  log_writer.info_header(logger, "Sort the bam file")
  process = subprocess.Popen([samtools, 'sort', '-o', bam_sorted + ".bam", '-@', str(threads), bam], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")
  # index bam
  log_writer.info_header(logger, "Index the BAM file")
  process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  process.wait()
  log_writer.log_process(logger, process, log_error_to = "info")
  return bam_out 
def getNovelAllele(variant, locus, fastq_files,  bowtie, samtools, ids, tmp_dir, logger):
	
	"""
	Function 
	Generate SAM, BAM and pileup file for novel allele
	
	The option for method:
	variant[str]: locus variant number
	locus[str]: locus name
	fastq_files[str]: the path to the fastq file 
	bowtie[str]: the path to  Bowtie2 command 
	samtools[str]: the path to SAMtools command
	ids[str]: sample unique identifier number
	tmp_dir[str]: the path to where  SAM, BAM, Pileup will be created
	"""
	
	refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl")))
	allele_name = locus + "-" + variant 
	typeFn = os.path.join(tmp_dir, allele_name + ".fa")
	typeFile = open(typeFn, "w")
	s = refSeqs[allele_name]
	Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta")
	typeFile.close()
	
	
	#index refrence sample
	bowtie2_index = bowtie + "-build"
	log_writer.info_header(logger, "index refrence sample")
	process = subprocess.Popen([bowtie2_index,
								typeFn,
								typeFn],
							stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref
	process.wait()	
	log_writer.log_process(logger, process, log_error_to = "info")
	
	# create sam and bam files
	log_writer.info_header(logger, "Creating sam and bam files")
	bam = create_bam_file(tmp_dir,
						  fastq_files,
						  typeFn,
						  False,
						  bowtie,
						  samtools,
						  ids,
						  logger)
	
	#name bam file
	bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam")
	process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")

	#index bam file
	process = subprocess.Popen([samtools, 'index',
								bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	process.wait()
	log_writer.log_process(logger, process, log_error_to = "info")
	log_writer.info_header(logger, "index bam file")
	
	#generate pilup file
	piFn= os.path.join(tmp_dir,  ids + "." + allele_name + '.pileup')
	f = open(piFn, 'w')
	log_writer.info_header(logger, "generate pileup file")
	process = subprocess.Popen([samtools, 'mpileup',
								'-B', '-A', '-cf', typeFn, bamFn],
								stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup
	for l in process.stdout:
		f.write(l)
	f.close()
	process.wait()