def filterAlignmentByReferenceIDs(cls, inputFname, outputFname=None, refNameSet = set(['Contig0', 'Contig1', 'Contig2']), \ readGroup="", platform='LS454'): """ 2011-7-8 """ import os, sys import pysam, copy samfile = BamFile(inputFname, 'rb') header = copy.deepcopy(samfile.header) if readGroup: #add read group if it's missing if "RG" not in header: header['RG'] = [] header['RG'].append({'ID':readGroup, 'PL':platform, 'LB':platform, 'SM':readGroup}) # remove SQ entries that are not in refNameSet, from the header newSQList = [] for SQ_entry in header["SQ"]: if SQ_entry['SN'] in refNameSet: newSQList.append(SQ_entry) header["SQ"] = newSQList refName2bamOutputF = {} bamOutputF = pysam.Samfile(outputFname, 'wb', header=header) # template=samfile) sys.stderr.write("Retain reads from %s only from these references %s ...\n"%(inputFname, refNameSet)) processor = cls.FilterAlignmentByReferenceIDs(refNameSet=refNameSet, readGroup=readGroup, bamOutputF=bamOutputF) samfile.traverseBamByRead(processor=processor) bamOutputF.close()
def filterAlignmentByReferenceIDs(cls, inputFname, outputDir=None, refNameSet = set(['Contig0', 'Contig1', 'Contig2']), \ readGroup="", platform='LS454'): """ 2011-7-8 """ import os, sys import pysam, copy samfile = BamFile(inputFname, 'rb') header = copy.deepcopy(samfile.header) if readGroup: if "RG" not in header: header['RG'] = [] header['RG'].append({'ID':readGroup, 'PL':platform, 'LB':platform, 'SM':readGroup}) refName2bamOutputF = {} for refName in refNameSet: inputFileBaseNamePrefix = os.path.splitext(os.path.basename(inputFname))[0] outputFname = os.path.join(outputDir, '%s_%s.bam'%(inputFileBaseNamePrefix,refName)) bamOutputF = pysam.Samfile(outputFname, 'wb', header=header) # template=samfile) refName2bamOutputF[refName] = bamOutputF sys.stderr.write("Retain reads from %s only from these references %s ...\n"%(inputFname, refNameSet)) processor = cls.FilterAlignmentByReferenceIDs(refName2bamOutputF=refName2bamOutputF, refNameSet=refNameSet, readGroup=readGroup) samfile.traverseBamByRead(processor=processor) for refName, bamOutputF in refName2bamOutputF.iteritems(): bamOutputF.close()