def exportToPlot(args): """ Originally written to easily import the data into interactive plotting tools. Converts the mapped data to a pandas dataframe and adds restriction site information. Additionally it creates 2 files that link between restrition sites and read ids for interaction down the line. """ settings = mc.loadIni(args.inifile) print 'Loading restrsites, this takes a while...' restrefs = np.load(args.restfile)['restrsites'].item() print 'Finished loading, moving on' byRegion, byRead, pdFrame = mc.exportToPlot(settings, restrefs, args.bamfile) #dupSet = mc.findDuplicates(settings,byRead,byRegion) #pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False) #print pdFrame np.savez_compressed(args.plotfile, pdframe=pdFrame, pdcolumns=pdFrame.columns, pdindex=pdFrame.index) np.savez_compressed(args.plotfile + '_extra', byregion=byRegion, byread=byRead)
def markDuplicates(args): """ This function aims to identify reads that are most likely PCR duplicates. Identification is based on having overlap with eachother that is not in the viewport. It takes a pandas dataframe and adds a new column to the end of it. """ settings = mc.loadIni(args.inifile) exFile = np.load(args.extra) try: byRead = exFile['byread'].item() except KeyError: byRead = exFile['byreads'].item() byRegion = exFile['byregion'].item() pdFile = np.load(args.pdframe) pdFrame = pd.DataFrame(pdFile['pdframe'], columns=pdFile['pdcolumns'], index=pdFile['pdindex']) dupSet = mc.findDuplicates(settings, byRead, byRegion) #df['dup']=np.where(pd.Series(df.index).isin([1,5]),True,False) #pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False) pdFrame['Duplicate'] = np.where( pd.Series(pdFrame.index).isin(dupSet), True, False) np.savez_compressed(args.outfile, pdframe=pdFrame, pdcolumns=pdFrame.columns, pdindex=pdFrame.index)
def cleaveReads(args): settings = mc.loadIni(args.inifile) primerLens = [len(x) for x in settings['prm_seq']] primers = [''] primers.extend(settings['prm_seq']) #print primers prmCuts = mc.combinePrimers(args.bamfile,primerLens) #print prmCuts[:10] mc.applyCuts(args.fastqfile,args.outfile,prmCuts,primers)
def findRefRestSites(args): """ Determine the location of restriction sites on the reference genome. Takes a fasta file and stores results as a list per chromosome in a dictionary, which is saved as an npz. """ settings = mc.loadIni(args.inifile) restSeqs = settings['re_seq'] restDict = mc.findReferenceRestSites(args.fastafile, restSeqs, lineLen=args.linelen) np.savez_compressed(args.restfile, restrsites=restDict)
def cleaveReads(args): """ Cleave the reads by primer sequences. Requires BowTie2 information. """ settings = mc.loadIni(args.inifile) primerLens = [len(x) for x in settings['prm_seq']] primers = [''] primers.extend(settings['prm_seq']) #print primers prmCuts = mc.combinePrimers(args.bamfile, primerLens) #print prmCuts[:10] mc.applyCuts(args.fastqfile, args.outfile, prmCuts, primers)
def exportToPlot(args): settings = mc.loadIni(args.inifile) print 'Loading restrsites, this takes a while...' restrefs=np.load(args.restfile)['restrsites'].item() print 'Finished loading, moving on' byRegion,byRead,pdFrame = mc.exportToPlot(settings,restrefs,args.bamfile) #dupSet = mc.findDuplicates(settings,byRead,byRegion) #pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False) #print pdFrame np.savez_compressed(args.plotfile, pdframe=pdFrame, pdcolumns=pdFrame.columns, pdindex=pdFrame.index) np.savez_compressed(args.plotfile+'_extra', byregion=byRegion, byread=byRead)
def markDuplicates(args): settings = mc.loadIni(args.inifile) exFile = np.load(args.extra) try: byRead = exFile['byread'].item() except KeyError: byRead = exFile['byreads'].item() byRegion = exFile['byregion'].item() pdFile = np.load(args.pdframe) pdFrame = pd.DataFrame(pdFile['pdframe'],columns=pdFile['pdcolumns'],index=pdFile['pdindex']) dupSet = mc.findDuplicates(settings,byRead,byRegion) #df['dup']=np.where(pd.Series(df.index).isin([1,5]),True,False) #pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False) pdFrame['Duplicate'] = np.where(pd.Series(pdFrame.index).isin(dupSet), True, False) np.savez_compressed(args.outfile, pdframe=pdFrame, pdcolumns=pdFrame.columns, pdindex=pdFrame.index)
def getRefResPositions(args): """ Extract a subset of restriction site positions from the reference genome. """ settings = mc.loadIni(args.inifile) print[settings['vp_chr']], [settings['vp_start'], settings['vp_end']] print 'Loading restrsites, this takes a while...' restrefs = np.load(args.restfile)['restrsites'].item() print 'Finished loading, moving on' result = mc.mapToRefSite(restrefs[settings['vp_chr'][0]], [settings['vp_start'][0], settings['vp_end'][0]]) refPosList = [] for i in range(result[0], result[1] + 1): #print i,restrefs[settings['vp_chr'][0]][i] refPosList.append(restrefs[settings['vp_chr'][0]][i]) pdFrame = pd.DataFrame(refPosList, index=range(result[0], result[1] + 1), columns=['start', 'stop']) np.savez_compressed(args.outfile, pdframe=pdFrame, pdcolumns=pdFrame.columns, pdindex=pdFrame.index)
def makePrimerFasta(args): """ Turn primer sequences into a fasta file. """ settings = mc.loadIni(args.inifile) primerSeqs = mc.getPrimerSeqs(settings) mc.writePrimerFasta(primerSeqs, args.outfile)
def splitReads(args): """ Split the reads by restriction site information based on the reference genome. """ settings = mc.loadIni(args.inifile) restSeqs = settings['re_seq'] # TODO: Substitute reference genome with reads (?) mc.findRestrictionSeqs(args.fastqfile, args.outfile, restSeqs)
def makePrimerFasta(args): settings = mc.loadIni(args.inifile) primerSeqs = mc.getPrimerSeqs(settings) mc.writePrimerFasta(primerSeqs, args.outfile)
def findRefRestSites(args): settings = mc.loadIni(args.inifile) restSeqs = settings['re_seq'] restDict = mc.findReferenceRestSites(args.fastafile,restSeqs) np.savez_compressed(args.restfile,restrsites=restDict)
def splitReads(args): settings = mc.loadIni(args.inifile) restSeqs = settings['re_seq'] # TODO: Substitute reference genome with reads (?) mc.findRestrictionSeqs(args.fastqfile,args.outfile,restSeqs)