예제 #1
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def exportToPlot(args):
    """ Originally written to easily import the data into interactive plotting tools.
		Converts the mapped data to a pandas dataframe and adds restriction site information.
		Additionally it creates 2 files that link between restrition sites and read ids for 
		interaction down the line.
	"""
    settings = mc.loadIni(args.inifile)
    print 'Loading restrsites, this takes a while...'
    restrefs = np.load(args.restfile)['restrsites'].item()
    print 'Finished loading, moving on'
    byRegion, byRead, pdFrame = mc.exportToPlot(settings, restrefs,
                                                args.bamfile)

    #dupSet = mc.findDuplicates(settings,byRead,byRegion)
    #pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False)

    #print pdFrame
    np.savez_compressed(args.plotfile,
                        pdframe=pdFrame,
                        pdcolumns=pdFrame.columns,
                        pdindex=pdFrame.index)

    np.savez_compressed(args.plotfile + '_extra',
                        byregion=byRegion,
                        byread=byRead)
예제 #2
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def markDuplicates(args):
    """ This function aims to identify reads that are most likely PCR duplicates.
		Identification is based on having overlap with eachother that is not in the viewport.
		It takes a pandas dataframe and adds a new column to the end of it.
	"""
    settings = mc.loadIni(args.inifile)
    exFile = np.load(args.extra)

    try:
        byRead = exFile['byread'].item()
    except KeyError:
        byRead = exFile['byreads'].item()
    byRegion = exFile['byregion'].item()

    pdFile = np.load(args.pdframe)
    pdFrame = pd.DataFrame(pdFile['pdframe'],
                           columns=pdFile['pdcolumns'],
                           index=pdFile['pdindex'])
    dupSet = mc.findDuplicates(settings, byRead, byRegion)

    #df['dup']=np.where(pd.Series(df.index).isin([1,5]),True,False)
    #pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False)

    pdFrame['Duplicate'] = np.where(
        pd.Series(pdFrame.index).isin(dupSet), True, False)

    np.savez_compressed(args.outfile,
                        pdframe=pdFrame,
                        pdcolumns=pdFrame.columns,
                        pdindex=pdFrame.index)
예제 #3
0
def cleaveReads(args):
	settings = mc.loadIni(args.inifile)
	primerLens = [len(x) for x in settings['prm_seq']]
	primers = ['']
	primers.extend(settings['prm_seq'])
	#print primers
	prmCuts = mc.combinePrimers(args.bamfile,primerLens)
	#print prmCuts[:10]
	mc.applyCuts(args.fastqfile,args.outfile,prmCuts,primers)
예제 #4
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def findRefRestSites(args):
    """ Determine the location of restriction sites on the reference genome. Takes a fasta file
		and stores results as a list per chromosome in a dictionary, which is saved as an npz.
	"""
    settings = mc.loadIni(args.inifile)
    restSeqs = settings['re_seq']
    restDict = mc.findReferenceRestSites(args.fastafile,
                                         restSeqs,
                                         lineLen=args.linelen)
    np.savez_compressed(args.restfile, restrsites=restDict)
예제 #5
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def cleaveReads(args):
    """ Cleave the reads by primer sequences. Requires BowTie2 information. """
    settings = mc.loadIni(args.inifile)
    primerLens = [len(x) for x in settings['prm_seq']]
    primers = ['']
    primers.extend(settings['prm_seq'])
    #print primers
    prmCuts = mc.combinePrimers(args.bamfile, primerLens)
    #print prmCuts[:10]
    mc.applyCuts(args.fastqfile, args.outfile, prmCuts, primers)
예제 #6
0
def exportToPlot(args):
	settings = mc.loadIni(args.inifile)
	print 'Loading restrsites, this takes a while...'
	restrefs=np.load(args.restfile)['restrsites'].item()
	print 'Finished loading, moving on'
	byRegion,byRead,pdFrame = mc.exportToPlot(settings,restrefs,args.bamfile)

	#dupSet = mc.findDuplicates(settings,byRead,byRegion)
	#pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False)

	#print pdFrame
	np.savez_compressed(args.plotfile,
		pdframe=pdFrame,
		pdcolumns=pdFrame.columns,
		pdindex=pdFrame.index)

	np.savez_compressed(args.plotfile+'_extra',
		byregion=byRegion,
		byread=byRead)
예제 #7
0
def markDuplicates(args):
	settings = mc.loadIni(args.inifile)
	exFile = np.load(args.extra)

	try:
		byRead = exFile['byread'].item()
	except KeyError:
		byRead = exFile['byreads'].item()
	byRegion = exFile['byregion'].item()

	pdFile = np.load(args.pdframe)
	pdFrame = pd.DataFrame(pdFile['pdframe'],columns=pdFile['pdcolumns'],index=pdFile['pdindex'])
	dupSet = mc.findDuplicates(settings,byRead,byRegion)

	#df['dup']=np.where(pd.Series(df.index).isin([1,5]),True,False)
	#pdFrame['Duplicate'] = np.where(pdFrame['CircleId'].isin(dupSet), True, False)

	pdFrame['Duplicate'] = np.where(pd.Series(pdFrame.index).isin(dupSet), True, False)

	np.savez_compressed(args.outfile,
		pdframe=pdFrame,
		pdcolumns=pdFrame.columns,
		pdindex=pdFrame.index)
예제 #8
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def getRefResPositions(args):
    """ Extract a subset of restriction site positions from the reference genome. """
    settings = mc.loadIni(args.inifile)
    print[settings['vp_chr']], [settings['vp_start'], settings['vp_end']]
    print 'Loading restrsites, this takes a while...'
    restrefs = np.load(args.restfile)['restrsites'].item()
    print 'Finished loading, moving on'
    result = mc.mapToRefSite(restrefs[settings['vp_chr'][0]],
                             [settings['vp_start'][0], settings['vp_end'][0]])

    refPosList = []

    for i in range(result[0], result[1] + 1):
        #print i,restrefs[settings['vp_chr'][0]][i]
        refPosList.append(restrefs[settings['vp_chr'][0]][i])

    pdFrame = pd.DataFrame(refPosList,
                           index=range(result[0], result[1] + 1),
                           columns=['start', 'stop'])

    np.savez_compressed(args.outfile,
                        pdframe=pdFrame,
                        pdcolumns=pdFrame.columns,
                        pdindex=pdFrame.index)
예제 #9
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def makePrimerFasta(args):
    """ Turn primer sequences into a fasta file.
	"""
    settings = mc.loadIni(args.inifile)
    primerSeqs = mc.getPrimerSeqs(settings)
    mc.writePrimerFasta(primerSeqs, args.outfile)
예제 #10
0
파일: mc4c.py 프로젝트: jing-wan/pymc4c
def splitReads(args):
    """ Split the reads by restriction site information based on the reference genome. """
    settings = mc.loadIni(args.inifile)
    restSeqs = settings['re_seq']
    # TODO: Substitute reference genome with reads (?)
    mc.findRestrictionSeqs(args.fastqfile, args.outfile, restSeqs)
예제 #11
0
def makePrimerFasta(args):
	settings = mc.loadIni(args.inifile)
	primerSeqs = mc.getPrimerSeqs(settings)
	mc.writePrimerFasta(primerSeqs, args.outfile)
예제 #12
0
def findRefRestSites(args):
	settings = mc.loadIni(args.inifile)
	restSeqs = settings['re_seq']
	restDict = mc.findReferenceRestSites(args.fastafile,restSeqs)
	np.savez_compressed(args.restfile,restrsites=restDict)
예제 #13
0
def splitReads(args):
	settings = mc.loadIni(args.inifile)
	restSeqs = settings['re_seq']
	# TODO: Substitute reference genome with reads (?)
	mc.findRestrictionSeqs(args.fastqfile,args.outfile,restSeqs)