def main(inputfile, outputfile, gfffile, gffmin, gffmax, takeStop, upstream, downstream, verbose): takeStart = True if takeStop: takeStart = False sites = ParclipSiteContainer() sites.loadFromFile(inputfile) anno = gff.GFF(gfffile) anno.filterSize(gffmin, gffmax) anno.getChromosomePositions() if anno.size() < 10: print('Warning: Low number of annotation enries! ' + str(anno.size())) fsites = ParclipSiteContainer() percent_old = 0 percent_new = 0 for i in range(sites.size()): if anno.isAround(sites.chrs[i], sites.pos[i], sites.strand[i], takeStart, upstream, downstream)[1]: fsites.addSite(sites.chrs[i], sites.pos[i], sites.m[i], sites.r[i], sites.result[i], sites.strand[i], sites.occ[i]) percent_new = round(i / sites.size() * 100) if percent_new > percent_old: if verbose: functions.showProgress(i, anno.size(), 'selecting sites') percent_old = percent_new fsites.save2File(outputfile)
def main(parclipA, parclipB, outfile, width, verbose): quantiles = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] total = (len(quantiles)-1)*(len(quantiles)-1) total_count = 0 if verbose: functions.showProgress(total_count, total, 'Calculating Jaccard-Index') fc = open(outfile, 'w') for q1 in range(len(quantiles)-1): a = ParclipSiteContainer() a.loadFromFile(parclipA) aq = getEntries(a,quantiles[q1], quantiles[q1+1]) #removeEntries(a,quantiles[q1], quantiles[q1+1]) for q2 in range(len(quantiles)-1): b = ParclipSiteContainer() b.loadFromFile(parclipB) #removeEntries(b,quantiles[q2], quantiles[q2+1]) bq = getEntries(b,quantiles[q2], quantiles[q2+1]) intersect = 0 for j in range(bq.size()): if aq.exactSearch(bq.chrs[j], bq.pos[j], bq.strand[j], width=width)[1]: intersect += 1 jaccard = intersect/(aq.size()+bq.size()-intersect) #print('q1: '+str(quantiles[q1])+' q2: '+str(quantiles[q2])+' '+str(round(jaccard,4))) fc.write(str(round(jaccard,4))+'\t') total_count += 1 if verbose: functions.showProgress(total_count, total, 'Calculating Jaccard-Index') fc.write('\n') print('') fc.close()
def main(inputfile, outputfile): if os.path.isfile(inputfile) == False: print('Inputfile: '+inputfile+' does not exist') sys.exit(-1) sites = ParclipSiteContainer() sites.loadFromFile(inputfile) for i in range(sites.size()): sites.occ[i] = sites.m[i]/sites.r[i] sites.save2File(outputfile)
def main(parclipA, parclipB, start, stop, width, anno=None, annowidth=100, logRatio=False, verbose=False): tmpA = ParclipSiteContainer() dataB = ParclipSiteContainer() tmpA.loadFromFile(parclipA) tmpA.sort(key='occ') dataB.loadFromFile(parclipB) if start < 0 or stop < start or stop >= tmpA.size(): print('Bullshit start and stop indices. Come on! Concentrate!') sys.exit() dataA = parclipsites.ParclipSites('') total = stop - start count = 0 i = start while count < total and i < (tmpA.size()-1): if verbose: functions.showProgress(count,total-1,'Selecting PAR-CLIP sites') if anno == None: dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i], tmpA.result[i], tmpA.strand[i], tmpA.occ[i]) count +=1 else: if anno.isInside(tmpA.chrs[i], tmpA.pos[i], tmpA.strand[i], annowidth, annowidth)[1]: dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i], tmpA.result[i], tmpA.strand[i], tmpA.occ[i]) count +=1 i += 1 coloc = 1 count_coloc = 1 if verbose: print('\n') for i in range(dataA.size()): values = dataB.getValues(dataA.chrs[i], dataA.pos[i], dataA.strand[i], True, width, width) if values != None: count_coloc += 1 coloc += max(values) if verbose: functions.showProgress(i, (dataA.size()-1), 'Collecting colocolization data') coloc = coloc / count_coloc if verbose: print('') if logRatio: return math.log( coloc/functions.getQuantile(dataB.occ,0.5) ,2) else: return coloc
def main(parclipfile, outputfile, gfffile, downstream, upstream, gene, sense, minSize, maxSize, verbose, vstring=''): anno = gff.GFF(gfffile) anno.filterSize(minSize, maxSize) pc = ParclipSiteContainer() pc.loadFromFile(parclipfile) with open(outputfile, 'w') as fc_out: for g in range(anno.size()): if verbose: functions.showProgress(g, (anno.size() - 1), vstring) if anno.strand[g] == '+': values_upstream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, upstream, gene) values_dostream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, gene, downstream) else: values_upstream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, upstream, gene) values_dostream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, gene, downstream) if values_upstream is not None and values_dostream is not None: print(*chain(values_upstream, values_dostream), sep='\t', file=fc_out) if verbose: print()
from mockinbird.utils import ParclipSiteContainer if __name__ == '__main__': parser = argparse.ArgumentParser(description='Takes PAR-CLIP sites and a genome and saves genomic sequences as fasta file around PAR-CLIP sites according to the given parameters.', epilog="contact: [email protected]") parser.add_argument('sites', help='PAR-CLIP file *.table') parser.add_argument('genome', help='path to genome') parser.add_argument('fafile', help='output filename') parser.add_argument('filterGFF', help='set path to GFF if sites should be removed that overlap with the GFF [default = '']', default='') parser.add_argument('start', help='start index of PAR-CLIP sites [default=0]', type=int, default = 0) parser.add_argument('stop', help='stop index of PAR-CLIP sites [default=1500]', type=int, default = 1500) parser.add_argument('width', help='number of nt +/- the crosslink site [default=15]', type=int, default = 15) parser.add_argument('additionalFilterWidth', help='number of nt that are added to the start/stop indices of the GFF annotations', type=int, default = 20) parser.add_argument('key', help='set key that is used for PAR-CLIP site ordering [default = \'occ\'], options: [\'occ\', \'m\', \'r\', \'mr\', \'pvalue\']', default='occ') parser.add_argument('-v','--verbose', dest='verbose', action="store_true", default=False, help='verbose output') args = parser.parse_args() yeast = genome.Genome(args.genome, False) sites = ParclipSiteContainer() sites.loadFromFile(args.sites) if args.verbose: print('#sites : '+str(sites.size())) if args.filterGFF != '': anno = gff.GFF(args.filterGFF) sites = sites.removeSitesLocatedInGFF(anno, args.additionalFilterWidth) print('#sites after removal: '+str(sites.size())) sites.sort(args.key) sites.save2Fasta(yeast, args.fafile, args.start, args.stop, args.width)