def filterGTF(gtf, filterstring, tempout): if "!=" in filterstring: column, value = filterstring.split("!=") value = value.split("+") filtertype = "notin" elif "=" in filterstring: column, value = filterstring.split("=") value = value.split("+") filtertype = "in" elif "-in_file-" in filterstring: column, value = filterstring.split("-in_file-") value = [line.strip() for line in iotools.open_file(value)] filtertype = "in_file" elif "-notin_file-" in filterstring: column, value = filterstring.split("-notin_file-") value = [line.strip() for line in iotools.open_file(value)] filtertype = "notin_file" elif "-morethan-" in filterstring: column, value = filterstring.split("-morethan-") value = float(value) filtertype = "morethan" elif "-lessthan-" in filterstring: column, value = filterstring.split("-lessthan-") value = float(value) filtertype = "lessthan" gfile = iotools.open_file(gtf) G = GTF.iterator(gfile) out = iotools.open_file(tempout, "w") for item in G: D = item.asDict() D['contig'] = item.contig D['source'] = item.source D['feature'] = item.feature D['start'] = item.start D['end'] = item.end D['strand'] = item.strand D['frame'] = item.frame if filtertype == "in" or filtertype == 'in_file': if D[column] in value: out.write("%s\n" % str(item)) elif filtertype == "notin" or filtertype == 'notin_file': if D[column] not in value: out.write("%s\n" % str(item)) elif filtertype == "morethan": if float(D[column]) > value: out.write("%s\n" % str(item)) elif filtertype == "lessthan": if float(D[column]) < value: out.write("%s\n" % str(item)) out.close() gfile.close()
def CleanVariantTables(genes, variants, cols, outfile): variants = pd.read_csv(variants, sep="\t") variants = variants.drop(0) vp1 = copy.copy( variants[['CHROM', 'POS', 'QUAL', 'ID', 'REF1', 'ALT', 'GT']]) alleles = vp1['REF1'].str.cat(vp1['ALT'].str.strip(), sep=",").str.split(",") vp1['GT'] = vp1['GT'].str.replace(".", "0") inds1 = vp1['GT'].str.get(0).astype(int).values inds2 = vp1['GT'].str.get(-1).astype(int).values x = 0 a1s = [] a2s = [] gts = [] homhet = [] for allele in alleles: i1 = int(inds1[x]) i2 = int(inds2[x]) a1 = allele[i1] a2 = allele[i2] a1s.append(a1) a2s.append(a2) if a1 == a2: homhet.append("HOM") else: homhet.append("HET") gts.append("%s%s" % (a1, a2)) x += 1 vp1['HOMHET'] = homhet vp1['Allele1'] = a1s vp1['Allele2'] = a2s vp1['Genotype'] = gts vp1 = vp1.drop(['REF1', 'ALT', 'GT'], 1) vp1[cols] = copy.copy(variants[cols]) Ls = [] for gene in [ line.strip() for line in iotools.open_file(genes[0]).readlines() ]: cp = [] with iotools.open_file(genes[1]) as infile: for line in infile: r = re.search(gene, line) if r: line = line.strip().split("\t") chrom = line[0] pos = line[1] cp.append("%s_%s" % (chrom, pos)) cp = set(cp) for c in cp: Ls.append((gene, c.split("_"))) df = pd.DataFrame(Ls) df['CHROM'] = df[1].str.get(0) df['POS'] = df[1].str.get(1) df = df.drop(1, 1) df.columns = ['gene', 'CHROM', 'POS'] variants = vp1.merge(df, 'left') variants.to_csv(outfile, sep="\t")
def chunk_iterator_lines(infile, args, prefix, use_header=False): """split by lines.""" chunk_size = args[0] n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = iotools.open_file(filename, "w") header = None for line in infile: if line[0] == "#": continue if not header and n == 0 and use_header: header = line outfile.write(header) continue n += 1 if n % chunk_size == 0: outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = iotools.open_file(filename, "w") if header: outfile.write(header) outfile.write(line) outfile.close() yield filename
def chunk_iterator_regex_split(infile, args, prefix, use_header=False): """split where regular expression is true. """ rex = args[0] chunk_size = args[2] max_lines = args[3] nlines = 0 n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = iotools.open_file(filename, "w") for line in infile: if line[0] == "#": continue if rex.search(line[:-1]): if n > 0 and (n % chunk_size == 0 or (max_lines and nlines > max_lines)): outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = iotools.open_file(filename, "w") nlines = 0 n += 1 outfile.write(line) nlines += 1 outfile.close() yield filename
def main(argv): def _add_input(parser): parser.add_option("--data-dir", default=".") parser.add_option("--force", default=False, action="store_true") parser.add_option("--min-depth", default=0, type="int") parser.add_option("--follow-links", default=False, action="store_true") parser.add_option("--limit-metrics", default=0, type="int") parser.add_option("--output-filename-metrics") parser.add_option("--input-filename-metrics") P.initialize(argv, callback=_add_input) options = E.get_args() if options.config_file: PARAMS = P.get_parameters(options.config_file) else: sys.exit(P.main(options)) if os.path.exists("results.commit"): if not options.force: raise ValueError( "a results.commit file already exists. Please remove " "before uploading.") data_dir = os.path.abspath(options.data_dir) if options.input_filename_metrics: with IOTools.open_file(options.input_filename_metrics) as inf: infiles = [x.strip() for x in inf.readlines() if x.strip()] if options.limit_metrics: infiles = infiles[:options.limit_metrics] else: E.info(f"collecting files to upload starting in {data_dir}") infiles = [] for root, dirs, files in os.walk(data_dir, followlinks=options.follow_links): E.debug(f"working on {root}: dirs={len(dirs)}, files={len(files)}") # ignore first level (tools) (needs better check) depth = root[len(data_dir):].count(os.sep) if "benchmark.info" in files: if depth <= options.min_depth: E.info(f"skipping - depth not high enough: {depth}") else: infiles.append(os.path.join(root, "benchmark.info")) if options.limit_metrics and len(infiles) > options.limit_metrics: E.info(f"stopping collection as {len(infiles)} reached") break E.info("found a potential {} benchmark.info files to upload".format(len(infiles))) if options.output_filename_metrics: with IOTools.open_file(options.output_filename_metrics, "w") as outf: outf.write("\n".join(infiles) + "\n") # find all files of interest oldwd = os.getcwd() os.chdir(data_dir) upload_result(infiles, "results.commit", PARAMS) os.chdir(oldwd) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--output-section", dest="output", type=str, choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s.") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unknown) != 2: raise ValueError("two arguments required") if unknown[0] == "-": infile1 = args.stdin else: infile1 = iotools.open_file(unknown[0], "r") infile2 = iotools.open_file(unknown[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = args.output outfile = args.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.stop()
def FilterFreqCols(infile, thresh, fcols): ''' Returns a set of line indices indicating lines where either of the alleles called have a frequency of less than thresh in all of the columns specified in fcols. No information - assigned allele frequency of -1. ''' fcols = fcols.split(",") # read the column headings from the variant table cols = iotools.open_file(infile).readline().strip().split("\t") # store allele frequency columns AFdict = dict() # store low frequency indices nD = dict() for col in fcols: ind = cols.index(col) GT_i = cols.index('GT') n = 0 nlist = set() AFS = [] with iotools.open_file(infile) as input: for line in input: if n > 1: line = line.strip().split("\t") GT = line[GT_i].replace(".", "0").split("/") af = line[ind].split(",") AF = [] # where the allele frequency is not numeric # "." or "NA" use -1 to indicate no data for a in af: try: AF.append(float(a)) except: AF.append(float(-1)) AF2 = [l if l > 0 else 0 for l in AF] AF = np.array(AF) AF = np.insert(AF, 0, 1 - sum(AF2)) GT[0] = int(GT[0]) GT[1] = int(GT[1]) # If the variant is not in database the column shows "." # but the site # may still have been called as multi allelic # - use -1 for all frequencies # in this case if max(GT[0], GT[1]) > (len(AF) - 1): AF = [float(-1)] * (max(GT[0], GT[1]) + 1) AF1 = AF[GT[0]] AF2 = AF[GT[1]] if AF1 >= thresh and AF2 >= thresh: nlist.add(n) AFS.append((AF1, AF2)) else: AFS.append(('NA', 'NA')) n += 1 AFdict[col] = AFS nD[col] = nlist ns = set.union(*list(nD.values())) return AFdict, ns
def extend_bed(infile, outfile): inf = iotools.open_file(infile) outf = iotools.open_file(outfile, "w") replace(inf, outf) outf.close()
def pileup_to_quasar(infile, outfile): import collections prev_line = None line_buffer = list() outf = iotools.open_file(outfile, "w") fates = collections.Counter() for line in iotools.open_file(infile): fields = line.strip().split("\t") if not fields[3].upper() == fields[8].upper(): fates["error in bases"] += 1 continue if not (int(PARAMS["min_depth"]) <= int(fields[4]) <= int( PARAMS["max_depth"])): fates["bad read coverage"] += 1 continue filt_alleles = re.sub('[a-zA-z\.\, ]\$', '', fields[5]) filt_alleles = re.sub('\^..', '', filt_alleles) if len(filt_alleles) == 0: fates["read ends only"] += 1 continue alleles = re.sub('[\.\, ]', fields[3], fields[5]) alleles = alleles.upper() ref = fields[3].upper() alt = fields[9].upper() ref_count = alleles.count(ref) alt_count = alleles.count(alt) outline = "\t".join([ fields[0], fields[1], fields[2], ref, alt, fields[7], fields[10], str(ref_count), str(alt_count), str(int(fields[4]) - ref_count - alt_count) ]) if (fields[0], fields[1]) == prev_line: line_buffer.append(outline) else: if len(line_buffer) == 1: outf.write(line_buffer[0] + "\n") fates["output"] += 1 else: fates["duplicate lines"] += 1 line_buffer = [outline] prev_line = (fields[0], fields[1]) outf.close() log = iotools.open_file(outfile + ".log", "w") for key in fates.keys(): log.write("\t".join((key, str(fates[key]))) + "\n") log.close()
def run(self, infile, outfile, params): if "reference_fasta" in params._fields: reference_fasta = "REFERENCE_SEQUENCE={}".format( params.reference_fasta) else: reference_fasta = "" # command can fail when no output is produced, but still produce output # 12G is required for java overhead retval = P.run("java -Xmx8000m -jar {params.path} " "CollectMultipleMetrics " "{reference_fasta} " "INPUT={infile} " "TMP_DIR=%(tmpdir)s " "{params.options} " "OUTPUT={outfile} " ">& {outfile} ".format(**locals()), job_memory="12G", ignore_errors=True) def get_section(section, data): pattern = "## {}".format(section) keep = False result = [] for line in data: if line.startswith("##"): if line.startswith(pattern): keep = True else: keep = False if keep: result.append(line) return result for tablename in self.tablenames: filename = re.sub("histogram", "metrics", tablename) raw = filename[len("picard_"):] src = outfile + "." + raw dest = outfile + "." + tablename + ".tsv" if not os.path.exists(src): E.warn("no file {}, ignored".format(src)) continue with IOTools.open_file(src) as inf: data = inf.readlines() if tablename.endswith("metrics"): data = get_section("METRICS", data) elif tablename.endswith("histogram"): data = get_section("HISTOGRAM", data) with IOTools.open_file(dest, "w") as outf: outf.write("".join(data)) return retval
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-a", "--first-fastq-file", dest="fastq1", type=str, help="supply read1 fastq file") parser.add_argument( "-b", "--second-fastq-file", dest="fastq2", type=str, help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if unknown and len(unknown) == 2: args.fastq1, args.fastq2 = unknown fastq1 = iotools.open_file(args.fastq1) fastq2 = iotools.open_file(args.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" args.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def test_touch_file_updates_existing_file(self): with iotools.open_file(self.filename, "w") as outf: outf.write("some data\n") created = os.stat(self.filename).st_mtime time.sleep(1) iotools.touch_file(self.filename) modified = os.stat(self.filename).st_mtime self.assertGreater(modified, created) with iotools.open_file(self.filename) as inf: data = inf.read() self.assertEqual(data, "some data\n")
def filterDamage(infile, damagestr, outfiles): ''' Filter variants which have not been assessed as damaging by any of the specified tools. Tools and thresholds can be specified in the pipeline.yml. Does not account for multiple alt alleles - if any ALT allele has been assessed as damaging with any tool the variant is kept, regardless of if this is the allele called in the sample. ''' damaging = damagestr.split(",") cols = iotools.open_file(infile).readline().strip().split("\t") D = dict() # parses the "damage string" from the pipeline.yml # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1... # where variants with any of these results in this column will # be retained for d in damaging: d = d.split("|") col = d[0] res = d[1].split("-") i = cols.index(col) D[col] = ((res, i)) x = 0 out = iotools.open_file(outfiles[0], "w") out2 = iotools.open_file(outfiles[1], "w") with iotools.open_file(infile) as input: for line in input: if x > 1: # grep for specific strings within this column of this # line of the input file line = line.strip().split("\t") isdamaging = 0 for key in D: res, i = D[key] current = line[i] for r in res: if re.search(r, current): isdamaging = 1 if isdamaging == 1: out.write("%s\n" % "\t".join(line)) else: out2.write("%s\n" % "\t".join(line)) else: out.write(line) x += 1 out.close() out2.close()
def runRegexMotifSearch(infiles, outfile): '''run a regular expression search on sequences. compute counts. ''' motif = "[AG]G[GT]T[CG]A" reverse_motif = "T[GC]A[CA]C[TC]" controlfile, dbfile = infiles if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) motifs = [] for x in range(0, 15): motifs.append( ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE))) for x in range(0, 15): motifs.append(("ER%i" % x, re.compile(motif + "." * x + reverse_motif, re.IGNORECASE))) db_positions = Motifs.countMotifs(iotools.open_file(dbfile, "r"), motifs) control_positions = Motifs.countMotifs(iotools.open_file(controlfile, "r"), motifs) db_counts, control_counts = Motifs.getCounts( db_positions), Motifs.getCounts(control_positions) db_seqcounts, control_seqcounts = Motifs.getOccurances( db_positions), Motifs.getCounts(control_positions) ndb, ncontrol = len(db_positions), len(control_positions) outf = iotools.open_file(outfile, "w") outf.write( "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n" ) for motif, pattern in motifs: try: fold = float(db_seqcounts[motif]) * \ ncontrol / (ndb * control_seqcounts[motif]) except ZeroDivisionError: fold = 0 outf.write( "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % (motif, db_counts[motif], control_counts[motif], db_seqcounts[motif], iotools.pretty_percent(db_seqcounts[motif], ndb), control_seqcounts[motif], iotools.pretty_percent(control_seqcounts[motif], ncontrol), fold))
def loadManualAnnotations(infile, outfile): tmp = P.get_temp_filename(".") annotation = P.snip(infile, "_annotations.tsv") with iotools.open_file(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with iotools.open_file(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def GenotypeSNPs(infile, snplist, outfile): ''' Fetches the genotype from the variant tables for all samples for SNPs in the hapmap sample from makeRandomSNPSet. Complex sites are ignored (as simple SNPs are sufficient for these calculations). These are: Sites which failed QC (column 3 in the variant table is not PASS) Sites with more than 2 alleles defined (column 6 in the variant table contains more than one alternative allele) SNPs with more than one ID Indels ''' out = iotools.open_file(outfile, "w") with iotools.open_file(infile) as inf: for line in inf: line = line.strip().split() # if the variant passed QC if line[4] == "PASS": genotype = line[7] # if the genotype looks normal e.g. 1/1 if len(genotype) == 3: # get the actual genotype (rather than the index) if genotype[0] != ".": ind1 = int(genotype[0]) else: ind1 = 0 if genotype[2] != ".": ind2 = int(genotype[2]) else: ind2 = 0 A1 = line[5] A2 = line[6].split(",") AS = [A1] + A2 if len(AS) <= 2: GT = "%s%s" % (AS[ind1], AS[ind2]) refGT = "%s%s" % (A1, A1) if len(GT) == 2: if line[3][0:2] == "rs" and len( line[3].split(";")) == 1: snpid = line[3] chrom = line[0] pos = line[1] if snpid in snplist: out.write("%s\t%s\t%s\t%s\t%s\n" % (snpid, chrom, pos, GT, refGT)) out.close()
def generate_bedfile(infile, outfile): '''Convert to bed file 50bp +/- from summit''' infile = iotools.open_file(infile) outfile = iotools.open_file(outfile, "w") for line in infile: chrom, start, end, peak, value = line.strip().split("\t") start = int(start) - 50 end = int(end) + 50 outfile.write("%s\t%s\t%s\t%s\t%s\n" % (chrom, start, end, peak, value)) outfile.close()
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = get_params().get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = iotools.open_file(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = iotools.open_file(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = iotools.zap_file(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write( "%s\t%s\t%s\t%s\n" % (fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) get_logger().info("zapped: %s" % (c)) outfile.close() return c
def process_remote(infile): repository, acc = iotools.open_file(infile).readlines()[0].strip().split() if repository == "ENCODE": location, filetype = get_encode_file(acc) elif repository == "URL": location = acc if acc.endswith("gz"): filetype = ".".join(acc.split(".")[-2]) else: filetype = acc.split(".")[-1] else: raise ValueError("repository %s not yet supported" % repository) tmpfile = P.get_temp_filename(shared=False, suffix="." + filetype) preamble = "wget %(location)s -O %(tmpfile)s --quiet &&" postamble = "&& rm %(tmpfile)s" if filetype == "bam": preamble += "samtools index %(tmpfile)s && " postamble += " && rm %(tmpfile)s.bai " elif filetype == "bed.gz": tmp2 = P.get_temp_filename(shared=False) preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s && mv %(tmp2)s %(tmpfile)s && tabix -p bed %(tmpfile)s && ''' postamble += "&& rm %(tmpfile)s.tbi" return preamble % locals(), postamble % locals(), tmpfile, filetype
def parseMutectCallStats(infile, outfile): '''take the call stats outfile from mutect and summarise the reasons for variant rejection''' single_dict = collections.defaultdict(int) combinations_dict = collections.defaultdict(int) with iotools.open_file(infile, "rb") as infile: lines = infile.readlines() for i, line in enumerate(lines): if i < 2: continue values = line.strip().split("\t") judgement, justification = (values[-1], values[-2]) if judgement == "REJECT": reasons = justification.split(",") if len(reasons) == 1: single_dict[reasons[0]] += 1 else: for reason in reasons: combinations_dict[reasons[0]] += 1 df = pd.DataFrame([single_dict, combinations_dict]) df = df.transpose() df.columns = ["single", "combination"] df = df.sort("single", ascending=False) df.index.name = "justification" df.to_csv(outfile, header=True, index=True, sep="\t")
def buildMisprimingLib(infiles, outfile): ''' build fasta file of sequences to check for mispriming ''' fasta, identifiers = infiles inf = IOTools.open_file(fasta) E.info("reading ids for sequences to keep") ids = readIdentifiers(identifiers) outf = IOTools.open_file(outfile, "w") E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.open_file(fasta)): if f.title not in ids: outf.write(">%s\n%s\n" % (f.title, f.sequence)) outf.close()
def buildOptimalPrimerSet(infiles, outfile): ''' build a set of optimal primer pairs across sequences ''' outf = IOTools.open_file(outfile, "w") outf.write("""name\tforward_seq\tforward_gc (%) \tforward_tm\tforward_length (bp)\treverse_seq\treverse_gc (%)\treverse_tm\treverse_length (bp)\tfragment_length (bp)\n""") for infile in infiles: primerset = PrimerSet() name = primerset.readName(infile) size = primerset.readSize(infile) forward = primerset.readForward(infile) E.info(forward) reverse = primerset.readReverse(infile) primerset = primerset.parse(attributes=[name, size] + list(forward) + list(reverse)) outf.write("\t".join([primerset.name, primerset.forwardseq, primerset.forwardgc, primerset.forwardtm, primerset.forwardlength, primerset.reverseseq, primerset.reversegc, primerset.reversetm, primerset.reverselength, primerset.size]) + "\n") outf.close()
def getCpGIslandsFromUCSC(dbhandle, outfile): '''get CpG islands from UCSC database and save as a :term:`bed` formatted file. The name column in the bed file will be set to the UCSC name. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`bed` format. ''' table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name FROM %(table)s ORDER by chrom, chromStart""" sql = sql % locals() E.debug("executing sql statement: %s" % sql) try: cc = dbhandle.execute(sql) outfile = iotools.open_file(outfile, "w") for data in cc.fetchall(): outfile.write("\t".join(map(str, data)) + "\n") outfile.close() except Exception: E.warn("Failed to connect to table %s. %s is empty" % (table, outfile)) iotools.touch_file(outfile)
def run(self, infile, outfile, params): if params.reference_fasta_map is None: raise ValueError("bam2reference requires a reference sequence map") reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") retval, diff = get_reference_for_bam(infile, fasta) if retval is None: if diff is None: retval = "corrupted" else: retval = "unknown" E.debug("differences: {}".format(str(diff))) path = "" else: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) path = map_path2name.get(retval, os.path.basename(retval)) with IOTools.open_file(outfile, "w") as outf: outf.write("filename\treference\tpath\n") outf.write("\t".join((infile, retval, path)) + "\n") return None
def createOpen(self, mode="w", header=None): """open file. Check first, if directory exists. """ self.nchunk += 1 filename = self.output_filename_pattern % self.nchunk if self.dry_run: E.info("opening file %s" % filename) returniotools.open_file("/dev/null", mode) if mode in ("w", "a"): dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if os.path.exists(filename): existed = True else: existed = False f = iotools.open_file(filename, mode) if header and not existed: f.write(header + "\n") return f
def split_gtf_by_category(infiles, outfiles, catname): catfile, gtffile = infiles categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t") # create output filepool outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True) gtffile = iotools.open_file(gtffile) for gtfline in gtf.iterator(gtffile): try: transcript_id = gtfline.transcript_id except AttributeError: transcript_id = None try: gene_id = gtfline.gene_id except AttributeError: gene_id = None if transcript_id in categories.index: outpool.write(categories[transcript_id], str(gtfline) + "\n") elif gene_id in categories.index: outpool.write(categories[gene_id], str(gtfline) + "\n") outpool.close()
def runGLAM2SCAN(infiles, outfile): '''run glam2scan on all intervals and motifs. ''' # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles controlfile = dbfile[:-len(".fasta")] + ".controlfasta" if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) if os.path.exists(outfile): os.remove(outfile) for motiffile in motiffiles: of = iotools.open_file(outfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s ::\n" % motif) of.close() statement = ''' cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s ''' P.run(statement)
def read_table(filename, options): '''read table and filter as an iterator. ''' if os.path.exists(filename): lines = iotools.open_file(filename, "r") else: lines = (x for x in []) # extract table by regular expression enumerated_lines = enumerate(lines) if options.regex_start: rx = re.compile(options.regex_start) for n, line in enumerated_lines: if rx.search(line): E.info("reading table from line %i" % n) if not line.startswith("#") and line.strip(): yield line break else: E.info("start regex not found - no table") if options.regex_end: rx = re.compile(options.regex_end) for n, line in enumerated_lines: if options.regex_end and rx.search(line): break if not line.startswith("#") and line.strip(): yield line
def run(self, infiles, outfile, params): def _link(infile, outfile): if os.path.exists(os.path.abspath(outfile)): return dirname = os.path.dirname(outfile) if not os.path.exists(dirname): os.makedirs(dirname) os.symlink(infile, os.path.abspath(outfile)) rx = re.compile(params.regex) outfiles = [] for infile in infiles: outpath = os.path.join( os.path.dirname(outfile), rx.search(infile).expand(params.pattern_out)) for suffix in self.suffixes: for fn in glob.glob(infile + suffix): _link(fn, outpath + suffix) _link(os.path.abspath(infile), outpath) outfiles.append(outpath) with IOTools.open_file(outfile, "w") as outf: outf.write("\n".join(outfiles) + "\n")
def getGeneTable(reffile): E.info("Loading reference") table = defaultdict(dict) for ens_gene in GTF.gene_iterator(GTF.iterator( IOTools.open_file(reffile))): geneid = ens_gene[0][0].gene_id table[geneid]["models"] = dict() table[geneid]["start_codons"] = defaultdict(list) for transcript in ens_gene: transcript_id = transcript[0].transcript_id table[geneid]["models"][transcript_id] = transcript CDS = GTF.asRanges(transcript, "start_codon") if len(CDS) == 0: continue if transcript[0].strand == "-": start_codon = max(e[1] for e in CDS) else: start_codon = min(e[0] for e in CDS) table[geneid]["start_codons"][start_codon].append(transcript_id) E.info("Reference Loaded") return table