示例#1
0
def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = IOTools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = IOTools.openFile(infile)
    header = inf.readline()
    outf = IOTools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()
示例#2
0
def filterGTF(gtf, filterstring, tempout):

    if "!=" in filterstring:
        column, value = filterstring.split("!=")
        value = value.split("+")
        filtertype = "notin"

    elif "=" in filterstring:
        column, value = filterstring.split("=")
        value = value.split("+")
        filtertype = "in"

    elif "-in_file-" in filterstring:
        column, value = filterstring.split("-in_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "in_file"

    elif "-notin_file-" in filterstring:
        column, value = filterstring.split("-notin_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "notin_file"

    elif "-morethan-" in filterstring:
        column, value = filterstring.split("-morethan-")
        value = float(value)
        filtertype = "morethan"

    elif "-lessthan-" in filterstring:
        column, value = filterstring.split("-lessthan-")
        value = float(value)
        filtertype = "lessthan"

    gfile = IOTools.openFile(gtf)
    G = GTF.iterator(gfile)

    out = IOTools.openFile(tempout, "w")
    for item in G:
        D = item.asDict()
        D['contig'] = item.contig
        D['source'] = item.source
        D['feature'] = item.feature
        D['start'] = item.start
        D['end'] = item.end
        D['strand'] = item.strand
        D['frame'] = item.frame

        if filtertype == "in" or filtertype == 'in_file':
            if D[column] in value:
                out.write("%s\n" % str(item))
        elif filtertype == "notin" or filtertype == 'notin_file':
            if D[column] not in value:
                out.write("%s\n" % str(item))
        elif filtertype == "morethan":
            if float(D[column]) > value:
                out.write("%s\n" % str(item))
        elif filtertype == "lessthan":
            if float(D[column]) < value:
                out.write("%s\n" % str(item))
    out.close()
    gfile.close()
示例#3
0
def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = IOTools.openFile(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename
示例#4
0
def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue
        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = IOTools.openFile(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename
示例#5
0
    def __call__(self, track, slice=None):

        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(
                lengths_transcripts, numbins=40, defaultreallimits=(0, 20000))
            x = np.arange(counts.size) * dx + lower
            return odict((("length", x), ("cumulative frequency",
                                          counts / len(lengths_transcripts))))

        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(
                lengths_genes, numbins=40, defaultreallimits=(0, 20000))
            x = np.arange(counts.size) * dx + lower
            return odict((("length", x), ("cumulative frequency",
                                          counts / len(lengths_genes))))
示例#6
0
def convertGo2Goslim(options):
    """read gene list with GO assignments and convert to GO slim
    categories."""

    E.info("reading GO assignments from stdin")
    gene2gos, go2infos = ReadGene2GOFromFile(options.stdin)
    input_genes, input_goids = countGOs(gene2gos)

    #############################################################
    # read GO ontology from file
    assert options.filename_ontology, "please supply a GO ontology"
    E.info("reading ontology from %s" % (options.filename_ontology))

    infile = IOTools.openFile(options.filename_ontology)
    ontology = readOntology(infile)
    infile.close()

    go2infos = collections.defaultdict(dict)
    # substitute go2infos
    for go in list(ontology.values()):
        go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId,
                                                 go_type=go.mNameSpace,
                                                 description=go.mName)

    E.info("reading GO assignments from %s" % options.filename_slims)
    go_slims = GetGOSlims(IOTools.openFile(options.filename_slims, "r"))

    if options.loglevel >= 1:
        v = set()
        for x in list(go_slims.values()):
            for xx in x:
                v.add(xx)
        E.info("read go slims from %s: go=%i, slim=%i" %
               (options.filename_slims, len(go_slims), len(v)))

    output_goids, output_genes = set(), set()
    noutput = 0
    options.stdout.write("\t".join(("go_type", "gene_id", "go_id",
                                    "description", "evidence")) + "\n")

    for category, gene2go in sorted(gene2gos.items()):
        gene2go = MapGO2Slims(gene2go, go_slims, ontology)
        for gene_id, values in sorted(gene2go.items()):
            output_genes.add(gene_id)
            for go in sorted(values, key=lambda x: x.mGOId):
                output_goids.add(go.mGOId)
                options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (
                    go.mGOType,
                    gene_id,
                    go.mGOId,
                    go.mDescription,
                    "NA",
                ))
                noutput += 1

    E.info(("ninput_genes=%i, ninput_goids=%i, noutput_gene=%i, "
            "noutput_goids=%i, noutput=%i") %
           (len(input_genes), len(input_goids), len(output_genes),
            len(output_goids), noutput))
示例#7
0
文件: CPC.py 项目: kevinrue/cgat-flow
    def __call__(self, track, slice=None):

        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
示例#8
0
def makeCpgIslandsBed(outfile):
    infile = PARAMS["methylation_summary_cpgislands"]
    out = IOTools.openFile(outfile, "w")
    with IOTools.openFile(infile, "r") as f:
        for line in f.readlines():
            # this assumes location of req. values
            contig, start, end = line.split()[1:4]
            if not contig == "chrom":
                out.write("%s\t%s\t%s\n" % (contig, start, end))
    out.close()
示例#9
0
def make1basedCpgIslands(infile, outfile):

    # outfile, loadfile = outfiles

    out = IOTools.openFile(outfile, "w")
    out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi"))

    with IOTools.openFile(infile, "r") as f:
        lines = f.readlines()
        for line in lines:
            contig, start, stop = line.split()
            for position in [x for x in range(int(start), int(stop) + 2)]:
                out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland"))
    out.close()
示例#10
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = PARAMS.get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = IOTools.openFile(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = IOTools.openFile(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = IOTools.zapFile(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    E.info("zapped: %s" % (c))
    outfile.close()

    return c
def readDefinitions(filename):
    '''read definitions from a :term:`yaml` file.'''
    with IOTools.openFile(filename) as f:
        config = yaml.load(f)
        if config is None:
            raise IOError("could not read data from '%s'" % filename)
    return config
示例#12
0
def openOutputFile(section, mode="w"):
    """open file for writing substituting section in the
    output_pattern (if defined).

    If the filename ends with ".gz", the output is opened
    as a gzip'ed file.

    Arguments
    ---------
    section : string
       section will replace any %s in the pattern for output files.

    mode : char
       file opening mode

    Returns
    -------
    File
        an opened file
    """

    fn = getOutputFile(section)
    try:
        if fn == "-":
            return global_options.stdout
        else:
            if not global_options.output_force and os.path.exists(fn):
                raise OSError(
                    ("file %s already exists, use --force-output to "
                     "overwrite existing files.") % fn)
            return IOTools.openFile(fn, mode)
    except AttributeError:
        return global_options.stdout
示例#13
0
文件: CPC.py 项目: kevinrue/cgat-flow
    def __call__(self, track, slice=None):

        classes = ["antisense", "antisense_upstream", "antisense_downstream", "sense_upstream",
                   "sense_downstream", "intergenic", "sense_intronic", "antisense_intronic"]

        coding_set = {}
        for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding": collections.defaultdict(int)}
        total_nc = float(self.getValue(
            "SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"))
        for c in classes:
            result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id""" % (track, c))) / total_nc) * 100

        total_c = len(list(coding_set.keys()))
        for c in classes:
            ids = self.getValues(
                "SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'")
            for i in ids:
                if i in list(coding_set.keys()):
                    if coding_set[i] == c:
                        result["coding"][c] += 1

        for x, y in result["coding"].items():
            result["coding"][x] = (float(y) / total_c) * 100

        return result
def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.items():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()
示例#15
0
    def __call__(self, filenames, outfile, options):

        for fi, fn in filenames:
            E.debug("# merging %s" % fn)
            infile = IOTools.openFile(fn, "r")

            if options.output_header:
                self.parseHeader(infile, outfile, options)

            for l in infile:
                nfields = l.count("\t")

                if l[0] == "#":
                    options.stdlog.write(l)
                elif self.nfields is not None and nfields != self.nfields:
                    # validate number of fields in row, raise warning
                    # for those not matching and skip.
                    E.warn(
                        "# line %s has unexpected number of fields: %i != %i" %
                        (l[:-1], nfields, self.nfields))
                else:
                    if self.mFieldIndex is not None:
                        data = l[:-1].split("\t")
                        try:
                            data[self.mFieldIndex] = self.mMapper(
                                fi, data[self.mFieldIndex])
                        except IndexError:
                            raise IndexError("can not find field %i in %s" %
                                             (self.mFieldIndex, l))
                        l = "\t".join(data) + "\n"

                    outfile.write(l)
            infile.close()
示例#16
0
def getMappedReads(infile):
    '''return number of reads mapped. '''
    for lines in IOTools.openFile(infile, "r"):
        data = lines[:-1].split("\t")
        if data[1].startswith("without duplicates"):
            return int(data[0])
    return
示例#17
0
def buildSimpleNormalizedBAM(infiles, outfile, nreads):
    '''normalize a bam file to given number of counts
       by random sampling
    '''
    infile, countfile = infiles

    pysam_in = pysam.Samfile(infile, "rb")

    fh = IOTools.openFile(countfile, "r")
    readcount = int(fh.read())
    fh.close()

    threshold = float(nreads) / float(readcount)

    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    # iterate over mapped reads thinning by the threshold
    ninput, noutput = 0, 0
    for read in pysam_in.fetch():
        ninput += 1
        if random.random() <= threshold:
            pysam_out.write(read)
            noutput += 1

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, nreads))
示例#18
0
def exportPeaksAsBed(infile, outfile):
    '''export peaks as bed files.'''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    if infile.endswith("_macs.load"):
        track = infile[:-len("_macs.load")]
    else:
        track = infile[:-len("_intervals.load")]

    if track.startswith("control"):
        return

    peakwidth = PARAMS["peakwidth"]

    cc = dbhandle.cursor()
    statement = '''SELECT contig, peakcenter - %(peakwidth)i, peakcenter + %(peakwidth)i,
                          interval_id, peakval FROM %(track)s_intervals ORDER by contig, start''' % locals()
    cc.execute(statement)

    outs = IOTools.openFile(outfile, "w")

    for result in cc:
        contig, start, end, interval_id, peakval = result
        # peakval is truncated at a 1000 as this is the maximum permitted
        # score in a bed file.
        peakval = int(min(peakval, 1000))
        outs.write("%s\t%i\t%i\t%s\t%i\n" %
                   (contig, start, end, str(interval_id), peakval))

    cc.close()
    outs.close()
示例#19
0
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | cgat gtf2gtf
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''
            zcat %(infile)s | cgat gtf2gtf
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | cgat gtf2gtf
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | cgat gtf2gtf
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()
示例#20
0
def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc
        ]))
        i += 1
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c)
            for x in cc
        ]))
        i += 1

    outf.close()
示例#21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    dir2files = {}
    for root, directory, files in os.walk("."):
        dir2files[root] = files

    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    filename = "CWD_%s" % st
    E.info("outputting directory state to %s" % filename)
    with IOTools.openFile(filename, "w") as outf:
        outf.write("##contents of cwd on %s\n\n" % st)
        for directory, files in dir2files.items():
            for file in files:
                path = os.path.join(directory, file)
                outf.write(path + "\n")

    # write footer and output benchmark information.
    E.Stop()
示例#22
0
def ReadGeneLists(filename_genes, gene_pattern=None):
    """read gene lists from filename in matrix.

    returns a tuple (list of all genes, dictionary of gene lists) 
    """

    if filename_genes == "-":
        infile = sys.stdin
    else:
        infile = IOTools.openFile(filename_genes, "r")

    headers, table = CSV.readTable(infile.readlines(), as_rows=False)

    if filename_genes != "-":
        infile.close()

    all_genes = table[0]

    # if there is only a single column, add a dummy column
    if len(table) == 1:
        table.append([1] * len(table[0]))
        headers.append("foreground")

    E.info("read %i genes from %s" % (len(all_genes), filename_genes))

    if gene_pattern:
        rx = re.compile(gene_pattern)
        all_genes = [rx.search(x).groups()[0] for x in all_genes]

    gene_lists = collections.OrderedDict()
    for header, col in zip(headers[1:], table[1:]):
        s = list(set([x for x, y in zip(all_genes, col) if y != "0"]))
        gene_lists[header] = set(s)

    return all_genes, gene_lists
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
示例#24
0
 def __call__(self, filenames, outfile, options):
     for fi, fn in filenames:
         infile = IOTools.openFile(fn, "r")
         outfile.write(
             "######### logging output for %s ###################\n" % fi)
         for l in infile:
             outfile.write(l)
         infile.close()
示例#25
0
    def __call__(self, track, slice=None):

        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            return np.mean(lengths_transcripts)

        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            return np.mean(lengths_genes)
示例#26
0
def summarizePeaksForPooledPseudoreplicates(infiles, outfile):
    outf = IOTools.openFile(outfile, "w")
    outf.write("Sample_id\t"
               "Experiment\t"
               "Tissue\t"
               "Condition\t"
               "Pseudoreplicate\t"
               "n_peaks\n")
    IDR.countPeaks(infiles, outf)
示例#27
0
    def __call__(self, track, slice=None):

        transcript_counts = collections.defaultdict(set)
        counts = []
        for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
            transcript_counts[gtf.gene_id].add(gtf.transcript_id)
        for gene, transcripts in transcript_counts.items():
            counts.append(len(transcripts))
        return counts
示例#28
0
def writeContigSizes(genome, outfile):
    '''write contig sizes to outfile for UCSC tools.
    '''

    outf = IOTools.openFile(outfile, "w")
    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], genome))
    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outf.write("%s\t%i\n" % (contig, size))
    outf.close()
示例#29
0
def chunk_iterator_regex_group(infile, args, prefix, use_header=False):
    """group by regular expression is true.

    Entries need to be consecutive.
    """

    rex = args[0]
    column = args[1]
    chunk_size = args[2]
    last = None
    header = None
    n = chunk_size
    outfile = None
    filename = None

    for line in infile:

        if line[0] == "#":
            continue

        if not header and use_header:
            header = line
            continue

        try:
            this = rex.search(line[:-1]).groups()[0]
        except IndexError:
            if outfile:
                outfile.write(line)
            continue
        except AttributeError:
            if outfile:
                outfile.write(line)
            continue

        if last != this and n >= chunk_size:
            if last:
                outfile.close()
                yield filename

            last = this

            filename = "%s/%s.in" % (prefix, this)
            outfile = IOTools.openFile(filename, "w")
            if header:
                outfile.write(header)
            n = 0

        outfile.write(line)
        n += 1

    if outfile:
        outfile.close()
        yield filename
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile):
    ''' use the plotting methods for Counts object to make summary plots'''

    with IOTools.openFile(logfile, "w") as log:

        plot_prefix = P.snip(logfile, ".log")

        # need to manually read in data as index column is not the first column
        counts = Counts.Counts(pd.read_table(counts_inf, sep="\t"))
        counts.table.set_index(["transcript_id"])

        design = Expression.ExperimentalDesign(design_inf)

        # make certain counts table only include samples in design
        counts.restrict(design)

        cor_outfile = plot_prefix + "_pairwise_correlations.png"
        pca_var_outfile = plot_prefix + "_pca_variance.png"
        pca1_outfile = plot_prefix + "_pc1_pc2.png"
        pca2_outfile = plot_prefix + "_pc3_pc4.png"
        heatmap_outfile = plot_prefix + "_heatmap.png"

        counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False)

        counts_highExp = counts_log10.clone()
        counts_highExp.table['order'] = counts_highExp.table.apply(np.mean,
                                                                   axis=1)
        counts_highExp.table.sort(["order"], ascending=0, inplace=True)
        counts_highExp.table = counts_highExp.table.iloc[0:500, :]
        counts_highExp.table.drop("order", axis=1, inplace=True)

        log.write("plot correlations: %s\n" % cor_outfile)
        counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000)

        log.write("plot pc3,pc4: %s\n" % pca1_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile,
                             pca1_outfile,
                             x_axis="PC1",
                             y_axis="PC2",
                             colour="group",
                             shape="group")

        log.write("plot pc3,pc4: %s\n" % pca2_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile,
                             pca2_outfile,
                             x_axis="PC3",
                             y_axis="PC4",
                             colour="group",
                             shape="group")

        log.write("plot heatmap: %s\n" % heatmap_outfile)
        counts_highExp.heatmap(heatmap_outfile)