示例#1
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.get_temp_filename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=P.get_params()["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run(statement)

    os.unlink(tmpfasta)
示例#2
0
    def _output(section, subsection, valuef, dtype):

        # fold change matrix
        matrix, row_headers = buildMatrix(results, valuef=valuef, dtype=dtype)

        outfile = getFileName(options,
                              go=test_ontology,
                              section=section,
                              set='%s_all' % subsection)

        IOTools.writeMatrix(outfile,
                            matrix,
                            row_headers,
                            col_headers,
                            row_header="category")

        outfile = getFileName(options,
                              go=test_ontology,
                              section=section,
                              set='%s_alldesc' % subsection)

        IOTools.writeMatrix(
            outfile,
            matrix,
            ["%s:%s" % (x, go2info[x].mDescription) for x in row_headers],
            col_headers,
            row_header="category")
示例#3
0
def buildPicardInsertSizeStats(infile, outfile, genome_file):
    '''run Picard:CollectInsertSizeMetrics

    Collect insert size statistics.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectInsertSizeMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run(statement)
示例#4
0
def buildPicardRnaSeqMetrics(infiles, strand, outfile):
    '''run picard:RNASeqMetrics



    Arguments
    ---------
    infiles : string
        Input filename in :term:`BAM` format.
        Genome file in refflat format
            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
    outfile : string
        Output filename with picard output.

    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3
    infile, genome = infiles

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
    REF_FLAT=%(genome)s
    INPUT=%(infile)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    STRAND=%(strand)s
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run(statement)
示例#5
0
def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.open_file(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = IOTools.open_file(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename
示例#6
0
def buildPicardCoverageStats(infile, outfile, baits, regions):
    '''run picard:CollectHSMetrics

    Generate coverage statistics for regions of interest from a bed
    file using Picard.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    baits : :term:`bed` formatted file of bait regions
    regions : :term:`bed` formatted file of target regions

    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectHsMetrics
    BAIT_INTERVALS=%(baits)s
    TARGET_INTERVALS=%(regions)s
    INPUT=%(infile)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=LENIENT''' % locals()
    P.run(statement)
示例#7
0
def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.open_file(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue
        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = IOTools.open_file(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename
示例#8
0
def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = IOTools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = IOTools.openFile(infile)
    header = inf.readline()
    outf = IOTools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()
示例#9
0
def CleanVariantTables(genes, variants, cols, outfile):
    variants = pd.read_csv(variants, sep="\t")
    variants = variants.drop(0)

    vp1 = copy.copy(
        variants[['CHROM', 'POS', 'QUAL', 'ID', 'REF1', 'ALT', 'GT']])
    alleles = vp1['REF1'].str.cat(vp1['ALT'].str.strip(),
                                  sep=",").str.split(",")

    vp1['GT'] = vp1['GT'].str.replace(".", "0")
    inds1 = vp1['GT'].str.get(0).astype(int).values
    inds2 = vp1['GT'].str.get(-1).astype(int).values
    x = 0
    a1s = []
    a2s = []
    gts = []
    homhet = []
    for allele in alleles:
        i1 = int(inds1[x])
        i2 = int(inds2[x])
        a1 = allele[i1]
        a2 = allele[i2]
        a1s.append(a1)
        a2s.append(a2)
        if a1 == a2:
            homhet.append("HOM")
        else:
            homhet.append("HET")
        gts.append("%s%s" % (a1, a2))
        x += 1
    vp1['HOMHET'] = homhet
    vp1['Allele1'] = a1s
    vp1['Allele2'] = a2s
    vp1['Genotype'] = gts
    vp1 = vp1.drop(['REF1', 'ALT', 'GT'], 1)
    vp1[cols] = copy.copy(variants[cols])

    Ls = []
    for gene in [
            line.strip() for line in IOTools.open_file(genes[0]).readlines()
    ]:
        cp = []
        with IOTools.open_file(genes[1]) as infile:
            for line in infile:
                r = re.search(gene, line)
                if r:
                    line = line.strip().split("\t")
                    chrom = line[0]
                    pos = line[1]
                    cp.append("%s_%s" % (chrom, pos))
        cp = set(cp)
        for c in cp:
            Ls.append((gene, c.split("_")))
    df = pd.DataFrame(Ls)
    df['CHROM'] = df[1].str.get(0)
    df['POS'] = df[1].str.get(1)
    df = df.drop(1, 1)
    df.columns = ['gene', 'CHROM', 'POS']
    variants = vp1.merge(df, 'left')
    variants.to_csv(outfile, sep="\t")
示例#10
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        IOTools.touch_file(outfile)
        return

    load_statement = P.build_load_statement(tablename=tablename,
                                            options="--allow-empty-file "
                                            "--add-index=category "
                                            "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run(statement)
示例#11
0
def convertGo2Goslim(options):
    """read gene list with GO assignments and convert to GO slim
    categories."""

    E.info("reading GO assignments from stdin")
    gene2gos, go2infos = ReadGene2GOFromFile(options.stdin)
    input_genes, input_goids = countGOs(gene2gos)

    #############################################################
    # read GO ontology from file
    assert options.filename_ontology, "please supply a GO ontology"
    E.info("reading ontology from %s" % (options.filename_ontology))

    infile = IOTools.openFile(options.filename_ontology)
    ontology = readOntology(infile)
    infile.close()

    go2infos = collections.defaultdict(dict)
    # substitute go2infos
    for go in list(ontology.values()):
        go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId,
                                                 go_type=go.mNameSpace,
                                                 description=go.mName)

    E.info("reading GO assignments from %s" % options.filename_slims)
    go_slims = GetGOSlims(IOTools.openFile(options.filename_slims, "r"))

    if options.loglevel >= 1:
        v = set()
        for x in list(go_slims.values()):
            for xx in x:
                v.add(xx)
        E.info("read go slims from %s: go=%i, slim=%i" %
               (options.filename_slims, len(go_slims), len(v)))

    output_goids, output_genes = set(), set()
    noutput = 0
    options.stdout.write("\t".join(("go_type", "gene_id", "go_id",
                                    "description", "evidence")) + "\n")

    for category, gene2go in sorted(gene2gos.items()):
        gene2go = MapGO2Slims(gene2go, go_slims, ontology)
        for gene_id, values in sorted(gene2go.items()):
            output_genes.add(gene_id)
            for go in sorted(values, key=lambda x: x.mGOId):
                output_goids.add(go.mGOId)
                options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (
                    go.mGOType,
                    gene_id,
                    go.mGOId,
                    go.mDescription,
                    "NA",
                ))
                noutput += 1

    E.info(("ninput_genes=%i, ninput_goids=%i, noutput_gene=%i, "
            "noutput_goids=%i, noutput=%i") %
           (len(input_genes), len(input_goids), len(output_genes),
            len(output_goids), noutput))
示例#12
0
def FilterFreqCols(infile, thresh, fcols):
    '''
    Returns a set of line indices indicating lines where either of the alleles
    called have a frequency of less than thresh in all of the columns specified
    in fcols.
    No information - assigned allele frequency of -1.
    '''
    fcols = fcols.split(",")
    # read the column headings from the variant table
    cols = IOTools.open_file(infile).readline().strip().split("\t")
    # store allele frequency columns
    AFdict = dict()
    # store low frequency indices
    nD = dict()
    for col in fcols:
        ind = cols.index(col)
        GT_i = cols.index('GT')
        n = 0
        nlist = set()
        AFS = []
        with IOTools.open_file(infile) as input:
            for line in input:
                if n > 1:
                    line = line.strip().split("\t")
                    GT = line[GT_i].replace(".", "0").split("/")
                    af = line[ind].split(",")
                    AF = []
                    # where the allele frequency is not numeric
                    # "." or "NA" use -1 to indicate no data
                    for a in af:
                        try:
                            AF.append(float(a))
                        except:
                            AF.append(float(-1))
                    AF2 = [l if l > 0 else 0 for l in AF]
                    AF = np.array(AF)
                    AF = np.insert(AF, 0, 1 - sum(AF2))
                    GT[0] = int(GT[0])
                    GT[1] = int(GT[1])
                    # If the variant is not in database the column shows "."
                    # but the site
                    # may still have been called as multi allelic
                    # - use -1 for all frequencies
                    # in this case
                    if max(GT[0], GT[1]) > (len(AF) - 1):
                        AF = [float(-1)] * (max(GT[0], GT[1]) + 1)
                    AF1 = AF[GT[0]]
                    AF2 = AF[GT[1]]
                    if AF1 >= thresh and AF2 >= thresh:
                        nlist.add(n)
                    AFS.append((AF1, AF2))
                else:
                    AFS.append(('NA', 'NA'))
                n += 1
        AFdict[col] = AFS
        nD[col] = nlist

    ns = set.union(*list(nD.values()))
    return AFdict, ns
示例#13
0
def config_to_dictionary(config):
    """convert the contents of a :py:class:`ConfigParser.ConfigParser`
    object to a dictionary

    This method works by iterating over all configuration values in a
    :py:class:`ConfigParser.ConfigParser` object and inserting values
    into a dictionary. Section names are prefixed using and underscore.
    Thus::

        [sample]
        name=12

    is entered as ``sample_name=12`` into the dictionary. The sections
    ``general`` and ``DEFAULT`` are treated specially in that both
    the prefixed and the unprefixed values are inserted: ::

       [general]
       genome=hg19

    will be added as ``general_genome=hg19`` and ``genome=hg19``.

    Numbers will be automatically recognized as such and converted into
    integers or floats.

    Returns
    -------
    config : dict
        A dictionary of configuration values

    """
    p = defaultdict(lambda: defaultdict(TriggeredDefaultFactory()))
    for section in config.sections():
        for key, value in config.items(section):
            try:
                v = IOTools.str2val(value)
            except TypeError:
                E.error("error converting key %s, value %s" % (key, value))
                E.error("Possible multiple concurrent attempts to "
                        "read configuration")
                raise

            p["%s_%s" % (section, key)] = v

            # IMS: new heirarchical format
            try:
                p[section][key] = v
            except TypeError:
                # fails with things like genome_dir=abc
                # if [genome] does not exist.
                continue

            if section in ("general", "DEFAULT"):
                p["%s" % (key)] = v

    for key, value in config.defaults().items():
        p["%s" % (key)] = IOTools.str2val(value)

    return p
示例#14
0
def runGOFromDatabase(outfile,
                      outdir,
                      statement_fg,
                      statement_bg,
                      go_file,
                      ontology_file=None,
                      samples=1000):
    """check for GO enrichment.

    Gene lists are extracted from a database.
    This method is a wrapper for `runGO.py`.

    Arguments
    ---------
    outfile : string
        Output filename
    outdir : string
        Output directory for auxiliary files
    statement_fg : string
        SQL statement to select genes of foreground set.
    statement_bg : string
        SQL statement to select genes in background set.
    go_file : string
        Filename with Gene-to-GO assignments
    ontology_file : string
        Filename with ontology information.
    samples : int
        Number of samples for empirical FDR. If not given, use
        BH FDR.
    """

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    cc = dbhandle.cursor()
    fg = set([x[0] for x in cc.execute(statement_fg).fetchall()])
    bg = set([x[0] for x in cc.execute(statement_bg).fetchall()])

    if len(fg) == 0:
        IOTools.touch_file(outfile)
        return

    fg_file = os.path.join(outdir, "foreground")
    bg_file = os.path.join(outdir, "background")
    outf = open(fg_file, "w")
    outf.write("\n".join(map(str, fg)) + "\n")
    outf.close()
    outf = open(bg_file, "w")
    outf.write("\n".join(map(str, bg)) + "\n")
    outf.close()

    runGOFromFiles(outfile,
                   outdir,
                   fg_file,
                   bg_file,
                   go_file,
                   ontology_file=ontology_file,
                   samples=samples)
示例#15
0
def realignMatchedSample(infile, outfile):
    ''' repeat realignments with merged bam of control and tumor
        this should help avoid problems with sample-specific realignments'''

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    PipelineExome.GATKIndelRealign(infile, outfile, genome)

    IOTools.zap_file(infile)
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--first-fastq-file",
                      dest="fastq1",
                      type="string",
                      help="supply read1 fastq file")
    parser.add_option("-b",
                      "--second-fastq-file",
                      dest="fastq2",
                      type="string",
                      help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if args and len(args) == 2:
        options.fastq1, options.fastq2 = args

    fastq1 = IOTools.open_file(options.fastq1)
    fastq2 = IOTools.open_file(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
示例#17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output-section", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.open_file(args[0], "r")

    infile2 = IOTools.open_file(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.stop()
示例#18
0
def makeCpgIslandsBed(outfile):
    infile = PARAMS["methylation_summary_cpgislands"]
    out = IOTools.openFile(outfile, "w")
    with IOTools.openFile(infile, "r") as f:
        for line in f.readlines():
            # this assumes location of req. values
            contig, start, end = line.split()[1:4]
            if not contig == "chrom":
                out.write("%s\t%s\t%s\n" % (contig, start, end))
    out.close()
示例#19
0
文件: CPC.py 项目: kevinrue/cgat-flow
    def __call__(self, track, slice=None):

        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
 def test_touch_file_updates_existing_file(self):
     with IOTools.open_file(self.filename, "w") as outf:
         outf.write("some data\n")
     created = os.stat(self.filename).st_mtime
     time.sleep(1)
     IOTools.touch_file(self.filename)
     modified = os.stat(self.filename).st_mtime
     self.assertGreater(modified, created)
     with IOTools.open_file(self.filename) as inf:
         data = inf.read()
     self.assertEqual(data, "some data\n")
    def test_touch_file_creates_empty_file(self):
        self.assertFalse(os.path.exists(self.filename))
        IOTools.touch_file(self.filename)
        self.assertTrue(os.path.exists(self.filename))
        if self.filename.endswith(".gz"):
            self.assertFalse(IOTools.is_empty(self.filename))
        else:
            self.assertTrue(IOTools.is_empty(self.filename))

        with IOTools.open_file(self.filename) as inf:
            data = inf.read()
        self.assertEqual(len(data), 0)
示例#22
0
def mergeSampleBams(infile, outfile):
    '''merge control and tumor bams'''
    # Note: need to change readgroup headers for merge and subsequent
    # splitting of bam files
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir(shared=True)

    outfile_tumor = outfile.replace(PARAMS["sample_control"],
                                    PARAMS["sample_tumour"])
    infile_tumor = infile.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    infile_base = os.path.basename(infile)
    infile_tumor_base = infile_base.replace(PARAMS["sample_control"],
                                            PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(PARAMS["sample_control"],
                                PARAMS["sample_tumour"])

    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]

    control_id = "Control.bam"
    tumor_id = control_id.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    statement = '''picard AddOrReplaceReadGroups
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    ID=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(infile_tumor)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track_tumor)s
                    ID=%(track_tumor)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''samtools merge -rf
                    %(outfile)s
                    %(tmpdir_gatk)s/%(infile_base)s
                    %(tmpdir_gatk)s/%(infile_tumor_base)s;'''
    statement += "samtools index %(outfile)s; "
    statement += "rm -rf %(tmpdir_gatk)s ;"
    P.run(statement)
    IOTools.zap_file(infile)
    IOTools.zap_file(infile_tumor)
示例#23
0
def summarizeFastqScreen(infiles, outfiles):
    all_files = []
    for infile in infiles:
        all_files.extend(glob.glob(IOTools.snip(infile, "screen") + "*_screen.txt"))
    if len(all_files) == 0:
        E.warn("no fastqcscreen results to concatenate")
        for x in outfiles:
            IOTools.touch_file(x)
        return
    df_summary, df_details = PipelineReadqc.read_fastq_screen(
        all_files)
    df_summary.to_csv(outfiles[0], sep="\t", index=True)
    df_details.to_csv(outfiles[1], sep="\t", index=True)
示例#24
0
def filterDamage(infile, damagestr, outfiles):
    '''
    Filter variants which have not been assessed as damaging by any
    of the specified tools.
    Tools and thresholds can be specified in the pipeline.yml.

    Does not account for multiple alt alleles - if any ALT allele has
    been assessed as damaging with any tool the variant is kept,
    regardless of if this is the allele called in the sample.

    '''
    damaging = damagestr.split(",")
    cols = IOTools.open_file(infile).readline().strip().split("\t")

    D = dict()
    # parses the "damage string" from the pipeline.yml
    # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1...
    # where variants with any of these results in this column will
    # be retained
    for d in damaging:
        d = d.split("|")
        col = d[0]
        res = d[1].split("-")
        i = cols.index(col)
        D[col] = ((res, i))

    x = 0
    out = IOTools.open_file(outfiles[0], "w")
    out2 = IOTools.open_file(outfiles[1], "w")
    with IOTools.open_file(infile) as input:
        for line in input:
            if x > 1:
                # grep for specific strings within this column of this
                # line of the input file
                line = line.strip().split("\t")
                isdamaging = 0
                for key in D:
                    res, i = D[key]
                    current = line[i]
                    for r in res:
                        if re.search(r, current):
                            isdamaging = 1
                if isdamaging == 1:
                    out.write("%s\n" % "\t".join(line))
                else:
                    out2.write("%s\n" % "\t".join(line))
            else:
                out.write(line)
            x += 1
    out.close()
    out2.close()
示例#25
0
def loadManualAnnotations(infile, outfile):

    tmp = P.get_temp_filename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.open_file(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.open_file(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
示例#26
0
def make1basedCpgIslands(infile, outfile):

    # outfile, loadfile = outfiles

    out = IOTools.openFile(outfile, "w")
    out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi"))

    with IOTools.openFile(infile, "r") as f:
        lines = f.readlines()
        for line in lines:
            contig, start, stop = line.split()
            for position in [x for x in range(int(start), int(stop) + 2)]:
                out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland"))
    out.close()
示例#27
0
def runRegexMotifSearch(infiles, outfile):
    '''run a regular expression search on sequences.
    compute counts.
    '''

    motif = "[AG]G[GT]T[CG]A"
    reverse_motif = "T[GC]A[CA]C[TC]"

    controlfile, dbfile = infiles
    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    motifs = []
    for x in range(0, 15):
        motifs.append(
            ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE)))
    for x in range(0, 15):
        motifs.append(("ER%i" % x,
                       re.compile(motif + "." * x + reverse_motif,
                                  re.IGNORECASE)))

    db_positions = Motifs.countMotifs(IOTools.open_file(dbfile, "r"), motifs)
    control_positions = Motifs.countMotifs(IOTools.open_file(controlfile, "r"),
                                           motifs)

    db_counts, control_counts = Motifs.getCounts(
        db_positions), Motifs.getCounts(control_positions)
    db_seqcounts, control_seqcounts = Motifs.getOccurances(
        db_positions), Motifs.getCounts(control_positions)

    ndb, ncontrol = len(db_positions), len(control_positions)
    outf = IOTools.open_file(outfile, "w")
    outf.write(
        "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n"
    )
    for motif, pattern in motifs:
        try:
            fold = float(db_seqcounts[motif]) * \
                ncontrol / (ndb * control_seqcounts[motif])
        except ZeroDivisionError:
            fold = 0

        outf.write(
            "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" %
            (motif, db_counts[motif], control_counts[motif],
             db_seqcounts[motif],
             IOTools.pretty_percent(db_seqcounts[motif],
                                    ndb), control_seqcounts[motif],
             IOTools.pretty_percent(control_seqcounts[motif], ncontrol), fold))
示例#28
0
def GenotypeSNPs(infile, snplist, outfile):
    '''
    Fetches the genotype from the variant tables for all samples
    for SNPs in the hapmap sample from makeRandomSNPSet.

    Complex sites are ignored (as simple SNPs are sufficient for these
    calculations).
    These are:
        Sites which failed QC (column 3 in the variant table is not PASS)
        Sites with more than 2 alleles defined (column 6 in the variant table
        contains more than one alternative allele)
        SNPs with more than one ID
        Indels
    '''
    out = IOTools.open_file(outfile, "w")
    with IOTools.open_file(infile) as inf:
        for line in inf:
            line = line.strip().split()
            # if the variant passed QC
            if line[4] == "PASS":
                genotype = line[7]
                # if the genotype looks normal e.g. 1/1
                if len(genotype) == 3:
                    # get the actual genotype (rather than the index)
                    if genotype[0] != ".":
                        ind1 = int(genotype[0])
                    else:
                        ind1 = 0
                    if genotype[2] != ".":
                        ind2 = int(genotype[2])
                    else:
                        ind2 = 0
                    A1 = line[5]
                    A2 = line[6].split(",")
                    AS = [A1] + A2

                    if len(AS) <= 2:
                        GT = "%s%s" % (AS[ind1], AS[ind2])
                        refGT = "%s%s" % (A1, A1)
                        if len(GT) == 2:
                            if line[3][0:2] == "rs" and len(
                                    line[3].split(";")) == 1:
                                snpid = line[3]
                                chrom = line[0]
                                pos = line[1]
                                if snpid in snplist:
                                    out.write("%s\t%s\t%s\t%s\t%s\n" %
                                              (snpid, chrom, pos, GT, refGT))
    out.close()
示例#29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run, do not delete any files [%default]")

    parser.set_defaults(dry_run=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filenames = args

    c = E.Counter()
    for filename in filenames:
        c.checked += 1
        if os.path.exists(filename + ".log"):
            if IOTools.isComplete(filename + ".log"):
                c.complete += 1
                continue

        if IOTools.isComplete(filename):
            c.complete += 1
            continue

        c.incomplete += 1
        E.info('deleting %s' % filename)
        if options.dry_run:
            continue
        os.unlink(filename)
        c.deleted += 1

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
示例#30
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)