Пример #1
0
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    '''import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    '''

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    dbhandle = PipelineUCSC.connectToUCSC(
        host=PARAMS["ucsc_host"],
        user=PARAMS["ucsc_user"],
        database=ucsc_database)

    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(shared=True)

    total_repeats = 0
    for table in tables:
        E.info("%s: loading repeats from %s" % (ucsc_database, table))
        cc = dbhandle.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.',
            strand, '.',
            CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
            FROM %(table)s
            WHERE repClass in ('%(repclasses)s') """ % locals())
        n = 0
        for data in cc.fetchall():
            n += 1
            tmpfile.write("\t".join(map(str, data)) + "\n")
        E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n))
        total_repeats += n

    if total_repeats == 0:
        raise ValueErrror("did not find any repeats for %s" % ucsc_database)

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''cat %(tmpfilename)s
    | %(pipeline_scriptsdir)s/gff_sort pos
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)