Exemplo n.º 1
0
    def test_tabs2genetrack(self):
        "Testing bed2genetrack transformation"
        from genetrack.scripts import tabs2genetrack

        inpfile = conf.testdata('short-data.bed', verify=True)
        outfile = conf.tempdata('short-data.genetrack')
        tabs2genetrack.transform(inpfile, outfile, format='BED')
Exemplo n.º 2
0
def transform(inpname, outname, format, shift=0, index=False, options=None):
    """
    Transforms reads stored in bedfile to a genetrack input file.
    Requires at least 6 bed columns to access the strand.
    """

    # detect file formats
    if format == "BED":
        CHROM, START, END, STRAND = 0, 1, 2, 5
    elif format == "GFF":
        CHROM, START, END, STRAND = 0, 3, 4, 6
    else:
        raise Exception('Invalid file format' % format)

    # two sanity checks, one day someone will thank me
    if format == 'BED' and inpname.endswith('gff'):
        raise Exception('BED format on a gff file?')
    if format == 'GFF' and inpname.endswith('bed'):
        raise Exception('GFF format on a bed file?')

    # find the basename of the outputname
    basename = os.path.basename(outname)
   
    # two files store intermediate results
    flat = conf.tempdata( '%s.flat' % basename )
    sorted  = conf.tempdata( '%s.sorted' % basename )

    # check for track information on first line, 
    # much faster this way than conditional checking on each line
    fp = file(inpname, 'rU')
    first = fp.readline()
    fp.close()

    # create the reader
    reader = csv.reader(file(inpname, 'rU'), delimiter='\t')

    # skip if trackline exists
    if first.startswith == 'track':
        reader.next()

    # unwind the comments
    list(takewhile(lambda x: x[0].startswith('#'), reader))

    # copious timing info for those who enjoy these
    timer, full = util.Timer(), util.Timer()

    logger.debug("parsing '%s'" % inpname)
    logger.debug("output to '%s'" % outname)

    # create the unsorted output file and apply corrections
    logger.debug("unsorted flat file '%s'" % flat)

    fp = file(flat, 'wt')
    for linec, row in enumerate(reader):
        try:
            chrom, start, end, strand = row[CHROM], row[START], row[END], row[STRAND]
        except Exception, exc:
            first = row[0][0]
            # may be hitting the end of the file with other comments
            if  first == '>':
                break # hit the sequence content of the gff file
            elif first == '#':
                continue # hit upon some comments
            else:
                logger.error(row)
                raise Exception(exc) 

        if strand == '+':
            # on forward strand, 5' is at start
            idx = int(start) + shift
            fwd, rev, val = 1, 0, 1
        elif strand == '-':
            # on reverse strand, 5' is at end
            idx = int(end) - shift
            fwd, rev, val = 0, 1, 1
        else:
            # no strand specified, generate interval centers
            idx = (int(start)+int(end))/2
            fwd, rev, val = 0, 0, 1

        # it is essential be able to sort the index as a string! 
        fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))
Exemplo n.º 3
0
def bedfile(inpfile):
    basename = os.path.basename(inpfile)
    outfile = conf.tempdata('%s.genetrack' % basename)
    bed2genetrack.transform(inpfile, outfile)
Exemplo n.º 4
0
def bedfile(inpfile):
    basename = os.path.basename(inpfile)
    outfile = conf.tempdata('%s.genetrack' % basename)
    bed2genetrack.transform(inpfile, outfile)