def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr,
                          num_processors, tmp_dir):
    # create queue
    input_queue = JoinableQueue(maxsize=num_processors * 3)
    # start worker processes
    procs = []
    worker_gtf_files = []
    for i in xrange(num_processors):
        worker_gtf_file = os.path.join(tmp_dir,
                                       "annotate_worker%03d.gtf" % (i))
        worker_gtf_files.append(worker_gtf_file)
        args = (input_queue, worker_gtf_file, gtf_sample_attr)
        p = Process(target=annotate_gtf_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    for lines in parse_loci(open(input_gtf_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge/sort worker gtf files
    logging.debug("Merging %d worker GTF file(s)" % (num_processors))
    merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
    # remove worker gtf files
    for filename in worker_gtf_files:
        if os.path.exists(filename):
            os.remove(filename)
Exemplo n.º 2
0
def annotate_gtf(gtf_file, bed_dbs):
    # read reference databases
    bed_trees = []
    for name,filename in bed_dbs:
        logging.debug("Loading BED db '%s' file '%s'" % (name,filename))
        trees = build_interval_tree_from_bed(filename)
        bed_trees.append((name, trees))
    # parse gtf file and annotate
    logging.debug("Annotating GTF")
    for lines in parse_loci(open(gtf_file)):
        features = []
        transcripts = []
        transcript_matches = collections.defaultdict(lambda: collections.defaultdict(lambda: set()))
        for line in lines:
            f = GTFFeature.from_string(line)
            features.append(f)
            t_id = f.attrs['transcript_id']
            if f.feature_type == 'transcript':
                transcripts.append(f)
            elif f.feature_type == 'exon':
                for dbname,dbtrees in bed_trees:
                    # intersect this exon with features
                    hits = dbtrees[f.seqid].find(f.start, f.end)                        
                    matches = set(hit.value for hit in hits if hit.strand == f.strand)
                    f.attrs[dbname] = ','.join(sorted(matches))
                    # update transcript level matches
                    transcript_matches[t_id][dbname].update(matches)
        # set transcript annotations
        for f in transcripts:
            t_id = f.attrs['transcript_id']
            for dbname,dbtrees in bed_trees:
                matches = transcript_matches[t_id][dbname]
                f.attrs[dbname] = ','.join(sorted(matches))
        # write features
        for f in features:
            print str(f)
    logging.debug("Done")
Exemplo n.º 3
0
def annotate_gtf_parallel(input_gtf_file,
                          output_gtf_file, 
                          gtf_sample_attr, 
                          num_processors, 
                          tmp_dir):
    # create queue
    input_queue = JoinableQueue(maxsize=num_processors*3)
    # start worker processes
    procs = []
    worker_gtf_files = []
    for i in xrange(num_processors):
        worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i))
        worker_gtf_files.append(worker_gtf_file)
        args = (input_queue, worker_gtf_file, gtf_sample_attr)
        p = Process(target=annotate_gtf_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    for lines in parse_loci(open(input_gtf_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge/sort worker gtf files
    logging.debug("Merging %d worker GTF file(s)" % (num_processors))
    merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
    # remove worker gtf files
    for filename in worker_gtf_files:
        if os.path.exists(filename):
            os.remove(filename)
def run_parallel(config):
    """
    runs assembly in parallel and merges output from child processes 

    config: RunConfig object
    """
    # create temp directory
    tmp_dir = os.path.join(config.output_dir, "tmp")
    if not os.path.exists(tmp_dir):
        logging.debug("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    # create queue
    input_queue = JoinableQueue(maxsize=config.num_processors*3)
    # shared memory values
    locus_id_value_obj = LockValue(1)
    gene_id_value_obj = LockValue(1)
    tss_id_value_obj = LockValue(1)
    t_id_value_obj = LockValue(1)
    # start worker processes
    procs = []
    worker_prefixes = []
    for i in xrange(config.num_processors):
        worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i))
        worker_prefixes.append(worker_prefix)
        args = (input_queue, 
                locus_id_value_obj,
                gene_id_value_obj,
                tss_id_value_obj,
                t_id_value_obj,
                worker_prefix,
                config)
        p = Process(target=assembly_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    # parse gtf file                
    for lines in parse_loci(open(config.gtf_input_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge gtf files
    if config.create_gtf:
        logging.info("Merging %d worker GTF files" % 
                     (config.num_processors))
        worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes]
        output_gtf_file = os.path.join(config.output_dir, "assembly.gtf")
        merge_sort_gtf_files(worker_gtf_files, output_gtf_file, 
                             tmp_dir=tmp_dir)
        # remove worker gtf files
        for filename in worker_gtf_files:
            if os.path.exists(filename):
                os.remove(filename)
    # merge bed files
    if config.create_bed:
        logging.info("Merging %d worker BED files" % 
                     (config.num_processors))
        worker_bed_files = [p + ".bed" for p in worker_prefixes]
        output_bed_file = os.path.join(config.output_dir, "assembly.bed")
        merge_sort_files(worker_bed_files, output_bed_file, 
                         sort_func=sort_bed, 
                         tmp_dir=tmp_dir)
        # write bed file track description line
        track_name = os.path.basename(config.output_dir)
        track_line = ' '.join(['track name="%s"' % (track_name),
                               'description="%s"' % (track_name),
                               'visibility=pack',
                               'useScore=1'])
        track_file = os.path.join(config.output_dir, 
                                  "assembly.bed.ucsc_track")
        fileh = open(track_file, "w")
        print >>fileh, track_line
        fileh.close()
    # merge bedgraph files
    if config.create_bedgraph:
        logging.info("Merging %d worker bedGraph files" % 
                     (config.num_processors))
        for strand in xrange(0,3):
            strand_name = STRAND_NAMES[strand]
            bgfiles = ['%s_%s.bedgraph' % (p, strand_name)
                       for p in worker_prefixes]
            output_file = os.path.join(config.output_dir, 
                                       "assembly_%s.bedgraph" % strand_name)
            merge_sort_files(bgfiles, output_file, 
                             sort_func=sort_bed, 
                             tmp_dir=tmp_dir)
            track_name = '%s_%s' % (os.path.basename(config.output_dir), 
                                    strand_name)
            track_line = ' '.join(['track type=bedGraph',
                                   'name="%s"' % (track_name),
                                   'description="%s"' % (track_name),
                                   'visibility=full',
                                   'color=%s' % (STRAND_COLORS[strand]),
                                   'autoScale=on',
                                   'alwaysZero=on',
                                   'maxHeightPixels=64:64:11'])
            track_file = os.path.join(config.output_dir, 
                                      "assembly_%s.bedgraph.ucsc_track" % strand_name)
            fileh = open(track_file, "w")
            print >>fileh, track_line
            fileh.close()
    # cleanup
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    logging.info("Done")
    return 0
def run_parallel(config):
    """
    runs assembly in parallel and merges output from child processes 

    config: RunConfig object
    """
    # create temp directory
    tmp_dir = os.path.join(config.output_dir, "tmp")
    if not os.path.exists(tmp_dir):
        logging.debug("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    # create queue
    input_queue = JoinableQueue(maxsize=config.num_processors * 3)
    # shared memory values
    locus_id_value_obj = LockValue(1)
    gene_id_value_obj = LockValue(1)
    tss_id_value_obj = LockValue(1)
    t_id_value_obj = LockValue(1)
    # start worker processes
    procs = []
    worker_prefixes = []
    for i in xrange(config.num_processors):
        worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i))
        worker_prefixes.append(worker_prefix)
        args = (
            input_queue,
            locus_id_value_obj,
            gene_id_value_obj,
            tss_id_value_obj,
            t_id_value_obj,
            worker_prefix,
            config,
        )
        p = Process(target=assembly_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    # parse gtf file
    for lines in parse_loci(open(config.gtf_input_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge gtf files
    if config.create_gtf:
        logging.info("Merging %d worker GTF files" % (config.num_processors))
        worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes]
        output_gtf_file = os.path.join(config.output_dir, "assembly.gtf")
        merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
        # remove worker gtf files
        for filename in worker_gtf_files:
            if os.path.exists(filename):
                os.remove(filename)
    # merge bed files
    if config.create_bed:
        logging.info("Merging %d worker BED files" % (config.num_processors))
        worker_bed_files = [p + ".bed" for p in worker_prefixes]
        output_bed_file = os.path.join(config.output_dir, "assembly.bed")
        merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir)
        # write bed file track description line
        track_name = os.path.basename(config.output_dir)
        track_line = " ".join(
            ['track name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=pack", "useScore=1"]
        )
        track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track")
        fileh = open(track_file, "w")
        print >> fileh, track_line
        fileh.close()
    # merge bedgraph files
    if config.create_bedgraph:
        logging.info("Merging %d worker bedGraph files" % (config.num_processors))
        for strand in xrange(0, 3):
            strand_name = STRAND_NAMES[strand]
            bgfiles = ["%s_%s.bedgraph" % (p, strand_name) for p in worker_prefixes]
            output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name)
            merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir)
            track_name = "%s_%s" % (os.path.basename(config.output_dir), strand_name)
            track_line = " ".join(
                [
                    "track type=bedGraph",
                    'name="%s"' % (track_name),
                    'description="%s"' % (track_name),
                    "visibility=full",
                    "color=%s" % (STRAND_COLORS[strand]),
                    "autoScale=on",
                    "alwaysZero=on",
                    "maxHeightPixels=64:64:11",
                ]
            )
            track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name)
            fileh = open(track_file, "w")
            print >> fileh, track_line
            fileh.close()
    # cleanup
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    logging.info("Done")
    return 0