Пример #1
0
 def _get_vector(self, bp):
     bed = read_bed(bp)
     divide_bed(bed, self.bin_size)
     merge_bed(bed)
     try:
         vector = bed2vector(bed, self.chromosome_len)
     except:
         print bp
         raise
     return vector
Пример #2
0
    def init_dhs_extraction(self):
        with open(self.dhs_path) as dp:
            dhs = read_bed(dp)
            divide_bed(dhs, self.bin_size)
            merge_bed(dhs)
            self.dhs_vector = bed2vector(dhs, self.chromosome_len)
            coverage = len(nonzero(self.dhs_vector)[0])
            print coverage,"coverage"

        ret = empty((coverage, len(self.beds_path)), dtype="int8")

        current_col = 0
        for bed_path in self.beds_path:
            with open(bed_path) as bp:
                ret[:,current_col] = self._bed2extracted_vector(bp)
                current_col += 1
        self.sparse = csr_matrix(ret,dtype="int16")
        print "init DHS extraction finished"
Пример #3
0
def aggregate_parallel(samples, args, results):
    '''
    Process and aggregate GTF input files

    samples: list of Sample objects
    args: from Argparse module. command-line arguments to configure the
          assembly process
    results: Results object containing input and output filenames
    '''
    logging.info('Aggregating in parallel using %d processes' %
                 (args.num_processes))

    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        # test opening FastaFile
        logging.info('Indexing reference genome fasta file (if necessary)')
        fasta_fh = FastaFile(args.ref_genome_fasta_file)
        fasta_fh.close()

    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'aggregate_worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        p = Process(target=aggregate_worker,
                    args=(input_queue, args, worker_dir))
        p.start()
        procs.append(p)

    # reference gtf
    if args.ref_gtf_file is not None:
        logging.debug('Reference: %s' % args.ref_gtf_file)
        input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID))
    # parse samples
    for sample in samples:
        input_queue.put(sample)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    logging.info('Merging aggregated files')
    logging.debug('\tmerging bed files')
    retcode = merge_bed(input_files=[r.transfrags_bed_file for r in worker_results],
                        output_file=results.transfrags_bed_file,
                        num_processes=args.num_processes,
                        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging filtered bed files')
    retcode = merge_bed(input_files=[r.transfrags_filtered_bed_file for r in worker_results],
                        output_file=results.transfrags_filtered_bed_file,
                        num_processes=args.num_processes,
                        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging sample stats')
    def sort_key_field0(line):
        fields = line.split('\t', 1)
        return fields[0]
    stats_header = ['sample_id', 'num_transfrags', 'filtered_length',
                    'filtered_expr', 'filtered_splice\n']
    stats_header = '\t'.join(stats_header)
    merge_files(input_files=[r.sample_stats_file for r in worker_results],
                output_file=results.sample_stats_file,
                key=sort_key_field0,
                header=stats_header)
    # cleanup worker data
    logging.info('Removing temporary files')
    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))
    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Aggregate done')
    return 0
Пример #4
0
def aggregate_parallel(samples, args, results):
    '''
    Process and aggregate GTF input files

    samples: list of Sample objects
    args: from Argparse module. command-line arguments to configure the
          assembly process
    results: Results object containing input and output filenames
    '''
    logging.info('Aggregating in parallel using %d processes' %
                 (args.num_processes))

    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        # test opening FastaFile
        logging.info('Indexing reference genome fasta file (if necessary)')
        fasta_fh = FastaFile(args.ref_genome_fasta_file)
        fasta_fh.close()

    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'aggregate_worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        p = Process(target=aggregate_worker,
                    args=(input_queue, args, worker_dir))
        p.start()
        procs.append(p)

    # reference gtf
    if args.ref_gtf_file is not None:
        logging.debug('Reference: %s' % args.ref_gtf_file)
        input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID))
    # parse samples
    for sample in samples:
        input_queue.put(sample)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    logging.info('Merging aggregated files')
    logging.debug('\tmerging bed files')
    retcode = merge_bed(
        input_files=[r.transfrags_bed_file for r in worker_results],
        output_file=results.transfrags_bed_file,
        num_processes=args.num_processes,
        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging filtered bed files')
    retcode = merge_bed(
        input_files=[r.transfrags_filtered_bed_file for r in worker_results],
        output_file=results.transfrags_filtered_bed_file,
        num_processes=args.num_processes,
        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging sample stats')

    def sort_key_field0(line):
        fields = line.split('\t', 1)
        return fields[0]

    stats_header = [
        'sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr',
        'filtered_splice\n'
    ]
    stats_header = '\t'.join(stats_header)
    merge_files(input_files=[r.sample_stats_file for r in worker_results],
                output_file=results.sample_stats_file,
                key=sort_key_field0,
                header=stats_header)
    # cleanup worker data
    logging.info('Removing temporary files')

    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))

    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Aggregate done')
    return 0
Пример #5
0
 def testMerge2(self):
     divide_bed(self.bed_obj, 100)
     merge_bed(self.bed_obj)
     with open("./utest/test1_divided_merged.bed") as dmd:
         self.bed_dmer = read_bed(dmd)
     self.assertEqual(self.bed_obj, self.bed_dmer)
Пример #6
0
 def testMerge(self):
     merge_bed(self.bed_obj)
     with open("./utest/test1_merged.bed") as md:
         self.bed_mer = read_bed(md)
     self.assertEqual(self.bed_obj, self.bed_mer)