def run_signalp(fasta_files, output_file, tmp_dir):
    def _consumer(worker_index, worker_fasta_file, output_queue):
        # process with signalp
        logging.debug('[Worker %d] Running SignalP' % (worker_index))
        args = ['signalp', '-f', 'short', '-t', 'euk', worker_fasta_file]
        tmp_output_file = os.path.join(tmp_dir, 'worker%d.signalp.txt' % (worker_index))
        with open(tmp_output_file, 'w') as outfileh:
            retcode = subprocess.call(args, stdout=outfileh)
            if retcode != 0:
                logging.error('[Worker %d] Error running SignalP' % (worker_index))
        # put results into output queue
        with open(tmp_output_file) as f:
            for line in f:
                if not line:
                    continue
                line = line.strip()
                if not line:
                    continue
                if line.startswith('#'):
                    continue
                output_queue.put(line)
        output_queue.put(None)
        logging.debug('[Worker %d] Finished' % (worker_index))
    # create multiprocessing queue for passing data
    output_queue = Queue(maxsize=len(fasta_files)*4) 
    # start consumer processes
    procs = []
    for i in xrange(len(fasta_files)):
        p = Process(target=_consumer, args=(i, fasta_files[i], output_queue))
        p.start()
        procs.append(p)
    # get results from consumers
    num_alive = len(procs)
    tmp_output_file = os.path.join(tmp_dir, 'signalp.merge.txt')
    with open(tmp_output_file, 'w') as f:
        while num_alive > 0:
            result = output_queue.get()
            if result is None:
                num_alive -= 1
                logging.debug("Main process detected worker finished, %d still alive" % (num_alive))
            else:
                print >>f, result
    logging.debug("Joining all processes")
    # wait for consumers to finish
    for p in procs:
        p.join()
    # sort
    logging.debug('Sorting signal peptide results')
    def sort_signalp(line):
        return line.split(None,1)[0]
    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=tmp_output_file,
               output=output_file,
               key=sort_signalp,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    # cleanup temporary files
    shutil.rmtree(sort_tmp_dir)
    return 0
def sort_classification_results(input_file, output_file, tmp_dir):
    # sort classification results
    def sort_by_chrom_start(line):
        fields = line.strip().split('\t', 2)
        if fields[0] == "chrom":
            return chr(0), 0
        return fields[0], int(fields[1])
    batch_sort(input=input_file,
               output=output_file,
               key=sort_by_chrom_start,
               buffer_size=(1 << 21),
               tempdirs=[tmp_dir])
def sort_classification_results(input_file, output_file, tmp_dir):
    # sort classification results
    def sort_by_chrom_start(line):
        fields = line.strip().split('\t', 2)
        if fields[0] == "chrom":
            return chr(0), 0
        return fields[0], int(fields[1])

    batch_sort(input=input_file,
               output=output_file,
               key=sort_by_chrom_start,
               buffer_size=(1 << 21),
               tempdirs=[tmp_dir])
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, 
                 output_dir, min_orf_length, first_orf_only, 
                 num_processes):
    #
    # extract transcript DNA sequences, translate to protein, and
    # search for ORFs
    #
    logging.debug('Finding ORFs in transcript sequences')
    # output files
    tmp_dir = os.path.join(output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.info("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed')
    unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt')
    unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed')
    orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt')
    sorted_orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.sortbyorf.txt')
    sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt')
    signalp_file = os.path.join(output_dir, 'signalp.txt')
    pfam_file = os.path.join(output_dir, 'pfam.txt')
    merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbyorf.merged.txt')
    sorted_merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbytranscript.merged.txt')
    # open output files
    orf_fileh = open(orf_file, 'w')
    orf_bed_fileh = open(orf_bed_file, 'w')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    num_finished = 1
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            if first_orf_only:
                # get first ORF
                orf = get_first_transcript_orf(t, ref_fa)
                if len(orf.seq) >= min_orf_length:
                    print >>orf_fileh, '\t'.join(orf.to_table())
                    print >>orf_bed_fileh, '\t'.join(orf.to_bed())
            else:
                # get all ORFs
                for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length):
                    print >>orf_fileh, '\t'.join(orf.to_table())
                    print >>orf_bed_fileh, '\t'.join(orf.to_bed())
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # cleanup
    orf_fileh.close()
    orf_bed_fileh.close()
    #
    # sort ORF table by ORF amino acid sequence to group identical ORFs 
    # together
    #
    logging.debug('Sorting ORFs by amino acid sequence')
    def sort_by_seq(line):
        '''comparison function for batch_sort'''
        fields = line.strip().split('\t')
        return fields[ORFInfo.SEQ_COL_NUM]
    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=orf_file,
               output=sorted_orf_file,
               key=sort_by_seq,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    #
    # assign each ORF a unique id and write to FASTA file
    #
    logging.debug('Determining unique ORFs')
    orf_fasta_prefix = os.path.join(tmp_dir, 'orf')
    orf_fasta_files = []
    orf_fasta_sizes = []
    for i in xrange(num_processes):
        orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w'))
        orf_fasta_sizes.append(0)
    orf_file_index = 0
    outfileh = open(sorted_orf_id_file, 'w')
    unique_orf_fileh = open(unique_orf_file, 'w')
    print >>unique_orf_fileh, '\t'.join(['orf_id', 'orf_length', 'total_occurrences', 'unique_genomic_occurrences'])
    unique_orf_bed_fileh = open(unique_orf_bed_file, 'w')
    with open(sorted_orf_file) as infileh:
        for orfs in group_unique_orfs(infileh):
            # write to master transcript/ORF table
            for orf in orfs:
                print >>outfileh, '\t'.join(orf.to_table())
            # write ORF to fasta file
            lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*'))
            print >>orf_fasta_files[orf_file_index], lines
            orf_fasta_sizes[orf_file_index] += 1
            # advance to next fasta file
            orf_file_index = (orf_file_index + 1) % (num_processes)
            # group by genomic position and write ORFs to BED file
            unique_genome_orfs = {}
            for orf in orfs:
                k = (orf.chrom, orf.strand, tuple(orf.exons))
                if k in unique_genome_orfs:
                    continue
                unique_genome_orfs[k] = orf
            for orf in unique_genome_orfs.itervalues():
                print >>unique_orf_bed_fileh, '\t'.join(orf.to_bed(orf.orf_id))
            # write unique ORF to tab-delimited text file
            fields = [orfs[0].orf_id, len(orfs[0].seq), len(orfs), len(unique_genome_orfs)]
            print >>unique_orf_fileh, '\t'.join(map(str,fields))
    # cleanup
    unique_orf_bed_fileh.close()
    outfileh.close()
    # get fasta files with lines written
    orf_fasta_file_names = []
    for i in xrange(len(orf_fasta_files)):
        orf_fasta_files[i].close()
        if orf_fasta_sizes[i] > 0:
            orf_fasta_file_names.append(orf_fasta_files[i].name)
    #
    # search FASTA file against signalp
    #
    logging.debug('Searching for signal peptides')
    retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir)
    if retcode != 0:
        logging.error('Error searching for signal peptides')
        return 1 
    #
    # search FASTA file against Pfam
    #
    logging.error('Scanning for Pfam domains')
    retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir)
    if retcode != 0:
        logging.error('Error running pfam_scan.pl')
    #
    # merge results from Pfam and signalp
    #
    logging.debug('Merging SignalP and Pfam results')
    merge_results(sorted_orf_id_file, signalp_file, pfam_file, merged_orf_id_file)
    #
    # sort by transcript id
    #
    logging.debug('Sorting ORFs by transcript ID')
    def sort_by_transcript_id(line):
        return line.split('\t', 1)[0]
    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=merged_orf_id_file,
               output=sorted_merged_orf_id_file,
               key=sort_by_transcript_id,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    # cleanup
    ref_fa.close()
    return 0
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir,
                 min_orf_length, first_orf_only, num_processes):
    #
    # extract transcript DNA sequences, translate to protein, and
    # search for ORFs
    #
    logging.debug('Finding ORFs in transcript sequences')
    # output files
    tmp_dir = os.path.join(output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.info("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed')
    unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt')
    unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed')
    orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt')
    sorted_orf_file = os.path.join(tmp_dir,
                                   'transcript_orfs.no_ids.sortbyorf.txt')
    sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt')
    signalp_file = os.path.join(output_dir, 'signalp.txt')
    pfam_file = os.path.join(output_dir, 'pfam.txt')
    merged_orf_id_file = os.path.join(output_dir,
                                      'transcript_orfs.sortbyorf.merged.txt')
    sorted_merged_orf_id_file = os.path.join(
        output_dir, 'transcript_orfs.sortbytranscript.merged.txt')
    # open output files
    orf_fileh = open(orf_file, 'w')
    orf_bed_fileh = open(orf_bed_file, 'w')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    num_finished = 1
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            if first_orf_only:
                # get first ORF
                orf = get_first_transcript_orf(t, ref_fa)
                if len(orf.seq) >= min_orf_length:
                    print >> orf_fileh, '\t'.join(orf.to_table())
                    print >> orf_bed_fileh, '\t'.join(orf.to_bed())
            else:
                # get all ORFs
                for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length):
                    print >> orf_fileh, '\t'.join(orf.to_table())
                    print >> orf_bed_fileh, '\t'.join(orf.to_bed())
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # cleanup
    orf_fileh.close()
    orf_bed_fileh.close()
    #
    # sort ORF table by ORF amino acid sequence to group identical ORFs
    # together
    #
    logging.debug('Sorting ORFs by amino acid sequence')

    def sort_by_seq(line):
        '''comparison function for batch_sort'''
        fields = line.strip().split('\t')
        return fields[ORFInfo.SEQ_COL_NUM]

    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=orf_file,
               output=sorted_orf_file,
               key=sort_by_seq,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    #
    # assign each ORF a unique id and write to FASTA file
    #
    logging.debug('Determining unique ORFs')
    orf_fasta_prefix = os.path.join(tmp_dir, 'orf')
    orf_fasta_files = []
    orf_fasta_sizes = []
    for i in xrange(num_processes):
        orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w'))
        orf_fasta_sizes.append(0)
    orf_file_index = 0
    outfileh = open(sorted_orf_id_file, 'w')
    unique_orf_fileh = open(unique_orf_file, 'w')
    print >> unique_orf_fileh, '\t'.join([
        'orf_id', 'orf_length', 'total_occurrences',
        'unique_genomic_occurrences'
    ])
    unique_orf_bed_fileh = open(unique_orf_bed_file, 'w')
    with open(sorted_orf_file) as infileh:
        for orfs in group_unique_orfs(infileh):
            # write to master transcript/ORF table
            for orf in orfs:
                print >> outfileh, '\t'.join(orf.to_table())
            # write ORF to fasta file
            lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*'))
            print >> orf_fasta_files[orf_file_index], lines
            orf_fasta_sizes[orf_file_index] += 1
            # advance to next fasta file
            orf_file_index = (orf_file_index + 1) % (num_processes)
            # group by genomic position and write ORFs to BED file
            unique_genome_orfs = {}
            for orf in orfs:
                k = (orf.chrom, orf.strand, tuple(orf.exons))
                if k in unique_genome_orfs:
                    continue
                unique_genome_orfs[k] = orf
            for orf in unique_genome_orfs.itervalues():
                print >> unique_orf_bed_fileh, '\t'.join(orf.to_bed(
                    orf.orf_id))
            # write unique ORF to tab-delimited text file
            fields = [
                orfs[0].orf_id,
                len(orfs[0].seq),
                len(orfs),
                len(unique_genome_orfs)
            ]
            print >> unique_orf_fileh, '\t'.join(map(str, fields))
    # cleanup
    unique_orf_bed_fileh.close()
    outfileh.close()
    # get fasta files with lines written
    orf_fasta_file_names = []
    for i in xrange(len(orf_fasta_files)):
        orf_fasta_files[i].close()
        if orf_fasta_sizes[i] > 0:
            orf_fasta_file_names.append(orf_fasta_files[i].name)
    #
    # search FASTA file against signalp
    #
    logging.debug('Searching for signal peptides')
    retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir)
    if retcode != 0:
        logging.error('Error searching for signal peptides')
        return 1
    #
    # search FASTA file against Pfam
    #
    logging.error('Scanning for Pfam domains')
    retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir)
    if retcode != 0:
        logging.error('Error running pfam_scan.pl')
    #
    # merge results from Pfam and signalp
    #
    logging.debug('Merging SignalP and Pfam results')
    merge_results(sorted_orf_id_file, signalp_file, pfam_file,
                  merged_orf_id_file)
    #
    # sort by transcript id
    #
    logging.debug('Sorting ORFs by transcript ID')

    def sort_by_transcript_id(line):
        return line.split('\t', 1)[0]

    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=merged_orf_id_file,
               output=sorted_merged_orf_id_file,
               key=sort_by_transcript_id,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    # cleanup
    ref_fa.close()
    return 0
def run_pfam(fasta_files, pfam_dir, output_file, tmp_dir):
    def _consumer(worker_index, worker_fasta_file, output_queue):
        # process with pfam
        logging.debug('[Worker %d] Running pfam_scan.pl' % (worker_index))
        tmp_output_file = os.path.join(tmp_dir,
                                       'worker%d.pfam.txt' % (worker_index))
        args = [
            'pfam_scan.pl', '-cpu', '1', '-pfamB', '-fasta', worker_fasta_file,
            '-dir', pfam_dir, '-outfile', tmp_output_file
        ]
        retcode = subprocess.call(args)
        if retcode != 0:
            logging.error('[Worker %d] Error running pfam_scan.pl' %
                          (worker_index))
        # put results into output queue
        with open(tmp_output_file) as f:
            for line in f:
                if not line:
                    continue
                line = line.strip()
                if not line:
                    continue
                if line.startswith('#'):
                    continue
                output_queue.put(line)
        output_queue.put(None)
        logging.debug('[Worker %d] Finished' % (worker_index))

    # create multiprocessing queues for passing data
    output_queue = Queue(maxsize=len(fasta_files) * 4)
    # start consumer processes
    procs = []
    for i in xrange(len(fasta_files)):
        p = Process(target=_consumer, args=(i, fasta_files[i], output_queue))
        p.start()
        procs.append(p)
    # get results from consumers
    num_alive = len(procs)
    tmp_output_file = os.path.join(tmp_dir, 'pfam.merge.txt')
    with open(tmp_output_file, 'w') as f:
        while num_alive > 0:
            result = output_queue.get()
            if result is None:
                num_alive -= 1
                logging.debug(
                    "Main process detected worker finished, %d still alive" %
                    (num_alive))
            else:
                print >> f, result
    logging.debug("Joining all processes")
    # wait for consumers to finish
    for p in procs:
        p.join()
    # sort
    logging.debug('Sorting pfam results')

    def sort_pfam(line):
        return line.split(None, 1)[0]

    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=tmp_output_file,
               output=output_file,
               key=sort_pfam,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    # cleanup temporary files
    shutil.rmtree(sort_tmp_dir)
    return 0