def create_chimerascan_index(output_dir, 
                             genome_fasta_file, 
                             gene_feature_file,
                             bowtie_build_bin):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # copy reference fasta file to output dir and index it
    index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa")
    msg = "Adding reference genome to index"
    if (up_to_date(index_fasta_file, genome_fasta_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        shutil.copyfile(genome_fasta_file, index_fasta_file)
        # index the genome fasta file
        logging.info("Indexing FASTA file")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
    # add gene sequences to index
    dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE)
    msg = "Building transcriptome sequences and gene features"
    if (up_to_date(index_fasta_file, gene_feature_file) and
        up_to_date(dst_gene_feature_file, gene_feature_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        # write sequences from gene feature file
        logging.info("Adding transcript sequences and gene features to index")
        fasta_fh = open(index_fasta_file, "a")
        gene_fh = open(dst_gene_feature_file, "w")
        for g, fa_record in genepred_to_fasta(gene_feature_file, index_fasta_file):
            print >>gene_fh, str(g)
            print >>fasta_fh, fa_record
        gene_fh.close()
        fasta_fh.close()
        # remove old fasta index
        if os.path.exists(index_fasta_file + ".fai"):
            os.remove(index_fasta_file + ".fai")
        # index the combined fasta file
        logging.info("Reindexing the FASTA file")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
    # build bowtie index on the reference sequence file
    bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE)
    msg = "Building bowtie index"
    if up_to_date(bowtie_index_file, index_fasta_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX)
        args = [bowtie_build_bin, index_fasta_file, bowtie_index_name]
        if subprocess.call(args) != os.EX_OK:
            logging.error("bowtie-build failed to create alignment index")
            if os.path.exists(bowtie_index_file):
                os.remove(bowtie_index_file)
            return JOB_ERROR
    logging.info("Chimerascan index created successfully")
    return JOB_SUCCESS
Пример #2
0
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" %
                 (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update(
            [line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update(
            [line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir,
                                   config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir,
                                       config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir,
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(
        open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 -
                           runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Process and inspect the FASTQ files, performing several alterations
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    #
    converted_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES
    ]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(
        up_to_date(cfq, fq)
        for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files,
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir,
                                          config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(
        tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(
            up_to_date(transcriptome_bam_file, fq)
            for fq in converted_fastq_files) and all(
                up_to_date(a, b)
                for a, b in zip(transcriptome_unaligned_fastq_files,
                                converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(
            transcriptome_index=transcriptome_index,
            genome_index=genome_index,
            transcript_file=transcript_file,
            fastq_files=converted_fastq_files,
            unaligned_path=transcriptome_unaligned_path,
            bam_file=transcriptome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_transcriptome_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(
        runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(
            sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file,
                   sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file,
                   sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(
            open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(
            bamfh,
            transcripts,
            min_isize=min_fragment_length,
            max_isize=runconfig.max_fragment_length,
            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" %
                            (runconfig.isize_mean, runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(
                runconfig.isize_mean,
                runconfig.isize_stdev,
                min_isize=runconfig.min_fragment_length,
                max_isize=runconfig.max_fragment_length,
                samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(
        DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" %
                  (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" %
                  (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug(
        "\tAfter adjusting for min %d and read length %d, final segment length is %d"
        % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug(
            "\tOverriding auto segment length and using segment length of %d" %
            (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files)
            and all(
                up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files,
                                                 converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(
            index=genome_index,
            fastq_files=transcriptome_unaligned_fastq_files,
            unaligned_path=genome_unaligned_path,
            bam_file=genome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(
            up_to_date(realigned_bam_file, fq)
            for fq in genome_unaligned_fastq_files)
            and up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(
            transcripts=transcripts,
            input_bam_file=realigned_bam_file,
            paired_bam_file=paired_bam_file,
            discordant_bam_file=discordant_bam_file,
            unpaired_bam_file=unpaired_bam_file,
            unmapped_bam_file=unmapped_bam_file,
            multimap_bam_file=multimap_bam_file,
            unresolved_bam_file=unresolved_bam_file,
            max_isize=runconfig.max_fragment_length,
            max_multihits=runconfig.max_multihits,
            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(
        tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        discordant_genome_sam_file = os.path.join(
            tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(
            genome_index,
            transcripts,
            input_file=discordant_bam_file,
            output_file=discordant_genome_sam_file,
            library_type=runconfig.library_type,
            input_sam=False,
            output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file,
                             discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file,
                   discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file,
                   sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir,
                                            config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        unpaired_genome_sam_file = os.path.join(
            tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index,
                                          transcripts,
                                          input_file=unpaired_bam_file,
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file,
                             unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file,
                   sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir,
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            concordant_bam_file=sorted_transcriptome_bam_file,
            output_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_file=cluster_file,
            cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file, )
    if up_to_date(cluster_pair_file,
                  sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(
            discordant_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_pair_file=cluster_pair_file,
            tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file, cluster_shelve_file,
                   cluster_pair_file)
    output_files = (breakpoint_bam_file, )
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(
            index_dir=runconfig.index_dir,
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            breakpoint_bam_file=breakpoint_bam_file,
            log_dir=log_dir,
            tmp_dir=tmp_dir,
            num_processors=runconfig.num_processors,
            local_anchor_length=runconfig.local_anchor_length,
            local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(
        tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file)
    output_files = (spanning_bam_file, spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            bam_file=breakpoint_bam_file,
            output_sam_file=spanning_sam_file,
            output_cluster_pair_file=spanning_cluster_pair_file,
            local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir,
                                            config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    #
    unfiltered_chimera_bedpe_file = os.path.join(
        runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (
        unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file)
            and up_to_date(unfiltered_chimera_bedpe_file,
                           cluster_shelve_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts,
                               cluster_shelve_file=cluster_shelve_file,
                               cluster_pair_file=spanning_cluster_pair_file,
                               read_name_file=read_name_file,
                               output_file=unfiltered_chimera_bedpe_file,
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                      config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = filter_chimeras(
            input_file=unfiltered_chimera_bedpe_file,
            output_file=chimera_bedpe_file,
            filter_num_frags=runconfig.filter_num_frags,
            filter_allele_fraction=runconfig.filter_allele_fraction,
            mask_biotypes=mask_biotypes,
            mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    #
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run.")
    return config.JOB_SUCCESS
Пример #3
0
def create_chimerascan_index(output_dir, genome_fasta_file,
                             transcript_feature_file):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # copy reference fasta file to output dir and index it
    dst_genome_fasta_file = os.path.join(output_dir, config.GENOME_FASTA_FILE)
    msg = "Adding reference genome"
    if (up_to_date(dst_genome_fasta_file, genome_fasta_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        shutil.copyfile(genome_fasta_file, dst_genome_fasta_file)
        # index the genome fasta file
        logging.info("Indexing FASTA file")
        fh = pysam.Fastafile(dst_genome_fasta_file)
        fh.close()
    # add gene sequences to index
    dst_transcript_feature_file = os.path.join(output_dir,
                                               config.TRANSCRIPT_FEATURE_FILE)
    transcript_fasta_file = os.path.join(output_dir,
                                         config.TRANSCRIPTOME_FASTA_FILE)
    multimapping_file = os.path.join(output_dir, config.MAX_MULTIMAPPING_FILE)
    msg = "Building transcriptome sequences and gene features"
    if (up_to_date(dst_transcript_feature_file, transcript_feature_file)
            and up_to_date(transcript_fasta_file, dst_transcript_feature_file)
            and up_to_date(multimapping_file, transcript_feature_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        # write sequences from gene feature file
        logging.info("Adding transcript sequences")
        fasta_fh = open(transcript_fasta_file, "w")
        tx_fh = open(dst_transcript_feature_file, "w")
        chrom_transcript_dict = collections.defaultdict(lambda: [])
        for t, fa_record in transcript_features_to_fasta(
                transcript_feature_file, dst_genome_fasta_file):
            print >> tx_fh, str(t)
            print >> fasta_fh, fa_record
            chrom_transcript_dict[t.chrom].append(t)
        tx_fh.close()
        fasta_fh.close()
        # find maximum transcript overlap as this informs alignment
        # parameters controlling multi-mapping read handling
        max_overlap = 0
        for chrom, transcripts in chrom_transcript_dict.iteritems():
            overlap = find_maximum_feature_overlap(transcripts)
            max_overlap = max(max_overlap, overlap)
        logging.info("Maximum transcript overlap is %d" % (max_overlap))
        fh = open(multimapping_file, "w")
        print >> fh, max_overlap
        fh.close()
        # index the transcript fasta file
        logging.info("Indexing the Transcriptome FASTA file")
        fh = pysam.Fastafile(transcript_fasta_file)
        fh.close()
    #
    # Build Transcriptome alignment index
    #
    skip = True
    index_files = (os.path.join(output_dir, f)
                   for f in config.TRANSCRIPTOME_BOWTIE2_FILES)
    for f in index_files:
        skip = skip and up_to_date(f, transcript_fasta_file)
    msg = "Building transcriptome index"
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir,
                                         config.TRANSCRIPTOME_INDEX)
        args = [
            config.BOWTIE2_BUILD_BIN, transcript_fasta_file, bowtie_index_name
        ]
        if subprocess.call(args) != os.EX_OK:
            logging.error("Failed to create alignment index")
            for f in index_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Build Genome alignment index
    #
    skip = True
    index_files = (os.path.join(output_dir, f)
                   for f in config.GENOME_BOWTIE2_FILES)
    for f in index_files:
        skip = skip and up_to_date(f, dst_genome_fasta_file)
    msg = "Building genome index"
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, config.GENOME_INDEX)
        args = [
            config.BOWTIE2_BUILD_BIN, dst_genome_fasta_file, bowtie_index_name
        ]
        if subprocess.call(args) != os.EX_OK:
            logging.error("Failed to create alignment index")
            for f in index_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    logging.info("Chimerascan index created successfully")
    return config.JOB_SUCCESS
Пример #4
0
def create_chimerascan_index(output_dir, 
                             genome_fasta_file, 
                             gene_feature_file,
                             bowtie_build_bin):
#                             min_fragment_size,
#                             max_fragment_size):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # copy reference fasta file to output dir
    index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa")
    if (up_to_date(index_fasta_file, genome_fasta_file) and
        up_to_date(index_fasta_file, gene_feature_file)):
        logging.info("[SKIPPED] Adding reference genome to index")
    else:
        logging.info("Adding reference genome to index")
        shutil.copyfile(genome_fasta_file, index_fasta_file)
        # index the genome fasta file
        logging.info("Indexing FASTA file")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
        # append sequences from gene feature file
        logging.info("Adding transcript sequences to index...")
        fh = open(index_fasta_file, "a")
        for fa_record in bed12_to_fasta(gene_feature_file, 
                                        index_fasta_file):
            print >>fh, fa_record
        fh.close()
        # remove old fasta index
        os.remove(index_fasta_file + ".fai")
        # re-index the combined fasta file
        logging.info("Re-indexing FASTA file...")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
    # build bowtie index on the reference sequence file
    bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE)
    msg = "Building bowtie index"
    if up_to_date(bowtie_index_file, index_fasta_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX)
        args = [bowtie_build_bin, index_fasta_file, bowtie_index_name]
        if subprocess.call(args) != os.EX_OK:
            logging.error("bowtie-build failed to create alignment index")
            if os.path.exists(bowtie_index_file):
                os.remove(bowtie_index_file)
            return JOB_ERROR
    # copy gene bed file to index directory
    dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE)
    if up_to_date(dst_gene_feature_file, gene_feature_file):
        logging.info("[SKIPPED] Adding transcript features to index...")
    else:
        logging.info("Adding transcript features to index...")
        shutil.copyfile(gene_feature_file, dst_gene_feature_file)
    # create tophat junctions file from gene features
#    juncs_file = os.path.join(output_dir, TOPHAT_JUNCS_FILE)
#    if up_to_date(juncs_file, dst_gene_feature_file):
#        logging.info("[SKIPPED] Creating splice junction file...")
#    else:
#        logging.info("Creating splice junction file...")
#        fh = open(juncs_file, "w")
#        for junc_line in create_tophat_juncs_file(output_dir, gene_feature_file):
#            print >>fh, junc_line
#        fh.close()
    # build special index used to discover the fragment size
#    frag_size_index_file = os.path.join(output_dir, FRAG_SIZE_INDEX_FILE)
#    if up_to_date(frag_size_index_file, index_fasta_file):
#        logging.info("[SKIPPED] Building fragment size distribution index")
#    else:
#        logging.info("Building fragment size distribution index")
#        retcode = create_fragment_size_index(output_dir, gene_feature_file, 
#                                             genome_fasta_file, 
#                                             bowtie_build_bin, 
#                                             max_fragment_size)
#        if retcode != os.EX_OK:
#            logging.error("bowtie-build failed to create fragment size "
#                          "distribution index")
#            if os.path.exists(frag_size_index_file):
#                os.remove(frag_size_index_file)
#            return JOB_ERROR 
    logging.info("chimerascan index created successfully")
    return JOB_SUCCESS
Пример #5
0
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))        
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >>fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update([line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update([line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir, 
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length)
    # 
    # Process and inspect the FASTQ files, performing several alterations 
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names 
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    # 
    converted_fastq_files = [os.path.join(tmp_dir, fq) 
                             for fq in config.CONVERTED_FASTQ_FILES]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(up_to_date(cfq, fq) for cfq,fq in 
               zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files, 
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many 
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and 
        all(up_to_date(a,b) for a,b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(transcriptome_index=transcriptome_index,
                                                 genome_index=genome_index,
                                                 transcript_file=transcript_file,     
                                                 fastq_files=converted_fastq_files,
                                                 unaligned_path=transcriptome_unaligned_path,
                                                 bam_file=transcriptome_bam_file,
                                                 log_file=log_file,
                                                 library_type=runconfig.library_type,
                                                 min_fragment_length=min_fragment_length,
                                                 max_fragment_length=runconfig.max_fragment_length,
                                                 max_transcriptome_hits=max_transcriptome_hits,
                                                 num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(runconfig.output_dir, 
                                                 config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir, 
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(bamfh, transcripts, 
                                                            min_isize=min_fragment_length, 
                                                            max_isize=runconfig.max_fragment_length, 
                                                            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" % 
                            (runconfig.isize_mean, 
                             runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(runconfig.isize_mean, 
                                                            runconfig.isize_stdev, 
                                                            min_isize=runconfig.min_fragment_length,
                                                            max_isize=runconfig.max_fragment_length,
                                                            samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % 
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(), 
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))    
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" % 
                 (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug("\tAfter adjusting for min %d and read length %d, final segment length is %d" % 
                 (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug("\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and 
        all(up_to_date(a,b) for a,b in zip(genome_unaligned_fastq_files, converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(index=genome_index,
                                   fastq_files=transcriptome_unaligned_fastq_files,
                                   unaligned_path=genome_unaligned_path,
                                   bam_file=genome_bam_file,
                                   log_file=log_file,
                                   library_type=runconfig.library_type,
                                   min_fragment_length=min_fragment_length,
                                   max_fragment_length=runconfig.max_fragment_length,
                                   max_hits=max_transcriptome_hits,
                                   num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and
        up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(transcripts=transcripts,
                                            input_bam_file=realigned_bam_file,
                                            paired_bam_file=paired_bam_file,
                                            discordant_bam_file=discordant_bam_file,
                                            unpaired_bam_file=unpaired_bam_file,
                                            unmapped_bam_file=unmapped_bam_file,
                                            multimap_bam_file=multimap_bam_file,
                                            unresolved_bam_file=unresolved_bam_file,
                                            max_isize=runconfig.max_fragment_length,
                                            max_multihits=runconfig.max_multihits,
                                            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)        
        discordant_genome_sam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index, transcripts, 
                                          input_file=discordant_bam_file, 
                                          output_file=discordant_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)        
        unpaired_genome_sam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index, transcripts, 
                                          input_file=unpaired_bam_file, 
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)        
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir, 
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file, 
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,                      
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(discordant_bam_file=sorted_discordant_genome_bam_file, 
                                           unpaired_bam_file=sorted_unpaired_genome_bam_file, 
                                           concordant_bam_file=sorted_transcriptome_bam_file, 
                                           output_bam_file=sorted_discordant_genome_cluster_bam_file, 
                                           cluster_file=cluster_file,
                                           cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file,)
    if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(discordant_bam_file=sorted_discordant_genome_cluster_bam_file, 
                                           cluster_pair_file=cluster_pair_file, 
                                           tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file, 
                   sorted_unpaired_genome_bam_file, 
                   cluster_shelve_file, 
                   cluster_pair_file)
    output_files = (breakpoint_bam_file,)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(index_dir=runconfig.index_dir,
                                             discordant_bam_file=sorted_discordant_genome_bam_file,
                                             unpaired_bam_file=sorted_unpaired_genome_bam_file,
                                             cluster_shelve_file=cluster_shelve_file,
                                             cluster_pair_file=cluster_pair_file,
                                             breakpoint_bam_file=breakpoint_bam_file,
                                             log_dir=log_dir,
                                             tmp_dir=tmp_dir,
                                             num_processors=runconfig.num_processors,
                                             local_anchor_length=runconfig.local_anchor_length,
                                             local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file,
                   cluster_shelve_file, 
                   cluster_pair_file)
    output_files = (spanning_bam_file,
                    spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(cluster_shelve_file=cluster_shelve_file,
                                              cluster_pair_file=cluster_pair_file,
                                              bam_file=breakpoint_bam_file,                                              
                                              output_sam_file=spanning_sam_file,
                                              output_cluster_pair_file=spanning_cluster_pair_file,
                                              local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    # 
    unfiltered_chimera_bedpe_file = os.path.join(runconfig.output_dir, 
                                                 config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and
        up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)):                
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts, 
                               cluster_shelve_file=cluster_shelve_file, 
                               cluster_pair_file=spanning_cluster_pair_file, 
                               read_name_file=read_name_file, 
                               output_file=unfiltered_chimera_bedpe_file, 
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:        
        logging.info(msg)
        retcode = filter_chimeras(input_file=unfiltered_chimera_bedpe_file, 
                                  output_file=chimera_bedpe_file,
                                  filter_num_frags=runconfig.filter_num_frags,
                                  filter_allele_fraction=runconfig.filter_allele_fraction,
                                  mask_biotypes=mask_biotypes,
                                  mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    # 
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    # 
    logging.info("Finished run.")
    return config.JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file,
                             bowtie_build_bin):
    #                             min_fragment_size,
    #                             max_fragment_size):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # copy reference fasta file to output dir
    index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa")
    if (up_to_date(index_fasta_file, genome_fasta_file)
            and up_to_date(index_fasta_file, gene_feature_file)):
        logging.info("[SKIPPED] Adding reference genome to index")
    else:
        logging.info("Adding reference genome to index")
        shutil.copyfile(genome_fasta_file, index_fasta_file)
        # index the genome fasta file
        logging.info("Indexing FASTA file")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
        # append sequences from gene feature file
        logging.info("Adding transcript sequences to index...")
        fh = open(index_fasta_file, "a")
        for fa_record in bed12_to_fasta(gene_feature_file, index_fasta_file):
            print >> fh, fa_record
        fh.close()
        # remove old fasta index
        os.remove(index_fasta_file + ".fai")
        # re-index the combined fasta file
        logging.info("Re-indexing FASTA file...")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
    # build bowtie index on the reference sequence file
    bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE)
    msg = "Building bowtie index"
    if up_to_date(bowtie_index_file, index_fasta_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX)
        args = [bowtie_build_bin, index_fasta_file, bowtie_index_name]
        if subprocess.call(args) != os.EX_OK:
            logging.error("bowtie-build failed to create alignment index")
            if os.path.exists(bowtie_index_file):
                os.remove(bowtie_index_file)
            return JOB_ERROR
    # copy gene bed file to index directory
    dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE)
    if up_to_date(dst_gene_feature_file, gene_feature_file):
        logging.info("[SKIPPED] Adding transcript features to index...")
    else:
        logging.info("Adding transcript features to index...")
        shutil.copyfile(gene_feature_file, dst_gene_feature_file)
    # create tophat junctions file from gene features


#    juncs_file = os.path.join(output_dir, TOPHAT_JUNCS_FILE)
#    if up_to_date(juncs_file, dst_gene_feature_file):
#        logging.info("[SKIPPED] Creating splice junction file...")
#    else:
#        logging.info("Creating splice junction file...")
#        fh = open(juncs_file, "w")
#        for junc_line in create_tophat_juncs_file(output_dir, gene_feature_file):
#            print >>fh, junc_line
#        fh.close()
# build special index used to discover the fragment size
#    frag_size_index_file = os.path.join(output_dir, FRAG_SIZE_INDEX_FILE)
#    if up_to_date(frag_size_index_file, index_fasta_file):
#        logging.info("[SKIPPED] Building fragment size distribution index")
#    else:
#        logging.info("Building fragment size distribution index")
#        retcode = create_fragment_size_index(output_dir, gene_feature_file,
#                                             genome_fasta_file,
#                                             bowtie_build_bin,
#                                             max_fragment_size)
#        if retcode != os.EX_OK:
#            logging.error("bowtie-build failed to create fragment size "
#                          "distribution index")
#            if os.path.exists(frag_size_index_file):
#                os.remove(frag_size_index_file)
#            return JOB_ERROR
    logging.info("chimerascan index created successfully")
    return JOB_SUCCESS
Пример #7
0
def create_chimerascan_index(output_dir, 
                             genome_fasta_file, 
                             transcript_feature_file):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # copy reference fasta file to output dir and index it
    dst_genome_fasta_file = os.path.join(output_dir, 
                                         config.GENOME_FASTA_FILE)
    msg = "Adding reference genome"
    if (up_to_date(dst_genome_fasta_file, genome_fasta_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        shutil.copyfile(genome_fasta_file, dst_genome_fasta_file)
        # index the genome fasta file
        logging.info("Indexing FASTA file")
        fh = pysam.Fastafile(dst_genome_fasta_file)
        fh.close()
    # add gene sequences to index
    dst_transcript_feature_file = os.path.join(output_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcript_fasta_file = os.path.join(output_dir, config.TRANSCRIPTOME_FASTA_FILE)
    multimapping_file = os.path.join(output_dir, config.MAX_MULTIMAPPING_FILE)
    msg = "Building transcriptome sequences and gene features"
    if (up_to_date(dst_transcript_feature_file, transcript_feature_file) and
        up_to_date(transcript_fasta_file, dst_transcript_feature_file) and
        up_to_date(multimapping_file, transcript_feature_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        # write sequences from gene feature file
        logging.info("Adding transcript sequences")
        fasta_fh = open(transcript_fasta_file, "w")
        tx_fh = open(dst_transcript_feature_file, "w")
        chrom_transcript_dict = collections.defaultdict(lambda: [])
        for t, fa_record in transcript_features_to_fasta(transcript_feature_file, 
                                                         dst_genome_fasta_file):
            print >>tx_fh, str(t)
            print >>fasta_fh, fa_record
            chrom_transcript_dict[t.chrom].append(t)
        tx_fh.close()
        fasta_fh.close()
        # find maximum transcript overlap as this informs alignment 
        # parameters controlling multi-mapping read handling
        max_overlap = 0
        for chrom, transcripts in chrom_transcript_dict.iteritems():            
            overlap = find_maximum_feature_overlap(transcripts)
            max_overlap = max(max_overlap, overlap)
        logging.info("Maximum transcript overlap is %d" % (max_overlap))
        fh = open(multimapping_file, "w")
        print >>fh, max_overlap
        fh.close()
        # index the transcript fasta file
        logging.info("Indexing the Transcriptome FASTA file")
        fh = pysam.Fastafile(transcript_fasta_file)
        fh.close()
    #
    # Build Transcriptome alignment index
    #
    skip = True
    index_files = (os.path.join(output_dir, f) for f in config.TRANSCRIPTOME_BOWTIE2_FILES)
    for f in index_files:
        skip = skip and up_to_date(f, transcript_fasta_file) 
    msg = "Building transcriptome index"
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, config.TRANSCRIPTOME_INDEX)
        args = [config.BOWTIE2_BUILD_BIN, transcript_fasta_file, bowtie_index_name]
        if subprocess.call(args) != os.EX_OK:
            logging.error("Failed to create alignment index")
            for f in index_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Build Genome alignment index
    #
    skip = True
    index_files = (os.path.join(output_dir, f) for f in config.GENOME_BOWTIE2_FILES)
    for f in index_files:
        skip = skip and up_to_date(f, dst_genome_fasta_file) 
    msg = "Building genome index"
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, config.GENOME_INDEX)
        args = [config.BOWTIE2_BUILD_BIN, dst_genome_fasta_file, bowtie_index_name]
        if subprocess.call(args) != os.EX_OK:
            logging.error("Failed to create alignment index")
            for f in index_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    logging.info("Chimerascan index created successfully")
    return config.JOB_SUCCESS