def bed12_to_fasta(gene_feature_file, reference_seq_file):
    ref_fa = pysam.Fastafile(reference_seq_file)
    for g in GeneFeature.parse(open(gene_feature_file)):
        exon_seqs = []
        error_occurred = False
        for start, end in g.exons:
            seq = ref_fa.fetch(g.chrom, start, end)
            if not seq:
                logging.warning(
                    "gene %s exon %s:%d-%d not found in reference" %
                    (g.tx_name, g.chrom, start, end))
                error_occurred = True
                break
            exon_seqs.append(seq)
        if error_occurred:
            continue
        # make fasta record
        seq = ''.join(exon_seqs)
        if g.strand == '-':
            seq = DNA_reverse_complement(seq)
        # break seq onto multiple lines
        seqlines = split_seq(seq, BASES_PER_LINE)
        yield (">%s range=%s:%d-%d gene=%s strand=%s\n%s" %
               (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name,
                g.strand, seqlines))
    ref_fa.close()
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file,
                             bowtie_build_bin):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # create FASTA index file
    index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa")
    fh = open(index_fasta_file, "w")
    # copy reference fasta file to output dir
    logging.info("Adding reference genome to index...")
    shutil.copyfileobj(open(genome_fasta_file), fh)
    # extract sequences from gene feature file
    logging.info("Adding gene models to index...")
    for fa_record in bed12_to_fasta(gene_feature_file, genome_fasta_file):
        print >> fh, fa_record
    fh.close()
    # copy gene bed file to index directory
    shutil.copyfile(gene_feature_file,
                    os.path.join(output_dir, GENE_FEATURE_FILE))
    # index the combined fasta file
    logging.info("Indexing FASTA file...")
    fh = pysam.Fastafile(index_fasta_file)
    fh.close()
    # build bowtie index on the combined fasta file
    logging.info("Building bowtie index...")
    bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX)
    args = [bowtie_build_bin, index_fasta_file, bowtie_index_name]
    if subprocess.call(args) != os.EX_OK:
        logging.error("bowtie-build failed to create alignment index")
        return JOB_ERROR
    logging.info("chimerascan index created successfully")
    return JOB_SUCCESS
Exemplo n.º 3
0
def bedpe_to_junction_fasta(bed_file, reference_seq_file, read_length,
                            fasta_output_fh, junc_output_fh,
                            num_mismatches=2):
    gene_fasta_prefix = config.GENE_REF_PREFIX
    ref_fa = pysam.Fastafile(reference_seq_file)
    juncs = collections.defaultdict(lambda: [])
    for line in open(bed_file):
        #print line
        fields = line.strip().split('\t')
        ref5p, start5p, end5p = fields[0], int(fields[1]), int(fields[2])
        ref3p, start3p, end3p = fields[3], int(fields[4]), int(fields[5])
        # join end of 5' ref with beginning of 3' ref
        junc_start5p = max(start5p, end5p - read_length + 1)
        junc_end3p = min(end3p, start3p + read_length - 1)
        # fetch sequence
        seq5p = ref_fa.fetch(gene_fasta_prefix + ref5p, junc_start5p, end5p)
        seq3p = ref_fa.fetch(gene_fasta_prefix + ref3p, start3p, junc_end3p)
        seq = seq5p + seq3p
        if len(seq) < (read_length*2) - 2:
            logging.warning("Could not extract sequence of length >%d from BEDPE, only retrieved sequence of (%d,%d) for gene %s" % 
                            ((read_length*2)-2, len(seq5p), len(seq3p), line.strip()))
        # fetch continuation sequence of non-fusion gene
        homolog_end5p = end5p + read_length - 1
        homolog_start3p = max(0, start3p - read_length + 1)
        homolog5p = ref_fa.fetch(gene_fasta_prefix + ref3p, homolog_start3p, start3p)
        homolog3p = ref_fa.fetch(gene_fasta_prefix + ref5p, end5p, homolog_end5p)
        # find homology between 5' gene and 3' gene
        homology_length_5p = find_homology(seq5p, homolog5p, num_mismatches)
        homology_length_3p = find_homology(seq3p, homolog3p, num_mismatches)
        # add sequence to dictionary and group fusion candidates together
        # if they have the same junction sequence
        juncs[seq].append((len(seq5p), homology_length_5p, homology_length_3p, fields))
    # now extract the unique junction sequences
    # and write them to a fasta file
    junc_index = 1    
    for junc_seq,junc_info_list in juncs.iteritems():
        junc_name = "JUNC%07d" % (junc_index) 
        # write to fasta file
        print >>fasta_output_fh, ">%s\n%s" % (junc_name, junc_seq)
        # create entries in junc map file
        for junc_info in junc_info_list:
            left_seq_length, homology_length_5p, homology_length_3p, bedpe_fields = junc_info
            fields = [junc_name, left_seq_length, 
                      homology_length_5p, homology_length_3p]
            fields.extend(bedpe_fields)
            print >>junc_output_fh, '\t'.join(map(str, fields))
        junc_index += 1
def create_fragment_size_index(output_dir, gene_feature_file,
                               reference_seq_file, bowtie_build_bin,
                               max_fragment_size):
    """
    make an alignment index containing sequences that can be used to
    assess the fragment size distribution.  these sequences must be 
    larger than the 'max_insert_size' in order to be viable for use 
    in characterizing the fragment size distribution.
    """
    # parse genes file
    genes = [g for g in GeneFeature.parse(open(gene_feature_file))]
    # find all exons that are larger than the maximum estimated fragment size
    exons = set([
        coord for coord in find_unambiguous_exon_intervals(genes)
        if (coord[2] - coord[1]) >= max_fragment_size
    ])
    logging.info("Found %d exons larger than %d" %
                 (len(exons), max_fragment_size))
    # extract the nucleotide sequence of the exons
    logging.info("Extracting sequences to use for estimating the fragment "
                 " size distribution")
    ref_fa = pysam.Fastafile(reference_seq_file)
    frag_size_fa_file = os.path.join(output_dir, "frag_size_seq.fa")
    fh = open(frag_size_fa_file, 'w')
    for chrom, start, end, strand in exons:
        seq = ref_fa.fetch(chrom, start, end)
        if not seq:
            logging.warning("exon %s:%d-%d not found in reference" %
                            (chrom, start, end))
            continue
        # make fasta record
        if strand == '-':
            seq = DNA_reverse_complement(seq)
            # break seq onto multiple lines
            seqlines = split_seq(seq, BASES_PER_LINE)
            record = (">%s:%d-%d strand=%s\n%s" %
                      (chrom, start, end, strand, seqlines))
            print >> fh, record
    fh.close()
    ref_fa.close()
    # build bowtie alignment index from the fragment size exons
    logging.info("Building bowtie index")
    frag_size_index = os.path.join(output_dir, FRAG_SIZE_INDEX)
    args = [bowtie_build_bin, frag_size_fa_file, frag_size_index]
    return subprocess.call(args)
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file,
                                    output_file, trim_bp, max_read_length,
                                    homology_mismatches):
    """
    homology_mismatches: number of mismatches to tolerate while computing
    homology between chimeric breakpoint sequence and "wildtype" sequence
    
    trim_bp: when selecting the best matching exon for each read, we
    account for spurious overlap into adjacent exons by trimming the
    read by 'trim_bp'
    """
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_tx_map = build_tid_tx_map(bamfh,
                                  gene_file,
                                  rname_prefix=config.GENE_REF_PREFIX)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # iterate through read pairs
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for r5p, r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # given the insert size find the highest probability
        # exon junction breakpoint between the two transcripts
        isize_prob, breakpoints = \
            choose_best_breakpoints(r5p, r3p, tx5p, tx3p,
                                    trim_bp, isize_dist)
        # extract the sequence of the breakpoint along with the
        # number of homologous bases at the breakpoint between
        # chimera and wildtype genes
        for breakpoint in breakpoints:
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            # write breakpoint information for each read to a file
            fields = [
                tx5p.tx_name,
                0,
                tx_end_5p,
                tx3p.tx_name,
                tx_start_3p,
                tx3p.tx_end,
                r5p.rname,  # name
                isize_prob,  # score
                tx5p.strand,
                tx3p.strand,  # strand 1, strand 2
                # user defined fields
                exon_num_5p,
                exon_num_3p,
                breakpoint_seq_5p,
                breakpoint_seq_3p,
                homology_left,
                homology_right
            ]
            fields.append('|'.join(map(str, dr5p.to_list())))
            fields.append('|'.join(map(str, dr3p.to_list())))
            print >> outfh, '\t'.join(map(str, fields))
    # cleanup
    ref_fa.close()
    outfh.close()
    bamfh.close()
    return config.JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file,
                             bowtie_build_bin):
    #                             min_fragment_size,
    #                             max_fragment_size):
    # create output dir if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logging.info("Created index directory: %s" % (output_dir))
    # copy reference fasta file to output dir
    index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa")
    if (up_to_date(index_fasta_file, genome_fasta_file)
            and up_to_date(index_fasta_file, gene_feature_file)):
        logging.info("[SKIPPED] Adding reference genome to index")
    else:
        logging.info("Adding reference genome to index")
        shutil.copyfile(genome_fasta_file, index_fasta_file)
        # index the genome fasta file
        logging.info("Indexing FASTA file")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
        # append sequences from gene feature file
        logging.info("Adding transcript sequences to index...")
        fh = open(index_fasta_file, "a")
        for fa_record in bed12_to_fasta(gene_feature_file, index_fasta_file):
            print >> fh, fa_record
        fh.close()
        # remove old fasta index
        os.remove(index_fasta_file + ".fai")
        # re-index the combined fasta file
        logging.info("Re-indexing FASTA file...")
        fh = pysam.Fastafile(index_fasta_file)
        fh.close()
    # build bowtie index on the reference sequence file
    bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE)
    msg = "Building bowtie index"
    if up_to_date(bowtie_index_file, index_fasta_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX)
        args = [bowtie_build_bin, index_fasta_file, bowtie_index_name]
        if subprocess.call(args) != os.EX_OK:
            logging.error("bowtie-build failed to create alignment index")
            if os.path.exists(bowtie_index_file):
                os.remove(bowtie_index_file)
            return JOB_ERROR
    # copy gene bed file to index directory
    dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE)
    if up_to_date(dst_gene_feature_file, gene_feature_file):
        logging.info("[SKIPPED] Adding transcript features to index...")
    else:
        logging.info("Adding transcript features to index...")
        shutil.copyfile(gene_feature_file, dst_gene_feature_file)
    # create tophat junctions file from gene features


#    juncs_file = os.path.join(output_dir, TOPHAT_JUNCS_FILE)
#    if up_to_date(juncs_file, dst_gene_feature_file):
#        logging.info("[SKIPPED] Creating splice junction file...")
#    else:
#        logging.info("Creating splice junction file...")
#        fh = open(juncs_file, "w")
#        for junc_line in create_tophat_juncs_file(output_dir, gene_feature_file):
#            print >>fh, junc_line
#        fh.close()
# build special index used to discover the fragment size
#    frag_size_index_file = os.path.join(output_dir, FRAG_SIZE_INDEX_FILE)
#    if up_to_date(frag_size_index_file, index_fasta_file):
#        logging.info("[SKIPPED] Building fragment size distribution index")
#    else:
#        logging.info("Building fragment size distribution index")
#        retcode = create_fragment_size_index(output_dir, gene_feature_file,
#                                             genome_fasta_file,
#                                             bowtie_build_bin,
#                                             max_fragment_size)
#        if retcode != os.EX_OK:
#            logging.error("bowtie-build failed to create fragment size "
#                          "distribution index")
#            if os.path.exists(frag_size_index_file):
#                os.remove(frag_size_index_file)
#            return JOB_ERROR
    logging.info("chimerascan index created successfully")
    return JOB_SUCCESS
Exemplo n.º 7
0
def determine_chimera_breakpoints(index_dir, read_length, 
                                  input_chimera_file, output_chimera_file, 
                                  breakpoint_map_file, breakpoint_fasta_file,
                                  homology_mismatches=DEFAULT_HOMOLOGY_MISMATCHES):
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # output files
    chimerafh = open(output_chimera_file, "w")
    breakpointfh = open(breakpoint_map_file, "w")
    fasta_output_fh = open(breakpoint_fasta_file, "w")
    breakpoints = collections.defaultdict(lambda: [])
    breaknum = 0
    for c in Chimera.parse(open(input_chimera_file)):
        # retrieve transcript coordinates of 5' and 3' partners
        ref5p = config.GENE_REF_PREFIX + c.partner5p.tx_name
        ref3p = config.GENE_REF_PREFIX + c.partner3p.tx_name
        start5p, end5p = c.partner5p.start, c.partner5p.end
        start3p, end3p = c.partner3p.start, c.partner3p.end
        # get intervals for breakpoint sequence
        breakpoint_start5p = max(start5p, end5p - read_length + 1)
        breakpoint_end3p = min(end3p, start3p + read_length - 1)
        # fetch sequence
        seq5p = ref_fa.fetch(ref5p, breakpoint_start5p, end5p)
        seq3p = ref_fa.fetch(ref3p, start3p, breakpoint_end3p)
        if len(seq5p) < read_length - 1:
            logging.warning("Could not extract sequence of length >%d from "
                            "5' partner of chimera %s, only retrieved "
                            "sequence of %d" % 
                            (read_length-1, c.name, len(seq5p)))
            # pad sequence
            padding = (read_length - 1) - len(seq5p)
            seq5p = ("N" * padding) + seq5p
        if len(seq3p) < read_length - 1:
            logging.warning("Could not extract sequence of length >%d from "
                            "3' partner of chimera %s, only retrieved "
                            "sequence of %d" % 
                            (read_length-1, c.name, len(seq3p)))
            # pad sequence
            padding = (read_length - 1) - len(seq3p)
            seq3p = seq3p + ("N" * padding)
        # fetch continuation sequence of non-fusion gene
        homolog_end5p = end5p + read_length - 1
        homolog_start3p = max(0, start3p - read_length + 1)
        homolog5p = ref_fa.fetch(ref3p, homolog_start3p, start3p)
        homolog3p = ref_fa.fetch(ref5p, end5p, homolog_end5p)
        # find homology between 5' gene and 3' gene
        homology_length_5p = calc_homology(seq5p[::-1], homolog5p[::-1], 
                                           homology_mismatches)
        homology_length_3p = calc_homology(seq3p, homolog3p, 
                                           homology_mismatches)        
        # create a Breakpoint and add to dictionary
        seq = seq5p + seq3p
        if seq in breakpoints:
            b = breakpoints[seq]
        else:
            b = Breakpoint()
            b.name = "B%07d" % (breaknum)
            breaknum += 1
            b.seq5p = seq5p
            b.seq3p = seq3p
            breakpoints[seq] = b
        # add sequence to dictionary and group fusion candidates together
        # if they have the same location and junction sequence
        b.chimera_names.append(c.name)
        # update Chimera object with breakpoint information
        c.breakpoint_name = b.name
        c.breakpoint_homology_5p = homology_length_5p
        c.breakpoint_homology_3p = homology_length_3p
        # write Chimera
        fields = c.to_list()
        print >>chimerafh, '\t'.join(map(str, c.to_list()))
    # now extract the unique junction sequences
    # and write them to a fasta file
    for seq,b in breakpoints.iteritems():
        # write to fasta file
        print >>fasta_output_fh, ">%s\n%s" % (b.name, seq)
        # write to breakpoint map file
        fields = b.to_list()
        print >>breakpointfh, '\t'.join(map(str, fields))
    # close files
    fasta_output_fh.close()
    breakpointfh.close()
    chimerafh.close()