def bed12_to_fasta(gene_feature_file, reference_seq_file): ref_fa = pysam.Fastafile(reference_seq_file) for g in GeneFeature.parse(open(gene_feature_file)): exon_seqs = [] error_occurred = False for start, end in g.exons: seq = ref_fa.fetch(g.chrom, start, end) if not seq: logging.warning( "gene %s exon %s:%d-%d not found in reference" % (g.tx_name, g.chrom, start, end)) error_occurred = True break exon_seqs.append(seq) if error_occurred: continue # make fasta record seq = ''.join(exon_seqs) if g.strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) yield (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name, g.strand, seqlines)) ref_fa.close()
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file, bowtie_build_bin): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # create FASTA index file index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa") fh = open(index_fasta_file, "w") # copy reference fasta file to output dir logging.info("Adding reference genome to index...") shutil.copyfileobj(open(genome_fasta_file), fh) # extract sequences from gene feature file logging.info("Adding gene models to index...") for fa_record in bed12_to_fasta(gene_feature_file, genome_fasta_file): print >> fh, fa_record fh.close() # copy gene bed file to index directory shutil.copyfile(gene_feature_file, os.path.join(output_dir, GENE_FEATURE_FILE)) # index the combined fasta file logging.info("Indexing FASTA file...") fh = pysam.Fastafile(index_fasta_file) fh.close() # build bowtie index on the combined fasta file logging.info("Building bowtie index...") bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX) args = [bowtie_build_bin, index_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("bowtie-build failed to create alignment index") return JOB_ERROR logging.info("chimerascan index created successfully") return JOB_SUCCESS
def bedpe_to_junction_fasta(bed_file, reference_seq_file, read_length, fasta_output_fh, junc_output_fh, num_mismatches=2): gene_fasta_prefix = config.GENE_REF_PREFIX ref_fa = pysam.Fastafile(reference_seq_file) juncs = collections.defaultdict(lambda: []) for line in open(bed_file): #print line fields = line.strip().split('\t') ref5p, start5p, end5p = fields[0], int(fields[1]), int(fields[2]) ref3p, start3p, end3p = fields[3], int(fields[4]), int(fields[5]) # join end of 5' ref with beginning of 3' ref junc_start5p = max(start5p, end5p - read_length + 1) junc_end3p = min(end3p, start3p + read_length - 1) # fetch sequence seq5p = ref_fa.fetch(gene_fasta_prefix + ref5p, junc_start5p, end5p) seq3p = ref_fa.fetch(gene_fasta_prefix + ref3p, start3p, junc_end3p) seq = seq5p + seq3p if len(seq) < (read_length*2) - 2: logging.warning("Could not extract sequence of length >%d from BEDPE, only retrieved sequence of (%d,%d) for gene %s" % ((read_length*2)-2, len(seq5p), len(seq3p), line.strip())) # fetch continuation sequence of non-fusion gene homolog_end5p = end5p + read_length - 1 homolog_start3p = max(0, start3p - read_length + 1) homolog5p = ref_fa.fetch(gene_fasta_prefix + ref3p, homolog_start3p, start3p) homolog3p = ref_fa.fetch(gene_fasta_prefix + ref5p, end5p, homolog_end5p) # find homology between 5' gene and 3' gene homology_length_5p = find_homology(seq5p, homolog5p, num_mismatches) homology_length_3p = find_homology(seq3p, homolog3p, num_mismatches) # add sequence to dictionary and group fusion candidates together # if they have the same junction sequence juncs[seq].append((len(seq5p), homology_length_5p, homology_length_3p, fields)) # now extract the unique junction sequences # and write them to a fasta file junc_index = 1 for junc_seq,junc_info_list in juncs.iteritems(): junc_name = "JUNC%07d" % (junc_index) # write to fasta file print >>fasta_output_fh, ">%s\n%s" % (junc_name, junc_seq) # create entries in junc map file for junc_info in junc_info_list: left_seq_length, homology_length_5p, homology_length_3p, bedpe_fields = junc_info fields = [junc_name, left_seq_length, homology_length_5p, homology_length_3p] fields.extend(bedpe_fields) print >>junc_output_fh, '\t'.join(map(str, fields)) junc_index += 1
def create_fragment_size_index(output_dir, gene_feature_file, reference_seq_file, bowtie_build_bin, max_fragment_size): """ make an alignment index containing sequences that can be used to assess the fragment size distribution. these sequences must be larger than the 'max_insert_size' in order to be viable for use in characterizing the fragment size distribution. """ # parse genes file genes = [g for g in GeneFeature.parse(open(gene_feature_file))] # find all exons that are larger than the maximum estimated fragment size exons = set([ coord for coord in find_unambiguous_exon_intervals(genes) if (coord[2] - coord[1]) >= max_fragment_size ]) logging.info("Found %d exons larger than %d" % (len(exons), max_fragment_size)) # extract the nucleotide sequence of the exons logging.info("Extracting sequences to use for estimating the fragment " " size distribution") ref_fa = pysam.Fastafile(reference_seq_file) frag_size_fa_file = os.path.join(output_dir, "frag_size_seq.fa") fh = open(frag_size_fa_file, 'w') for chrom, start, end, strand in exons: seq = ref_fa.fetch(chrom, start, end) if not seq: logging.warning("exon %s:%d-%d not found in reference" % (chrom, start, end)) continue # make fasta record if strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) record = (">%s:%d-%d strand=%s\n%s" % (chrom, start, end, strand, seqlines)) print >> fh, record fh.close() ref_fa.close() # build bowtie alignment index from the fragment size exons logging.info("Building bowtie index") frag_size_index = os.path.join(output_dir, FRAG_SIZE_INDEX) args = [bowtie_build_bin, frag_size_fa_file, frag_size_index] return subprocess.call(args)
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file, output_file, trim_bp, max_read_length, homology_mismatches): """ homology_mismatches: number of mismatches to tolerate while computing homology between chimeric breakpoint sequence and "wildtype" sequence trim_bp: when selecting the best matching exon for each read, we account for spurious overlap into adjacent exons by trimming the read by 'trim_bp' """ # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_tx_map = build_tid_tx_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # iterate through read pairs outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for r5p, r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist) # extract the sequence of the breakpoint along with the # number of homologous bases at the breakpoint between # chimera and wildtype genes for breakpoint in breakpoints: exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) # write breakpoint information for each read to a file fields = [ tx5p.tx_name, 0, tx_end_5p, tx3p.tx_name, tx_start_3p, tx3p.tx_end, r5p.rname, # name isize_prob, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 # user defined fields exon_num_5p, exon_num_3p, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >> outfh, '\t'.join(map(str, fields)) # cleanup ref_fa.close() outfh.close() bamfh.close() return config.JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file, bowtie_build_bin): # min_fragment_size, # max_fragment_size): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # copy reference fasta file to output dir index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa") if (up_to_date(index_fasta_file, genome_fasta_file) and up_to_date(index_fasta_file, gene_feature_file)): logging.info("[SKIPPED] Adding reference genome to index") else: logging.info("Adding reference genome to index") shutil.copyfile(genome_fasta_file, index_fasta_file) # index the genome fasta file logging.info("Indexing FASTA file") fh = pysam.Fastafile(index_fasta_file) fh.close() # append sequences from gene feature file logging.info("Adding transcript sequences to index...") fh = open(index_fasta_file, "a") for fa_record in bed12_to_fasta(gene_feature_file, index_fasta_file): print >> fh, fa_record fh.close() # remove old fasta index os.remove(index_fasta_file + ".fai") # re-index the combined fasta file logging.info("Re-indexing FASTA file...") fh = pysam.Fastafile(index_fasta_file) fh.close() # build bowtie index on the reference sequence file bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE) msg = "Building bowtie index" if up_to_date(bowtie_index_file, index_fasta_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX) args = [bowtie_build_bin, index_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("bowtie-build failed to create alignment index") if os.path.exists(bowtie_index_file): os.remove(bowtie_index_file) return JOB_ERROR # copy gene bed file to index directory dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE) if up_to_date(dst_gene_feature_file, gene_feature_file): logging.info("[SKIPPED] Adding transcript features to index...") else: logging.info("Adding transcript features to index...") shutil.copyfile(gene_feature_file, dst_gene_feature_file) # create tophat junctions file from gene features # juncs_file = os.path.join(output_dir, TOPHAT_JUNCS_FILE) # if up_to_date(juncs_file, dst_gene_feature_file): # logging.info("[SKIPPED] Creating splice junction file...") # else: # logging.info("Creating splice junction file...") # fh = open(juncs_file, "w") # for junc_line in create_tophat_juncs_file(output_dir, gene_feature_file): # print >>fh, junc_line # fh.close() # build special index used to discover the fragment size # frag_size_index_file = os.path.join(output_dir, FRAG_SIZE_INDEX_FILE) # if up_to_date(frag_size_index_file, index_fasta_file): # logging.info("[SKIPPED] Building fragment size distribution index") # else: # logging.info("Building fragment size distribution index") # retcode = create_fragment_size_index(output_dir, gene_feature_file, # genome_fasta_file, # bowtie_build_bin, # max_fragment_size) # if retcode != os.EX_OK: # logging.error("bowtie-build failed to create fragment size " # "distribution index") # if os.path.exists(frag_size_index_file): # os.remove(frag_size_index_file) # return JOB_ERROR logging.info("chimerascan index created successfully") return JOB_SUCCESS
def determine_chimera_breakpoints(index_dir, read_length, input_chimera_file, output_chimera_file, breakpoint_map_file, breakpoint_fasta_file, homology_mismatches=DEFAULT_HOMOLOGY_MISMATCHES): # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # output files chimerafh = open(output_chimera_file, "w") breakpointfh = open(breakpoint_map_file, "w") fasta_output_fh = open(breakpoint_fasta_file, "w") breakpoints = collections.defaultdict(lambda: []) breaknum = 0 for c in Chimera.parse(open(input_chimera_file)): # retrieve transcript coordinates of 5' and 3' partners ref5p = config.GENE_REF_PREFIX + c.partner5p.tx_name ref3p = config.GENE_REF_PREFIX + c.partner3p.tx_name start5p, end5p = c.partner5p.start, c.partner5p.end start3p, end3p = c.partner3p.start, c.partner3p.end # get intervals for breakpoint sequence breakpoint_start5p = max(start5p, end5p - read_length + 1) breakpoint_end3p = min(end3p, start3p + read_length - 1) # fetch sequence seq5p = ref_fa.fetch(ref5p, breakpoint_start5p, end5p) seq3p = ref_fa.fetch(ref3p, start3p, breakpoint_end3p) if len(seq5p) < read_length - 1: logging.warning("Could not extract sequence of length >%d from " "5' partner of chimera %s, only retrieved " "sequence of %d" % (read_length-1, c.name, len(seq5p))) # pad sequence padding = (read_length - 1) - len(seq5p) seq5p = ("N" * padding) + seq5p if len(seq3p) < read_length - 1: logging.warning("Could not extract sequence of length >%d from " "3' partner of chimera %s, only retrieved " "sequence of %d" % (read_length-1, c.name, len(seq3p))) # pad sequence padding = (read_length - 1) - len(seq3p) seq3p = seq3p + ("N" * padding) # fetch continuation sequence of non-fusion gene homolog_end5p = end5p + read_length - 1 homolog_start3p = max(0, start3p - read_length + 1) homolog5p = ref_fa.fetch(ref3p, homolog_start3p, start3p) homolog3p = ref_fa.fetch(ref5p, end5p, homolog_end5p) # find homology between 5' gene and 3' gene homology_length_5p = calc_homology(seq5p[::-1], homolog5p[::-1], homology_mismatches) homology_length_3p = calc_homology(seq3p, homolog3p, homology_mismatches) # create a Breakpoint and add to dictionary seq = seq5p + seq3p if seq in breakpoints: b = breakpoints[seq] else: b = Breakpoint() b.name = "B%07d" % (breaknum) breaknum += 1 b.seq5p = seq5p b.seq3p = seq3p breakpoints[seq] = b # add sequence to dictionary and group fusion candidates together # if they have the same location and junction sequence b.chimera_names.append(c.name) # update Chimera object with breakpoint information c.breakpoint_name = b.name c.breakpoint_homology_5p = homology_length_5p c.breakpoint_homology_3p = homology_length_3p # write Chimera fields = c.to_list() print >>chimerafh, '\t'.join(map(str, c.to_list())) # now extract the unique junction sequences # and write them to a fasta file for seq,b in breakpoints.iteritems(): # write to fasta file print >>fasta_output_fh, ">%s\n%s" % (b.name, seq) # write to breakpoint map file fields = b.to_list() print >>breakpointfh, '\t'.join(map(str, fields)) # close files fasta_output_fh.close() breakpointfh.close() chimerafh.close()