示例#1
0
def buildCovModel(readF, dfilter=[80, 80, 180], mapq=1):
    """
    Building Genome Coverage profile for MNase-seq data based on HTSeq.

    Parameters
    ---
    readF: str,bedpe.gz
    dfilter: list, distance to determin conical and particle
    mapq: int, MAPQ cutoff to remove PETs.

    Returns
    ---
    non-Redundant PETs number, int
    Conical nucleosome PETs number, int
    Particle PETs number, int
    Conical nucleosome PETs coverage, HTSeq.GenomeicArray
    Particle PETs coverage, HTSeq.GenomicArray
    """
    print("building models for %s" % readF)
    n = readF.split('/')[-1].split(".bedpe.gz")[0]
    modelCn = HTSeq.GenomicArray("auto", stranded=False)
    modelSp = HTSeq.GenomicArray("auto", stranded=False)
    cn, sp = 0, 0
    reds = set()
    for i, line in enumerate(gzip.open(readF, 'rt')):
        if i % 10000 == 0:
            report = "%s lines genome signal read." % i
            cFlush(report)
        line = line.split("\n")[0].split("\t")
        if len(line) < 7:
            continue
        if line[0] != line[3]:
            continue
        if int(line[7]) < mapq:
            continue
        s = min(int(line[1]), int(line[4]))
        e = max(int(line[2]), int(line[5]))
        d = e - s
        r = (line[0], s, e)
        if r in reds:
            continue
        else:
            reds.add(r)
        m = (s + e) / 2
        #iv = HTSeq.GenomicInterval(line[0], m, m + 1)
        iv = HTSeq.GenomicInterval(line[0], s, e)
        if d <= dfilter[0]:  #sp
            sp += 1
            modelSp[iv] += 1
        if d > dfilter[1] and d <= dfilter[2]:
            cn += 1
            modelCn[iv] += 1
    return cn, sp, modelCn, modelSp
def makeIslandFilteredGraphFile(chroms, chrom_lengths, window_size, bamfile,
                                islandbedfile, outfile):
    ga = HTSeq.GenomicArray(chroms, stranded=False, typecode='d')

    bam_reader = HTSeq.BAM_Reader(bamfile)
    for alt_first, alt_second in HTSeq.pair_SAM_alignments(bam_reader):
        if alt_first == None or alt_second == None:
            continue
        if alt_first.aligned and alt_first.optional_field(
                "NH"
        ) == 1 and alt_second.aligned and alt_second.optional_field("NH") == 1:
            if alt_first.iv.chrom != alt_second.iv.chrom or alt_first.iv.strand == alt_second.iv.strand or alt_first.iv.chrom not in chroms:
                continue

            alt_first_iv_seq = [
                co.ref_iv for co in alt_first.cigar
                if co.type == "M" and co.size > 0
            ]
            alt_second_iv_seq = [
                reverse_strand(co.ref_iv) for co in alt_second.cigar
                if co.type == "M" and co.size > 0
            ]
            alt_iv_seq = combine_pair_iv_seq(alt_first_iv_seq,
                                             alt_second_iv_seq)

            read_length = get_read_length(alt_iv_seq)
            for alt_iv in alt_iv_seq:
                ga[alt_iv] += 1.0 / read_length

    ga_island = HTSeq.GenomicArray(chroms, stranded=False, typecode='d')
    bedfile = HTSeq.BED_Reader(islandbedfile)
    for alt in bedfile:
        for iv, value in ga[alt.iv].steps():
            ga_island[iv] += value

    with open(outfile, 'w') as f:
        for chrom in chroms:
            chrom_length = chrom_lengths[chrom]
            num_windows = chrom_length / window_size
            for i in range(num_windows):
                count_in_window = 0
                window_start = i * window_size
                window_end = (i + 1) * window_size
                window_iv = HTSeq.GenomicInterval(chrom, window_start,
                                                  window_end)
                for iv, value in ga_island[window_iv].steps():
                    count_in_window += value * iv.length
                count_in_window = int(count_in_window)
                if count_in_window != 0:
                    outline = chrom + '\t' + str(window_start) + '\t' + str(
                        window_end) + '\t' + str(count_in_window) + '\n'
                    f.write(outline)
    def run(self):
        logging.info('[{0}] Target cutoff at FDR={1}'.format(
            time.ctime(), self.target_fdr))
        faidx_genome = apriori_rts_analysis.FaidxGenome(self.genome_fasta)
        whitelist_array = HTSeq.GenomicArray(chroms='auto',
                                             stranded=True,
                                             storage='step',
                                             typecode='b')
        for in_whitelist_bed in self.whitelist_bed_list:
            apriori_rts_analysis.load_whitelist(whitelist_array,
                                                in_whitelist_bed)

        rg4_motif_information, rg4_motif_array, rg4_threeprime_array = apriori_rts_analysis.load_rg4_motif(
            self.motif_bed_list, _all_rts_array=None)

        structural_classes = [
            'canonical/G3L1-7', 'longloop', 'bulges', 'two-quartet'
        ]
        for forward_in_fsrtsv, reverse_in_fsrtsv, output_cutoff_config, output_cutoff_dump in self.io_tuples:
            if not self.overwrite and os.path.exists(
                    output_cutoff_config) and os.path.isfile(
                        output_cutoff_config):
                continue
            u_treatment_fsr, treatment_fsr, u_decoy_fsr, decoy_fsr = single_sample_load_fsrtsv(
                forward_in_fsrtsv, reverse_in_fsrtsv, whitelist_array,
                faidx_genome)
            rts_array = HTSeq.GenomicArray(chroms='auto',
                                           stranded=True,
                                           typecode='b')
            for strand_symbol in ('+', '-'):
                for fsr_store in [u_treatment_fsr]:
                    for items in fsr_store[strand_symbol]:
                        iv = items[0]
                        rts_array[iv] = True
            result, result_dump = unknown_fdr_cut(
                treatment_fsr, rg4_motif_information, rg4_threeprime_array,
                frozenset(structural_classes), faidx_genome, self.target_fdr)
            assert result
            with open(output_cutoff_config, 'w') as fw:
                fw.write(str(result.first_alpha_cutoff) + '\n')
                fw.write(str(result.second_alpha_cutoff) + '\n')

            if self.debug_dump:
                with open(output_cutoff_dump, 'w') as fw:
                    for line in result_dump:
                        fw.write(str(line) + '\n')

            del u_treatment_fsr, treatment_fsr, u_decoy_fsr, decoy_fsr
            del rts_array
            del result, result_dump
示例#4
0
def get_window_counts_and_normalize(window_dict, tags_dict, genome_data, scaling_factor, total_reads, window_size):
    # dictionary to store read count in each window
    window_counts_dict = {}
    # HTSeq genomic array to store normalized score for each window (used to generate bedgraph file)
    normalized_window_array = HTSeq.GenomicArray(genome_data, stranded=False, typecode='d')

    # create chromosome keys in window counts dictionary for all chromosomes in genome; the values are empty lists
    for chrom in genome_data:
        window_counts_dict[chrom] = []

    # iterate through all chromosomes in the genome
    for chrom in genome_data:
        # iterate through all windows on the chromosome
        for window_start in window_dict[chrom]:
            # get read count in window
            read_count = get_read_count_in_window(chrom, window_start, window_size, tags_dict)

            window_counts_dict[chrom].append([window_start, read_count, 0])

            # calculate normalized read count
            normalized_count = float(read_count) * float(scaling_factor) / float(total_reads)
            window_end = window_start + window_size
            window = HTSeq.GenomicInterval(chrom, window_start, window_end)
            # assign normalized read count to window on HTSeq genomic array
            normalized_window_array[window] = normalized_count

    return window_counts_dict, normalized_window_array
示例#5
0
 def find_clusters(self):
     # TO DO: A significant number of objects in arr are empty ([]).
     # Not sure why. Was this figured out?
     self.clusters = []
     self.clusters_as_ga = HTSeq.GenomicArray('auto', stranded=True)
     n_clusters = 0
     nonzeroes = np.nonzero(self.exon_coverage)[0]
     clusters = self.consecutive(np.nonzero(self.exon_coverage)[0])
     self.clusters_as_indices_in_exon_coverage_array = clusters
     #print "exon_coverage= {0}\n\nnonzeroes= {1}\n\nconsecutive(nonzeroes)=\n\n{2}\
     #".format(self.exon_coverage, nonzeroes, clusters)
     if (len(self.exon_coverage) == 0) or (len(clusters) == 0):
         n_clusters = 0
         return n_clusters
     arr = np.array([self.exon_coverage[i] for i in clusters])
     if len(arr) > 0:
         for _index_array in clusters:
             for pos in _index_array:
                 self.clusters_as_ga[self.to_iv(pos)] += 1
         nonzeros = [x for x in arr if len(x) > 0]
         #maxes = [np.max(x) for x in arr if len(x) > 0]
         self.clusters = [x for x in nonzeros if np.max(x) > 1]
         n_clusters = len(self.clusters)  #len([x for x in maxes if x>1])
         #n_clusters = len(filter(lambda x: np.nanmax(x)>1, arr))
     if n_clusters == 0:
         print("...Failure")
     return n_clusters
示例#6
0
def createMTrack(dirName):
	'''merge all mapped tracks in directory and create a single wig file'''
	
	fileList = cg.recurseDir(dirName, end = '.out')
	
	chroms = cg.humanChromosomes
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		print fName
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				try:
					cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..
				except KeyError:
					pass

	bedNamePos = dirName + '/Merge.' + 'hg19' + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + 'hg19' + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, 'hg19')
	updateWigLength(bedNameNeg, 'hg19')
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)
示例#7
0
def call_peaks_from_bam(clip_bam_filename, config):
    gtf_filename = config['gtf_filename'] #"/home/dp/Desktop/celegans_genome/wormbase_ws235/Caenorhabditis_elegans.WBcel235.78.gtf"
    gtf_noheader_filename = config['gtf_filename_noheader'] #"/home/dp/Desktop/celegans_genome/wormbase_ws235/Caenorhabditis_elegans.WBcel235.78.noheader.gtf" 
    gtffile = HTSeq.GFF_Reader(gtf_filename)
    #clip_bam_filename = "/home/dp/Desktop/bams/celegans/run813_fbf_aacc_20mapq.bam"
    bamfiles = {'clip': clip_bam_filename,
                'rna_seq': config['rna_seq_filename'],
                'neg_ip': config['neg_ip_filename']}
    clip_bamfile = HTSeq.BAM_Reader(clip_bam_filename)
    coverage = HTSeq.GenomicArray("auto", stranded=True, typecode='i')
    #gtf_df = pandas.read_csv(gtf_noheader_filename, sep='\t', header=None)
    print "Reading alignments from bamfile..."
    for aln in clip_bamfile:  # Very slow.
        if aln.aligned:
            coverage[aln.iv] += 1
    print "Creating gtf file and dataframe..."
    gtf_df = create_gtf_with_names_file(gtf_noheader_filename)
    print "Calling peaks..."
    peaks_by_chrm = {}
    peak_objs_by_chrm = {}
    for chrm in dict(gtf_df['0'].value_counts()).keys():
        peaks_by_chrm[chrm] = {}
        peak_objs_by_chrm[chrm] = {}
        for strand in ['+', '-']:
            peaks_by_chrm[chrm][strand] = find_peaks(coverage, chrm=chrm, strand=strand)
            peak_objs_by_chrm[chrm][strand] = find_borders(
                peaks_by_chrm[chrm][strand], coverage, chrm, strand)
            peak_objs_by_chrm[chrm][strand] = merge_overlapping_on_chrm_and_strand(
                peak_objs_by_chrm[chrm][strand], coverage)
            assign_to_gene(peak_objs_by_chrm, chrm, strand, gtf_df)
            add_local_signal(peak_objs_by_chrm[chrm][strand], bamfiles)
            add_gene_signal(peak_objs_by_chrm[chrm][strand], gtf_df, bamfiles)
            do_statistics(peak_objs_by_chrm[chrm][strand], bamfiles)
            any_have_na(peak_objs_by_chrm[chrm][strand])
    return peak_objs_by_chrm
示例#8
0
def createTrack(fName, organism):

	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'

	alignment_file = HTSeq.BowtieReader(fName)
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for alngt in alignment_file:
		if alngt.aligned:
			cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = fName + '.1.' + 'wig'
	bedNameNeg = fName + '.-1.' + 'wig'

	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )
	
	#Now extend it and sort it.
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)
示例#9
0
def collapse_unannotated_exons(stranded, unannot_exon_dict):
    """Collapses the list of unannotated exon genomic itervals. Similar to
    what dexseq_prepare_annotation.py does, it takes all overlapping exons and
    collapses them down into a single unique coordinate that contains the rest.
    returns a set of unqiue genomic interval objects"""

    ga_dict = {}  # genomic array dictionary
    for gene_name, exon_list in unannot_exon_dict.iteritems():
        # Create a GenomicArray object to get largest exons
        ga = HTSeq.GenomicArray('auto', stranded=stranded)

        for exon_iv in exon_list:
            ga[exon_iv] = 1  # real exons are set to 1, inbetween are 0's

        ga_dict[gene_name] = ga

    # create new defaultdict set to hold unique exon coordinates
    collapsed_unannot_dict = defaultdict(set)

    for gene_name, ga in ga_dict.iteritems():
        for exon_iv, num in ga.steps():
            if num == 1:
                collapsed_unannot_dict[gene_name].add(exon_iv)

    return collapsed_unannot_dict
示例#10
0
    def write_motif_track(cls, genomic_fasta, motif, motif2=None):

        ga = HTSeq.GenomicArray('auto', stranded=True)

        for chrom, seq in genomic_fasta.items():
            print(chrom, )

            cls.add_motif_locations(chrom, seq, '+', motif, ga)

            if motif2 is not None:
                cls.add_motif_locations(chrom, seq, '+', motif2, ga, value=2)

            cls.add_motif_locations(chrom, seq, '-', cls.rc(motif), ga)

            if motif2 is not None:
                cls.add_motif_locations(chrom,
                                        seq,
                                        '-',
                                        cls.rc(motif2),
                                        ga,
                                        value=2)

        cls._mk('beds/')
        cls._mk('beds/motif_bedgraphs/')

        ga.write_bedgraph_file('beds/motif_bedgraphs/{}_+.wig'.format(motif),
                               '+')
        ga.write_bedgraph_file('beds/motif_bedgraphs/{}_-.wig'.format(motif),
                               '-')
示例#11
0
def bg2GModel(bg):
    """
    BedGraph format, gzip or not into HTSeq.GenomicArray 
    """
    if bg.endswith(".gz"):
        f = gzip.open(bg, "rb")
    else:
        f = open(bg)
    print datetime.now(), "Start building model for %s" % bg
    model = HTSeq.GenomicArray("auto", stranded=False)
    for i, line in enumerate(f):
        if i % 10000 == 0:
            report = "%s lines genome signal read." % i
            commandFlush(report)
        line = line.split("\n")[0].split("\t")
        if len(line) < 3:
            continue
        chrom = line[0]
        s = int(line[1])
        e = int(line[2])
        iv = HTSeq.GenomicInterval(chrom, s, e)
        model[iv] = float(line[3])
    print
    print datetime.now(), "Model built for %s" % bg
    #return genomic coverage model, chromosomes, reads count and read length
    return model
def get_bedgraph(do_combine_bedgraphs=False,
                 bedgraphs_folder='data/wigs/',
                 lib=None):
    if lib is not None:
        bedgraphs_folder = lib['coverage_wigs']
        bedgraph_exp_plus = lib['bedgraph_exp_plus']
        bedgraph_exp_minus = lib['bedgraph_exp_minus']
    else:
        bedgraph_exp_plus = bedgraphs_folder + 'both_fbfs_plus.bed'
        bedgraph_exp_minus = bedgraphs_folder + 'both_fbfs_minus.bed'
    bedgraphs_folder = bedgraphs_folder.rstrip('/') + '/'
    if do_combine_bedgraphs:
        combine_bedgraphs(bedgraphs_folder=bedgraphs_folder)
    ga = HTSeq.GenomicArray(chroms='auto', stranded=True)
    with open(bedgraph_exp_plus, 'r') as f:
        next(f)
        for line in f:
            s = line.rstrip('\n').split('\t')
            ga[HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]),
                                     '+')] = float(s[3])
    with open(bedgraph_exp_minus, 'r') as f:
        next(f)
        for line in f:
            s = line.rstrip('\n').split('\t')
            ga[HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]),
                                     '-')] = float(s[3])
    return ga
示例#13
0
def ave_ga(file_list):
    ga = build_ga(file_list)
    norm_ga = HTSeq.GenomicArray('auto', stranded=True)
    denom = len(file_list)
    for iv, score in ga.steps():
        norm_ga[iv] = score / denom
    return norm_ga
示例#14
0
    def HTSeq(self, bamlist):
        # Axin2
        #window = HTSeq.GenomicInterval("chr11", 108914532, 108954079, "+")
        # Elf3
        window = HTSeq.GenomicInterval("chr1", 135253574, 135258472, "-")
        coverage = HTSeq.GenomicArray("auto", stranded=True, typecode="i")
        a = []
        samplelist = []
        for bamfile in bamlist:
            sample = os.path.basename(bamfile).split("_")[0]
            marker = sample.split("-")[0]
            samplelist.append(sample)
            bamfile = HTSeq.BAM_Reader(bamfile)
            for almnt in bamfile:
                if almnt.aligned:
                    almnt.iv.length = 1
                    coverage[almnt.iv] += 1

            normalization = np.fromiter(coverage[window],
                                        dtype=float) / p.H3K27ac_bam[sample]
            a.append(normalization)
        b = np.array(a)
        df = pd.DataFrame(b.T)
        df.columns = samplelist
        data = df[[
            "ctrl-H3K27ac", "2weeks-H3K27ac", "4weeks-H3K27ac",
            "7weeks-H3K27ac", "10weeks-H3K27ac"
        ]]
        data.to_csv(
            "/data3/zhaochen/project/colon_cancer/colon_chip/peakUCSCplot/H3K27ac_Elf3.txt",
            sep="\t",
            index=False)
def main(cl=None):
	'''
	Implements the Usage exception handler that can be raised from anywhere 
	in process.  

	'''
	if cl is None:
		cl = CommandLine()
	else :
		cl = CommandLine(['-r'])

	try:
		print cl.args  # print the parsed argument string
		alignment_file = HTSeq.SAM_Reader(cl.args["sam_file"])
		
		# Get coverage for the whole genome
		cvg = HTSeq.GenomicArray( "auto", stranded=False, typecode='i' )
		for alngt in alignment_file:
			if alngt.aligned:
				cvg[ alngt.iv ] += 1
				
		# Write a "Wiggle" file for genome browser viewing
		cvg.write_bedgraph_file(cl.args["output_prefix"]+".wig")
				
		# Now need to iterate over every gene/transcript and get the
		# per-transcript coverage
		# gtf_file = HTSeq.GFF_Reader("/home/pvcastro/reference_known_genes.gtf")
		
	except Usage, err:
	   cl.do_usage_and_die(err.msg)
示例#16
0
 def add_raw_reads_to_utr(self, ga, chr_len):
     #utr_left = self.cds_right
     #utr_right = self.txpt_right
     # Need chrom information.
     self.utr_arr = []
     if self.txpt_right - self.cds_right < 2:
         for pos in range(0, self.txpt_right - self.cds_right + 1, 1):
             self.utr_arr = [0]
         return
     if self.strand == '-':
         txpt_left = chr_len[self.chrom] - self.txpt_right + 1
         txpt_right = chr_len[self.chrom] - self.txpt_left
         cds_left = chr_len[self.chrom] - self.cds_right + 1
         cds_right = chr_len[self.chrom] - self.cds_left
     else:
         txpt_left = self.txpt_left
         txpt_right = self.txpt_right
         cds_left = self.cds_left
         cds_right = self.cds_right
     iv = HTSeq.GenomicInterval(self.chrom, cds_right, txpt_right,
                                self.strand)
     self.utr_ga = HTSeq.GenomicArray(chroms='auto', stranded=True)
     # if len(ga[iv].steps()) == 0:
     #    for pos in range(0,self.txpt_right - self.cds_right + 1,1):
     #        self.utr_arr.append(0)
     #    return
     if txpt_right - cds_right < 2:
         self.utr_arr = [0]
         return
     for _iv, score in ga[iv].steps():
         self.utr_ga[_iv] = score
         left_in_utr = _iv.start - cds_right
         right_in_utr = _iv.end - cds_right
         for pos in range(left_in_utr, right_in_utr, 1):
             self.utr_arr.append(score)
示例#17
0
def load_bedgraph(filename_list, ga, use_key=False):
    if use_key:
        exp = use_key
    else: exp = filename_list[0]
    ga[exp] = HTSeq.GenomicArray(chroms='auto', stranded=True)
    ratio_fbf1_to_2 = float(9792191+3166675+10408265)/float(7680463+884888+5584323)
    #rep = HTSeq.WiggleReader(filename)
    if exp == 'combined_fbf2.txt':
        with open(filename_list[0], 'r') as f:
            next(f)
            for line in f:
                s = line.rstrip('\n').split('\t')
                ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '+')] = ratio_fbf1_to_2 * float(s[3])
        with open(filename_list[1], 'r') as f:
            next(f)
            for line in f:
                s = line.rstrip('\n').split('\t')
                ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '-')] = ratio_fbf1_to_2 * float(s[3])
    else:
        with open(filename_list[0], 'r') as f:
            next(f)
            for line in f:
                s = line.rstrip('\n').split('\t')
                ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '+')] = float(s[3])
        with open(filename_list[1], 'r') as f:
            next(f)
            for line in f:
                s = line.rstrip('\n').split('\t')
                ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '-')] = float(s[3])
示例#18
0
    def create_tasks(self, ploidy: int, snps):
        """Create tasks."""
        vc_sites = HTSeq.GenomicArray('auto', stranded=False, typecode='O')

        for snp in snps:
            snp = snp  # type: SNP
            vc_sites[snp.iv] = snp

        self._mc.handle_progress('Creating tasks...')
        selected_snp_ids = set()
        selected_snps = []
        n = -1
        segment_ids = self.get_all_segment_ids()
        for segment_id in segment_ids:
            segment = self.get_segment(segment_id)
            phased_snp_ids, unphased_snp_ids = [], []
            for isoform in segment.isoforms:
                for exon in isoform.exons:
                    for vc_iv, snp in vc_sites[exon].steps():
                        if snp is not None:
                            if snp.id not in selected_snp_ids:
                                selected_snp_ids.add(snp.id)
                                selected_snps.append(snp)
                            if snp.phased and snp.id not in phased_snp_ids:
                                phased_snp_ids.append(snp.id)
                            if not snp.phased and snp.id not in unphased_snp_ids:
                                unphased_snp_ids.append(snp.id)
            if len(phased_snp_ids) > 0:
                n = self._create_task(n, segment.id, True, ploidy,
                                      phased_snp_ids)
            for unphased_snp_id in unphased_snp_ids:
                n = self._create_task(n, segment.id, False, ploidy,
                                      [unphased_snp_id])

        self.store_snps(selected_snps)
示例#19
0
def _extract_long_continuous_regions(gff_path: str, min_region_len: int,
                                     out_path: str, mc: MessageCenter):
    """Extract long continuous regions with length of at least min_region_len."""
    mc.log_debug('gff_path: {}'.format(gff_path))
    mc.log_debug('min_region_len: {}'.format(min_region_len))
    mc.log_debug('out_path: {}'.format(out_path))

    mc.handle_progress('Calculating long continuous regions...')

    region = HTSeq.GenomicArray('auto', stranded=False, typecode='i')

    if not os.path.exists(gff_path):
        raise PEUtilPathError(gff_path, 'File not exists.')

    gff = HTSeq.GFF_Reader(gff_path)
    n = -1
    for ft in gff:
        n += 1
        if n != 0 and n % 100000 == 0:
            mc.handle_progress('{} lines read from GFF file...'.format(n))
        if ft.type == 'exon':
            region[ft.iv] += 1

    with open(out_path, 'w') as o:
        for iv, v in region.steps():
            if v != 0:
                region_len = iv.end - iv.start
                if region_len >= min_region_len:
                    o.write('{0}\t{1}\t{2}\t{3}\n'.format(
                        iv.chrom, iv.start, iv.end, region_len))
示例#20
0
def offset_read_alignment_positions(bam=None, offsets=None):
    """ Adjust the reported position of reads based on the offsets.


    Calculates the offset position of the read based on the read length, if the offset
    is not defined then set the offset position to the midpoint of the read. Based on
    these offsets, the reported reference position of the read is adjusted to the
    requisite A- or P-site position of the ribosome.
    """
    try:
        if bam is not None:
            coverage = hts.GenomicArray(chroms='auto',
                                        stranded=True,
                                        typecode='i',
                                        storage='step')
            for alignment in bam:
                offset = (offsets[len(alignment.read.seq)]
                          if len(alignment.read.seq) in offsets else
                          len(alignment.read.seq) // 2)
                offset_pos = (convert_cigar_to_reference_coordinates(
                    alignment.cigar)[offset - 1] if alignment.iv.strand == '+'
                              else convert_cigar_to_reference_coordinates(
                                  alignment.cigar)[-offset])
                coverage[HTSeq.GenomicPosition(alignment.iv.chrom, offset_pos,
                                               alignment.iv.strand)] += 1
            if len(coverage.chrom_vectors) == 0:
                raise ValueError('Alignment position offset failure')
        else:
            raise NameError('Missing BAM input')
    except (NameError, ValueError):
        return None
    return coverage
示例#21
0
def load_bedgraph(fname):
    ga = HTSeq.GenomicArray(chroms='auto', stranded=True)
    plus_file = fname.partition('.wig')[0] + '_+.wig'
    add_strand_to_ga_from_bedgraph_file(plus_file, ga, '+')
    minus_file = fname.partition('.wig')[0] + '_-.wig'
    add_strand_to_ga_from_bedgraph_file(minus_file, ga, '-')
    return ga
def get_total_tag_counts(chroms, bamfile):
    ga = HTSeq.GenomicArray(chroms, stranded=False, typecode='d')
    tag_count = 0
    bam_reader = HTSeq.BAM_Reader(bamfile)
    for alt_first, alt_second in HTSeq.pair_SAM_alignments(bam_reader):
        if alt_first == None or alt_second == None:
            continue
        if alt_first.aligned and alt_first.optional_field(
                "NH"
        ) == 1 and alt_second.aligned and alt_second.optional_field("NH") == 1:
            if alt_first.iv.chrom != alt_second.iv.chrom or alt_first.iv.strand == alt_second.iv.strand or alt_first.iv.chrom not in chroms:
                continue

            tag_count += 1
            alt_first_iv_seq = [
                co.ref_iv for co in alt_first.cigar
                if co.type == "M" and co.size > 0
            ]
            alt_second_iv_seq = [
                reverse_strand(co.ref_iv) for co in alt_second.cigar
                if co.type == "M" and co.size > 0
            ]
            alt_iv_seq = combine_pair_iv_seq(alt_first_iv_seq,
                                             alt_second_iv_seq)

            read_length = get_read_length(alt_iv_seq)
            for alt_iv in alt_iv_seq:
                ga[alt_iv] += 1.0 / read_length
    return tag_count, ga
def combine_bedgraphs(bedgraphs_folder='data/wigs_five_prime/'):
    ga = {}
    bedgraphs_folder = bedgraphs_folder.rstrip('/') + '/'
    for filename_list in [
        (bedgraphs_folder + 'fbf1_reads_plus.bed',
         bedgraphs_folder + 'fbf1_reads_minus.bed', 'combined_fbf1.txt'),
        (bedgraphs_folder + 'fbf2_reads_plus.bed',
         bedgraphs_folder + 'fbf2_reads_minus.bed', 'combined_fbf2.txt')
    ]:
        peaks_filename = filename_list[2]
        scatterplot_correlation_by_wig.load_bedgraph(filename_list,
                                                     ga,
                                                     use_key=peaks_filename)
    ga['combined'] = HTSeq.GenomicArray(chroms='auto', stranded=True)
    for iv, score in ga['combined_fbf1.txt'].steps():
        ga['combined'][iv] += score
    for iv, score in ga['combined_fbf2.txt'].steps():
        ga['combined'][iv] += score


#    with open('temp_ga.p', 'w') as f:
#        pickle.dump(ga, f)
    ga['combined'].write_bedgraph_file(bedgraphs_folder + 'both_fbfs_plus.bed',
                                       '+')
    ga['combined'].write_bedgraph_file(
        bedgraphs_folder + 'both_fbfs_minus.bed', '-')
    return ga
示例#24
0
def empty_array_from_file(bam_file, stranded=True, typecode="i"):
    cov_array = HTSeq.GenomicArray("auto",
                                   stranded=stranded,
                                   typecode=typecode)
    myheader = HTSeq.BAM_Reader(bam_file).get_header_dict()
    for entry in myheader['SQ']:
        cov_array.add_chrom(entry['SN'], entry['LN'])
    return cov_array
示例#25
0
def run(input_bed, output_bedgraph_unnorm, output_bedgraph_norm):
    if not os.path.exists(output_bedgraph_unnorm):
        os.system('mkdir ' + output_bedgraph_unnorm)
    if not os.path.exists(output_bedgraph_norm):
        os.system('mkdir ' + output_bedgraph_norm)
    ga_all_exp = HTSeq.GenomicArray('auto', stranded=True)
    ga_all_control = HTSeq.GenomicArray('auto', stranded=True)
    ga_other = HTSeq.GenomicArray('auto', stranded=True)
    for infile in glob.glob(input_bed + '/*.bed'):
        ga = HTSeq.GenomicArray('auto', stranded=True)
        if (re.match('.*fog.*',
                     os.path.basename(infile)) is not None) or (re.match(
                         '.*exp.*',
                         os.path.basename(infile)) is not None) or (re.match(
                             '.*fbf.*', os.path.basename(infile)) is not None):
            # if re.match('.*fbf1.*', os.path.basename(infile)) is not None:
            #     continue
            print(infile)
            ga = add_to_ga(infile, ga_all_exp)
        elif (re.match('.*control.*',
                       os.path.basename(infile)) is not None) or (re.match(
                           '.*n2.*', os.path.basename(infile)) is not None):
            ga = add_to_ga(infile, ga_all_control)
        else:
            ga = add_to_ga(infile, ga_other)
        outname = "{d}/{b}".format(
            d=output_bedgraph_unnorm,
            b=os.path.basename(infile).partition('.bed')[0])
        print("Creating a bedgraph {c} from {a}...".format(c=outname,
                                                           a=infile))
        outname_plus = outname + '_+.wig'
        ga.write_bedgraph_file(outname_plus, strand='+')
        outname_minus = outname + '_-.wig'
        ga.write_bedgraph_file(outname_minus, strand='-')
    ga_all_exp.write_bedgraph_file(output_bedgraph_unnorm + '/all_exp_+.wig',
                                   strand='+')
    ga_all_exp.write_bedgraph_file(output_bedgraph_unnorm + '/all_exp_-.wig',
                                   strand='-')
    ga_all_control.write_bedgraph_file(output_bedgraph_unnorm +
                                       '/all_control_+.wig',
                                       strand='+')
    ga_all_control.write_bedgraph_file(output_bedgraph_unnorm +
                                       '/all_control_-.wig',
                                       strand='-')
    normalize_bedgraph.normalize_wig(input_bed, output_bedgraph_unnorm,
                                     output_bedgraph_norm)
示例#26
0
def bed2model(bg, mapq=1, noRedu=True, ext=150):
    """
    Convet BED format file into HTSeq.GenomicArray to get the genomic coverage.
    Only non-redundant reads will be kept.

    Parameteres
    ----
    bg: str, .bed or .bed.gz file
    mapq: int, Bowtie2 MAPQ cutoff to filter reads.
    noRedu: bool, whether to keep redundant reads

    Returns
    ----
    HTSeq.GenomicArray
 
    BedGraph format, gzip or not into HTSeq.GenomicArray 
    """
    rs = set()
    if bg.endswith(".gz"):
        f = gzip.open(bg, "rb")
    else:
        f = open(bg)
    logger.info("Start building model for %s, with MAPQ cutoff >=%s" %
                (bg, mapq))
    model = HTSeq.GenomicArray("auto", stranded=False)
    t = 0
    for i, line in enumerate(f):
        if i % 10000 == 0:
            report = "%s lines genome signal read." % i
            cFlush(report)
        line = line.split("\n")[0].split("\t")
        if len(line) < 3:
            continue
        try:
            chrom = line[0]
            s = int(line[1])
            e = int(line[2])
        except:
            continue
        if int(line[4]) < mapq:
            continue
        t += 1
        r = (chrom, s, e)
        if noRedu:
            if r not in rs:
                if line[5] == "+":
                    e = s + ext
                else:
                    s = max(0, e - ext)
                iv = HTSeq.GenomicInterval(chrom, s, e)
                model[iv] += 1
                rs.add(r)
        else:
            iv = HTSeq.GenomicInterval(chrom, s, e)
            model[iv] += 1
    print("%s:totalReads:%s;nonRedudant:%s" % (f, i, len(rs)))
    logger.info("%s:totalReads:%s;nonRedudant:%s" % (f, i, len(rs)))
    return len(rs), model
示例#27
0
def Get_label_information(label, annot, bam_reader):
    warnings.simplefilter("ignore")
    gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    ga = HTSeq.GenomicArray("auto", stranded=False, typecode="i")
    gene_count = {}
    for feature, rank, chrom, start, end, strand, length, exon_rank_left, exon_rank_right in annot[
            label]:
        iv = HTSeq.GenomicInterval(chrom, start, end, strand)
        gas[iv] += (feature, rank)
        gene_count[(feature, rank)] = 0
    boundary_left, boundary_right = min([i[3] for i in annot[label]
                                         ]), max([i[4] for i in annot[label]])
    region_fetch = annot[label][0][2] + ":" + str(
        int(boundary_left) - 500) + "-" + str(int(boundary_right) + 500)
    read_seq = bam_reader.fetch(region=region_fetch)
    read_seq_iter = iter(bam_reader.fetch())
    one_read = next(read_seq_iter)
    pe_mode = one_read.paired_end
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    for a in read_seq:
        if not pe_mode:
            if not a.aligned:
                continue
            if a.optional_field('NH') > 1:
                continue
            iv_seq = (cigop.ref_iv for cigop in a.cigar
                      if cigop.type == "M" and cigop.size > 0)
        else:
            if ((a[0] and a[0].aQual < minaqual)
                    or (a[1] and a[1].aQual < minaqual)):
                continue
            if ((a[0] and a[0].optional_field('NH') > 1)
                    or (a[1] and a[1].optional_field('NH') > 1)):
                continue
            if a[0] is not None and a[0].aligned:
                iv_seq = (cigop.ref_iv for cigop in a[0].cigar
                          if cigop.type in cigar_char and cigop.size > 0)
            else:
                iv_seq = tuple()
            if a[1] is not None and a[1].aligned:
                iv_seq = itertools.chain(
                    iv_seq, (invert_strand(cigop.ref_iv)
                             for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))
        feature_aligned = set()
        for iv in iv_seq:
            for iv2, val2 in gas[iv].steps():
                feature_aligned |= val2
                ga[iv] += 1  # for calculating coverage
        if len(feature_aligned) == 0:
            continue
        for f in [item for item in feature_aligned if item[0] == 'intron']:
            gene_count[f] += 1
        if 'intron' not in [x for x, y in feature_aligned]:
            for f in feature_aligned:
                gene_count[f] += 1
    return gas, ga, gene_count
示例#28
0
    def zero(self):
        if hasattr(self, 'bedgraphs'):
            for name in self.bedgraphs:
                self.bedgraphs[name] = HTSeq.GenomicArray('auto',
                                                          stranded=True)

        if hasattr(self, 'raw_signal_by_type'):
            for _type in self.raw_signal_by_type:
                self.raw_signal_by_type[_type] = []
示例#29
0
def read_coverage(bam_filename):
    _bamfile = HTSeq.BAM_Reader(bam_filename)
    coverage = HTSeq.GenomicArray("auto", stranded=True, typecode='i')
    #gtf_df = pandas.read_csv(gtf_noheader_filename, sep='\t', header=None)
    print "Reading alignments from bamfile..."
    for aln in _bamfile:  # Very slow.
        if aln.aligned:
            coverage[aln.iv] += 1
    return coverage
示例#30
0
def bedpe2model(bg, mapq=10, noRedu=True):
    """
    Convet BEDPE format file into HTSeq.GenomicArray to get the genomic coverage.
    Only non-redundant reads will be kept.

    Parameteres
    ----
    bg: str, .bedpe or .bedpe.gz file
    mapq: int, Bowtie2 MAPQ cutoff to filter reads.
    noRedu: bool, whether to keep redundant reads

    Returns
    ----
    HTSeq.GenomicArray
    """
    rs = set()
    if bg.endswith(".gz"):
        fh = gzip.open(bg, "rb")
    else:
        fh = open(bg)
    logger.info("Start building model for %s, with MAPQ cutoff >=%s" %
                (bg, mapq))
    model = HTSeq.GenomicArray("auto", stranded=False)
    t = 0
    for i, line in enumerate(fh):
        if i % 10000 == 0:
            report = "%s lines genome signal read." % i
            cFlush(report)
        line = line.split("\n")[0].split("\t")
        try:
            pet = PET(line)
        except:
            logger.error("%s from %s is not a BEDPE record" % (line, bg))
        if not pet.cis or "_" in pet.chromA:
            continue
        if pet.mapq < mapq:
            continue
        t += 1
        r = (pet.chromA, pet.mid, pet.mid + 1)
        if noRedu:
            if r not in rs:
                iva = HTSeq.GenomicInterval(pet.chromA, pet.startA, pet.endA)
                ivb = HTSeq.GenomicInterval(pet.chromB, pet.startB, pet.endB)
                model[iva] += 1
                model[ivb] += 1
                rs.add(r)
        else:
            iva = HTSeq.GenomicInterval(pet.chromA, pet.startA, pet.endA)
            ivb = HTSeq.GenomicInterval(pet.chromB, pet.startB, pet.endB)
            model[iva] += 1
            model[ivb] += 1
    logger.info("%s:totalReads:%s;nonRedudant:%s" % (bg, t, len(rs)))
    if noRedu:
        return len(rs), model
    else:
        return t, model