Exemplo n.º 1
0
def HTseq_count(bam_file, gtf_file, out_dir, identifier, parallel = True ):
	gtf_file = HTSeq.GFF_Reader(gtf_file)
	features = HTSeq.GenomicArrayOfSets( "auto", stranded=True )

	print "extracting features from gtf file"
	for feature in gtf_file:
		# if feature.type == "exon":
		features[feature.iv] += feature.attr[identifier]

	counts = collections.Counter( )

	almnt_file = HTSeq.SAM_Reader(bam_file)
	counts = collections.Counter( )
	for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ):
		if len(bundle) != 1:
			continue  # Skip multiple alignments
		first_almnt, second_almnt = bundle[0]  # extract pair
		if not first_almnt.aligned and second_almnt.aligned:
			count[ "_unmapped" ] += 1
			continue
		gene_ids = set()
		for iv, val in features[ left_almnt.iv ].steps():
			gene_ids |= val
		for iv, val in features[ right_almnt.iv ].steps():
			gene_ids |= val
		if len(gene_ids) == 1:
			gene_id = list(gene_ids)[0]
			counts[ gene_id ] += 1
		elif len(gene_ids) == 0:
			counts[ "_no_feature" ] += 1
		else:
			counts[ "_ambiguous" ] += 1

	for gene_id in counts:
		print gene_id, counts[ gene_id ]
def count_reads_paired(read_seq, counter, order, stranded, 
      quiet, minaqual, write_to_samout ):
      
    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments( read_seq )
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
    else:
        raise ValueError, "Illegal order specified."

    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

        i += 1
        if r[0] is not None and r[0].aligned:
            if stranded != "reverse":
                iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
        else:
            iv_seq = tuple()
        if r[1] is not None and r[1].aligned:            
            if stranded != "reverse":
                iv_seq = itertools.chain(iv_seq, 
                    ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
                iv_seq = itertools.chain( iv_seq, 
                    ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
        else:
            if ( r[0] is None ) or not ( r[0].aligned ):
                write_to_samout( r, "__not_aligned" )
                counter.notaligned += 1
                continue         
        try:
            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                counter.nonunique += 1
                write_to_samout( r, "__alignment_not_unique" )
                continue
        except KeyError:
            pass
        if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
            lowqual += 1
            write_to_samout( r, "__too_low_aQual" )
            continue         
        
        counter.count(iv_seq, r)
         
    if not quiet:
        sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
Exemplo n.º 3
0
def count_reads_paired(read_seq, counter, order, quiet, minaqual):

    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments(read_seq)
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    else:
        raise ValueError("Illegal order specified.")

    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            msg = "%d SAM alignment record pairs processed.\n" % (i)
            sys.stderr.write(msg)

        i += 1
        if r[0] is not None and r[0].aligned:
            forward_iv_seq = (co.ref_iv for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
            reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
        else:
            forward_iv_seq = tuple()
            reverse_iv_seq = tuple()
        if r[1] is not None and r[1].aligned:
            rest = (invert_strand(co.ref_iv) for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            forward_iv_seq = itertools.chain(forward_iv_seq, rest)
            rest = (co.ref_iv for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            reverse_iv_seq = itertools.chain(reverse_iv_seq, rest)
        else:
            if (r[0] is None) or not (r[0].aligned):
                counter.not_aligned(r)
                continue
        try:
            if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                    (r[1] is not None and r[1].optional_field("NH") > 1):
                counter.non_unique(r)
                continue
        except KeyError:
            pass
        if (r[0] and r[0].aQual < minaqual) or \
                (r[1] and r[1].aQual < minaqual):
            counter.too_low_quality(r)
            continue

        counter.forward_count(forward_iv_seq, r)
        counter.reverse_count(reverse_iv_seq, r)

    if not quiet:
        sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def ungapped_pe_counter(sam_reader, feature_array):
    counts = collections.Counter( )
    pair_iterator = hts.pair_SAM_alignments( sam_reader, bundle=True )
    # bundle puts all multiply-mapped pairs together.

    t0 = datetime.datetime.now()
    for ic, bundle in enumerate(pair_iterator):

        # report progress (to prove that it is still alive):
        if ic % 1000000 == 0:
            t1 = datetime.datetime.now()
            print "\r%d read bundles counted in %s\r" % (ic, t1-t0)
            sys.stdout.flush()

        if bundle == []: # first bundle for some reason is always an empty list
            continue

        bcounts = assess_bundle(bundle, feature_array)

        """
        To evaluate the multiply mapped bundles, each pair in a bundle must still ALWAYS
        and ONLY map to a single feature. Thus, every aligned pair has come from the same
        feature (gene), and this bundle counts as evidence of one read for this gene.

        If any of the read pairs maps to a different gene, or no gene, or multiple genes,
        then the bundle is considered ambiguous.

        If all pairs in a bundle map as _no_feature, _unmapped or _ambiguous, then the
        bundle counts as one count towards this feature type. (ie, it is passed on to
        the final counter to increment by 1).
        """

        if len(bcounts) > 1: # ie, is a multiply mapped feature with multiple gene mappings
            counts[ "_ambiguous" ] += 1
            continue
        elif len(bcounts) == 0:  # uh oh! There is an error somewhere.
            print "#" * 40
            print "Error! bundle was not assigned any status"
            print "Contents of bundle:"
            print bundle
            continue
        else:
            counts[ bcounts.keys()[0] ] += 1

    return counts
Exemplo n.º 5
0
def bam_parser_2(bam_file, min_len, max_clip, min_id, mode):
    bam_dict = {}

    query_counter = 0

    output_list = list()

    if mode == 'paired':
        #import itertools
        #for aln in itertools.islice( HTSeq.pair_SAM_alignments(bam_file), 1000 ):  # printing first N reads
        for aln in HTSeq.pair_SAM_alignments(bam_file):
            query_counter += 1

            query_1, query_2 = aln

            q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id)
            q2_aln = parser_aln_list(query_2, aln_number = query_counter, pair_pos = 2, min_len=min_len, max_clip=max_clip, min_id=min_id)

            alns = [q1_aln, q2_aln]

            if alns == [None, None]:
                continue
            else:
                if None in alns:
                    alns.remove(None)
                output_list.append(alns)

    elif mode == 'single':
        for aln in bam_file:

            query_counter += 1

            query_1 = aln

            q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id)

            alns = [q1_aln]

            if q1_aln != None:
                output_list.append(alns)

    df_columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT']
    output_list = [item for sublist in output_list for item in sublist]

    return pd.DataFrame(output_list, columns=df_columns)
Exemplo n.º 6
0
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, \
							 filename_read_names_gene_names,filename_read_names_gene_names_amb_unique):
   """
	Main function to count reads in features i.e. genes. 
	
	Input:
		+ sam_filename: Input alignment with all the ambiguously mapped reads
		+ gff_filename: GTF containing all genes for a given species
		+ stranded: specify whether data are stranded - see -s option
		+ overlap_mode: mode to handle reads overlapping more than one feature (e.g. union) - 
		  See -m option: choices = ( "union", "intersection-strict", "intersection-nonempty")
		+ feature_type: see -t option
		+ id_attribute: see -i option
		+ quiet: see -q option
		+ minaqual: see -a option 
		+ samout: SAM output file storing disambiguated reads (see -o option).
		+ filename_read_names_gene_names: filename for the output file containing the mappings readName to geneNames for multimapped reads
		+ filename_read_names_gene_names_amb_unique: filename for the output file containing the mappings readName to geneNames for ambiguously mapped reads
      
	Output:
		+ Writes readName to geneName outputs.
		+ Writes SAM output file for ddisambiguated uniquely mapped reads.
		+ Writes to stdout the genes and their read counts with read count for distinct read type: non-ambiguous unique, multimapped and ambiguous unique. 
		  This output redirected and stored to an output file in main peakRescue pipeline. 
		  This output is used in the later stage of the peakRescue pipeline to rescue the reads present in the readName to genNames mappings.
	
   """
   # Output filhandles for readName to geneNames mappings
   fh_read_names_gene_names = open(filename_read_names_gene_names, 'w')
   fh_read_names_gene_names_amb_unique = open(filename_read_names_gene_names_amb_unique, 'w')
   
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   ## Hash table to store unique reads per exon (if modified GTF)
   counts = {}
   ## Hash table to store original non unique reads per gene (without 
   dict_nonunique = {}
   ## Hash table to store all unique reads as per original GTF
   dict_gene_unique_counts = {}
   ## hast table to store ambigouous read count for unique reads...
   dict_gene_unique_counts_ambiguous = {}
   ## Hash table to store all non-unique reads including shared reads 
   ## (either split reads or read pair matching on two distinct exons, same gene)
   dict_gene_nonunique_counts = {}
   ## Hash to store the non-unique read-names as key and genes names as values (fragments)
   dict_read_name_genes_names = {}
   ## Hash to store the non-unique read-names as key and genes names as values (fragments) including instances of a given multimapped read on same gene
   dict_read_name_genes_names_final = {} 
   dict_read_name_genes_names_ambiguous = {}
   ## @todo: tag_gff - parameter to be removed - only deal with gene level information 
   ## tag_gff: type to specify whether it contains gene or exons information 
   tag_gff = "gene_gff" 
   # Try to open samfile and fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close() 
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   exons = HTSeq.GenomicArrayOfSets( "auto", stranded=False )
   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
	    exons[ f.iv ] += f # added to get exon interval data
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
	    # -- Initialisation 
	    feature_name = f.attr[ id_attribute ]
	    # -- Added tag_gff for GFF type
	    if tag_gff == "gene_gff":
		# Original GTF (genes) 
		dict_nonunique = initialise_counts_per_feature(dict_nonunique, feature_name)
		dict_gene_unique_counts = initialise_counts_per_feature(dict_gene_unique_counts, feature_name)
		dict_gene_nonunique_counts = initialise_counts_per_feature(dict_gene_nonunique_counts, feature_name)
		dict_gene_unique_counts_ambiguous = initialise_counts_per_feature(dict_gene_unique_counts_ambiguous, feature_name)
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
      #pe_mode = 1 ## Added by us
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   ###################################################################################################   
   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      ambiguous_tag=0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      nonunique_nonamb_to_be_rescued = 0
      temp_read_name="NA"
      previous_read_name="NA"
      temp_interval_r0="NA"
      temp_interval_r1="NA"
      counter_fragment = 0	
      flag_result = 0
      i = 0   
      pe_mode_for_SE = 0
      ## -- Added pe_mode on for SE files so that multireads reads will be accounted for
      if not pe_mode: # real SE
      	pe_mode_for_SE = 1 #
      	read_seq_pe_file = read_seq
      	pe_mode=1
      ## -- End
      index_fragment = 0
      for r in read_seq:
         prev_index_fragment = index_fragment
	 tag_nonunique_NH = 0
	 tag_overlapping_genes = 0
	 flag_aln_not_unique = 0 #
	 flag_ambiguous = 0 #
	 #-- LOOP OVER ALL READS IN INPUT BAM FILE
	 if pe_mode_for_SE:
	 	r = (r, None)
      	 counter_fragment += 1	
         i += 1
         if not pe_mode:
	    # -- SINGLE_END mode
            if not r.aligned:
               notaligned += 1
               #write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
		  # --- Rescue multimappers in singel-end mode
                  #write_to_samout( r, "alignment_not_unique" )
                  #nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               #write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
	    # -- PAIRED-END
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  #write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
	       	  tag_nonunique_NH = 1
               	  if ( r[0] is not None and r[1] is None ):
			result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag, exons)
			if result:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
												temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			else:
				if len(fs_genes) != 0:
					(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
													temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
               	  if ( r[0] is None and r[1] is not None ):
			result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons)
			if result:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
												temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			else:
				if len(fs_genes) != 0:
					(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
													temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
               	  if ( r[0] is not None and r[1] is not None ):
			result1, fs_genes1, fs_exons1,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag,exons)
			result2, fs_genes2, fs_exons2,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons)

		        if len(fs_genes1.intersection(fs_genes2)) > 0:
				fs_genes = fs_genes1.intersection(fs_genes2)
		        elif len(fs_genes1.intersection(fs_genes2))==0:
				fs_genes = fs_genes1.union(fs_genes2)

			if result1 and not result2:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
												temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			elif result2 and not result1:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
												temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			else:
				if len(fs_genes1) != 0 or len(fs_genes2) != 0:
					flag_result = 1
					if ( ( ((temp_interval_r0 != str(r[0].iv)) or (temp_interval_r1 != str(r[1].iv))) or (temp_read_name != r[0].read.name) ) ):
						(dict_nonunique)= add_non_unique_counts_per_feature(fs_genes, dict_nonunique)
						dict_read_name_genes_names = _populate_read_name_gene_name(dict_read_name_genes_names, fs_genes, r[0].read.name, tag_report_instances_same_multiread_on_same_gene)
						flag_aln_not_unique = 1
                  #write_to_samout( r, "alignment_not_unique" )
	          nonunique += 1

		  if flag_result:
			
			  if r[0] is not None and r[1] is None:		
				non_uniq_read_name = r[0].read.name
			  elif r[0] is None and r[1] is not None:		
				non_uniq_read_name = r[1].read.name
			  elif r[0] is not None and r[1] is not None:		
				non_uniq_read_name= r[0].read.name
			  non_uniq_read_name2 = dict_read_name_genes_names.keys()[0]
			  if flag_aln_not_unique:
				nonunique_nonamb_to_be_rescued += 1
	          	  # -- Re-initialise hash
			  # previous_read_name: read which falls into at least one gene interval
			  # tmp_read_name: the previous read in the bam file
			  # BAM is sorted by read name hence each multimapper will be arranged one after another 
			  if previous_read_name == "NA":
				previous_read_name = non_uniq_read_name

		  	  if non_uniq_read_name != previous_read_name:
				if previous_read_name in dict_read_name_genes_names.keys():
					fs_genes_names = dict_read_name_genes_names[previous_read_name]
					fh_read_names_gene_names.write("%s\t%s\n" % (previous_read_name, "\t".join(list(fs_genes_names)) ))
				previous_read_name = non_uniq_read_name
				tmp_dict = {}
				if non_uniq_read_name in dict_read_name_genes_names.keys():
					#print "non_uniq_read_name IN dict_read_name_genes_names.keys()"
					tmp_dict[non_uniq_read_name] = dict_read_name_genes_names[non_uniq_read_name]
				dict_read_name_genes_names.clear() # only one read stored
				dict_read_name_genes_names = tmp_dict	

		  flag_result = 0
		  flag_aln_not_unique = 0 #
		  (temp_read_name, temp_interval_r0, temp_interval_r1) = initalize_read_name_and_interval(r[0], r[1]) 
		  continue
            # except KeyError:
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               #write_to_samout( r, "too_low_aQual" )
               continue         
          
         try:
	    # --
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq: # interval from bam file for each fragment
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
		     	#if debug:
				#print "****Unique_feature %s and feature_interval %s" %(fs2,iv2)	
		        fs = fs.union( fs2 )
			
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )

	    fs_genes = fs
            if fs_genes is None or len( fs_genes ) == 0:
               #write_to_samout( r, "no_feature" )
               empty += 1
		# ambiguous read count and/or one of the read pair mapping on different gene (potential gene fusion events)...
		# elif len( fs ) > 1:
            elif len( fs_genes ) > 1:
	       ###############################################################
	       ## AMBIGUOUS UNIQUE
	       ###############################################################
	       is_disambiguated = 0
	       if not tag_nonunique_NH:
                  if ( r[0] is not None and r[1] is None ):
			       result, fs_genes, fs_exons,dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       if result:
		       			(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       				is_disambiguated = 1
			       if ambiguous_tag:
			       		(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
					flag_ambiguous = 1
					# write in the file ambiguous read name gene name data...
					fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) ))
                  if ( r[0] is None and r[1] is not None ):
			       result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       if result:
		       			(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       				is_disambiguated = 1
			       if ambiguous_tag:
			       		(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
					flag_ambiguous = 1
					fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[1].read.name, "\t".join(list(fs_genes)) ))
                  if ( r[0] is not None and r[1] is not None ):
			       result1, fs_genes1, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag1 = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       result2, fs_genes2, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag2 = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       if debug:
			       		print "IN UNIQUE DISAMBIGUATION -->r[0].read.name=%s\t%s\t%s\t%s\t%s\n" % (r[0].read.name,result1, result2, fs_genes1, fs_genes2)
			       if len(fs_genes1.intersection(fs_genes2))==1:
					fs_genes = fs_genes1.intersection(fs_genes2)
					(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       				is_disambiguated = 1
			       elif len(fs_genes1.intersection(fs_genes2)) > 1:
					fs_genes = fs_genes1.intersection(fs_genes2)
					(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
					flag_ambiguous = 1
					fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) ))
			       elif len(fs_genes1.intersection(fs_genes2))==0:
					fs_genes = fs_genes1.union(fs_genes2)
					if (fs_genes1 == set([]) or fs_genes2 == set([])) and len(fs_genes) == 1: 					
						## Disambiguate the uniquely mapped to the single gene it maps on
						(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       					is_disambiguated = 1
					elif (fs_genes1 != set([]) or fs_genes2 != set([])):
						## Add fragment to the RN-GN for ambiguous uniquely mapped based on 
						## union of both fs_genes (fs_genes1 & fs_genes2) > 1
						(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
						flag_ambiguous = 1
						fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) ))

	       if flag_ambiguous:
			ambiguous += 1
			#write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               if is_disambiguated:
			write_to_samout( r, list(fs_genes)[0] )
            else:
	       if debug:
		       #print "DEBUG::CR:: len(fs) <-> 1:: fs = %s" %fs
			pass
               write_to_samout( r, list(fs)[0] )

               rr2 = r[0] if r[0] is not None else r[1]

	       if not tag_nonunique_NH:
			(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
			
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

	 flag_ambiguous = 0 ## re-initialise....
	 index_fragment += 1
      #########################
      # This is to store the last read/fragment since it will no pass in previous condition:
      # => if non_uniq_read_name != previous_read_name:
      # -- At same level as the for loop (outside of the for loop) - column: 7
      #fh_read_names_gene_names.close()
      if dict_read_name_genes_names.keys() != []:
	#print "dict_read_name_genes_names passing"
	non_uniq_read_name = dict_read_name_genes_names.keys()[0]
	fs_genes_names = dict_read_name_genes_names[non_uniq_read_name]
	fh_read_names_gene_names.write("%s\t%s\n" % (non_uniq_read_name, "\t".join(list(fs_genes_names)) ))
      # -- 
      fh_read_names_gene_names.close() 
      fh_read_names_gene_names_amb_unique.close()
   ###################################################################################################   
   #except UnboundLocalError:
   except AttributeError:
   #except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   if tag_gff == "gene_gff":
	   tuples_genenames_exontag = [(fn, fn) for fn in dict_gene_unique_counts.keys()]
   tuples_genenames_exontag.sort()

   previous_gene_name = "NA"

   for gene_name, fn in tuples_genenames_exontag:
	gene_name = gene_name.strip()
	fn = fn.strip()
	
   	if tag_gff == "gene_gff": #
		if gene_name in dict_gene_unique_counts.keys():
			print "%s\t%i\t%i\t%s" % ( fn, dict_gene_unique_counts[gene_name], dict_nonunique[gene_name],dict_gene_unique_counts_ambiguous[gene_name] )
		else:
			# -- No non-unique reads for that gene_name
			print "%s\t%i\t%i\t%i" % ( fn, dict_gene_unique_counts[gene_name], 0,dict_gene_unique_counts_ambiguous[gene_name] )
		
	# -- Re-initialise gene name
	previous_gene_name = gene_name
			
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
   print "nonunique_nonamb_to_be_rescued:\t%d"  % nonunique_nonamb_to_be_rescued
Exemplo n.º 7
0
def tabulate_start_positions(BamFileName, cells, name, targetsite,
                             mapq_threshold, gap_threshold, start_threshold,
                             outfile_base):

    output_filename = '{0}_coordinates.txt'.format(outfile_base)

    sorted_bam_file = HTSeq.BAM_Reader(BamFileName)
    filename_base = os.path.basename(BamFileName)
    ga = HTSeq.GenomicArray("auto", stranded=False)
    ga_windows = HTSeq.GenomicArray("auto", stranded=False)
    ga_stranded = HTSeq.GenomicArray("auto", stranded=True)
    ga_coverage = HTSeq.GenomicArray("auto", stranded=False)
    read_count = 0

    ref_chr = [
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
        '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'
    ]

    with open(output_filename, 'w') as o:
        header = [
            '#Name', 'Targetsite_Sequence', 'Cells', 'BAM', 'Read1_chr',
            'Read1_start_position', 'Read1_strand', 'Read2_chr',
            'Read1_start_position', 'Read2_strand'
        ]
        print(*header, sep='\t', file=o)
        for bundle in HTSeq.pair_SAM_alignments(sorted_bam_file, bundle=True):
            output = False
            first_read_chr, first_read_position, first_read_strand = None, None, None
            second_read_chr, second_read_position, second_read_strand = None, None, None

            if len(bundle) == 1:  # single alignment
                first_read, second_read = bundle[0]
                if first_read.aligned:
                    if first_read.aQual >= mapq_threshold and not first_read.flag & 1024 and \
                    (first_read.iv.strand == '+' and first_read.cigar[0].type == 'M') or \
                    (first_read.iv.strand == '-' and first_read.cigar[-1].type == 'M'):
                        first_read_chr = first_read.iv.chrom
                        first_read_position = first_read.iv.start_d
                        first_read_strand = first_read.iv.strand
                if second_read.aligned:
                    if second_read.aQual >= mapq_threshold and not first_read.flag & 1024 and \
                    (second_read.iv.strand == '+' and second_read.cigar[0].type == 'M') or \
                    (second_read.iv.strand == '-' and second_read.cigar[-1].type == 'M'):
                        second_read_chr = second_read.iv.chrom
                        second_read_position = second_read.iv.start_d
                        second_read_strand = second_read.iv.strand
            elif len(bundle) > 1:  # multiple alignments
                first_read_list, second_read_list = zip(*bundle)
                filtered_first_read_list = []
                filtered_second_read_list = []
                for read in first_read_list:
                    if read:
                        if read.aligned:
                            if read.iv.strand == '+' and read.cigar[
                                    0].type == 'M':
                                filtered_first_read_list.append(read)
                            elif read.iv.strand == '-' and read.cigar[
                                    -1].type == 'M':
                                filtered_first_read_list.append(read)
                for read in second_read_list:
                    if read:
                        if read.aligned:
                            if read.iv.strand == '+' and read.cigar[
                                    0].type == 'M':
                                filtered_second_read_list.append(read)
                            elif read.iv.strand == '-' and read.cigar[
                                    -1].type == 'M':
                                filtered_second_read_list.append(read)
                if len(filtered_first_read_list) == 1:
                    first_read = filtered_first_read_list[0]
                    if first_read.aQual >= mapq_threshold and not first_read.flag & 1024:
                        first_read_chr = first_read.iv.chrom
                        first_read_position = first_read.iv.start_d
                        first_read_strand = first_read.iv.strand
                if len(filtered_second_read_list) == 1:
                    second_read = filtered_second_read_list[0]
                    if second_read.aQual >= mapq_threshold and not first_read.flag & 1024:
                        second_read_chr = second_read.iv.chrom
                        second_read_position = second_read.iv.start_d
                        second_read_strand = second_read.iv.strand

            if first_read_chr == second_read_chr and first_read_chr in ref_chr and \
            ((first_read.iv.strand == '+' and second_read.iv.strand == '-' and abs(first_read_position - second_read_position) <= gap_threshold)
            or (second_read.iv.strand == '+' and first_read.iv.strand == '-' and abs(second_read_position - first_read_position) <= gap_threshold)):

                #if first_read_chr in ref_chr and first_read_position and first_read_strand:
                ga[HTSeq.GenomicPosition(first_read_chr, first_read_position,
                                         first_read_strand)] += 1
                ga_windows[HTSeq.GenomicPosition(first_read_chr,
                                                 first_read_position,
                                                 first_read_strand)] = 1
                ga_stranded[HTSeq.GenomicPosition(first_read_chr,
                                                  first_read_position,
                                                  first_read_strand)] += 1
                #    output = True

                #if second_read_chr in ref_chr and second_read_position and second_read_strand:
                ga[HTSeq.GenomicPosition(second_read_chr, second_read_position,
                                         second_read_strand)] += 1
                ga_windows[HTSeq.GenomicPosition(second_read_chr,
                                                 second_read_position,
                                                 second_read_strand)] = 1
                ga_stranded[HTSeq.GenomicPosition(second_read_chr,
                                                  second_read_position,
                                                  second_read_strand)] += 1
                output = True

            # Output read positions for plotting. Add gap.

            if output == True:
                print(name,
                      targetsite,
                      cells,
                      filename_base,
                      first_read_chr,
                      first_read_position,
                      first_read_strand,
                      second_read_chr,
                      second_read_position,
                      second_read_strand,
                      sep='\t',
                      file=o)

            last_pair_position = [
                first_read_chr, first_read_position, first_read_strand,
                second_read_chr, second_read_position, second_read_strand
            ]

            read_count += 1
            if not read_count % 100000:
                print(read_count / float(1000000), end=" ", file=sys.stderr)

    return ga, ga_windows, ga_stranded, ga_coverage, read_count
Exemplo n.º 8
0
def count_reads_in_features(sam_filename, gff_filename, samtype, order, overlap_mode,
    feature_type, id_attribute, quiet, minaqual, mapping_file, scale_method):

    features = HTSeq.GenomicArrayOfSets("auto", False)
    counts = {}

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    # Try to open mapping file to fail early in case it is not there
    if mapping_file:
        open(mapping_file).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    continue
                features[f.iv] += feature_id
                counts[feature_id] = 0
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("{!s} GFF lines processed.\n".format(i))
    except:
        sys.stderr.write("Error occured when processing GFF file ({}):\n"
            .format(gff.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write("{!s} GFF lines processed.\n".format(i))

    num_features = len(counts)
    if num_features == 0:
        sys.stderr.write("Warning: No features of type '{}' found.\n"
            .format(feature_type))

    if samtype == "sam":
        align_reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        align_reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format {} specified.".format(samtype)

    try:
        if sam_filename != "-":
            read_seq_file = align_reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = align_reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading SAM/BAM file.\n" )
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "position":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError, "Illegal order specified."
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("{!s} SAM alignment record{} processed.\n"
                    .format(i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    iv_seq = itertools.chain( iv_seq,
                        (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1 ) or \
                            (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                         if iv.chrom not in features.chrom_vectors:
                             raise UnknownChrom
                         for iv2, fs2 in features[iv].steps():
                             fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[ iv ].steps():
                            if len(fs2) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                empty += 1

    except:
        sys.stderr.write("Error occured when processing SAM input ({}):\n"
            .format(read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write("{!s} SAM {} processed.\n"
            .format(i, "alignments " if not pe_mode else "alignment pairs"))

    # map to higher order features if applicable
    if mapping_file:
        abundances = {}
        with open(mapping_file) as mapping_h:
            for row in csv.reader(mapping_h, delimiter='\t'):
                try:
                    feature, feature_category, feature_length, organism = row
                except ValueError:
                    sys.stderr.write("Can't determine the format of '{}'".format(mapping_file))
                    raise
                if feature not in counts:
                    continue
                if not feature_category:
                    feature_category = feature
                abund = counts[feature] if scale_method == 'none' else scale_abundance(counts[feature], int(feature_length))
                if ',' in feature_category:
                    cats = feature_category.split(',')
                    for category in cats:
                        abundances[category] = abundances.get(category, 0) + abund
                else:
                    abundances[feature_category] = abundances.get(feature_category, 0) + abund

        if num_features > 0 and len(abundances) == 0:
            sys.stderr.write("Warning: No higher order features found. Please "
                "make sure the mapping file is formatted correctly.\n")

        for feature in counts:
            if feature not in abundances:
                abundances['UNMAPPED'] = abundances.get('UNMAPPED', 0) + counts[feature]

    else:
        abundances = counts

    # "UNMAPPED" can be interpreted as a single unknown gene of length 1
    # kilobase recruiting all reads that failed to map to known sequences
    abundances['UNMAPPED'] = (abundances.get('UNMAPPED', 0) + empty + ambiguous + lowqual + notaligned + nonunique)

    for fn in sorted(abundances.keys()):
        print("{}\t{!s}".format(fn, abundances[fn]))
    sys.stderr.write("__no_feature\t{!s}\n".format(empty))
    sys.stderr.write("__ambiguous\t{!s}\n".format(ambiguous))
    sys.stderr.write("__too_low_aQual\t{!s}\n".format(lowqual))
    sys.stderr.write("__not_aligned\t{!s}\n".format(notaligned))
    sys.stderr.write("__alignment_not_unique\t{!s}\n".format(nonunique))
Exemplo n.º 9
0
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, allow_ambiguous, allow_nonunique ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
   features_dict = defaultdict(list)
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
            features_dict[ f.attr[ id_attribute ] ].append(f)
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
   
   sys.stderr.write( "Sorting exons from GFF file.\n" )
   for key, value in features_dict.items():
   		if features_dict[key][0].iv.strand == "-":
   				features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=True)
   		else:
   				features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=False)
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if ((allow_nonunique == "no") and (r.optional_field( "NH" ) > 1)):
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if (allow_nonunique == "no") and (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            iv_seq = list(iv_seq)
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
#            sys.stderr.write( "fs = %s with len = %d allow_ambiguous=%s\n" % (fs, len(fs), allow_ambiguous) )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif ((len( fs ) > 1) and (allow_ambiguous == "no")):
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
#               write_to_samout( r, list(fs)[0] )
#               sys.stderr.write( "iv_seq is now %s\n" % iv_seq )
#               sys.exit ("stopping for now" )
               for iv in iv_seq:
#               		sys.stderr.write( "iv in iv_seq is %s and getting %s\n" % (iv, features[iv]) )
               		for iv2, fs2 in features[ iv ].steps():
#						         sys.stderr.write( "iv2 = %s and fs2 = %s\n" % (iv2, fs2) )
						         if len(fs2) == 0:
						         		continue
						         for fsi in fs2:
#						         		sys.stderr.write( "fsi = %s\n" % fsi )
						         		offset = 0
#						         		write_to_samout( r, fsi[0] )
						         		for exon in features_dict[ fsi ]:
						         				if ((iv2.start >= exon.iv.start) and (iv2.end <= exon.iv.end)):
#						         						sys.stderr.write("found matching exon %s\n" % exon)
						         						if (exon.iv.strand == "+"):
						         								offset += (iv2.start - exon.iv.start)
						         						else:
						         								offset += (exon.iv.end - iv2.end)
#						         								sys.stderr.write("matching exon new offset %d\n" % offset)
						         						break
						         				else:
						         						offset += (exon.iv.end - exon.iv.start)
#						         						sys.stderr.write( "skipping exon %s - new offset %d\n" % (exon, offset) )
#						         				sys.stderr.write( "mapping read %s to offset %d\n" % (r, offset) )
#						         				sys.stderr.write( "dir(r) is %s" % "\t".join(list(dir(r))) )
						         		if pe_mode:
						         			rname = r[0].read.name if r[0] is not None else r[1].read.name
						         		else:
						         			rname = r.read.name
						         		print "%s\t%d\t%d\t%s" % (fsi, offset, offset + (iv2.end-iv2.start-1), rname) # output is 0-based, inclusive on both ends
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            empty += 1
            #if not quiet:
            #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
            #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
            #      ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()
Exemplo n.º 10
0
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}
   gene_length = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
 
   counts, colgenes = parse_gff(gff_filename,features,feature_type,id_attribute,stranded,quiet,counts)
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   ################# read sam file #######################
   try:
      if sam_filename != "-":
          read_seq = HTSeq.SAM_Reader( sam_filename )
          first_read = iter(read_seq).next()
      else:
          read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
          first_read = read_seq.next()
          read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise
   ################ read sam file #######################
   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     ## what is within the genomic interval of iv
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            empty += 1
            #if not quiet:
            #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
            #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
            #      ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print i, sum(counts.values())
      rpkm, feature_len = get_rpkm(counts[fn],i,colgenes[fn])
      print "%s\t%d\t%d\t%d" % ( fn, counts[fn], feature_len,rpkm)
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
Exemplo n.º 11
0
      set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
      if len( set_of_gene_names ) == 0:
         counts[ '_empty' ] += 1
      elif len( set_of_gene_names ) > 1:
         counts[ '_ambiguous' ] +=1
      else:
         for f in rs:
            counts[ f.name ] += 1
      num_reads += 1
      if num_reads % 100000 == 0:
         sys.stderr.write( "%d reads processed.\n" % num_reads )

else: # paired-end
   alignments = dict()
   if order == "name":
      for af, ar in HTSeq.pair_SAM_alignments( reader( sam_file ) ):
         if af == None or ar == None:
            continue
         if not ar.aligned:
            continue
         if not af.aligned:
            continue
         elif ar.optional_field("NH") > max_NH or af.optional_field("NH") > max_NH:
            continue
         elif af.iv.chrom != ar.iv.chrom:
            counts['_ambiguous_readpair_position'] += 1
            continue
         else:
            rs = map_read_pair( af, ar )
            counts = update_count_vector( counts, rs )
            num_reads += 1
Exemplo n.º 12
0
def count_reads(features, counts, pe_mode, read_seq, order, stranded,
                overlap_mode, quiet, minaqual, write_to_samout):

    if pe_mode:
        if order == "name":
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
        else:
            raise ValueError, "Illegal order specified."
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write("%d SAM alignment record%s processed.\n" %
                             (i, "s" if not pe_mode else " pairs"))

        i += 1
        if not pe_mode:
            if not r.aligned:
                notaligned += 1
                write_to_samout(r, "__not_aligned")
                continue
            try:
                if r.optional_field("NH") > 1:
                    nonunique += 1
                    write_to_samout(r, "__alignment_not_unique")
                    continue
            except KeyError:
                pass
            if r.aQual < minaqual:
                lowqual += 1
                write_to_samout(r, "__too_low_aQual")
                continue
            if stranded != "reverse":
                iv_seq = (co.ref_iv for co in r.cigar
                          if co.type == "M" and co.size > 0)
            else:
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                          if co.type == "M" and co.size > 0)
        else:
            if r[0] is not None and r[0].aligned:
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
            else:
                iv_seq = tuple()
            if r[1] is not None and r[1].aligned:
                if stranded != "reverse":
                    iv_seq = itertools.chain(
                        iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar
                                 if co.type == "M" and co.size > 0))
                else:
                    iv_seq = itertools.chain(
                        iv_seq, (co.ref_iv for co in r[1].cigar
                                 if co.type == "M" and co.size > 0))
            else:
                if (r[0] is None) or not (r[0].aligned):
                    write_to_samout(r, "__not_aligned")
                    notaligned += 1
                    continue
            try:
                if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                         ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                    nonunique += 1
                    write_to_samout(r, "__alignment_not_unique")
                    continue
            except KeyError:
                pass
            if (r[0] and r[0].aQual < minaqual) or (r[1]
                                                    and r[1].aQual < minaqual):
                lowqual += 1
                write_to_samout(r, "__too_low_aQual")
                continue

        try:
            if overlap_mode == "union":
                fs = set()
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[iv].steps():
                        fs = fs.union(fs2)
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[iv].steps():
                        if len(fs2
                               ) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection(fs2)
            else:
                sys.exit("Illegal overlap mode.")
            if fs is None or len(fs) == 0:
                write_to_samout(r, "__no_feature")
                empty += 1
            elif len(fs) > 1:
                write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                ambiguous += 1
            else:
                write_to_samout(r, list(fs)[0])
                counts[list(fs)[0]] += 1
        except UnknownChrom:
            write_to_samout(r, "__no_feature")
            empty += 1

    if not quiet:
        sys.stderr.write(
            "%d SAM %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))

    for fn in sorted(counts.keys()):
        print "%s\t%d" % (fn, counts[fn])
    print "__no_feature\t%d" % empty
    print "__ambiguous\t%d" % ambiguous
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
Exemplo n.º 13
0
def count_reads(sam_filename, features, counts, samtype, order, stranded,
                overlap_mode, quiet, minaqual, samout):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format %s specified." % samtype

    try:
        if sam_filename != "-":
            read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write(
            "Error occured when reading beginning of SAM/BAM file.\n")
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError, "Illegal order specified."
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                            (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature")
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0])
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                write_to_samout(r, "__no_feature")
                empty += 1

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write(
            "%d SAM %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))

    if samoutfile is not None:
        samoutfile.close()

    for fn in sorted(counts.keys()):
        print "%s\t%d" % (fn, counts[fn])
    print "__no_feature\t%d" % empty
    print "__ambiguous\t%d" % ambiguous
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
Exemplo n.º 14
0
def count_reads(sam_filename, features, counts, samtype, order, forward,
                reverse, overlap_mode, quiet, minaqual, samout, directory):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if samtype is None:
        samtype = detect_sam_type(sam_filename)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    try:
        if sam_filename != "-":
            read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading beginning "
                         "of SAM/BAM file.\n")
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError("Illegal order specified.")
        if forward:
            empty_forward = 0
            ambiguous_forward = 0
            counts_forward = copy.copy(counts)
        if reverse:
            empty_reverse = 0
            ambiguous_reverse = 0
            counts_reverse = copy.copy(counts)
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue
                if forward:
                    iv_seq_for = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                if reverse:
                    iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar
                                  if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if forward:
                        iv_seq_for = (co.ref_iv for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                    if reverse:
                        iv_seq_rev = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                else:
                    iv_seq_rev = tuple()
                    iv_seq_for = tuple()
                if r[1] is not None and r[1].aligned:
                    if forward:
                        iv_seq_for = (itertools.chain(
                            iv_seq_for, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type == "M" and co.size > 0)))
                    if reverse:
                        iv_seq_rev = itertools.chain(
                            iv_seq_rev, (co.ref_iv for co in r[1].cigar
                                         if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        notaligned += 1
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                        (r[1] is not None and r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual)
                        or (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    if forward:
                        fs_for = set()
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_for = fs_for.union(fs2)
                    if reverse:
                        fs_rev = set()
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_rev = fs_rev.union(fs2)
                elif (overlap_mode == "intersection-strict"
                      or overlap_mode == "intersection-nonempty"):
                    if forward:
                        fs_for = None
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_for is None:
                                        fs_for = fs2.copy()
                                    else:
                                        fs_for = fs_for.intersection(fs2)
                    if reverse:
                        fs_reverse = None
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_rev is None:
                                        fs_rev = fs2.copy()
                                    else:
                                        fs_rev = fs_rev.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if forward:
                    if fs_for is None or len(fs_for) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_forward += 1
                    elif len(fs_for) > 1:
                        write_to_samout(
                            r, "__ambiguous[" + '+'.join(fs_for) + "]")
                        ambiguous_forward += 1
                    else:
                        write_to_samout(r, list(fs_for)[0])
                        counts_forward[list(fs_for)[0]] += 1
                if reverse:
                    if fs_reverse is None or len(fs_rev) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_reverse += 1
                    elif len(fs_reverse) > 1:
                        write_to_samout(
                            r, "__ambiguous[" + '+'.join(fs_rev) + "]")
                        ambiguous_reverse += 1
                    else:
                        write_to_samout(r, list(fs_rev)[0])
                        counts_reverse[list(fs_rev)[0]] += 1
            except UnknownChrom:
                write_to_samout(r, "__no_feature")
                empty_forward += 1
                empty_reverse += 1

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write(
            "%d SAM %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))

    if samoutfile is not None:
        samoutfile.close()

    if forward:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_forward_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Forward written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_forward.keys()):
                output_file.write("%s\t%d\n" % (fn, counts_forward[fn]))
                used_features_count += 1
                used_features_sum += counts_forward[fn]
            output_file.write("__no_feature\t%d\n" % empty_forward)
            output_file.write("__ambiguous\t%d\n" % ambiguous_forward)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Forward features with alignment\t%d" % used_features_count
        print "Forward alignments asigned to feature\t%d" % used_features_sum
        print "__forward_no_feature\t%d" % empty_forward
        print "__forward_ambiguous\t%d" % ambiguous_forward
    if reverse:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_reverse_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Reverse written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_reverse.keys()):
                output.write("%s\t%d\n" % (fn, counts_reverse[fn]))
                used_features_count += 1
                used_features_sum += counts_reverse[fn]
            output_file.write("__no_feature\t%d\n" % empty_reverse)
            output_file.write("__ambiguous\t%d\n" % ambiguous_reverse)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Reverse features with alignment\t%d" % used_features_count
        print "Reverse alignments asigned to feature\t%d" % used_features_sum
        print "__reverse_no_feature\t%d" % empty_reverse
        print "__reverse_ambiguous\t%d" % ambiguous_reverse
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
Exemplo n.º 15
0
def count_reads(features, counts, pe_mode, read_seq, order, stranded, 
      overlap_mode, quiet, minaqual, write_to_samout ):
      
    if pe_mode:
        if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
        elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
        else:
            raise ValueError, "Illegal order specified."
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

        i += 1
        if not pe_mode:
            if not r.aligned:
                notaligned += 1
                write_to_samout( r, "__not_aligned" )
                continue
            try:
                if r.optional_field( "NH" ) > 1:
                    nonunique += 1
                    write_to_samout( r, "__alignment_not_unique" )
                    continue
            except KeyError:
                pass
            if r.aQual < minaqual:
                lowqual += 1
                write_to_samout( r, "__too_low_aQual" )
                continue
            if stranded != "reverse":
                iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
        else:
            if r[0] is not None and r[0].aligned:
                if stranded != "reverse":
                    iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
                else:
                    iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
                if stranded != "reverse":
                    iv_seq = itertools.chain(iv_seq, 
                        ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                else:
                    iv_seq = itertools.chain( iv_seq, 
                        ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
                if ( r[0] is None ) or not ( r[0].aligned ):
                    write_to_samout( r, "__not_aligned" )
                    notaligned += 1
                    continue         
            try:
                if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                         ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                    nonunique += 1
                    write_to_samout( r, "__alignment_not_unique" )
                    continue
            except KeyError:
                pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
                lowqual += 1
                write_to_samout( r, "__too_low_aQual" )
                continue         
         
        try:
            if overlap_mode == "union":
                fs = set()
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[ iv ].steps():
                        fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[ iv ].steps():
                        if len(fs2) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection( fs2 )
            else:
                sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
                write_to_samout( r, "__no_feature" )
                empty += 1
            elif len( fs ) > 1:
                write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
                ambiguous += 1
            else:
                write_to_samout( r, list(fs)[0] )
                counts[ list(fs)[0] ] += 1
        except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

    if not quiet:
        sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
    for fn in sorted( counts.keys() ):
        print "%s\t%d" % ( fn, counts[fn] )
    print "__no_feature\t%d" % empty
    print "__ambiguous\t%d" % ambiguous
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
Exemplo n.º 16
0
import HTSeq
import collections

gtf_file = HTSeq.GFF_Reader("p_stutzeri_28a24_and_pMPPla107.gtf")
cds = HTSeq.GenomicArrayOfSets("auto", stranded=True)

for feature in gtf_file:
    if feature.type == "CDS":
        cds[feature.iv] += feature.attr["gene_id"]

almnt_file = HTSeq.BAM_Reader('pair.386_48hr_a_AATGTTGC_starAligned.out.bam')
counts = collections.Counter()

for bundle in HTSeq.pair_SAM_alignments(almnt_file, bundle =True):
    if len(bundle) != 1:
        continue # Skip multiple alignments
    first_almnt, second_almnt = bundle[0] #extract pair
    if not first_almnt.aligned and second_almnt.aligned:
        count["_unmapped"] += 1
        continue
    gene_ids = set()
    for iv, val in features[left_almnt.iv].steps():
        gene_ids |= val
    for iv, val in features[right_almnt.iv].steps():
        gene_ids |= val
    if len(gene_ids) == 1:
        gene_id = list(gene_ids)[0]
        counts[gene_id] += 1
    elif len(gene_ids) == 0:
        counts["_no_feature"] += 1
    else:
Exemplo n.º 17
0
def count_reads(sam_filename, features, counts, samtype, order, forward,
                reverse, overlap_mode, quiet, minaqual, samout, directory):

    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() +
                                 "\tXF:Z:" + assignment + "\n")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if samtype is None:
        samtype = detect_sam_type(sam_filename)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    try:
        if sam_filename != "-":
            read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading beginning "
                         "of SAM/BAM file.\n")
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError("Illegal order specified.")
        if forward:
            empty_forward = 0
            ambiguous_forward = 0
            counts_forward = copy.copy(counts)
        if reverse:
            empty_reverse = 0
            ambiguous_reverse = 0
            counts_reverse = copy.copy(counts)
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue
                if forward:
                    iv_seq_for = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                if reverse:
                    iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar
                                  if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if forward:
                        iv_seq_for = (co.ref_iv for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                    if reverse:
                        iv_seq_rev = (invert_strand(co.ref_iv) for co in
                                      r[0].cigar if co.type == "M"
                                      and co.size > 0)
                else:
                    iv_seq_rev = tuple()
                    iv_seq_for = tuple()
                if r[1] is not None and r[1].aligned:
                    if forward:
                        iv_seq_for = (itertools.chain(iv_seq_for,
                                      (invert_strand(co.ref_iv)
                                       for co in r[1].cigar if co.type == "M"
                                       and co.size > 0)))
                    if reverse:
                        iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv
                                                     for co in r[1].cigar
                                                     if co.type == "M"
                                                     and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        notaligned += 1
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1)
                            or (r[1] is not None and
                                r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual) or
                        (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    if forward:
                        fs_for = set()
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_for = fs_for.union(fs2)
                    if reverse:
                        fs_rev = set()
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_rev = fs_rev.union(fs2)
                elif (overlap_mode == "intersection-strict" or
                        overlap_mode == "intersection-nonempty"):
                    if forward:
                        fs_for = None
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_for is None:
                                        fs_for = fs2.copy()
                                    else:
                                        fs_for = fs_for.intersection(fs2)
                    if reverse:
                        fs_reverse = None
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_rev is None:
                                        fs_rev = fs2.copy()
                                    else:
                                        fs_rev = fs_rev.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if forward:
                    if fs_for is None or len(fs_for) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_forward += 1
                    elif len(fs_for) > 1:
                        write_to_samout(r, "__ambiguous[" +
                                        '+'.join(fs_for) + "]")
                        ambiguous_forward += 1
                    else:
                        write_to_samout(r, list(fs_for)[0])
                        counts_forward[list(fs_for)[0]] += 1
                if reverse:
                    if fs_reverse is None or len(fs_rev) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_reverse += 1
                    elif len(fs_reverse) > 1:
                        write_to_samout(r, "__ambiguous[" +
                                        '+'.join(fs_rev) + "]")
                        ambiguous_reverse += 1
                    else:
                        write_to_samout(r, list(fs_rev)[0])
                        counts_reverse[list(fs_rev)[0]] += 1
            except UnknownChrom:
                write_to_samout(r, "__no_feature")
                empty_forward += 1
                empty_reverse += 1

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d SAM %s processed.\n" %
                         (i, "alignments "
                          if not pe_mode else "alignment pairs"))

    if samoutfile is not None:
        samoutfile.close()

    if forward:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_forward_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Forward written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_forward.keys()):
                output_file.write("%s\t%d\n" % (fn, counts_forward[fn]))
                used_features_count += 1
                used_features_sum += counts_forward[fn]
            output_file.write("__no_feature\t%d\n" % empty_forward)
            output_file.write("__ambiguous\t%d\n" % ambiguous_forward)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Forward features with alignment\t%d" % used_features_count
        print "Forward alignments asigned to feature\t%d" % used_features_sum
        print "__forward_no_feature\t%d" % empty_forward
        print "__forward_ambiguous\t%d" % ambiguous_forward
    if reverse:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_reverse_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Reverse written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_reverse.keys()):
                output.write("%s\t%d\n" % (fn, counts_reverse[fn]))
                used_features_count += 1
                used_features_sum += counts_reverse[fn]
            output_file.write("__no_feature\t%d\n" % empty_reverse)
            output_file.write("__ambiguous\t%d\n" % ambiguous_reverse)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Reverse features with alignment\t%d" % used_features_count
        print "Reverse alignments asigned to feature\t%d" % used_features_sum
        print "__reverse_no_feature\t%d" % empty_reverse
        print "__reverse_ambiguous\t%d" % ambiguous_reverse
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
def main():
    exe_parser = argparse.ArgumentParser()
    exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]')
    exe_parser.add_argument("-u", "--not_aligned",
                            help="output reads that were not aligned, including those that were aligned multiple times(flat file).",
                            type=str)
    exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str)
    exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].",
                            type=str)
    exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true")
    exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str)
    exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str)
    exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).",
                            type=int)
    exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int)
    exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int)
    exe_parser.add_argument("-c", "--max_clip",
                            help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float)
    exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str,
                            choices=["y", "n", "reverse"], default="n")
    exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str)
    exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).",
                            type=str)
    exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str)
    exe_parser.add_argument("-p", "--paired_end_mode",
                            help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str,
                            choices=["p", "n"], default="p")
    exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str)
    args = exe_parser.parse_args()

    if args.paired_end_mode == 'p':
        paired_end = True
        pe_order = 'p'
    elif args.paired_end_mode == 'n':
        paired_end = True
        pe_order = 'n'

    if args.infile:
        try:
            if args.infile == '-':  # get sam on a stream
                seqfile = HTSeq.SAM_Reader(sys.stdin)
                if args.paired_end_mode:
                    # read_seq_iter = iter(seqfile)
                    # first_read = read_seq_iter.next()
                    # read_seq = itertools.chain([first_read], read_seq_iter)
                    # reader = HTSeq.pair_SAM_alignments(read_seq)
                    if pe_order == 'p':
                        reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile)
                    elif pe_order == 'n':
                        reader = HTSeq.pair_SAM_alignments(seqfile)  # (read_seq)
                else:
                    reader = seqfile
            elif args.infile != '-':
                seqfile = HTSeq.SAM_Reader(args.infile)
                if args.paired_end_mode:
                    read_seq_iter = iter(seqfile)
                    first_read = read_seq_iter.next()
                    read_seq = itertools.chain([first_read], read_seq_iter)
                    reader = HTSeq.pair_SAM_alignments(read_seq)
                    if pe_order == 'p':
                        reader = HTSeq.pair_SAM_alignments_with_buffer(reader)
                    elif pe_order == 'n':
                        reader = HTSeq.pair_SAM_alignments(reader)
                else:
                    reader = seqfile
                    # fread_seq_iter = iter(reader)
                    # first_read = iter(read_seq).next()
            elif args.infile == '':
                print "no input file type given. exiting..."
                sys.exit(1)
        except:
            print "failed processing SAM/BAM file"
            raise
    elif not args.infile:
        print "no input file given. exiting..."
        sys.exit(1)

    if args.gff:
        gff_file = args.gff
    else:
        print "no gff file given. exiting..."
        sys.exit(1)

    if args.verbose:
        verbose = True
    else:
        verbose = False

    if args.min_read_length:
        min_read_len = args.min_read_length
    else:
        min_read_len = 60  # default read length

    if args.max_clip:
        max_clip_ = float(args.max_clip)
    else:
        max_clip_ = float(0.3)  # default read length

    if args.min_id:
        min_id = float(args.min_id)
    else:
        min_id = float(80)

    if args.min_score:
        min_score = int(args.min_score)
    else:
        min_score = 0

    if args.stranded == 'n':
        stranded = 'no'
    elif args.stranded == 'y':
        stranded = 'yes'
    elif args.stranded == 'reverse':
        stranded = 'reverse'

    if args.minaqual:
        minaqual = args.minaqual
    else:
        minaqual = 0

    if args.idattr:
        id_attribute = args.idattr
    else:
        id_attribute = "GeneID"
    if args.type:
        feature_type = args.type
    else:
        feature_type = 'CDS'

    # ###
    # parse GFF file
    features, counts = gff_reader(gff_file, feature_type, id_attribute, verbose, stranded)
    # ###
    if args.samout:
        samoutfile = open(args.samout, "w")
    else:
        samoutfile = None
    if args.ambiguous_out:
        ambiguousfile = open(args.ambiguous_out, "w")
    else:
        ambiguousfile = None
    if args.fasta:
        fastafile = open(args.fasta, "w")
    else:
        fastafile = None
    if args.not_aligned:
        not_aligned_file = open(args.not_aligned, "w")
    else:
        not_aligned_file = None
    if args.out:
        outfile = open(args.out, "w")
    else:
        outfile = None

        # if outfile and samoutfile and  ambiguousfile and fastafile and not_aligned_file == None:
        # print "None of the possible output file options specified. exiting..."
        # sys.exit(1)
    # #######
    # decalre counter variables
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    # #######

    read_counter = 0
    for alignment in reader:  # for alignment entry (line in fact) in sam file
        # iv_seq
        # print alignment
        if not paired_end:
            if read_counter % 1000000 == 0 and verbose:
                if verbose:
                    print read_counter, 'non paired-end alignments processed'
            read_name = alignment.read.name
            # read = alignment.read  # READ. Note that def invert_strand( iv ):
            read_seq = alignment.read.seq
            read_length = len(alignment.read.seq)
            if not alignment.aligned:  # check if read is aligned to ref sequence
                if alignment is not None:
                    notaligned += 1
                    if args.samout:
                        write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                    if args.not_aligned:
                        not_aligned_file.write(read_name + '\t' + 'not_aligned' + '\n')
                        # continue
            elif alignment.aligned:

                opt_fields = alignment.optional_fields
                # flag = alignment.flag
                cigar_string = parse_cigar(alignment.original_sam_line.split('\t')[
                    5])  # just the cigar string without the fancy HTseq additions
                cigar_soft_clipped, cigar_m, cigar_insertions, cigar_deletions, cigar_insertions = parse_cigar_alignment(cigar_string)  # get alignment data from cigar string
                score, md_matches, md_deletions, md_mismatches = parse_opt_fields(
                    opt_fields)  # get alignment data from md string
                percent_id = 100.0 * (
                    float(md_matches) / (float(read_length - cigar_soft_clipped + cigar_insertions + cigar_deletions)))
                if alignment[0] is not None:  # check if read is aligned to ref sequence
                    if alignment.optional_field("NH") > 1:  # check if read is mapped more than once
                        # By default these reads are discarded. CHANGE?
                        if args.samout:
                            write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique")
                        nonunique += 1
                        if args.not_aligned:
                            not_aligned_file.write(read_name + '\t' + 'alignment_not_unique' + '\n')
                            # continue
                    if alignment.aQual < minaqual:  # check quality. default is 0
                        lowqual += 1
                        if args.samout:
                            write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual")
                        if args.not_aligned:
                            not_aligned_file.write(read_name + '\t' + 'too_low_aQual' + '\n')
                            # continue
                    clipped = (float(cigar_soft_clipped) / float(read_length))
                    if read_length >= min_read_len:
                        if (float(cigar_soft_clipped) / float(read_length)) <= max_clip_:
                            if score >= args.min_score:
                                if percent_id >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = (
                                            (invert_strand(cigar_operation.ref_iv) for cigar_operation in
                                             alignment[1].cigar
                                             if cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = (cigar_operation.ref_iv for cigar_operation in alignment.cigar if
                                                  cigar_operation.type == "M" and cigar_operation.size > 0)
                                    iv_seq_good = True
                                    # collects hits to chromosomes/features.
                                    """
                                    cigarOperation in HTSeq:
                                    HTSeq.parse_cigar( "20M6I10M", 1000, "chr2", "+" ) #ref_iv == genomicInterval object
                                    of htSeq
                                    [< CigarOperation: 20 base(s) matched on ref iv chr2:[1000,1020)/+,query iv[0,20)>,
                                    < CigarOperation: 6 base(s) inserted on ref iv chr2:[1020,1020)/+,query iv[20,26)>,]
                                    """
                                    # if args.fasta:
                                    # fastafile.write('>' + read_name + '\n' + read_seq + '\n')

                                else:
                                    iv_seq_good = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_name + '\t' + 'percent_id_too_low=' + str(percent_id) + '\n')
                            else:
                                iv_seq_good = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_name + '\t' + 'alignment_score_too_low=' + str(score) + '\n')
                        else:
                            iv_seq_good = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_soft_clipped) + '\n')
        elif paired_end:
            # print "read counter=", read_counter
            if read_counter % 100000 == 0 and verbose:
                if verbose:
                    print read_counter, 'alignment pairs processed'
            if (alignment[0] is None) or not alignment[0].aligned:
                notaligned += 1
                try:
                    read_1_name = alignment[0].read.name
                except:
                    read_1_name = 'None'
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                if args.not_aligned:
                    not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
            elif (alignment[1] is None) or not alignment[1].aligned:
                notaligned += 1
                try:
                    read_2_name = alignment[1].read.name
                except:
                    read_2_name = 'None'
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                if args.not_aligned:
                    not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
            else:
                # else:
                read_1_name = alignment[0].read.name
                # read_1 = alignment[0].read  #READ.
                read_1_length = len(alignment[0].read.seq)
                read_1_seq = alignment[0].read.seq
                read_2_name = alignment[1].read.name
                # read_2 = alignment[1].read  #READ.
                # read_2_length = len(alignment[1].read.seq)
                read_2_seq = alignment[1].read.seq
                iv_seq = tuple()
                if (alignment[0] is not None) and alignment[0].aligned:  # check if read is aligned to ref sequence
                    opt_1_fields = alignment[0].optional_fields
                    # flag_1 = alignment[0].flag
                    cigar_1_string = parse_cigar(alignment[0].original_sam_line.split('\t')[
                        5])  # just the cigar string without the fancy HTseq additions
                    cigar_1_soft_clipped, cigar_1_m, cigar_1_insertions, cigar_1_deletions, cigar_1_insertions = parse_cigar_alignment(
                        cigar_1_string)
                    score_1, md_1_matches, md_1_deletions, md_1_mismatches = parse_opt_fields(
                        opt_1_fields)  # get alignment data from md string
                    percent_1_id = (100.0 * ((float(md_1_matches) / (
                        float(read_1_length - cigar_1_soft_clipped + cigar_1_insertions + cigar_1_deletions)))))
                    clipped_1 = (float(cigar_1_soft_clipped) / float(read_1_length))
                    if int(read_1_length) >= int(min_read_len):
                        if (float(cigar_1_soft_clipped) / float(read_1_length)) <= float(max_clip_):

                            # if int(score_1) >= int(args.min_score):
                            if int(score_1) >= int(min_score):
                                # if float(percent_1_id) >= float(args.min_id):
                                if float(percent_1_id) >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for
                                                                          cigar_operation in alignment[0].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in
                                                                          alignment[0].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    # if args.fasta:
                                    # fastafile.write('>' + read_1_name + '\n' + read_1_seq + '\n')
                                    iv_seq_good_1 = True

                                else:
                                    iv_seq_good_1 = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_1_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_1_name + '\t' + 'percent_id_too_low=' + str(percent_1_id) + '\n')
                            else:
                                iv_seq_good_1 = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score_1))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_1_name + '\t' + 'alignment_score_too_low=' + str(score_1) + '\n')
                        else:
                            iv_seq_good = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_1_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_1_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_1_soft_clipped) + '\n')
                # else:
                # iv_seq = tuple()

                if (alignment[1] is not None) and alignment[1].aligned:  # check if read is aligned to ref sequence
                    opt_2_fields = alignment[1].optional_fields
                    # flag_2 = alignment[1].flag  # ',  #'bit_length', 'conjugate', 'denominator', 'imag', 'numerator', 'real']
                    cigar_2_string = parse_cigar(alignment[1].original_sam_line.split('\t')[
                        5])  # just the cigar string without the fancy HTseq additions
                    cigar_2_soft_clipped, cigar_2_m, cigar_2_insertions, cigar_2_deletions, cigar_2_insertions = parse_cigar_alignment(
                        cigar_2_string)
                    score_2, md_2_matches, md_2_deletions, md_2_mismatches = parse_opt_fields(
                        opt_2_fields)  # get alignment data from md string
                    read_2_name = alignment[1].read.name
                    read_2_length = len(alignment[1].read.seq)
                    # read_2 = alignment[1].read  # READ.
                    read_2_seq = alignment[1].read.seq
                    percent_2_id = (100.0 * (float(md_2_matches) / (
                        float(read_2_length - cigar_2_soft_clipped + cigar_2_insertions + cigar_2_deletions))))
                    clipped_2 = (float(cigar_2_soft_clipped) / float(read_2_length))
                    if int(read_2_length) >= int(min_read_len):
                        if (float(cigar_2_soft_clipped) / float(read_2_length)) <= float(max_clip_):
                            if int(score_2) >= int(min_score):
                                if float(percent_2_id) >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for
                                                                          cigar_operation in alignment[1].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in
                                                                          alignment[1].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                        iv_seq_good_2 = True
                                    try:
                                        if (alignment[0].optional_field("NH") > 1) or (alignment[1].optional_field(
                                                "NH") > 1):
                                            # or (alignment[1].optional_field("NH") > 1): #check if read is mapped more
                                            # than once
                                            # By default these reads are discarded. CHANGE?
                                            iv_seq_good_1 = False
                                            iv_seq_good_2 = False
                                            if args.samout:
                                                write_to_samout(samoutfile, paired_end, alignment,
                                                                "alignment_not_unique")
                                                nonunique += 1
                                            if args.not_aligned:
                                                not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
                                                not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
                                                continue
                                    except KeyError:
                                        pass
                                    if (alignment[0] and alignment[0].aQual < minaqual) or (alignment[1] and alignment[1].aQual < minaqual):
                                        # check quality. default is 0
                                        iv_seq_good_2 = False
                                        lowqual += 1
                                        if args.samout:
                                            write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual")
                                        if args.not_aligned:
                                            not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
                                            not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
                                        continue
                                else:
                                    iv_seq_good_2 = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_2_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_2_name + '\t' + 'percent_id_too_low=' + str(percent_2_id) + '\n')
                            else:
                                iv_seq_good_2 = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score_2))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_2_name + '\t' + 'alignment_score_too_low=' + str(score_2) + '\n')
                        else:
                            iv_seq_good_2 = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_2_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_2_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_2_soft_clipped) + '\n')
        read_counter += 1

        """
        overlap_mode == "union"
        will count a hit even if read is mapped across an intron or there is an insertion.
        """
        try:
            feature_set = set()
            for iv in iv_seq:
                # print iv
                if iv.chrom not in features.chrom_vectors:  # check if alignment feaure name in features from GFF file
                    # The name of a sequence (i.e., chromosome, contig, or the like).
                    # check the gff features dictionary
                    raise UnknownChrom
                for iv2, fs2 in features[iv].steps():  # fs == feature steps.
                    """
                    from HTseq manual:
                    GenomicArray objects use by default so-called StepVectors that store the data internally in steps of
                    constant value
                    """
                    feature_set = feature_set.union(fs2)
                    # print feature_set
            if feature_set is None or len(feature_set) == 0:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "no_feature")
                if args.not_aligned:
                    not_aligned_file.write('None' + '\t' + 'no_feature' + '\n')
                empty += 1
            elif len(feature_set) > 1:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "ambiguous[" + '+'.join(feature_set) + "]")
                if ambiguousfile:
                    if paired_end:
                        if iv_seq_good_1:
                            ambiguousfile.write('>' + read_1_name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped_1) + '_score_' + str(score_2) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n')
                        if iv_seq_good_2:
                            ambiguousfile.write('>' + read_2_name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n')
                    else:
                        if iv_seq_good:
                            ambiguousfile.write('>' + alignment.read.name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n')

                """
                #if args.not_aligned:
                #    if paired_end:
                #    not_aligned_file.write(alignment[0].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                #        not_aligned_file.write(alignment[1].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                #    else:
                #    not_aligned_file.write(alignment.read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                """
                ambiguous += 1
            elif len(feature_set) == 1:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, list(feature_set)[0])
                if args.fasta:
                    if paired_end:
                        if iv_seq_good_1:
                            fastafile.write('>' + read_1_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped_1) + '_score_' + str(score_1) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n')
                        if iv_seq_good_2:
                            fastafile.write('>' + read_2_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n')
                    else:
                        if iv_seq_good:
                            fastafile.write('>' + read_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n')

                counts[list(feature_set)[0]] += 1
        except:
            if args.samout:
                write_to_samout(samoutfile, paired_end, alignment, "__no_feature")
            empty += 1

            # if not paired_end:
            # al = alignment
            # else:
            # al = alignment[0] if alignment[0] is not None else alignment[1]

            # if args.not_aligned:
            # not_aligned_file.write(al.read.name + '\t' + 'feature_not_in_gff_file' + '\n')
            # if not verbose:
            #    print (("Warning: Skipping read '%s', because chromosome " +
            #    "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
            #     (al.read.name, iv.chrom) )
    print 'total', read_counter, 'alignments processed'
    if samoutfile is not None:
        samoutfile.close()
    if fastafile is not None:
        fastafile.close
    if not_aligned_file is not None:
        not_aligned_file.close()

    if outfile is not None:
        for feature in sorted(counts.keys()):
            outfile.write("%s\t%d\n" % (feature, counts[feature]))
        outfile.write("no_feature\t%d\n" % empty)
        outfile.write("ambiguous\t%d\n" % ambiguous)
        outfile.write("too_low_aQual\t%d\n" % lowqual)
        outfile.write("not_aligned\t%d\n" % notaligned)
        outfile.write("alignment_not_unique\t%d\n" % nonunique)
    if outfile is not None:
        outfile.close()
Exemplo n.º 19
0
 def reader(x):
     # return HTSeq.pair_SAM_alignments(HTSeq.BAM_Reader(x), bundle=True)
     return HTSeq.pair_SAM_alignments(HTSeq.BAM_Reader(x),
                                      bundle=False)
Exemplo n.º 20
0
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print "%s\t%d" % ( fn, counts[fn] )
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
Exemplo n.º 21
0
    def count_reads_in_features(sam_filenames, colnames, gff_filename, opts):
        """ Hacked version of htseq count.py
       """
        if opts.quiet:
            warnings.filterwarnings(action="ignore", module="HTSeq")
        features = HTSeq.GenomicArrayOfSets("auto", opts.stranded != "no")
        mapqMin = int(opts.mapqMin)
        counts = {}
        nreads = 0
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        filtered = 0  # new filter_extras - need a better way to do this - independent filter tool?
        gff = HTSeq.GFF_Reader(gff_filename)
        try:
            for i, f in enumerate(gff):
                if f.type == opts.feature_type:
                    try:
                        feature_id = f.attr[opts.id_attribute]
                    except KeyError:
                        try:
                            feature_id = f.attr['gene_id']
                        except KeyError:
                            sys.exit(
                                "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?"
                                % ((i + 1), f.name, opts.id_attribute))
                    if opts.stranded != "no" and f.iv.strand == ".":
                        sys.exit(
                            "Feature %s at %s does not have strand information but you are "
                            "running htseq-count in stranded mode. Use '--stranded=no'."
                            % (f.name, f.iv))
                    features[f.iv] += feature_id
                    counts[feature_id] = [
                        0 for x in colnames
                    ]  # we use sami as an index here to bump counts later
        except:
            sys.stderr.write("Error occured in %s.\n" %
                             gff.get_line_number_string())
            raise

        if not opts.quiet:
            sys.stdout.write("%d GFF lines processed.\n" % i)

        if len(counts) == 0 and not opts.quiet:
            sys.stdout.write("Warning: No features of type '%s' found.\n" %
                             opts.feature_type)
        for sami, sam_filename in enumerate(sam_filenames):
            colname = colnames[sami]
            isbam = sam_exts[sami] == 'bam'
            hasbai = sam_bais[sami] > ''
            if hasbai:
                tempname = os.path.splitext(os.path.basename(sam_filename))[0]
                tempbam = '%s_TEMP.bam' % tempname
                tempbai = '%s_TEMP.bai' % tempname
                os.link(sam_filename, tempbam)
                os.link(sam_bais[sami], tempbai)
            try:
                if isbam:
                    if hasbai:
                        read_seq = HTSeq.BAM_Reader(tempbam)
                    else:
                        read_seq = HTSeq.BAM_Reader(sam_filename)
                else:
                    read_seq = HTSeq.SAM_Reader(sam_filename)
                first_read = iter(read_seq).next()
                pe_mode = first_read.paired_end
            except:
                if isbam:
                    print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (
                        sam_filename, colname)
                else:
                    print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (
                        sam_filename, colname)
                raise

            try:
                if pe_mode:
                    read_seq_pe_file = read_seq
                    read_seq = HTSeq.pair_SAM_alignments(read_seq)
                for seqi, r in enumerate(read_seq):
                    nreads += 1
                    if not pe_mode:
                        if not r.aligned:
                            notaligned += 1
                            continue
                        try:
                            if len(opts.filter_extras) > 0:
                                for extra in opts.filter_extras:
                                    if r.optional_field(extra):
                                        filtered += 1
                                        continue
                            if r.optional_field("NH") > 1:
                                nonunique += 1
                                continue
                        except KeyError:
                            pass
                        if r.aQual < mapqMin:
                            lowqual += 1
                            continue
                        if opts.stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r.cigar
                                      if co.type == "M" and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv)
                                      for co in r.cigar
                                      if co.type == "M" and co.size > 0)
                    else:
                        if r[0] is not None and r[0].aligned:
                            if opts.stranded != "reverse":
                                iv_seq = (co.ref_iv for co in r[0].cigar
                                          if co.type == "M" and co.size > 0)
                            else:
                                iv_seq = (invert_strand(co.ref_iv)
                                          for co in r[0].cigar
                                          if co.type == "M" and co.size > 0)
                        else:
                            iv_seq = tuple()
                        if r[1] is not None and r[1].aligned:
                            if opts.stranded != "reverse":
                                iv_seq = itertools.chain(
                                    iv_seq,
                                    (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                            else:
                                iv_seq = itertools.chain(
                                    iv_seq,
                                    (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                        else:
                            if (r[0] is None) or not (r[0].aligned):
                                notaligned += 1
                                continue
                        try:
                            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                                  ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                                nonunique += 1
                                continue
                        except KeyError:
                            pass
                        if (r[0] and r[0].aQual < mapqMin) or (
                                r[1] and r[1].aQual < mapqMin):
                            lowqual += 1
                            continue

                    try:
                        if opts.mode == "union":
                            fs = set()
                            for iv in iv_seq:
                                if iv.chrom not in features.chrom_vectors:
                                    raise UnknownChrom
                                for iv2, fs2 in features[iv].steps():
                                    fs = fs.union(fs2)
                        elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty":
                            fs = None
                            for iv in iv_seq:
                                if iv.chrom not in features.chrom_vectors:
                                    raise UnknownChrom
                                for iv2, fs2 in features[iv].steps():
                                    if len(
                                            fs2
                                    ) > 0 or opts.mode == "intersection-strict":
                                        if fs is None:
                                            fs = fs2.copy()
                                        else:
                                            fs = fs.intersection(fs2)
                        else:
                            sys.exit("Illegal overlap mode %s" % opts.mode)
                        if fs is None or len(fs) == 0:
                            empty += 1
                        elif len(fs) > 1:
                            ambiguous += 1
                        else:
                            ck = list(fs)[0]
                            counts[ck][
                                sami] += 1  # end up with counts for each sample as a list
                    except UnknownChrom:
                        if not pe_mode:
                            rr = r
                        else:
                            rr = r[0] if r[0] is not None else r[1]
                        empty += 1
                        if not opts.quiet:
                            sys.stdout.write((
                                "Warning: Skipping read '%s', because chromosome "
                                +
                                "'%s', to which it has been aligned, did not appear in the GFF file.\n"
                            ) % (rr.read.name, iv.chrom))
            except:
                if not pe_mode:
                    sys.stderr.write("Error occured in %s.\n" %
                                     read_seq.get_line_number_string())
                else:
                    sys.stderr.write("Error occured in %s.\n" %
                                     read_seq_pe_file.get_line_number_string())
                raise

            if not opts.quiet:
                sys.stdout.write(
                    "%d sam %s processed for %s.\n" %
                    (seqi, "lines " if not pe_mode else "line pairs", colname))
        return counts, empty, ambiguous, lowqual, notaligned, nonunique, filtered, nreads
Exemplo n.º 22
0
def mapping_reads2shared_exons_introns(refGene_txt, bam_filename, minaqual,
                                       stranded, order, max_buffer_size):
    # initialise counters
    counts = {}
    counts['_empty'] = 0
    counts['_ambiguous'] = 0
    counts['_lowaqual'] = 0
    counts['_notaligned'] = 0
    counts['_ambiguous_readpair_position'] = 0

    # Read BAM file
    bam_reader = HTSeq.BAM_Reader(bam_filename)
    # CIGAR match characters (including alignment match, sequence match, and sequence mismatch
    cigar_char = ('M', '=', 'X')
    # (Refer to HTSeq-count)strand-associated
    stranded_boolean = stranded == 'yes' or stranded == 'reverse'
    reverse_boolean = stranded == 'reverse'

    def invert_strand(iv):
        iv2 = iv.copy()
        if iv2.strand == "+":
            iv2.strand = "-"
        elif iv2.strand == "-":
            iv2.strand = "+"
        else:
            raise ValueError("Illegal strand")
        return iv2

    sys.stdout.write(
        "Gene\tfeature\trank\tposition\tlength\tread_counts\tread_counts_norm\tcoverage(%)\n"
    )

    annot = collections.OrderedDict()
    for line in open(refGene_txt):
        gene_label, feature, rank, position, length = line.strip().split('\t')
        chrom, iv_str, strand = position.strip().split(':')
        start, end = map(int, iv_str.strip().split('-'))
        annot.setdefault(gene_label, []).append(
            (feature, int(rank), chrom, start, end, strand, int(length)))

    for gene_name in annot:
        gene_count = {}
        gas = HTSeq.GenomicArrayOfSets("auto", stranded=stranded_boolean)
        ga = HTSeq.GenomicArray("auto",
                                stranded=stranded_boolean,
                                typecode="i")
        cvg_list = []

        # Annotation
        for feature, rank, chrom, start, end, strand, length in annot[
                gene_name]:
            iv = HTSeq.GenomicInterval(chrom, start, end, strand)
            gas[iv] += (feature, rank)
            gene_count[(feature, rank)] = 0

        # 直接对bam_reader取iter有问题,作者说是pysam的bug导致的。修正:加fetch
        boundary_left, boundary_right = min(
            [i[3]
             for i in annot[gene_name]]), max([i[4] for i in annot[gene_name]])
        region_fetch = annot[gene_name][0][2] + ':' + str(
            int(boundary_left) - 500) + '-' + str(int(boundary_right) + 500)
        read_seq = bam_reader.fetch(region=region_fetch)

        # distinguish SE and PE mode:
        read_seq_iter = iter(bam_reader.fetch())
        one_read = next(read_seq_iter)
        pe_mode = one_read.paired_end

        if pe_mode:
            if order == 'name':
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == 'pos':
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    read_seq, max_buffer_size=max_buffer_size)
            else:
                raise ValueError("Illegal order name.")

        # Mapping
        for a in read_seq:
            if not pe_mode:
                if not a.aligned:
                    counts['_notaligned'] += 1
                    continue
                if a.optional_field('NH') > 1:
                    continue
                if a.aQual < minaqual:
                    counts['_lowaqual'] += 1
                    continue
                if not reverse_boolean:
                    iv_seq = (cigop.ref_iv for cigop in a.cigar
                              if cigop.type == "M" and cigop.size > 0)
                else:
                    iv_seq = (invert_strand(cigop.ref_iv) for cigop in a.cigar
                              if cigop.type in cigar_char and cigop.size > 0)
            # pe mode
            else:
                if ((a[0] and a[0].aQual < minaqual)
                        or (a[1] and a[1].aQual < minaqual)):
                    counts['_lowaqual'] += 1
                    continue
                if ((a[0] and a[0].optional_field('NH') > 1)
                        or (a[1] and a[1].optional_field('NH') > 1)):
                    continue
                if a[0] is not None and a[0].aligned:
                    if not reverse_boolean:
                        iv_seq = (
                            cigop.ref_iv for cigop in a[0].cigar
                            if cigop.type in cigar_char and cigop.size > 0)
                    else:
                        iv_seq = (
                            invert_strand(cigop.ref_iv) for cigop in a[0].cigar
                            if cigop.type in cigar_char and cigop.size > 0)
                else:
                    iv_seq = tuple()
                if a[1] is not None and a[1].aligned:
                    if not reverse_boolean:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (invert_strand(cigop.ref_iv)
                             for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (cigop.ref_iv for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))

            feature_aligned = set()
            for iv in iv_seq:
                for iv2, val2 in gas[iv].steps():
                    feature_aligned |= val2
                    ga[iv] += 1  # for calculating coverage
            if len(feature_aligned) == 0:
                counts['_empty'] += 1
                continue
            # when mapping to intron, discard exons
            for f in [item for item in feature_aligned if item[0] == 'intron']:
                gene_count[f] += 1
            # when no mapping to intron, count all exons
            if 'intron' not in [x for x, y in feature_aligned]:
                for f in feature_aligned:
                    gene_count[f] += 1

        res = []
        for feature, rank, chrom, start, end, strand, length in annot[
                gene_name]:
            feature_count = gene_count[(feature, rank)]
            feature_count_norm = feature_count / length * 1000
            # Coverage calculation
            iv = HTSeq.GenomicInterval(chrom, start, end, strand)
            cvg_region = list(ga[iv])
            cvg = len(filter(lambda x: x > 0,
                             cvg_region)) / len(cvg_region) * 100
            res.append([
                feature, rank, chrom, start, end, strand, length,
                feature_count, feature_count_norm, cvg
            ])

        # Output
        for feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg in res:
            pos = "%s:%d-%d:%s" % (chrom, start, end, strand)
            sys.stdout.write('\t'.join(
                map(str, [
                    gene_name, feature, rank, pos, length, feature_count,
                    feature_count_norm, cvg
                ])) + '\n')

    for fn in counts.keys():
        sys.stderr.write('%s\t%d\n' % (fn, counts[fn]))
Exemplo n.º 23
0
# Deal with any GFF file reading errors
except ValueError as e:
    e.args += ( gff.get_line_number_string(), )
    raise

try:
    # Get the first read to see if we're dealing with paired-end data
    read_seq = HTSeq.SAM_Reader(options.sam)
    first_read = iter(read_seq).next()
    pe_mode = first_read.paired_end
    
    # Re-initialize read_seq depending on if it's paired-end data or not
    read_seq = HTSeq.SAM_Reader(options.sam)
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments(read_seq)

    # Read counter, for feedback to user
    i = 0 
    total = 0
    # Here we go, through each read...
    for r in read_seq:
        spliced = False
        if not pe_mode:
            if not r.aligned:
                continue
            total += 1
            iv_seq = []

            # Check to see if it's spliced
            for co in r.cigar:
Exemplo n.º 24
0
def count_reads_onto_prebuilt_features(
    sam_filename, features, feature_ids, stranded, overlap_mode, quiet, minaqual, samout, umis=False
):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n")

    if quiet:
        warnings.filterwarnings(action="ignore", module="HTSeq")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if umis:
        umi_re = re.compile(":UMI:(\w+):")
        umi_counts = {}

        def count_umis(fs, read_name):
            umi_seq = umi_re.search(read_name).group(1)
            umi_counts[fs][umi_seq] += 1

        for feature_id in feature_ids:
            umi_counts[feature_id] = Counter()
    else:

        def count_umis(x, y):
            return None

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    counts = {}
    for feature_id in feature_ids:
        counts[feature_id] = 0

    try:
        if sam_filename != "-":
            read_seq_file = HTSeq.SAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = HTSeq.SAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except StopIteration:
        raise EmptySamError(sam_filename)

    try:
        if pe_mode:
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        write_to_samout(r, "alignment_not_unique")
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)
                        )
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)
                        )
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "not_aligned")
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or (
                        r[1] is not None and r[1].optional_field("NH") > 1
                    ):
                        nonunique += 1
                        write_to_samout(r, "alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual):
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(fs2) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    write_to_samout(r, "no_feature")
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "ambiguous[" + "+".join(fs) + "]")
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0])
                    counts[list(fs)[0]] += 1
                    count_umis(list(fs)[0], r.read.name)
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1
                # if not quiet:
                #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                #      ( rr.read.name, iv.chrom ) )

            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs"))

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs"))

    if samoutfile is not None:
        samoutfile.close()

    # sorted feature list. features+counts
    feats = [fn for fn in sorted(counts.keys())]
    if umis:
        counts = [len(umi_counts[fn]) for fn in feats]
    else:
        counts = [counts[fn] for fn in feats]
    # cat statistics summary to feature+count list
    feats = feats + ["no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique"]
    counts = counts + [empty, ambiguous, lowqual, notaligned, nonunique]
    return (feats, counts)
Exemplo n.º 25
0
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      

   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               raise ValueError( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ))
            if stranded != "no" and f.iv.strand == ".":
               raise ValueError( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ))
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   if samtype == "sam":
      SAM_or_BAM_Reader = HTSeq.SAM_Reader
   elif samtype == "bam":
      SAM_or_BAM_Reader = HTSeq.BAM_Reader
   else:
      raise ValueError("Unknown input format %s specified." % samtype)

   try:
      if sam_filename != "-":
         read_seq_file = SAM_or_BAM_Reader( sam_filename )
         read_seq = read_seq_file
         first_read = next(iter(read_seq))
      else:
         read_seq_file = SAM_or_BAM_Reader( sys.stdin )
         read_seq_iter = iter( read_seq_file )
         first_read = next(read_seq_iter)
         read_seq = itertools.chain( [ first_read ], read_seq_iter )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" )
      raise

   try:
      if pe_mode:
         if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
         elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
         else:
            raise ValueError("Illegal order specified.")
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "__not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "__not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "__no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

   except:
      sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print("%s\t%d" % ( fn, counts[fn] ))
   print("__no_feature\t%d" % empty)
   print("__ambiguous\t%d" % ambiguous)
   print("__too_low_aQual\t%d" % lowqual)
   print("__not_aligned\t%d" % notaligned)
   print("__alignment_not_unique\t%d" % nonunique)
Exemplo n.º 26
0
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      

   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               raise ValueError, ( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               raise ValueError, ( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   if samtype == "sam":
      SAM_or_BAM_Reader = HTSeq.SAM_Reader
   elif samtype == "bam":
      SAM_or_BAM_Reader = HTSeq.BAM_Reader
   else:
      raise ValueError, "Unknown input format %s specified." % samtype

   try:
      if sam_filename != "-":
         read_seq_file = SAM_or_BAM_Reader( sam_filename )
         read_seq = read_seq_file
         first_read = iter(read_seq).next()
      else:
         read_seq_file = SAM_or_BAM_Reader( sys.stdin )
         read_seq_iter = iter( read_seq_file )
         first_read = read_seq_iter.next()
         read_seq = itertools.chain( [ first_read ], read_seq_iter )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" )
      raise

   try:
      if pe_mode:
         if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
         elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
         else:
            raise ValueError, "Illegal order specified."
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "__not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "__not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "__no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

   except:
      sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print "%s\t%d" % ( fn, counts[fn] )
   print "__no_feature\t%d" % empty
   print "__ambiguous\t%d" % ambiguous
   print "__too_low_aQual\t%d" % lowqual
   print "__not_aligned\t%d" % notaligned
   print "__alignment_not_unique\t%d" % nonunique
Exemplo n.º 27
0
def count_reads_with_barcodes(
    sam_filename,
    features,
    feature_attr,
    order,
    max_buffer_size,
    stranded,
    overlap_mode,
    multimapped_mode,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    feature_type,
    id_attribute,
    additional_attributes,
    quiet,
    minaqual,
    samout_format,
    samout_filename,
    cb_tag,
    ub_tag,
):
    def write_to_samout(r, assignment, samoutfile, template=None):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                if samout_format in ('SAM', 'sam'):
                    samoutfile.write(read.get_sam_line() + "\n")
                else:
                    samoutfile.write(read.to_pysam_AlignedSegment(template))

    def identify_barcodes(r):
        '''Identify barcode from the read or pair (both must have the same)'''
        if not pe_mode:
            r = (r, )
        # cell, UMI
        barcodes = [None, None]
        nbar = 0
        for read in r:
            if read is not None:
                for tag, val in read.optional_fields:
                    if tag == cb_tag:
                        barcodes[0] = val
                        nbar += 1
                        if nbar == 2:
                            return barcodes
                    elif tag == ub_tag:
                        barcodes[1] = val
                        nbar += 1
                        if nbar == 2:
                            return barcodes
        return barcodes

    try:
        if sam_filename == "-":
            read_seq_file = HTSeq.BAM_Reader(sys.stdin)
        else:
            read_seq_file = HTSeq.BAM_Reader(sam_filename)

        # Get template for output BAM
        if samout_filename is None:
            template = None
            samoutfile = None
        elif samout_format in ('bam', 'BAM'):
            template = read_seq_file.get_template()
            samoutfile = pysam.AlignmentFile(
                samout_filename,
                'wb',
                template=template,
            )
        else:
            template = None
            samoutfile = open(samout_filename, 'w')

        read_seq_iter = iter(read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            pe_mode = False
        if first_read is not None:
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq = []
    except:
        sys.stderr.write(
            "Error occured when reading beginning of SAM/BAM file.\n")
        raise

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    try:
        if pe_mode:
            if ((supplementary_alignment_mode == 'ignore')
                    and (secondary_alignment_mode == 'ignore')):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq,
                                                     primary_only=primary_only)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    read_seq,
                    max_buffer_size=max_buffer_size,
                    primary_only=primary_only)
            else:
                raise ValueError("Illegal order specified.")

        # The nesting is cell barcode, UMI, feature
        counts = defaultdict(lambda: defaultdict(Counter))
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))
                sys.stderr.flush()

            i += 1

            cb, ub = identify_barcodes(r)

            if not pe_mode:
                if not r.aligned:
                    counts[cb][ub]['__not_aligned'] += 1
                    write_to_samout(r, "__not_aligned", samoutfile, template)
                    continue
                if ((secondary_alignment_mode == 'ignore')
                        and r.not_primary_alignment):
                    continue
                if ((supplementary_alignment_mode == 'ignore')
                        and r.supplementary):
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        counts[cb][ub]['__alignment_not_unique'] += 1
                        write_to_samout(r, "__alignment_not_unique",
                                        samoutfile, template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    counts[cb][ub]['__too_low_aQual'] += 1
                    write_to_samout(r, "__too_low_aQual", samoutfile, template)
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type in com and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if (co.type in com and co.size > 0))
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned", samoutfile,
                                        template)
                        counts[cb][ub]['__not_aligned'] += 1
                        continue
                if secondary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].not_primary_alignment:
                        continue
                    elif (r[1] is not None) and r[1].not_primary_alignment:
                        continue
                if supplementary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].supplementary:
                        continue
                    elif (r[1] is not None) and r[1].supplementary:
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                        (r[1] is not None and r[1].optional_field("NH") > 1)):
                        write_to_samout(r, "__alignment_not_unique",
                                        samoutfile, template)
                        counts[cb][ub]['__alignment_not_unique'] += 1
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual)
                        or (r[1] and r[1].aQual < minaqual)):
                    write_to_samout(r, "__too_low_aQual", samoutfile, template)
                    counts[cb][ub]['__too_low_aQual'] += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode in ("intersection-strict",
                                      "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if ((len(fs2) > 0) or
                                (overlap_mode == "intersection-strict")):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")

                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature", samoutfile, template)
                    counts[cb][ub]['__no_feature'] += 1
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                    samoutfile, template)
                    counts[cb][ub]['__ambiguous'] += 1
                else:
                    write_to_samout(r, list(fs)[0], samoutfile, template)

                if fs is not None and len(fs) > 0:
                    if multimapped_mode == 'none':
                        if len(fs) == 1:
                            counts[cb][ub][list(fs)[0]] += 1
                    elif multimapped_mode == 'all':
                        for fsi in list(fs):
                            counts[cb][ub][fsi] += 1
                    else:
                        sys.exit("Illegal multimap mode.")

            except UnknownChrom:
                write_to_samout(r, "__no_feature", samoutfile, template)
                counts[cb][ub]['__no_feature'] += 1

    except:
        sys.stderr.write("Error occured when processing input (%s):\n" %
                         (read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write(
            "%d %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))
        sys.stderr.flush()

    if samoutfile is not None:
        samoutfile.close()

    # Get rid of UMI by majority rule
    cbs = sorted(counts.keys())
    counts_noumi = {}
    for cb in cbs:
        counts_cell = Counter()
        for ub, udic in counts.pop(cb).items():
            # In case of a tie, do not increment either feature
            top = udic.most_common(2)
            if (len(top) == 2) and (top[0][1] == top[1][1]):
                continue
            counts_cell[top[0][0]] += 1
        counts_noumi[cb] = counts_cell

    return {
        'cell_barcodes': cbs,
        'counts': counts_noumi,
    }
Exemplo n.º 28
0
    def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ):
        """ Hacked version of htseq count.py
        """
        if opts.quiet:
            warnings.filterwarnings( action="ignore", module="HTSeq" )
        features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" )
        mapqMin = int(opts.mapqMin)
        counts = {}
        nreads = 0
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        filtered = 0  # new filter_extras - need a better way to do this - independent filter tool?
        gff = HTSeq.GFF_Reader( gff_filename )
        try:
            for i, f in enumerate(gff):
                if f.type == opts.feature_type:
                    try:
                        feature_id = f.attr[ opts.id_attribute ]
                    except KeyError:
                        try:
                            feature_id = f.attr[ 'gene_id' ]
                        except KeyError:
                            sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" %
                                      ( (i + 1), f.name, opts.id_attribute ) )
                    if opts.stranded != "no" and f.iv.strand == ".":
                        sys.exit( "Feature %s at %s does not have strand information but you are "
                                  "running htseq-count in stranded mode. Use '--stranded=no'." %
                                  ( f.name, f.iv ) )
                    features[ f.iv ] += feature_id
                    counts[ feature_id ] = [0 for x in colnames]  # we use sami as an index here to bump counts later
        except:
            sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
            raise

        if not opts.quiet:
            sys.stdout.write( "%d GFF lines processed.\n" % i )

        if len( counts ) == 0 and not opts.quiet:
            sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type )
        for sami, sam_filename in enumerate(sam_filenames):
            colname = colnames[sami]
            isbam = sam_exts[sami] == 'bam'
            hasbai = sam_bais[sami] > ''
            if hasbai:
                tempname = os.path.splitext(os.path.basename(sam_filename))[0]
                tempbam = '%s_TEMP.bam' % tempname
                tempbai = '%s_TEMP.bai' % tempname
                os.link(sam_filename, tempbam)
                os.link(sam_bais[sami], tempbai)
            try:
                if isbam:
                    if hasbai:
                        read_seq = HTSeq.BAM_Reader( tempbam )
                    else:
                        read_seq = HTSeq.BAM_Reader( sam_filename )
                else:
                    read_seq = HTSeq.SAM_Reader( sam_filename )
                first_read = iter(read_seq).next()
                pe_mode = first_read.paired_end
            except:
                if isbam:
                    print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename, colname )
                else:
                    print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename, colname )
                raise

            try:
                if pe_mode:
                    read_seq_pe_file = read_seq
                    read_seq = HTSeq.pair_SAM_alignments( read_seq )
                for seqi, r in enumerate(read_seq):
                    nreads += 1
                    if not pe_mode:
                        if not r.aligned:
                            notaligned += 1
                            continue
                        try:
                            if len(opts.filter_extras) > 0:
                                for extra in opts.filter_extras:
                                    if r.optional_field(extra):
                                        filtered += 1
                                        continue
                            if r.optional_field( "NH" ) > 1:
                                nonunique += 1
                                continue
                        except KeyError:
                            pass
                        if r.aQual < mapqMin:
                            lowqual += 1
                            continue
                        if opts.stranded != "reverse":
                            iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
                        else:
                            iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )
                    else:
                        if r[0] is not None and r[0].aligned:
                            if opts.stranded != "reverse":
                                iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
                            else:
                                iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
                        else:
                            iv_seq = tuple()
                        if r[1] is not None and r[1].aligned:
                            if opts.stranded != "reverse":
                                iv_seq = itertools.chain( iv_seq,
                                                          ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                            else:
                                iv_seq = itertools.chain( iv_seq,
                                                          ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                        else:
                            if r[0] is None or not r[0].aligned:
                                notaligned += 1
                                continue
                        try:
                            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                                    ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                                nonunique += 1
                                continue
                        except KeyError:
                            pass
                        if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ):
                            lowqual += 1
                            continue

                    try:
                        if opts.mode == "union":
                            fs = set()
                            for iv in iv_seq:
                                if iv.chrom not in features.chrom_vectors:
                                    raise UnknownChrom
                                for iv2, fs2 in features[ iv ].steps():
                                    fs = fs.union( fs2 )
                        elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty":
                            fs = None
                            for iv in iv_seq:
                                if iv.chrom not in features.chrom_vectors:
                                    raise UnknownChrom
                                for iv2, fs2 in features[ iv ].steps():
                                    if len(fs2) > 0 or opts.mode == "intersection-strict":
                                        if fs is None:
                                            fs = fs2.copy()
                                        else:
                                            fs = fs.intersection( fs2 )
                        else:
                            sys.exit( "Illegal overlap mode %s" % opts.mode )
                        if fs is None or len( fs ) == 0:
                            empty += 1
                        elif len( fs ) > 1:
                            ambiguous += 1
                        else:
                            ck = list(fs)[0]
                            counts[ck][sami] += 1  # end up with counts for each sample as a list
                    except UnknownChrom:
                        if not pe_mode:
                            rr = r
                        else:
                            rr = r[0] if r[0] is not None else r[1]
                        empty += 1
                        if not opts.quiet:
                            sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " +
                                                "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                                              ( rr.read.name, iv.chrom ) )
            except:
                if not pe_mode:
                    sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
                else:
                    sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
                raise

            if not opts.quiet:
                sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) )
        return counts, empty, ambiguous, lowqual, notaligned, nonunique, filtered, nreads
Exemplo n.º 29
0
# Deal with any GFF file reading errors
except ValueError as e:
    e.args += (gff.get_line_number_string(), )
    raise

try:
    # Get the first read to see if we're dealing with paired-end data
    read_seq = HTSeq.SAM_Reader(options.sam)
    first_read = iter(read_seq).next()
    pe_mode = first_read.paired_end

    # Re-initialize read_seq depending on if it's paired-end data or not
    read_seq = HTSeq.SAM_Reader(options.sam)
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments(read_seq)

    # Read counter, for feedback to user
    i = 0
    total = 0
    # Here we go, through each read...
    for r in read_seq:
        spliced = False
        if not pe_mode:
            if not r.aligned:
                continue
            total += 1
            iv_seq = []

            # Check to see if it's spliced
            for co in r.cigar:
Exemplo n.º 30
0
def count_reads_single_file(
    isam,
    sam_filename,
    features,
    feature_attr,
    order,
    max_buffer_size,
    stranded,
    overlap_mode,
    multimapped_mode,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    feature_type,
    id_attribute,
    additional_attributes,
    quiet,
    minaqual,
    samout_format,
    samout_filename,
):
    def write_to_samout(r, assignment, samoutfile, template=None):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                if samout_format in ('SAM', 'sam'):
                    samoutfile.write(read.get_sam_line() + "\n")
                else:
                    samoutfile.write(read.to_pysam_AlignedSegment(template))

    try:
        if sam_filename == "-":
            read_seq_file = HTSeq.BAM_Reader(sys.stdin)
        else:
            read_seq_file = HTSeq.BAM_Reader(sam_filename)

        # Get template for output BAM
        if samout_filename is None:
            template = None
            samoutfile = None
        elif samout_format in ('bam', 'BAM'):
            template = read_seq_file.get_template()
            samoutfile = pysam.AlignmentFile(
                samout_filename,
                'wb',
                template=template,
            )
        else:
            template = None
            samoutfile = open(samout_filename, 'w')

        read_seq_iter = iter(read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            pe_mode = False
        if first_read is not None:
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq = []
    except:
        sys.stderr.write(
            "Error occured when reading beginning of SAM/BAM file.\n")
        raise

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')
    counts = {key: 0 for key in feature_attr}

    try:
        if pe_mode:
            if ((supplementary_alignment_mode == 'ignore')
                    and (secondary_alignment_mode == 'ignore')):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq,
                                                     primary_only=primary_only)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    read_seq,
                    max_buffer_size=max_buffer_size,
                    primary_only=primary_only)
            else:
                raise ValueError("Illegal order specified.")
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))
                sys.stderr.flush()

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned", samoutfile, template)
                    continue
                if ((secondary_alignment_mode == 'ignore')
                        and r.not_primary_alignment):
                    continue
                if ((supplementary_alignment_mode == 'ignore')
                        and r.supplementary):
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique",
                                        samoutfile, template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual", samoutfile, template)
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type in com and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if (co.type in com and co.size > 0))
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned", samoutfile,
                                        template)
                        notaligned += 1
                        continue
                if secondary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].not_primary_alignment:
                        continue
                    elif (r[1] is not None) and r[1].not_primary_alignment:
                        continue
                if supplementary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].supplementary:
                        continue
                    elif (r[1] is not None) and r[1].supplementary:
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                        (r[1] is not None and r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique",
                                        samoutfile, template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual)
                        or (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual", samoutfile, template)
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode in ("intersection-strict",
                                      "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if ((len(fs2) > 0) or
                                (overlap_mode == "intersection-strict")):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")

                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature", samoutfile, template)
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                    samoutfile, template)
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0], samoutfile, template)

                if fs is not None and len(fs) > 0:
                    if multimapped_mode == 'none':
                        if len(fs) == 1:
                            counts[list(fs)[0]] += 1
                    elif multimapped_mode == 'all':
                        for fsi in list(fs):
                            counts[fsi] += 1
                    elif multimapped_mode == 'fraction':
                        for fsi in list(fs):
                            counts[fsi] += 1.0 / len(fs)
                    elif multimapped_mode == 'random':
                        fsi = random.choice(fs)
                        counts[fsi] += 1
                    else:
                        sys.exit("Illegal multimap mode.")

            except UnknownChrom:
                write_to_samout(r, "__no_feature", samoutfile, template)
                empty += 1

    except:
        sys.stderr.write("Error occured when processing input (%s):\n" %
                         (read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write(
            "%d %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))
        sys.stderr.flush()

    if samoutfile is not None:
        samoutfile.close()

    return {
        'isam': isam,
        'counts': counts,
        'empty': empty,
        'ambiguous': ambiguous,
        'lowqual': lowqual,
        'notaligned': notaligned,
        'nonunique': nonunique,
    }
Exemplo n.º 31
0
def count_reads_in_features(sam_filenames, gff_filename, samtype, order,
                            max_buffer_size, stranded, overlap_mode,
                            multimapped_mode, secondary_alignment_mode,
                            supplementary_alignment_mode, feature_type,
                            id_attribute, additional_attributes, quiet,
                            minaqual, samouts):
    def exists(obj, chain):
        _key = chain.pop(0)
        if _key in obj:
            return exists(obj[_key], chain) if chain else obj[_key]

    def check_overlapped_exons_and_calc_sum(gene):

        rightmost_value = gene["exons"][0][1]
        start = gene["exons"][0][0]
        new_exons = []
        total = rightmost_value - start
        for interval in gene["exons"]:

            if (interval[0] <= rightmost_value
                    and interval[1] >= rightmost_value):

                total += (interval[1] - rightmost_value)
                rightmost_value = interval[1]

            elif (interval[0] > rightmost_value):
                total += (interval[1] - interval[0])
                new_exons.append([start, rightmost_value
                                  ])  #add previous extended interval to result

                start = interval[0]
                rightmost_value = interval[1]

        new_exons.append([start, rightmost_value])

        gene["exons"] = new_exons
        gene["total_sum_of_exons"] = total

    def check_and_count_points_coverage(gene_id, first_read, second_read):

        # определить какую из точек пересекает
        # вычесть из каждой координаты координату начала гена!
        if (first_read is None or second_read is None):
            return

        gene_begin = genes_exons[gene_id]["gene_begin"]

        fstart = first_read.iv.start - gene_begin
        fend = first_read.iv.end - gene_begin
        sstart = second_read.iv.start - gene_begin
        send = second_read.iv.end - gene_begin

        if (first_read.proper_pair == False
                or second_read.proper_pair == False):
            return

        if (fend < sstart and fstart < fend and sstart < send):
            check(gene_id, fstart, fend)
            check(gene_id, sstart, send)

        elif (send < fstart and fstart < fend and sstart < send):
            check(gene_id, fstart, fend)
            check(gene_id, sstart, send)

        elif (fstart < fend and sstart < send and sstart >= fstart
              and send >= fend and sstart <= fend):
            check(gene_id, fstart, send)

        elif (fstart < fend and sstart < send and sstart <= fstart
              and send >= fstart and send <= fend):
            check(gene_id, sstart, fend)

        elif (fstart < sstart and send < fend):
            check(gene_id, fstart, fend)
        elif (sstart < fstart and fend < send):
            check(gene_id, sstart, send)

    def check(gene_id, start, end):
        total = 100
        half = total / 2
        left_interval = right_interval = half

        try:
            i = 0
            while (left_interval >= 10):
                if (i > 10):
                    raise ValueError('Out of boundaries\n')

                if (exists(
                        genes_coverage_in_points,
                    [gene_id, half
                     ]) == None):  # если точки нет то ищем ближаишую слева
                    # half = math.ceil(half)
                    half = int(math.floor(half / 10) * 10)
                    point = genes_coverage_in_points[gene_id][half]["point"]
                    right_interval += 5
                    left_interval -= 5

                else:  # если точка есть,
                    point = genes_coverage_in_points[gene_id][half]["point"]

                if (point < start):  # слева точка от рида, рид справой строны

                    half = half + (right_interval / 2)
                    left_interval = right_interval = right_interval / 2

                elif (point > end):  # точка справа от рида, рид слевой стороны

                    half = half - (left_interval / 2)
                    left_interval = right_interval = left_interval / 2

                elif (point > start and point < end):  # пересекает
                    genes_coverage_in_points[gene_id][half]["coverage"] += 1
                    return
                i += 1

        except:
            sys.stderr.write("Out of boundaries\n")

    def check2(gene_id, start, end):
        #gene_begin = genes_exons[gene_id]["gene_begin"]
        for i in range(0, 100, 10):

            point = genes_coverage_in_points[gene_id][i]["point"]

            if (start < point and point < end):
                genes_coverage_in_points[gene_id][i]["coverage"] += 1
                return

    def clear_all_cov_points():
        for gene_id, gene in genes_coverage_in_points.iteritems():

            for k, val in gene.iteritems():
                val["coverage"] = 0

    def plot_gene_coverage():
        sys.stderr.write("ENSG00000000003.10 genes on: " + str(test_n[0]) +
                         "\n")
        x = []
        y = []

        i = 0
        for k, val in enumerate(
                list(cvg[HTSeq.GenomicInterval("chrX", test_first_exon_start,
                                               test_last_exon_end)])):
            x.append(i)
            y.append(val)
            i += 1
        plt.plot(x, y)
        plt.show()
        """
         iv = HTSeq.GenomicInterval("chr3", 100, 200, "+")
        cvg[iv] += 1
        iv = HTSeq.GenomicInterval("chr3", 150, 250, "-")
        cvg[iv] += 1
        

        
        """

    if samouts != "":
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                'Select the same number of SAM input and output files')
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)

    #genes_coverage_in_points = {}
    genes_coverage_in_points = defaultdict(dict)
    #genes_exons = {}

    genes_exons = defaultdict(dict)
    #cvg = HTSeq.GenomicArray("auto", stranded != "no")

    test_n = [0]
    i = 0

    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                        "Feature %s does not contain a '%s' attribute" %
                        (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))

                features[f.iv] += feature_id

                #counts[f.attr[id_attribute]] = 0

                #экзоны не в порядке сортировки! координат
                #ген - граница экзона
                #здесь будут все интервалы и сумма всех интервалов
                gene_id = feature_id  #f.attr[id_attribute]

                if (exists(genes_exons, [gene_id]) == None):
                    #координата первого экзона

                    genes_exons[gene_id] = {
                        "total_sum_of_exons": 0,
                        "total_aligned_reads": 0,
                        "gene_begin": 0,
                        "exons": list([[f.iv.start, f.iv.end]])
                    }

                else:

                    genes_exons[gene_id]["exons"].append(
                        [f.iv.start, f.iv.end])

                #10 точек для гена для которых будем считать покрытие(интроны вычтем)

            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)

    except:
        sys.stderr.write("Error occured when processing GFF file (%s):\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(genes_exons) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    #проход по всем генам и внутри каждого сортируем по первой координате экзона
    #в конце сортировки каждого гена назначаем крайнюю координату начала гена(первый экзон)
    #пересекающиеся экзоны надо склеивать и расширять границы
    #после склеивания будем получать сумму экзонов total_sum_of_exons, т.е. мы получим участки непокрытые ни на одном стренде

    for gene_id, gene in genes_exons.iteritems():

        gene["exons"].sort()  #by first member
        gene["gene_begin"] = gene["exons"][0][0]

        #слить все пересекающиеся экзоны и одновременно посчитать сумму длин без полученных промежутков
        check_overlapped_exons_and_calc_sum(gene)

        total = gene["total_sum_of_exons"]  # длина всех экзонов

        for ten_interval in xrange(0, 100, 10):
            point = (total * ten_interval
                     ) / 100  #точка в абсолютном исчислении % от длины экзона
            prev_exon_end = 0

            for exon_key, exon in enumerate(gene["exons"]):

                #prev_exon_length + exon.start +
                point += (exon[0] - prev_exon_end)  #длина интрона

                if (point < exon[1]):  #точка конца экзона
                    #пишем точку в конечный массив
                    genes_coverage_in_points[gene_id][ten_interval] = {
                        "point": point - gene["gene_begin"],
                        "coverage": 0
                    }

                    break  # переход на следующую точку 10%
                else:
                    #длину экзона не уложившегося записываем
                    #prev_exon_length += exon.end - exon.start
                    prev_exon_end = exon[1]

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    sample = 0

    colors = ["red", "blue", "green", "yellow"]
    handlers = []
    sys.stderr.write(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n")
    for isam, (sam_filename) in enumerate(sam_filenames):

        total_of_reads_in_sample = 0

        if samouts != '':
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename != "-":
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
                read_seq = read_seq_file
                first_read = next(iter(read_seq))
            else:
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
                read_seq_iter = iter(read_seq_file)
                first_read = next(read_seq_iter)
                read_seq = itertools.chain([first_read], read_seq_iter)
            pe_mode = first_read.paired_end
        except:
            sys.stderr.write(
                "Error occured when reading beginning of SAM/BAM file.\n")
            raise

        try:
            if pe_mode:
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(read_seq)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq, max_buffer_size=max_buffer_size)
                else:
                    raise ValueError("Illegal order specified.")

            notaligned = 0
            lowqual = 0

            i = 0
            for r in read_seq:
                #TODO 'NoneType' object has no attribute 'iv' raised in plot_coverage.py:169]
                total_of_reads_in_sample += 1
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                     (i, "s" if not pe_mode else " pairs"))
                    sys.stderr.write(
                        strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n")
                i += 1
                if not pe_mode:
                    if not r.aligned:
                        #notaligned += 1
                        #write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore')
                            and r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore')
                            and r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            #nonunique += 1
                            #write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        #write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                                  if (co.type in com and co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                iv_seq, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                iv_seq, (co.ref_iv for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            #write_to_samout(r, "__not_aligned", samoutfile)
                            #notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None
                             and r[0].optional_field("NH") > 1)
                                or (r[1] is not None
                                    and r[1].optional_field("NH") > 1)):
                            #nonunique += 1
                            #write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual)
                            or (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        #write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                continue
                                #raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                    (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                #counts[list(fs)[0]] += 1
                                #read mapped only for one exon, (all cigar parts of both reads in pair mapped on one gene, but may be for several exons)
                                #we can take this read into account of analysis
                                #they must come in sorted order by coordinate!
                                #this is one unit of analysis. save it in memory and go throught it

                                gene_name = list(fs)[0]  # - имя гена

                                genes_exons[gene_name][
                                    "total_aligned_reads"] += 1

                                #if (total_of_reads_in_sample==100000):
                                #   break

                                check_and_count_points_coverage(
                                    gene_name, r[0], r[1])
                            """
                            elif multimapped_mode == 'all':
                                for fsi in list(fs):
                                    #counts[fsi] += 1 
                            """
                        else:
                            sys.exit("Illegal multimap mode.")

                except UnknownChrom:
                    #write_to_samout(r, "__no_feature", samoutfile)
                    #empty += 1
                    raise

        except:
            sys.stderr.write(
                "Error occured when processing SAM input (%s):\n" %
                read_seq_file.get_line_number_string())
            raise

        if not quiet:
            sys.stderr.write(
                "%d SAM %s processed.\n" %
                (i, "alignments " if not pe_mode else "alignment pairs"))

        if samoutfile is not None:
            samoutfile.close()

        #сохранить данные в таблицы чтобы работать с ними как угодно потом!

        outfile = open(
            '/home/kirill/bi/transcript/' + str(sample) + '_dict.txt', 'w')
        outfile.write("total_of_reads_in_sample" + '\t' +
                      str(total_of_reads_in_sample) + '\n')
        for gene_id, gene in genes_coverage_in_points.iteritems():

            outfile.write(
                str(gene_id) + '\t' +
                str(genes_exons[gene_id]["total_aligned_reads"]) + '\t' +
                str(genes_exons[gene_id]["total_sum_of_exons"]) + '\n')

            outfile.write(str(gene_id) + '\t')
            [
                outfile.write(str(val["coverage"]) + '\t')
                for k, val in gene.iteritems()
            ]
            outfile.write('\n')

        outfile.close()

        #############test################

        #plot_gene_coverage()

        ################################

        #1. получить % от числа ридов картированных на ген в конкретной точке(сумма всех % на 10 точках = 100) - число ридов картированных на ген будем записывать в массив(это бывший массиыв count)
        #2 для каждой точки делим полученный процент на длину конкретного гена (total_sum of exons)
        #3. для каждой точки делим величину на общее число ридов в образце
        #4. deviance - min - max всех значений? точка на графике среднее между ними

        CalcCoverage.do_coverage(genes_coverage_in_points, genes_exons,
                                 total_of_reads_in_sample, colors, sample,
                                 handlers)

        sample += 1

        #обнуление точек покрытия
        clear_all_cov_points()

    plt.legend(handlers, ['Sample ' + str(v) for v in range(0, sample, 1)])
    plt.title('Positions relative coverege')
    plt.xlabel('5` -> 3` positions, %')
    plt.ylabel('relative coverage')
    plt.grid(True)

    plt.savefig('/home/kirill/bi/transcript/covarage.png')
    plt.show()
    plt.close()
Exemplo n.º 32
0
def count_reads_in_features( sam_filename, gff_filename, stranded,
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, custom_stat ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() +
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" )
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None

   # MB
   if custom_stat != "":
      custom_stat_file=open(custom_stat,"a")
   else:
      custom_stat_file = None
   # endMB
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" %
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." %
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      # MB: Creating detailed stats
      if custom_stat_file:
		  sam_lines = 0
		  skipped = 0
		  assigned_reads = 0
		  assigned_reads_s = 0
		  assigned_reads_p = 0
		  assigned_genes = 0
		  assigned_genes_s = 0
		  assigned_genes_p = 0
		  empty_s = 0
		  empty_p = 0
		  ambiguous_s = 0
		  ambiguous_p = 0
		  anu_dict = {}
      # endMB
      i = 0
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )
         else:
            if r[0] is not None and r[0].aligned:
               #for co in r[0].cigar:
                  #sys.stderr.write("ID: %s, %s\n" % (r[0].original_sam_line.split('\t')[0],co.ref_iv))
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq,
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq,
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  # MB: Counting the 'alignment_not_unique' for one or both mates
                  if custom_stat_file:
					  if r[0] is not None and r[1] is not None: # The 2 mates are mapped
						 read_id = r[0].original_sam_line.split('\t')[0]
						 if read_id not in anu_dict: # The read is not indexed yet
							anu_dict[read_id] = {}
							anu_dict[read_id]['chr1'] = r[0].original_sam_line.split('\t')[2]
							anu_dict[read_id]['chr2'] = r[1].original_sam_line.split('\t')[2]
							anu_dict[read_id]['start1'] = r[0].original_sam_line.split('\t')[3]
							anu_dict[read_id]['start2'] = r[1].original_sam_line.split('\t')[3]
							anu_dict[read_id]['al_unique1'] = True
							anu_dict[read_id]['al_unique2'] = True
						 else: # Read already indexed
							if anu_dict[read_id]['al_unique1']:
							   if anu_dict[read_id]['chr1'] != r[0].original_sam_line.split('\t')[2] or anu_dict[read_id]['start1'] != r[0].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[0]
								  anu_dict[read_id]['al_unique1'] = False
							if anu_dict[read_id]['al_unique2']:
							   if anu_dict[read_id]['chr2'] != r[1].original_sam_line.split('\t')[2] or anu_dict[read_id]['start2'] != r[1].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[1]
								  anu_dict[read_id]['al_unique2'] = False
					  elif r[0] is not None: # Only r[1] is mapped
					     anu_dict[r[0].original_sam_line.split('\t')[0]] = {}
					     anu_dict[r[0].original_sam_line.split('\t')[0]]['al_unique1'] = False
					  else: # Only r[0] is mapped
					     anu_dict[r[1].original_sam_line.split('\t')[0]] = {}
					     anu_dict[r[1].original_sam_line.split('\t')[0]]['al_unique2'] = False
                  # endMB
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
               # MB
               if custom_stat_file:
                  if r[0] is not None and r[1] is not None:
                     empty_p += 1
                  else:
                     empty_s += 1
               # endMB
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
               # MB
               if custom_stat_file:
                  if r[0] is not None and r[1] is not None:
                     ambiguous_p += 1
                  else:
                     ambiguous_s += 1
               # endMB
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
               # MB
               if custom_stat_file:
                  if counts[ list(fs)[0] ] == 1:
                     assigned_genes += 1
                  assigned_reads += 1
                  if r[0] is not None and r[1] is not None:
                     assigned_reads_p += 1
                  else:
                     assigned_reads_s += 1
               # endMB
         except UnknownChrom:
            if not pe_mode:
               rr = r
            else:
               rr = r[0] if r[0] is not None else r[1]
            # MB
            if custom_stat_file:
               skipped += 1
            #endMB
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print "%s\t%d" % ( fn, counts[fn] )
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
   
   # MB: Adding stats in the custom_stat file
   if custom_stat_file:
      custom_stat_file.write("Input SAM file line count\t"+"{:,}".format(sum(1 for line in open(sam_filename) if not line.startswith('@')))+"\n\n")
      custom_stat_file.write("SAM lines (pairs or singles) processed\t"+"{:,}".format(i)+"\n\n")
      custom_stat_file.write("Skipped pairs (chr.not found)\t"+"{:,}".format(skipped)+"\n\n")
      custom_stat_file.write("Assigned_genes\t"+"{:,}".format(assigned_genes)+"\n\n")
      custom_stat_file.write("Assigned_reads\t"+"{:,}".format(assigned_reads)+"\n")
      custom_stat_file.write("\tSingle reads\t"+"{:,}".format(assigned_reads_s)+"\n")
      custom_stat_file.write("\tPaired reads\t"+"{:,}".format(assigned_reads_p)+"\n\n")
      custom_stat_file.write("No_features\t"+"{:,}".format(empty)+"\n")
      custom_stat_file.write("\tSingle reads\t"+"{:,}".format(empty_s)+"\n")
      custom_stat_file.write("\tPaired reads\t"+"{:,}".format(empty_p)+"\n\n")
      custom_stat_file.write("Ambiguous\t"+"{:,}".format(ambiguous)+"\n")
      custom_stat_file.write("\tSingle reads\t"+"{:,}".format(ambiguous_s)+"\n")
      custom_stat_file.write("\tPaired reads\t"+"{:,}".format(ambiguous_p)+"\n\n")
      custom_stat_file.write("Alignment_not_unique\t"+"{:,}".format(nonunique)+"\n")
      custom_stat_file.write("\tSAM lines (pairs or singles)\t"+"{:,}".format(len(anu_dict))+"\n")
      # Counting the 'alignment_not_unique' with one or both mates multiply aligned
      simpl = 0
      multipl = 0
      for i in anu_dict:
         if 'al_unique1' in anu_dict[i] and 'al_unique2' in anu_dict[i]:
            if anu_dict[i]['al_unique1'] or anu_dict[i]['al_unique2']:
               simpl += 1
            else:
               multipl += 1
         else:
            multipl += 1
      custom_stat_file.write("\tOne_mate_uniquely_mapped\t"+"{:,}".format(simpl)+"\n")
      custom_stat_file.write("\tTwo_mates_multiply_mapped\t"+"{:,}".format(multipl)+"\n")
def count_PE_reads(sam_files,
                   labels,
                   regions,
                   file_type="sam",
                   use_chrom_name=False,
                   order="name"):
    """ counts fragments (PE read pairs) for each region from all SAM/BAM files """

    assert len(sam_files) == len(labels)
    if use_chrom_name:
        print "INFO: Running in mode for counting per chromosome name."

    m = len(sam_files)

    # initialize a list with default zero counts
    all_counts = [collections.Counter() for i in range(m)]

    # iterate over all sam/bam files
    for j in range(m):

        print "INFO: Start to count reads in", sam_files[j], "..."

        if file_type == "sam":
            almnt_file = HTSeq.SAM_Reader(sam_files[j])
        else:
            almnt_file = HTSeq.BAM_Reader(sam_files[j])

        # pair alignment records according to PE pairs and iterate over pairs
        if order == "name":
            print "INFO: Assuming SAM/BAM file ordered by read name."
            alignmentIterator = HTSeq.pair_SAM_alignments(almnt_file)
        else:
            print "INFO: Assuming SAM/BAM file ordered by position"
            alignmentIterator = HTSeq.pair_SAM_alignments_with_buffer(
                almnt_file, max_buffer_size=100 * 3000000)

        for pair in alignmentIterator:

            first_almnt, second_almnt = pair  # extract pair

            # check if both pairs are mapped
            if first_almnt == None or second_almnt == None or not (
                    first_almnt.aligned and second_almnt.aligned):
                all_counts[j]["_unmapped"] += 1
                continue

            # potential speed up for transcript fragments as reference
            if use_chrom_name:

                if first_almnt.iv.chrom == second_almnt.iv.chrom:
                    all_counts[j][first_almnt.iv.chrom] += 1
                else:
                    all_counts[j]["_no_feature"] += 1

            else:
                # build set for all regions overalapping with the reads
                gene_ids_first = set()
                gene_ids_second = set()

                # extract all region names that overlap with the reads and add them to set
                for iv, val in regions[first_almnt.iv].steps():
                    gene_ids_first |= val
                for iv, val in regions[second_almnt.iv].steps():
                    gene_ids_second |= val

                # take only those genes that are common for first and second read
                gene_ids = gene_ids_first & gene_ids_second

                # handle read-pairs not mapped to a feature
                if len(gene_ids) == 0:
                    all_counts[j]["_no_feature"] += 1

                # if pair maps to a unique gene count it
                else:
                    # add increase counter for all genes
                    for gene_id in list(gene_ids):
                        all_counts[j][gene_id] += 1

    # return counts
    return (all_counts)
Exemplo n.º 34
0
def count_reads_paired(read_seq, forward_counter, reverse_counter, order,
                       quiet, minaqual, write_to_samout):

    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments(read_seq)
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    else:
        raise ValueError("Illegal order specified.")

    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            msg = "%d SAM alignment record pairs processed.\n" % (i)
            sys.stderr.write(msg)

        i += 1
        if r[0] is not None and r[0].aligned:
            if forward_counter is not None:
                forward_iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
            if reverse_counter is not None:
                reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
        else:
            forward_iv_seq = tuple()
            reverse_iv_seq = tuple()
        if r[1] is not None and r[1].aligned:
            if forward_counter is not None:
                rest = (invert_strand(co.ref_iv) for co in r[1].cigar
                        if co.type == "M" and co.size > 0)
                forward_iv_seq = itertools.chain(forward_iv_seq, rest)
            if reverse_counter is not None:
                rest = (co.ref_iv for co in r[1].cigar
                        if co.type == "M" and co.size > 0)
                reverse_iv_seq = itertools.chain(reverse_iv_seq, rest)
        else:
            if (r[0] is None) or not (r[0].aligned):
                write_to_samout(r, "__not_aligned")
                if forward_counter is not None:
                    forward_counter.notaligned += 1
                if reverse_counter is not None:
                    reverse_counter.notaligned += 1
                continue
        try:
            if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                    (r[1] is not None and r[1].optional_field("NH") > 1):
                if forward_counter is not None:
                    forward_counter.nonunique += 1
                if reverse_counter is not None:
                    reverse_counter.nonunique += 1
                write_to_samout(r, "__alignment_not_unique")
                continue
        except KeyError:
            pass
        if (r[0] and r[0].aQual < minaqual) or \
                (r[1] and r[1].aQual < minaqual):
            if forward_counter is not None:
                forward_counter.lowqual += 1
            if reverse_counter is not None:
                reverse_counter.lowqual += 1
            write_to_samout(r, "__too_low_aQual")
            continue

        if forward_counter is not None:
            forward_counter.count(forward_iv_seq, r)
        if reverse_counter is not None:
            reverse_counter.count(reverse_iv_seq, r)

    if not quiet:
        sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
Exemplo n.º 35
0
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, \
        filename_read_names_gene_names,filename_read_names_gene_names_amb_unique):
    """
	Main function to count reads in features i.e. genes. 
	
	Input:
		+ sam_filename: Input alignment with all the ambiguously mapped reads
		+ gff_filename: GTF containing all genes for a given species
		+ stranded: specify whether data are stranded - see -s option
		+ overlap_mode: mode to handle reads overlapping more than one feature (e.g. union) - 
		  See -m option: choices = ( "union", "intersection-strict", "intersection-nonempty")
		+ feature_type: see -t option
		+ id_attribute: see -i option
		+ quiet: see -q option
		+ minaqual: see -a option 
		+ samout: SAM output file storing disambiguated reads (see -o option).
		+ filename_read_names_gene_names: filename for the output file containing the mappings readName to geneNames for multimapped reads
		+ filename_read_names_gene_names_amb_unique: filename for the output file containing the mappings readName to geneNames for ambiguously mapped reads
      
	Output:
		+ Writes readName to geneName outputs.
		+ Writes SAM output file for ddisambiguated uniquely mapped reads.
		+ Writes to stdout the genes and their read counts with read count for distinct read type: non-ambiguous unique, multimapped and ambiguous unique. 
		  This output redirected and stored to an output file in main peakRescue pipeline. 
		  This output is used in the later stage of the peakRescue pipeline to rescue the reads present in the readName to genNames mappings.
	
   """
    # Output filhandles for readName to geneNames mappings
    fh_read_names_gene_names = open(filename_read_names_gene_names, 'w')
    fh_read_names_gene_names_amb_unique = open(
        filename_read_names_gene_names_amb_unique, 'w')

    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if quiet:
        warnings.filterwarnings(action="ignore", module="HTSeq")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    ## Hash table to store unique reads per exon (if modified GTF)
    counts = {}
    ## Hash table to store original non unique reads per gene (without
    dict_nonunique = {}
    ## Hash table to store all unique reads as per original GTF
    dict_gene_unique_counts = {}
    ## hast table to store ambigouous read count for unique reads...
    dict_gene_unique_counts_ambiguous = {}
    ## Hash table to store all non-unique reads including shared reads
    ## (either split reads or read pair matching on two distinct exons, same gene)
    dict_gene_nonunique_counts = {}
    ## Hash to store the non-unique read-names as key and genes names as values (fragments)
    dict_read_name_genes_names = {}
    ## Hash to store the non-unique read-names as key and genes names as values (fragments) including instances of a given multimapped read on same gene
    dict_read_name_genes_names_final = {}
    dict_read_name_genes_names_ambiguous = {}
    ## @todo: tag_gff - parameter to be removed - only deal with gene level information
    ## tag_gff: type to specify whether it contains gene or exons information
    tag_gff = "gene_gff"
    # Try to open samfile and fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    exons = HTSeq.GenomicArrayOfSets("auto", stranded=False)

    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                exons[f.iv] += f  # added to get exon interval data
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                # -- Initialisation
                feature_name = f.attr[id_attribute]
                # -- Added tag_gff for GFF type
                if tag_gff == "gene_gff":
                    # Original GTF (genes)
                    dict_nonunique = initialise_counts_per_feature(
                        dict_nonunique, feature_name)
                    dict_gene_unique_counts = initialise_counts_per_feature(
                        dict_gene_unique_counts, feature_name)
                    dict_gene_nonunique_counts = initialise_counts_per_feature(
                        dict_gene_nonunique_counts, feature_name)
                    dict_gene_unique_counts_ambiguous = initialise_counts_per_feature(
                        dict_gene_unique_counts_ambiguous, feature_name)
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0 and not quiet:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    try:
        if sam_filename != "-":
            read_seq = HTSeq.SAM_Reader(sam_filename)
            first_read = iter(read_seq).next()
        else:
            read_seq = iter(HTSeq.SAM_Reader(sys.stdin))
            first_read = read_seq.next()
            read_seq = itertools.chain([first_read], read_seq)
        pe_mode = first_read.paired_end
        #pe_mode = 1 ## Added by us
    except:
        sys.stderr.write(
            "Error occured when reading first line of sam file.\n")
        raise

    ###################################################################################################
    try:
        if pe_mode:
            read_seq_pe_file = read_seq
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        empty = 0
        ambiguous = 0
        ambiguous_tag = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        nonunique_nonamb_to_be_rescued = 0
        temp_read_name = "NA"
        previous_read_name = "NA"
        temp_interval_r0 = "NA"
        temp_interval_r1 = "NA"
        counter_fragment = 0
        flag_result = 0
        i = 0
        pe_mode_for_SE = 0
        ## -- Added pe_mode on for SE files so that multireads reads will be accounted for
        if not pe_mode:  # real SE
            pe_mode_for_SE = 1  #
            read_seq_pe_file = read_seq
            pe_mode = 1
        ## -- End
        index_fragment = 0
        for r in read_seq:
            prev_index_fragment = index_fragment
            tag_nonunique_NH = 0
            tag_overlapping_genes = 0
            flag_aln_not_unique = 0  #
            flag_ambiguous = 0  #
            #-- LOOP OVER ALL READS IN INPUT BAM FILE
            if pe_mode_for_SE:
                r = (r, None)
            counter_fragment += 1
            i += 1
            if not pe_mode:
                # -- SINGLE_END mode
                if not r.aligned:
                    notaligned += 1
                    #write_to_samout( r, "not_aligned" )
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        # --- Rescue multimappers in singel-end mode
                        #write_to_samout( r, "alignment_not_unique" )
                        #nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    #write_to_samout( r, "too_low_aQual" )
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M")
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M")
            else:
                # -- PAIRED-END
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M")
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M")
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar if co.type == "M"))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (co.ref_iv for co in r[1].cigar if co.type == "M"))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        #write_to_samout( r, "not_aligned" )
                        notaligned += 1
                        continue
                try:
                    if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                          ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
                        tag_nonunique_NH = 1
                        if (r[0] is not None and r[1] is None):
                            result, fs_genes, fs_exons, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval(
                                r[0], features, dict_read_name_genes_names,
                                ambiguous_tag, exons)
                            if result:
                                flag_result = 1
                                (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
                                        temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
                            else:
                                if len(fs_genes) != 0:
                                    (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
                                            temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
                        if (r[0] is None and r[1] is not None):
                            result, fs_genes, fs_exons, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval(
                                r[1], features, dict_read_name_genes_names,
                                ambiguous_tag, exons)
                            if result:
                                flag_result = 1
                                (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
                                        temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
                            else:
                                if len(fs_genes) != 0:
                                    (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
                                            temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
                        if (r[0] is not None and r[1] is not None):
                            result1, fs_genes1, fs_exons1, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval(
                                r[0], features, dict_read_name_genes_names,
                                ambiguous_tag, exons)
                            result2, fs_genes2, fs_exons2, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval(
                                r[1], features, dict_read_name_genes_names,
                                ambiguous_tag, exons)

                            if len(fs_genes1.intersection(fs_genes2)) > 0:
                                fs_genes = fs_genes1.intersection(fs_genes2)
                            elif len(fs_genes1.intersection(fs_genes2)) == 0:
                                fs_genes = fs_genes1.union(fs_genes2)

                            if result1 and not result2:
                                flag_result = 1
                                (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
                                        temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
                            elif result2 and not result1:
                                flag_result = 1
                                (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
                                        temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
                            else:
                                if len(fs_genes1) != 0 or len(fs_genes2) != 0:
                                    flag_result = 1
                                    if ((((temp_interval_r0 != str(r[0].iv)) or
                                          (temp_interval_r1 != str(r[1].iv)))
                                         or
                                         (temp_read_name != r[0].read.name))):
                                        (dict_nonunique
                                         ) = add_non_unique_counts_per_feature(
                                             fs_genes, dict_nonunique)
                                        dict_read_name_genes_names = _populate_read_name_gene_name(
                                            dict_read_name_genes_names,
                                            fs_genes, r[0].read.name,
                                            tag_report_instances_same_multiread_on_same_gene
                                        )
                                        flag_aln_not_unique = 1
                    #write_to_samout( r, "alignment_not_unique" )
                        nonunique += 1

                        if flag_result:

                            if r[0] is not None and r[1] is None:
                                non_uniq_read_name = r[0].read.name
                            elif r[0] is None and r[1] is not None:
                                non_uniq_read_name = r[1].read.name
                            elif r[0] is not None and r[1] is not None:
                                non_uniq_read_name = r[0].read.name
                            non_uniq_read_name2 = dict_read_name_genes_names.keys(
                            )[0]
                            if flag_aln_not_unique:
                                nonunique_nonamb_to_be_rescued += 1
# -- Re-initialise hash
                            # previous_read_name: read which falls into at least one gene interval
                            # tmp_read_name: the previous read in the bam file
                            # BAM is sorted by read name hence each multimapper will be arranged one after another
                            if previous_read_name == "NA":
                                previous_read_name = non_uniq_read_name

                            if non_uniq_read_name != previous_read_name:
                                if previous_read_name in dict_read_name_genes_names.keys(
                                ):
                                    fs_genes_names = dict_read_name_genes_names[
                                        previous_read_name]
                                    fh_read_names_gene_names.write(
                                        "%s\t%s\n" %
                                        (previous_read_name, "\t".join(
                                            list(fs_genes_names))))
                                previous_read_name = non_uniq_read_name
                                tmp_dict = {}
                                if non_uniq_read_name in dict_read_name_genes_names.keys(
                                ):
                                    #print "non_uniq_read_name IN dict_read_name_genes_names.keys()"
                                    tmp_dict[
                                        non_uniq_read_name] = dict_read_name_genes_names[
                                            non_uniq_read_name]
                                dict_read_name_genes_names.clear(
                                )  # only one read stored
                                dict_read_name_genes_names = tmp_dict

                        flag_result = 0
                        flag_aln_not_unique = 0  #
                        (temp_read_name, temp_interval_r0,
                         temp_interval_r1) = initalize_read_name_and_interval(
                             r[0], r[1])
                        continue
                # except KeyError:
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    #write_to_samout( r, "too_low_aQual" )
                    continue

            try:
                # --
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:  # interval from bam file for each fragment
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            #if debug:
                            #print "****Unique_feature %s and feature_interval %s" %(fs2,iv2)
                            fs = fs.union(fs2)

                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")

                fs_genes = fs
                if fs_genes is None or len(fs_genes) == 0:
                    #write_to_samout( r, "no_feature" )
                    empty += 1
# ambiguous read count and/or one of the read pair mapping on different gene (potential gene fusion events)...
# elif len( fs ) > 1:
                elif len(fs_genes) > 1:
                    ###############################################################
                    ## AMBIGUOUS UNIQUE
                    ###############################################################
                    is_disambiguated = 0
                    if not tag_nonunique_NH:
                        if (r[0] is not None and r[1] is None):
                            result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(
                                r[0], features,
                                dict_read_name_genes_names_ambiguous,
                                ambiguous_tag, exons)
                            if result:
                                (dict_gene_unique_counts
                                 ) = add_unique_counts_per_feature(
                                     dict_gene_unique_counts, fs_genes)
                                is_disambiguated = 1
                            if ambiguous_tag:
                                (dict_gene_unique_counts_ambiguous
                                 ) = add_unique_counts_per_feature_ambiguous(
                                     fs_genes,
                                     dict_gene_unique_counts_ambiguous)
                                flag_ambiguous = 1
                                # write in the file ambiguous read name gene name data...
                                fh_read_names_gene_names_amb_unique.write(
                                    "%s\t%s\n" % (r[0].read.name, "\t".join(
                                        list(fs_genes))))
                        if (r[0] is None and r[1] is not None):
                            result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(
                                r[1], features,
                                dict_read_name_genes_names_ambiguous,
                                ambiguous_tag, exons)
                            if result:
                                (dict_gene_unique_counts
                                 ) = add_unique_counts_per_feature(
                                     dict_gene_unique_counts, fs_genes)
                                is_disambiguated = 1
                            if ambiguous_tag:
                                (dict_gene_unique_counts_ambiguous
                                 ) = add_unique_counts_per_feature_ambiguous(
                                     fs_genes,
                                     dict_gene_unique_counts_ambiguous)
                                flag_ambiguous = 1
                                fh_read_names_gene_names_amb_unique.write(
                                    "%s\t%s\n" % (r[1].read.name, "\t".join(
                                        list(fs_genes))))
                        if (r[0] is not None and r[1] is not None):
                            result1, fs_genes1, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag1 = is_read_in_gene_interval(
                                r[0], features,
                                dict_read_name_genes_names_ambiguous,
                                ambiguous_tag, exons)
                            result2, fs_genes2, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag2 = is_read_in_gene_interval(
                                r[1], features,
                                dict_read_name_genes_names_ambiguous,
                                ambiguous_tag, exons)
                            if debug:
                                print "IN UNIQUE DISAMBIGUATION -->r[0].read.name=%s\t%s\t%s\t%s\t%s\n" % (
                                    r[0].read.name, result1, result2,
                                    fs_genes1, fs_genes2)
                            if len(fs_genes1.intersection(fs_genes2)) == 1:
                                fs_genes = fs_genes1.intersection(fs_genes2)
                                (dict_gene_unique_counts
                                 ) = add_unique_counts_per_feature(
                                     dict_gene_unique_counts, fs_genes)
                                is_disambiguated = 1
                            elif len(fs_genes1.intersection(fs_genes2)) > 1:
                                fs_genes = fs_genes1.intersection(fs_genes2)
                                (dict_gene_unique_counts_ambiguous
                                 ) = add_unique_counts_per_feature_ambiguous(
                                     fs_genes,
                                     dict_gene_unique_counts_ambiguous)
                                flag_ambiguous = 1
                                fh_read_names_gene_names_amb_unique.write(
                                    "%s\t%s\n" % (r[0].read.name, "\t".join(
                                        list(fs_genes))))
                            elif len(fs_genes1.intersection(fs_genes2)) == 0:
                                fs_genes = fs_genes1.union(fs_genes2)
                                if (fs_genes1 == set([]) or fs_genes2 == set(
                                    [])) and len(fs_genes) == 1:
                                    ## Disambiguate the uniquely mapped to the single gene it maps on
                                    (dict_gene_unique_counts
                                     ) = add_unique_counts_per_feature(
                                         dict_gene_unique_counts, fs_genes)
                                    is_disambiguated = 1
                                elif (fs_genes1 != set([])
                                      or fs_genes2 != set([])):
                                    ## Add fragment to the RN-GN for ambiguous uniquely mapped based on
                                    ## union of both fs_genes (fs_genes1 & fs_genes2) > 1
                                    (
                                        dict_gene_unique_counts_ambiguous
                                    ) = add_unique_counts_per_feature_ambiguous(
                                        fs_genes,
                                        dict_gene_unique_counts_ambiguous)
                                    flag_ambiguous = 1
                                    fh_read_names_gene_names_amb_unique.write(
                                        "%s\t%s\n" %
                                        (r[0].read.name, "\t".join(
                                            list(fs_genes))))

                    if flag_ambiguous:
                        ambiguous += 1
                        #write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
                    if is_disambiguated:
                        write_to_samout(r, list(fs_genes)[0])
                else:
                    if debug:
                        #print "DEBUG::CR:: len(fs) <-> 1:: fs = %s" %fs
                        pass
                    write_to_samout(r, list(fs)[0])

                    rr2 = r[0] if r[0] is not None else r[1]

                    if not tag_nonunique_NH:
                        (dict_gene_unique_counts
                         ) = add_unique_counts_per_feature(
                             dict_gene_unique_counts, fs_genes)

            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                if not quiet:
                    sys.stderr.write((
                        "Warning: Skipping read '%s', because chromosome " +
                        "'%s', to which it has been aligned, did not appear in the GFF file.\n"
                    ) % (rr.read.name, iv.chrom))

            if i % 100000 == 0 and not quiet:
                sys.stderr.write(
                    "%d sam %s processed.\n" %
                    (i, "lines " if not pe_mode else "line pairs"))

            flag_ambiguous = 0  ## re-initialise....
            index_fragment += 1
        #########################
        # This is to store the last read/fragment since it will no pass in previous condition:
        # => if non_uniq_read_name != previous_read_name:
        # -- At same level as the for loop (outside of the for loop) - column: 7
        #fh_read_names_gene_names.close()
        if dict_read_name_genes_names.keys() != []:
            #print "dict_read_name_genes_names passing"
            non_uniq_read_name = dict_read_name_genes_names.keys()[0]
            fs_genes_names = dict_read_name_genes_names[non_uniq_read_name]
            fh_read_names_gene_names.write(
                "%s\t%s\n" %
                (non_uniq_read_name, "\t".join(list(fs_genes_names))))
        # --
        fh_read_names_gene_names.close()
        fh_read_names_gene_names_amb_unique.close()
    ###################################################################################################
    #except UnboundLocalError:
    except AttributeError:
        #except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq_pe_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d sam %s processed.\n" %
                         (i, "lines " if not pe_mode else "line pairs"))

    if samoutfile is not None:
        samoutfile.close()

    if tag_gff == "gene_gff":
        tuples_genenames_exontag = [(fn, fn)
                                    for fn in dict_gene_unique_counts.keys()]
    tuples_genenames_exontag.sort()

    previous_gene_name = "NA"

    for gene_name, fn in tuples_genenames_exontag:
        gene_name = gene_name.strip()
        fn = fn.strip()

        if tag_gff == "gene_gff":  #
            if gene_name in dict_gene_unique_counts.keys():
                print "%s\t%i\t%i\t%s" % (
                    fn, dict_gene_unique_counts[gene_name],
                    dict_nonunique[gene_name],
                    dict_gene_unique_counts_ambiguous[gene_name])
            else:
                # -- No non-unique reads for that gene_name
                print "%s\t%i\t%i\t%i" % (
                    fn, dict_gene_unique_counts[gene_name], 0,
                    dict_gene_unique_counts_ambiguous[gene_name])

        # -- Re-initialise gene name
        previous_gene_name = gene_name

    print "no_feature\t%d" % empty
    print "ambiguous\t%d" % ambiguous
    print "too_low_aQual\t%d" % lowqual
    print "not_aligned\t%d" % notaligned
    print "alignment_not_unique\t%d" % nonunique
    print "nonunique_nonamb_to_be_rescued:\t%d" % nonunique_nonamb_to_be_rescued
Exemplo n.º 36
0
def count_reads_in_features(sam_filenames, gff_filename, samtype, order,
                            max_buffer_size, stranded, overlap_mode,
                            multimapped_mode, secondary_alignment_mode,
                            supplementary_alignment_mode, feature_type,
                            id_attribute, additional_attributes, quiet,
                            minaqual, samouts):
    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                samoutfile.write(read.get_sam_line() + "\n")

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
        samname = 'SAM'
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
        samname = 'BAM'
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    if samouts != []:
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                'Select the same number of {:} input and output files'.format(
                    samname))
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                        "Feature %s does not contain a '%s' attribute" %
                        (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                    f.attr[attr] if attr in f.attr else ''
                    for attr in additional_attributes
                ]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
    except:
        sys.stderr.write("Error occured when processing GFF file (%s):\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)
        sys.stderr.flush()

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != []:
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename == "-":
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            else:
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq_iter = iter(read_seq_file)
            # Catch empty BAM files
            try:
                first_read = next(read_seq_iter)
                pe_mode = first_read.paired_end
            except:
                first_read = None
                pe_mode = False
            if first_read is not None:
                read_seq = itertools.chain([first_read], read_seq_iter)
            else:
                read_seq = []
        except:
            sys.stderr.write(
                "Error occured when reading beginning of {:} file.\n".format(
                    samname))
            raise

        try:
            if pe_mode:
                if ((supplementary_alignment_mode == 'ignore')
                        and (secondary_alignment_mode == 'ignore')):
                    primary_only = True
                else:
                    primary_only = False
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(
                        read_seq, primary_only=primary_only)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq,
                        max_buffer_size=max_buffer_size,
                        primary_only=primary_only)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write(
                        "%d %s alignment record%s processed.\n" %
                        (i, samname, "s" if not pe_mode else " pairs"))
                    sys.stderr.flush()

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore')
                            and r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore')
                            and r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique",
                                            samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                                  if (co.type in com and co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                iv_seq, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                iv_seq, (co.ref_iv for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None
                             and r[0].optional_field("NH") > 1)
                                or (r[1] is not None
                                    and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique",
                                            samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual)
                            or (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                    (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1
                    elif len(fs) > 1:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                counts[list(fs)[0]] += 1
                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")

                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write("Error occured when processing %s input (%s):\n" %
                             (samname, read_seq_file.get_line_number_string()))
            raise

        if not quiet:
            sys.stderr.write(
                "%d %s %s processed.\n" %
                (i, samname,
                 "alignments " if not pe_mode else "alignment pairs"))
            sys.stderr.flush()

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]
    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] +
                        [str(c[fn]) for c in counts_all]))
    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c)
                                               for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad +
                    [str(c) for c in nonunique_all]))
Exemplo n.º 37
0
def count_reads_in_features(sam_filenames, gff_filename,
                            samtype,
                            order, max_buffer_size,
                            stranded, overlap_mode,
                            multimapped_mode,
                            secondary_alignment_mode,
                            supplementary_alignment_mode,
                            feature_type, id_attribute,
                            additional_attributes,
                            quiet, minaqual, samouts):

    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                samoutfile.write(read.get_sam_line() + "\n")

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
        samname = 'SAM'
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
        samname = 'BAM'
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    if samouts != []:
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                    'Select the same number of {:} input and output files'.format(samname))
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                            "Feature %s does not contain a '%s' attribute" %
                            (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                            "Feature %s at %s does not have strand information but you are "
                            "running htseq-count in stranded mode. Use '--stranded=no'." %
                            (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                        f.attr[attr] if attr in f.attr else ''
                        for attr in additional_attributes]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
    except:
        sys.stderr.write(
            "Error occured when processing GFF file (%s):\n" %
            gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)
        sys.stderr.flush()

    if len(counts) == 0:
        sys.stderr.write(
            "Warning: No features of type '%s' found.\n" % feature_type)

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != []:
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename == "-":
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            else:
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq_iter = iter(read_seq_file)
            # Catch empty BAM files
            try:
                first_read = next(read_seq_iter)
                pe_mode = first_read.paired_end
            except:
                first_read = None
                pe_mode = False
            if first_read is not None:
                read_seq = itertools.chain([first_read], read_seq_iter)
            else:
                read_seq = []
        except:
            sys.stderr.write(
                "Error occured when reading beginning of {:} file.\n".format(
                    samname))
            raise

        try:
            if pe_mode:
                if ((supplementary_alignment_mode == 'ignore') and
                   (secondary_alignment_mode == 'ignore')):
                    primary_only = True
                else:
                    primary_only = False
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(
                            read_seq,
                            primary_only=primary_only)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                            read_seq,
                            max_buffer_size=max_buffer_size,
                            primary_only=primary_only)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write(
                        "%d %s alignment record%s processed.\n" %
                        (i, samname, "s" if not pe_mode else " pairs"))
                    sys.stderr.flush()

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore') and
                       r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore') and
                       r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(
                                    r,
                                    "__alignment_not_unique",
                                    samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar if co.type in com
                                  and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv)
                                  for co in r.cigar if (co.type in com and
                                                        co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (invert_strand(co.ref_iv) for co in r[1].cigar
                                    if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                           (r[1] is not None and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual) or
                       (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                   (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1
                    elif len(fs) > 1:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                counts[list(fs)[0]] += 1
                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")


                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write(
                "Error occured when processing %s input (%s):\n" %
                (samname, read_seq_file.get_line_number_string()))
            raise

        if not quiet:
            sys.stderr.write(
                "%d %s %s processed.\n" %
                (i, samname, "alignments " if not pe_mode else "alignment pairs"))
            sys.stderr.flush()

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]
    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all]))
    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
Exemplo n.º 38
0
def count_circrna(args):
    import HTSeq
    import numpy as np
    import pandas as pd
    from collections import OrderedDict, defaultdict
    from ioutils import open_file_or_stdout

    logger.info('read input BAM/SAM file: ' + args.input_file)
    if args.input_file.endswith('.sam'):
        sam = HTSeq.SAM_Reader(args.input_file)
    elif args.input_file.endswith('.bam'):
        sam = HTSeq.BAM_Reader(args.input_file)
    else:
        raise ValueError('unsupported file extension')

    # extract junction positions from SAM header
    logger.info('extract junction positions')
    junction_positions = OrderedDict()
    for sq in sam.get_header_dict()['SQ']:
        junction_positions[sq['SN']] = sq['LN'] // 2
    # initialize counts
    gene_ids = list(junction_positions.keys())
    counts = pd.Series(np.zeros(len(gene_ids), dtype='int'), index=gene_ids)
    # count reads
    min_mapping_quality = args.min_mapping_quality
    strandness = args.strandness
    if args.paired_end:
        logger.info('count paired-end fragments')
        stats = defaultdict(int)
        for bundle in HTSeq.pair_SAM_alignments(sam, bundle=True):
            stats['total_pairs'] += 1
            # ignore multi-mapped pairs
            if len(bundle) != 1:
                stats['multi_mapping'] += 1
                continue
            read1, read2 = bundle[0]
            # ignore singletons
            if (read1 is None) or (read2 is None):
                stats['singleton'] += 1
                continue
            # ignore unmapped reads
            if not (read1.aligned and read2.aligned):
                stats['unmapped'] += 1
                continue
            # ignore pairs with mapping quality below threshold
            if (read1.aQual < min_mapping_quality) or (read2.aQual <
                                                       min_mapping_quality):
                stats['low_mapping_quality'] += 1
                continue
            if (strandness == 'forward') and (not ((read1.iv.strand == '+') and
                                                   (read2.iv.strand == '-'))):
                stats['improper_strand'] += 1
                continue
            if (strandness == 'reverse') and (not ((read1.iv.strand == '-') and
                                                   (read2.iv.strand == '+'))):
                stats['improper_strand'] += 1
                continue
            # ignore pairs on different chromosomes
            if read1.iv.chrom != read2.iv.chrom:
                stats['diff_chrom'] += 1
                continue
            pos = junction_positions[read1.iv.chrom]
            if read1.iv.start < pos <= read2.iv.end:
                counts[read1.iv.chrom] += 1
        for key, val in stats.items():
            logger.info('{}: {}'.format(key, val))
    else:
        logger.info('count single-end reads')
        for read in sam:
            # ignore unmapped read
            if not read.aligned:
                continue
            # ignore reads with mapping quality below threshold
            if read.aQual < min_mapping_quality:
                continue
            if (strandness == 'forward') and (read.iv.strand == '-'):
                continue
            if (strandness == 'reverse') and (not ((read.iv.strand == '+'))):
                continue
            pos = junction_positions[read.iv.chrom]
            if read.iv.start < pos <= read.iv.end:
                counts[read.iv.chrom] += 1
    # output counts
    logger.info('count fragments: {}'.format(counts.sum()))
    logger.info('write counts to file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        counts.to_csv(fout, sep='\t', header=None, index=True, na_rep='NA')
Exemplo n.º 39
0
def count_reads_in_features( sam_filename, gff_filename, stranded,
      overlap_mode,  quiet, minaqual, samout ):

   warnings.filterwarnings( action="ignore", module="HTSeq" )

   samoutfile = open_sam_output_file(samout)

   check_sam_file(sam_filename)

   counts, features = read_gff_file(gff_filename, quiet, stranded)

   pe_mode, read_seq = read_pe_mode(sam_filename)

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( samoutfile ,r, "not_aligned",pe_mode )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( samoutfile ,r, "alignment_not_unique" ,pe_mode)
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout(samoutfile , r, "too_low_aQual",pe_mode )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( samoutfile, r, "not_aligned",pe_mode )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout(samoutfile, r, "alignment_not_unique",pe_mode )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout(samoutfile, r, "too_low_aQual",pe_mode )
               continue        
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( samoutfile,r, "no_feature" ,pe_mode)
               empty += 1
            elif len( fs ) > 1:
               write_to_samout(samoutfile, r, "ambiguous[" + '+'.join( fs ) + "]",pe_mode )
               print  "ambiguous[" + '+'.join( fs ) + "]"
               ambiguous += 1
            else:
               write_to_samout( samoutfile, r, list(fs)[0] ,pe_mode )
               counts[ list(fs)[0] ] += 1
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      if counts[fn]:
            print "%s\t%d" % ( fn, counts[fn] )
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
Exemplo n.º 40
0
def output_single_deletion_reads(input_sam_file, library_max_size):
    output_file = input_sam_file.split(":")[0] + "single_pos.txt"
    input_sam = HTSeq.SAM_Reader(input_sam_file)
    input_sam = HTSeq.pair_SAM_alignments(input_sam)
    with open(output_file, "w") as output_list:
        output_list.write(
            "read_ID\tread_start\tgap_start\tgap_end\tread_end\tdel_size\tother_info\n"
        )
        for sam_line in input_sam:
            if (sam_line[0] is not None
                    and sam_line[0].aligned) and (sam_line[1] is not None
                                                  and sam_line[1].aligned):
                (clipping_1, read_start_1, read_start_clip_1, read_end_clip_1,
                 read_end_1, insert_size_1,
                 mapped_size_1) = cigar_analyse(sam_line[0])
                (clipping_2, read_start_2, read_start_clip_2, read_end_clip_2,
                 read_end_2, insert_size_2,
                 mapped_size_2) = cigar_analyse(sam_line[1])
                read_start = min(read_start_1, read_start_2)
                read_end = max(read_end_1, read_end_2)

                if read_end - read_start - insert_size_1 - insert_size_2 < library_max_size:
                    if clipping_1 + clipping_2 > 0:  # at least one clip read
                        if clipping_1 * clipping_2 == 0:  # only one clip read
                            if clipping_1 > 0:  # read1 clipped
                                insert_size = insert_size_1
                                read_start_clip = read_start_clip_1
                                read_end_clip = read_end_clip_1
                                mapped_size = mapped_size_1
                            else:  # read2 clipped
                                insert_size = insert_size_2
                                read_start_clip = read_start_clip_2
                                read_end_clip = read_end_clip_2
                                mapped_size = mapped_size_2

                            if (read_start_clip - read_start) >= 30 and (
                                    read_end - read_end_clip) >= 30:
                                output_list.write(
                                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                    (sam_line[0].get_sam_line().split("\t")[0],
                                     str(read_start), str(read_start_clip),
                                     str(read_end_clip), str(read_end),
                                     str(insert_size), "paired"))
                            elif clipping_1 + clipping_2 > 1 and (
                                    mapped_size + read_start + read_end_clip -
                                    read_end - read_start_clip) >= 30:
                                output_list.write(
                                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                    (sam_line[0].get_sam_line().split("\t")[0],
                                     str(read_start), str(read_start_clip),
                                     str(read_end_clip), str(read_end),
                                     str(insert_size), "paired,multi"))

                        else:  # both are clip reads
                            insert_size = insert_size_1 + insert_size_2
                            read_start_clip = min(read_start_clip_1,
                                                  read_start_clip_2)
                            read_end_clip = max(read_end_clip_1,
                                                read_end_clip_2)
                            mapped_size = mapped_size_1 + mapped_size_2

                            if (read_start_clip - read_start) >= 30 and (
                                    read_end - read_end_clip) >= 30:
                                output_list.write(
                                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                    (sam_line[0].get_sam_line().split("\t")[0],
                                     str(read_start), str(read_start_clip),
                                     str(read_end_clip), str(read_end),
                                     str(insert_size), "paired_clip"))
                            elif (clipping_1 > 1 or clipping_2 > 1) and (
                                    mapped_size + read_start + read_end_clip -
                                    read_end - read_start_clip) >= 30:
                                output_list.write(
                                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                    (sam_line[0].get_sam_line().split("\t")[0],
                                     str(read_start), str(read_start_clip),
                                     str(read_end_clip), str(read_end),
                                     str(insert_size), "paired_clip,multi"))

    return output_file
Exemplo n.º 41
0
def generateProfiles(sam_fn, fasta_fn='NC_003210.1.fa'):
    ''' Creates coverage and mismatch profiles for DMS-MaPseq sample.

Requires the .sam file of Bowtie2 alignment of DMS-MaPseq reads to the bacterial genome.
Also requires the .fasta file of the genome, to which the reads were aligned.
Creates the profiles for coverage and number of mismatches for each nucleotide
in the genome. The profiles are saved as .pickle files in the form of GenomicArray
objects (See HTSeq library docs). Additionally the profiles can be visualized with
Artemis and IGV genome browsers.
'''
    # read the genome sequence from fasta file
    genome = SeqIO.read(fasta_fn, "fasta")
    genome_length = len(genome.seq)
    # create genomic arrays to store coverage and mismatch data
    cvg = HTSeq.GenomicArray({genome.id: genome_length}, stranded=True,
                             typecode="i")
    mis = HTSeq.GenomicArray({genome.id: genome_length}, stranded=True,
                             storage='ndarray', typecode="i")
    # read the paired-end sam file
    sam_reader = HTSeq.SAM_Reader(sam_fn)
    i = 0
    for first, second in HTSeq.pair_SAM_alignments(sam_reader):
        i += 1
        if not i % 100000:
            print sam_fn, '->', i
        if not (first.proper_pair and first.proper_pair):
            continue
        
        # Add fragment coverage to the coverage profile 
        second.iv.strand = first.iv.strand # The first read determines the fragment strand
        # If first and second reads overlap, the coverage is calculated for the whole fragment
        if first.iv.overlaps(second.iv):
            first.iv.extend_to_include(second.iv)
            cvg[first.iv] += 1
        else: # Alternatively, the coverage is calculated for each read separately
            cvg[first.iv] += 1
            cvg[second.iv] += 1

        # Add unique mismatches from every pair of reads to the mismatch profile
        mism_1 = parse_md(first.optional_field('MD'))
        mism_2 = parse_md(second.optional_field('MD'))
        coord_mism = set()
        for mism in mism_1:
            offset = mism[0]
            coord = first.iv.start + offset
            coord_mism.add(coord)    
        for mism in mism_2:
            offset = mism[0]
            coord = second.iv.start + offset
            coord_mism.add(coord)
                                
        for coord in coord_mism:
            pos = HTSeq.GenomicPosition(genome.id,
                                        coord, strand = first.iv.strand)
            mis[pos] += 1

    # Write coverage and mismatch profiles to file using pickle
    cvg_pickle_fn = sam_fn.replace('.sam', '_cvg.pickle')
    write_pickle(cvg, cvg_pickle_fn)

    mis_pickle_fn = sam_fn.replace('.sam', '_mis.pickle')
    write_pickle(mis, mis_pickle_fn)

    # Create Artemis profile of coverage
    cvg_artemis_fn = sam_fn.replace('.sam', '_cvg.artemis')
    write_artemis(cvg, cvg_artemis_fn)

    mis_artemis_fn = sam_fn.replace('.sam', '_mis.artemis')
    write_artemis(mis, mis_artemis_fn)

    # Create .bedgraph profiles of coverage at plus and minus strands
    cvg_bed_plus_fn = sam_fn.replace('.sam', '_cvg_plus.bedgraph')
    cvg_bed_minus_fn = sam_fn.replace('.sam', '_cvg_minus.bedgraph')
    write_bed(cvg, cvg_bed_plus_fn, cvg_bed_minus_fn)

    mis_bed_plus_fn = sam_fn.replace('.sam', '_mis_plus.bedgraph')
    mis_bed_minus_fn = sam_fn.replace('.sam', '_mis_minus.bedgraph')
    write_bed(mis, mis_bed_plus_fn, mis_bed_minus_fn)
Exemplo n.º 42
0
def htseq_count(data):
    """ adapted from Simon Anders htseq-count.py script
    http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
    """

    sam_filename, gff_filename, out_file, stats_file = _get_files(data)
    stranded = _get_stranded_flag(data["config"])
    overlap_mode = "union"
    feature_type = "exon"
    id_attribute = "gene_id"
    minaqual = 0

    if file_exists(out_file):
        return out_file

    logger.info(
        "Counting reads mapping to exons in %s using %s as the "
        "annotation and strandedness as %s." %
        (os.path.basename(sam_filename), os.path.basename(gff_filename),
         _get_strandedness(data["config"])))

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}

    # Try to open samfile to fail early in case it is not there
    open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit("Feature %s at %s does not have strand "
                             "information but you are running htseq-count "
                             "in stranded mode. Use '--stranded=no'." %
                             (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n" %
                         gff.get_line_number_string())
        raise

    sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    try:
        align_reader = htseq_reader(sam_filename)
        first_read = iter(align_reader).next()
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading first line of sam "
                         "file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = align_reader
            read_seq = HTSeq.pair_SAM_alignments(align_reader)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                       (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif (overlap_mode == "intersection-strict"
                      or overlap_mode == "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if (len(fs2) > 0
                                    or overlap_mode == "intersection-strict"):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1

            if i % 100000 == 0:
                sys.stderr.write(
                    "%d sam %s processed.\n" %
                    (i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq_pe_file.get_line_number_string())
        raise

    sys.stderr.write("%d sam %s processed.\n" %
                     (i, "lines " if not pe_mode else "line pairs"))

    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            on_feature = 0
            for fn in sorted(counts.keys()):
                on_feature += counts[fn]
                out_handle.write("%s\t%d\n" % (fn, counts[fn]))

    with file_transaction(stats_file) as tmp_stats_file:
        with open(tmp_stats_file, "w") as out_handle:
            out_handle.write("on_feature\t%d\n" % on_feature)
            out_handle.write("no_feature\t%d\n" % empty)
            out_handle.write("ambiguous\t%d\n" % ambiguous)
            out_handle.write("too_low_aQual\t%d\n" % lowqual)
            out_handle.write("not_aligned\t%d\n" % notaligned)
            out_handle.write("alignment_not_unique\t%d\n" % nonunique)

    return out_file
Exemplo n.º 43
0
def library_generation(exp, Info):
    #Generation of the Class Library specific for "exp"
    library = Library(
        exp,
        Info.IIDefinition.input_files[Info.IIDefinition.lib_names.index(exp)])
    print library.name
    string = '\n\n***\tInedependent Insertions (I.I.) definition\t***\n\n- Input file: %s\n- Pair ends: %s\n- Alignment cutoff: %s\n- Remove duplicates: %s\n- Insertion cutoff: %i' % (
        library.input, Info.General.pair_ends,
        Info.IIDefinition.fidelity_limit, Info.IIDefinition.reads_duplicate,
        Info.IIDefinition.ins_iv)
    Info.print_save(exp, string)

    startTime = getCurrTime()
    string = '\tSelection of Insertions (I.): %s' % startTime
    Info.print_save(exp, string)
    aligned_file = HTSeq.SAM_Reader(library.input)

    #aligned_file = [seq for seq in itertools.islice(aligned_file,100000)]

    insertions_counts = Counter()

    count_aligned = 0
    count_GoodQualityAlignment = 0
    count_total = 0
    for algnt in aligned_file:
        if algnt.aligned:
            if algnt.iv.chrom.startswith('chr'):
                chromosome_style = ''
            else:
                chromosome_style = 'chr'
                break

    if Info.General.pair_ends:  #Pair ends library
        for bundle in HTSeq.pair_SAM_alignments(aligned_file, bundle=True):
            if len(bundle) != 1:
                continue  # Skip multiple alignments
            first_almnt, second_almnt = bundle[0]  # extract pair
            if first_almnt.aligned and second_almnt.aligned:
                if first_almnt.aQual >= Info.IIDefinition.fidelity_limit:
                    ins = HTSeq.GenomicPosition(
                        '%s%s' % (chromosome_style, str(first_almnt.iv.chrom)),
                        first_almnt.iv.start_d, first_almnt.iv.strand)
                    insertions_counts[ins] += 1
                    count_GoodQualityAlignment += 1
                count_aligned += 1
            count_total += 1

    else:  #Single ends library
        for algnt in aligned_file:
            if algnt.aligned:
                if algnt.aQual >= Info.IIDefinition.fidelity_limit:
                    ins = HTSeq.GenomicPosition(
                        '%s%s' % (chromosome_style, str(algnt.iv.chrom)),
                        algnt.iv.start_d, algnt.iv.strand)
                    insertions_counts[ins] += 1
                    count_GoodQualityAlignment += 1
                count_aligned += 1
            count_total += 1

    del aligned_file

    string = '\t-Total reads: %i\n\t-Aligned reads: %i\n\t-Aligned Reads trusted: %i\n\t-Insertions identified: %i' % (
        count_total, count_aligned, count_GoodQualityAlignment,
        len(insertions_counts.keys()))
    Info.print_save(exp, string)

    string = '\tRunTime: %s' % computeRunTime(startTime, getCurrTime())
    Info.print_save(exp, string)

    ### To collapse insertions in insertion array that are in the same interval (4bps)

    startTime = getCurrTime()
    string = 'Define Independent Insertions\n\tStarted: %s' % startTime
    Info.print_save(exp, string)
    insertions_series = pd.Series(insertions_counts,
                                  index=insertions_counts.keys())
    del insertions_counts
    insertions_order = insertions_series.copy()
    insertions_order.sort_values(ascending=False)
    insertions_genomicarray = HTSeq.GenomicArray("auto", stranded=True)

    count_indipendent_insertions = 0
    count_indipendent_insertions_aborted = 0

    insertions_tuple = zip(insertions_order.index, insertions_order.values)
    del insertions_order
    del insertions_series

    for ins in insertions_tuple:
        insertions_genomicarray[ins[0]] = ins[1]

    insertions_collapsed = {}

    for n in insertions_tuple:
        i = n[0]
        if insertions_genomicarray[i] > 0:
            counted = 0
            iv_i = HTSeq.GenomicInterval(i.chrom, i.start - 2, i.start + 2,
                                         i.strand)
            for i_2 in iv_i.xrange(step=1):
                try:
                    counted += insertions_genomicarray[i_2]
                    insertions_genomicarray[i_2] = 0
                except IndexError:
                    string = "\t!!!Skipped from analysis: %s" % i_2
                    Info.print_save(exp, string)
                    continue

            if counted >= Info.IIDefinition.ins_iv:
                if insertions_collapsed.has_key(i):
                    insertions_collapsed[i] += counted
                else:
                    insertions_collapsed[i] = counted
                count_indipendent_insertions += 1
            else:
                count_indipendent_insertions_aborted += 1

    string = '\t-Total insertions: %i\n\t-Independent Insertions (I.I.): %i' % (
        (count_indipendent_insertions + count_indipendent_insertions_aborted),
        count_indipendent_insertions)
    Info.print_save(exp, string)

    string = '\tRunTime: %s' % computeRunTime(startTime, getCurrTime())
    Info.print_save(exp, string)

    ###Storing data in library class that will be returned modifed as result of the function

    library.informations['Total'] = count_total
    library.informations['Aligned'] = count_aligned
    library.informations['Insertions'] = count_indipendent_insertions
    library.informations['II'] = count_indipendent_insertions
    if Info.IIDefinition.reads_duplicate:
        library.informations['Unique_reads'] = count_reads

    library.raw = pd.Series(insertions_collapsed,
                            index=insertions_collapsed.keys())

    #####Store the class!!!!!#####
    location = os.path.join(Info.General.storing_loc,
                            exp + '_' + Info.General.date, 'raw',
                            exp + '_IIRawdata.pkl')
    with open(location, 'wb') as saving:
        pickle.dump(library, saving)
    #####END the program#####
    string = 'Informations stored in %s\n***\tEND of Inedependent Insertions (I.I.) definition\t***' % location
    Info.print_save(exp, string)

    return library
Exemplo n.º 44
0
 def next_pair(self):
     """ Get next read pair """
     for (first, second) in ht.pair_SAM_alignments(self.read_iter):
         yield (first, second)
Exemplo n.º 45
0
#!/usr/bin/python

import HTSeq as h
from collections import defaultdict

#reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned_masked.sam")
#reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned.sam")

reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022_masked/Aligned.out.filtered.new.1017680.sam")
reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/Aligned.out.filtered.new1mb.sam")
it_p = iter(h.pair_SAM_alignments(reader))
it_p_m = iter(h.pair_SAM_alignments(reader_masked))

same_aligned = 0
one_same_pos = 0
both_same_pos = 0
masked_more_pos = 0
simple_more_pos = 0
#cur_read = {}
#cur_m_read = {}
not_in_simple = 0
not_in_masked = 0

n_m = defaultdict(list)
i = 0
for r1, r2 in h.pair_SAM_alignments(reader):
	n_m[r1.read.name].append((r1,r2))
	i += 1
	if i%10000 == 0:
		print i, " lines"
#for k,v in n_m.items():
Exemplo n.º 46
0
def htseq_count(data):
    """ adapted from Simon Anders htseq-count.py script
    http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
    """

    sam_filename, gff_filename, out_file, stats_file = _get_files(data)
    stranded = _get_stranded_flag(data["config"])
    overlap_mode = "union"
    feature_type = "exon"
    id_attribute = "gene_id"
    minaqual = 0


    if file_exists(out_file):
        return out_file

    logger.info("Counting reads mapping to exons in %s using %s as the "
                    "annotation and strandedness as %s." % (os.path.basename(sam_filename),
                    os.path.basename(gff_filename), _get_strandedness(data["config"])))

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}

    # Try to open samfile to fail early in case it is not there
    open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit("Feature %s at %s does not have strand "
                             "information but you are running htseq-count "
                             "in stranded mode. Use '--stranded=no'." %
                             (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n"
                         % gff.get_line_number_string())
        raise

    sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n"
                         % feature_type)

    try:
        align_reader = htseq_reader(sam_filename)
        first_read = iter(align_reader).next()
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading first line of sam "
                         "file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = align_reader
            read_seq = HTSeq.pair_SAM_alignments(align_reader)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M"
                              and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if
                              co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar if
                                  co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if
                                  co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(iv_seq,
                                                 (invert_strand(co.ref_iv) for co
                                                  in r[1].cigar if co.type == "M"
                                                  and co.size > 0))
                    else:
                        iv_seq = itertools.chain(iv_seq,
                                                 (co.ref_iv for co in r[1].cigar
                                                  if co.type == "M" and co.size
                                                  > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                       (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif (overlap_mode == "intersection-strict" or
                      overlap_mode == "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if (len(fs2) > 0 or overlap_mode == "intersection-strict"):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1

            if i % 100000 == 0:
                sys.stderr.write("%d sam %s processed.\n" %
                                 ( i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n"
                             % read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n"
                             % read_seq_pe_file.get_line_number_string() )
        raise

    sys.stderr.write("%d sam %s processed.\n" %
                     (i, "lines " if not pe_mode else "line pairs"))

    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            on_feature = 0
            for fn in sorted(counts.keys()):
                on_feature += counts[fn]
                out_handle.write("%s\t%d\n" % (fn, counts[fn]))

    with file_transaction(stats_file) as tmp_stats_file:
        with open(tmp_stats_file, "w") as out_handle:
            out_handle.write("on_feature\t%d\n" % on_feature)
            out_handle.write("no_feature\t%d\n" % empty)
            out_handle.write("ambiguous\t%d\n" % ambiguous)
            out_handle.write("too_low_aQual\t%d\n" % lowqual)
            out_handle.write("not_aligned\t%d\n" % notaligned)
            out_handle.write("alignment_not_unique\t%d\n" % nonunique)

    return out_file
def count_reads_onto_prebuilt_features(sam_filename,
                                       features,
                                       feature_ids,
                                       stranded,
                                       overlap_mode,
                                       quiet,
                                       minaqual,
                                       samout,
                                       umis=False):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if quiet:
        warnings.filterwarnings(action="ignore", module="HTSeq")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if umis:
        umi_re = re.compile(":UMI:(\w+):")
        umi_counts = {}

        def count_umis(fs, read_name):
            umi_seq = umi_re.search(read_name).group(1)
            umi_counts[fs][umi_seq] += 1

        for feature_id in feature_ids:
            umi_counts[feature_id] = Counter()
    else:

        def count_umis(x, y):
            return None

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    counts = {}
    for feature_id in feature_ids:
        counts[feature_id] = 0

    try:
        if sam_filename != "-":
            read_seq_file = HTSeq.SAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = HTSeq.SAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except StopIteration:
        raise EmptySamError(sam_filename)

    try:
        if pe_mode:
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        write_to_samout(r, "alignment_not_unique")
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "not_aligned")
                        notaligned += 1
                        continue
                try:
                    if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                          ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                        nonunique += 1
                        write_to_samout(r, "alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    write_to_samout(r, "no_feature")
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "ambiguous[" + '+'.join(fs) + "]")
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0])
                    counts[list(fs)[0]] += 1
                    count_umis(list(fs)[0], r.read.name)
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1
                #if not quiet:
                #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                #      ( rr.read.name, iv.chrom ) )

            if i % 100000 == 0 and not quiet:
                sys.stderr.write(
                    "%d sam %s processed.\n" %
                    (i, "lines " if not pe_mode else "line pairs"))

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d sam %s processed.\n" %
                         (i, "lines " if not pe_mode else "line pairs"))

    if samoutfile is not None:
        samoutfile.close()

    #sorted feature list. features+counts
    feats = [fn for fn in sorted(counts.keys())]
    if umis:
        counts = [len(umi_counts[fn]) for fn in feats]
    else:
        counts = [counts[fn] for fn in feats]
    #cat statistics summary to feature+count list
    feats = feats + [
        'no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned',
        'alignment_not_unique'
    ]
    counts = counts + [empty, ambiguous, lowqual, notaligned, nonunique]
    return (feats, counts)
Exemplo n.º 48
0
      set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
      if len( set_of_gene_names ) == 0:
         counts[ '_empty' ] += 1
      elif len( set_of_gene_names ) > 1:
         counts[ '_ambiguous' ] +=1
      else:
         for f in rs:
            counts[ f.name ] += 1
      num_reads += 1
      if num_reads % 100000 == 0:
         sys.stderr.write( "%d reads processed.\n" % num_reads )

else: # paired-end

   num_reads = 0
   for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ):
      rs = set()
      if af and ar and not af.aligned and not ar.aligned:
         counts[ '_notaligned' ] += 1
         continue
      if af and ar and not af.aQual < minaqual and ar.aQual < minaqual:
         counts[ '_lowaqual' ] += 1
         continue
      if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys():
         for cigop in af.cigar:
            if cigop.type != "M":
               continue
            if reverse:
               cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
            for iv, s in features[cigop.ref_iv].steps():
               rs = rs.union( s )
Exemplo n.º 49
0
 def next_pair(self):
     """ Get next read pair """
     for (first, second) in ht.pair_SAM_alignments(self.read_iter):
         yield (first, second)
Exemplo n.º 50
0
def count_reads_in_features(sam_filename, gff_filename, stranded, overlap_mode,
                            feature_type, id_attribute, quiet, minaqual,
                            samout):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if quiet:
        warnings.filterwarnings(action="ignore", module="HTSeq")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}
    ## added by CR
    dict_nonunique = {}

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                ##added by CR
                dict_nonunique[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0 and not quiet:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    try:
        if sam_filename != "-":
            read_seq = HTSeq.SAM_Reader(sam_filename)
            first_read = iter(read_seq).next()
        else:
            read_seq = iter(HTSeq.SAM_Reader(sys.stdin))
            first_read = read_seq.next()
            read_seq = itertools.chain([first_read], read_seq)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write(
            "Error occured when reading first line of sam file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = read_seq
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        #added by SB
        temp_read_name = "NA"
        temp_interval_r0 = "NA"
        temp_interval_r1 = "NA"
        ## added by CR
        nonunique2 = 0
        #added by SB
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        write_to_samout(r, "alignment_not_unique")
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M")
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M")
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M")
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M")
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar if co.type == "M"))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (co.ref_iv for co in r[1].cigar if co.type == "M"))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "not_aligned")
                        notaligned += 1
                        continue
                try:
                    if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                          ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
                        #print "Reference i= ", i
                        nonunique += 1
                        #print "%s--%s" % ( r[0].cigar,  r[1].cigar)
                        if (r[0] is not None and r[1] is None):
                            result, fs_new = is_read_in_gene_interval(
                                r[0], features)
                            if result:
                                if ((temp_read_name != r[0].read.name)
                                        and (temp_interval_r0 is not r[0].iv)):
                                    temp_read_name = r[0].read.name
                                    temp_interval_r0 = r[0].iv
                                ## -- ro: dir(ro) = ['__class__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read', '_read_as_sequenced', 'aQual', 'aligned', 'cigar', 'failed_platform_qc', 'from_SAM_line', 'from_pysam_AlignedRead', 'get_sam_line', 'inferred_insert_size', 'iv', 'mate_aligned', 'mate_start', 'not_primary_alignment', 'optional_field', 'optional_fields', 'original_sam_line', 'paired_end', 'pcr_or_optical_duplicate', 'pe_which', 'proper_pair', 'read', 'read_as_aligned']

                                #print "## -- ro:  = %s---" % (r[0].original_sam_line)
                                dict_nonunique[list(fs_new)[0]] += 1
                                #print "R1 %s--> %s " % (fs_new1 ,r[0].iv)
                        if (r[0] is None and r[1] is not None):
                            result, fs_new = is_read_in_gene_interval(
                                r[1], features)
                            if result:
                                if ((temp_read_name != r[1].read.name)
                                        and (temp_interval_r1 is not r[1].iv)):
                                    temp_read_name = r[1].read.name
                                    temp_interval_r1 = r[1].iv

                            #print "## -- r1:  = %s---" % (r[1].original_sam_line)
                                dict_nonunique[list(fs_new)[0]] += 1
                            #print "R2 %s--> %s" % (fs_new ,r[1].iv )
                        if (r[0] is not None and r[1] is not None):
                            #print "## -- ro & r1 :: %s-%s" % (r[0].original_sam_line, r[1].original_sam_line)
                            #print "%s--%s" % ( r[0].cigar,  r[1].cigar)
                            result1, fs_new1 = is_read_in_gene_interval(
                                r[0], features)
                            result2, fs_new2 = is_read_in_gene_interval(
                                r[1], features)

                            if result1 and not result2:
                                if ((temp_read_name != r[0].read.name)
                                        and (temp_interval_r0 is not r[0].iv)):
                                    temp_interval_r0 = r[0].iv
                                    #print "before ---%s -" % ( temp_read_name )
                                    temp_read_name = r[0].read.name
                                    temp_interval_r0 = r[0].iv
                                    #print "after %s" % ( temp_read_name )
                                    dict_nonunique[list(fs_new1)[0]] += 1
                                #print "R1 %s--> %s" % (fs_new1 ,r[0].iv)
                            elif result2 and not result1:
                                if ((temp_read_name != r[1].read.name)
                                        and (temp_interval_r1 is not r[1].iv)):
                                    temp_read_name = r[1].read.name
                                    temp_interval_r1 = r[1].iv
                                    #print "## -- ro & r1: r1"
                                    #print "%s" % (r[1].read.name )
                                    dict_nonunique[list(fs_new2)[0]] += 1
                                #print "R2 %s--> %s" % (fs_new2 ,r[1].iv)
                            elif result1 and result2:
                                if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv ) and \
                                 ( temp_interval_r1 is not r[1].iv) ):
                                    temp_read_name = r[0].read.name
                                    temp_interval_r0 = r[0].iv
                                    temp_interval_r1 = r[1].iv
                                    #print "## -- ro & r1: ro&r1"
                                    #print "%s" % (r[0].original_sam_line)
                                    #print "---%s:%s -- %s --%s" % (r.count, r.index, r[1].read, r[0].read )
                                    #print "%i---%i---%s---%s " % (result1, result2, fs_new1, fs_new2 )

                                    if list(fs_new1)[0] != list(fs_new2)[0]:
                                        dict_nonunique[list(fs_new1)[0]] += 1
                                        dict_nonunique[list(fs_new2)[0]] += 1
                                    else:
                                        dict_nonunique[list(fs_new1)[0]] += 1
                                #dict_nonunique[ list(fs_new1)[0]] += 1
                                #print "R1_R2, %s--> %s ---%s " % (fs_new1 ,r[0].iv, r[1].iv)
                                #dict_nonunique[ list(fs_new2)[0]] += 1

#-------------------------Modified by SB------------------------------------------------------
#fs_new= set()
#print "%s**%s**%s" % (type(r[0]), type(r[0].iv), type(features))
#zz=0
#for iv3, fs_new2 in features[ r[0].iv ].steps():
#	print "%i--%s--%s" % (zz, iv3, fs_new2)
#	zz+=1
#	fs_new = fs_new.union( fs_new2 )
#CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p'])
#CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt'])
#if not ( (fs_new is None or len( fs_new ) == 0 ) or (len( fs_new ) > 1 ) ) :
#added by CR
#dict_nonunique[ list(fs_new)[0]] += 1
#---------------------------EOF SB_changes-----------------------------------------------------

                        write_to_samout(r, "alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)

                            # added to test SB
#CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p'])
#CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt'])
#print "%s *** %s -- %s --%s" % (iv, iv2, fs2, fs)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    write_to_samout(r, "no_feature")
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "ambiguous[" + '+'.join(fs) + "]")
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0])

                    counts[list(fs)[0]] += 1

##aded by CR 2 lines
#dict_nonunique[ list(fs)[0]] += nonunique2
#nonunique2 = 0

            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                if not quiet:
                    sys.stderr.write((
                        "Warning: Skipping read '%s', because chromosome " +
                        "'%s', to which it has been aligned, did not appear in the GFF file.\n"
                    ) % (rr.read.name, iv.chrom))

            if i % 100000 == 0 and not quiet:
                sys.stderr.write(
                    "%d sam %s processed.\n" %
                    (i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq_pe_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d sam %s processed.\n" %
                         (i, "lines " if not pe_mode else "line pairs"))

    if samoutfile is not None:
        samoutfile.close()
    print "Gene\tUnique_reads"
    for fn in sorted(counts.keys()):

        print "%s\t%d" % (fn, counts[fn])

        ##print "*%s\t%d" % (fn, dict_nonunique[fn])
    print "no_feature\t%d" % empty
    print "ambiguous\t%d" % ambiguous
    print "too_low_aQual\t%d" % lowqual
    print "not_aligned\t%d" % notaligned
    print "alignment_not_unique\t%d" % nonunique
Exemplo n.º 51
0
def run_through_sam( sam_filename ): 
    try:
        almnt_file = HTSeq.SAM_Reader( sam_filename )
    except KeyError:
        raise ValueError, ( "Can't find file %s" % (sam_filename))
    count_reads = collections.Counter()
    i = 0
    for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ):
        if len(bundle) != 0:
            i += 1
            if i > 0 and i % 200000 == 0 and not quiet:
                sys.stderr.write( "%d SAM alignment records processed.\n" % ( i ) )
            rs = set()
            # Loop for multimapping reads: Reads that map to more than 3 positions
            if len(bundle) > 2:
                 count_reads[ '__Ambigios_read' ] += 1
                 continue
            # Loop for Singles: Reads that map to 1 genomic postion 
            elif len(bundle) == 1:
                for r1,r2 in bundle:
                    if r1 is None or r2 is None:
                        count_reads[ '__Single_hit:Not_aligned' ] += 1
                        continue
                    else:
                        try:
                            iv_seq1 = ( co.ref_iv for co in r1.cigar if co.type == "M" and co.size > 0 )
                            iv_seq2 = ( co.ref_iv for co in r2.cigar if co.type == "M" and co.size > 0 )
                        except AttributeError:
                            raise ValueError, ( "Single:Someting wrong with read %s" % (r1))
                            continue
                        for iv in iv_seq1:
                            for iv2, fs2 in features[ invert_strand(iv) ].steps():
                                rs = rs.union( fs2 )
                        for iv in iv_seq2:
                            for iv2, fs2 in features[ iv ].steps():
                                rs = rs.union( fs2 )
                                # Parsing through the set  
                if len(rs) == 0:
                    count_reads[ '__Single_hit:No_feature' ] += 1
                elif len(rs) == 1:
                    count_reads[ '__Single_hit:Feature_found' ] += 1
                elif len(rs) > 1:
                    count_reads[ '__Single_hit:Ambigous_features' ] += 1
            # Loop for Doubles: Reads that map to 2 genomic postion 
            elif len(bundle) == 2:
                found = []
                for r1,r2 in bundle:
                    if r1 is None or r2 is None:
                        found.append(False)
                        continue
                    else:
                        found.append(True)  
                        try:
                            iv_seq1 = ( co.ref_iv for co in r1.cigar if co.type == "M" and co.size > 0 )
                            iv_seq2 = ( co.ref_iv for co in r2.cigar if co.type == "M" and co.size > 0 )
                        except AttributeError:
                            raise ValueError, ( "Double:Someting wrong with read %s" % (r1))
                            continue
                        for iv in iv_seq1:
                            for iv2, fs2 in features[ invert_strand(iv) ].steps():
                                rs = rs.union( fs2 )
                        for iv in iv_seq2:
                            for iv2, fs2 in features[ iv ].steps():
                                rs = rs.union( fs2 )
                if all(found) == False:
                    count_reads[ '__Double_hit:Not_aligned' ] += 1
                    continue
                if any(found):
                    if len(rs) == 0:
                        count_reads[ '__Double_hit:No_feature' ] += 1
                    elif len(rs) == 1:
                        count_reads[ '_'.join(rs) ] += 1
                        count_reads[ '__Double_hit:Single_feature_found' ] += 1
                    elif len(rs) == 2:
                        count_reads[ '_'.join(rs) ] += 1
                        count_reads[ '__Double_hit:Double_feature_found' ] += 1
                    elif len(rs) > 2:
                        count_reads[ '__Double_hit:Ambigous_features' ] += 1
        else:
            continue
    # this sorts the collections.counter
    count_reads['__Total_reads' ] = i
    com_coll = sorted(count_reads.items(), key=lambda pair: pair[0], reverse=False)
    return( com_coll )
Exemplo n.º 52
0
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}
   ## added by CR
   dict_nonunique = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
	    ##added by CR
	    dict_nonunique[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      #added by SB
      temp_read_name="NA"
      temp_interval_r0="NA"
      temp_interval_r1="NA"
      ## added by CR	
      nonunique2 = 0
      #added by SB
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
		  #print "Reference i= ", i
                  nonunique += 1
		  #print "%s--%s" % ( r[0].cigar,  r[1].cigar)
               	  if ( r[0] is not None and r[1] is None ):
			result, fs_new = is_read_in_gene_interval(r[0], features)
			if result:
				if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ):
					temp_read_name=r[0].read.name
					temp_interval_r0=r[0].iv
				## -- ro: dir(ro) = ['__class__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read', '_read_as_sequenced', 'aQual', 'aligned', 'cigar', 'failed_platform_qc', 'from_SAM_line', 'from_pysam_AlignedRead', 'get_sam_line', 'inferred_insert_size', 'iv', 'mate_aligned', 'mate_start', 'not_primary_alignment', 'optional_field', 'optional_fields', 'original_sam_line', 'paired_end', 'pcr_or_optical_duplicate', 'pe_which', 'proper_pair', 'read', 'read_as_aligned']
				
				#print "## -- ro:  = %s---" % (r[0].original_sam_line)
		  		dict_nonunique[ list(fs_new)[0]] += 1
				#print "R1 %s--> %s " % (fs_new1 ,r[0].iv)
               	  if ( r[0] is None and r[1] is not None ):
			result, fs_new = is_read_in_gene_interval(r[1], features)
			if result:
 				if ((temp_read_name != r[1].read.name) and ( temp_interval_r1 is not r[1].iv) ):
					temp_read_name=r[1].read.name
					temp_interval_r1=r[1].iv

				#print "## -- r1:  = %s---" % (r[1].original_sam_line)
		  		dict_nonunique[ list(fs_new)[0]] += 1
				#print "R2 %s--> %s" % (fs_new ,r[1].iv )
               	  if ( r[0] is not None and r[1] is not None ):
			#print "## -- ro & r1 :: %s-%s" % (r[0].original_sam_line, r[1].original_sam_line)
			#print "%s--%s" % ( r[0].cigar,  r[1].cigar)
			result1, fs_new1 = is_read_in_gene_interval(r[0], features)
			result2, fs_new2 = is_read_in_gene_interval(r[1], features)
			
			if result1 and not result2:
				if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ):					
					temp_interval_r0=r[0].iv
					#print "before ---%s -" % ( temp_read_name )
					temp_read_name=r[0].read.name
					temp_interval_r0=r[0].iv
					#print "after %s" % ( temp_read_name )
		  			dict_nonunique[ list(fs_new1)[0]] += 1
				#print "R1 %s--> %s" % (fs_new1 ,r[0].iv)
			elif result2 and not result1:
				if ((temp_read_name != r[1].read.name)and ( temp_interval_r1 is not r[1].iv)):
					temp_read_name=r[1].read.name
					temp_interval_r1=r[1].iv
				#print "## -- ro & r1: r1"
				#print "%s" % (r[1].read.name )
			  		dict_nonunique[ list(fs_new2)[0]] += 1
				#print "R2 %s--> %s" % (fs_new2 ,r[1].iv)
			elif result1 and result2:
				if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv ) and \
					( temp_interval_r1 is not r[1].iv) ):
					temp_read_name=r[0].read.name
					temp_interval_r0=r[0].iv
					temp_interval_r1=r[1].iv
				#print "## -- ro & r1: ro&r1"
				#print "%s" % (r[0].original_sam_line)
				#print "---%s:%s -- %s --%s" % (r.count, r.index, r[1].read, r[0].read )
		  		#print "%i---%i---%s---%s " % (result1, result2, fs_new1, fs_new2 )
				
					if list(fs_new1)[0] !=  list(fs_new2)[0]:
						dict_nonunique[ list(fs_new1)[0]] += 1
						dict_nonunique[ list(fs_new2)[0]] += 1
					else:
						dict_nonunique[ list(fs_new1)[0]] += 1
				#dict_nonunique[ list(fs_new1)[0]] += 1
		  		#print "R1_R2, %s--> %s ---%s " % (fs_new1 ,r[0].iv, r[1].iv)
				#dict_nonunique[ list(fs_new2)[0]] += 1
			

#-------------------------Modified by SB------------------------------------------------------
                  #fs_new= set()
		  #print "%s**%s**%s" % (type(r[0]), type(r[0].iv), type(features))
		  #zz=0
		  #for iv3, fs_new2 in features[ r[0].iv ].steps():
                  #	print "%i--%s--%s" % (zz, iv3, fs_new2)	
		  #	zz+=1
		  #	fs_new = fs_new.union( fs_new2 )
		  #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p'])
                  #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt'])	
                  #if not ( (fs_new is None or len( fs_new ) == 0 ) or (len( fs_new ) > 1 ) ) :
                     #added by CR
		     #dict_nonunique[ list(fs_new)[0]] += 1
#---------------------------EOF SB_changes-----------------------------------------------------

                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
		     fs = fs.union( fs2 )
			
		     # added to test SB		    
			#CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p'])
			#CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt'])
                     #print "%s *** %s -- %s --%s" % (iv, iv2, fs2, fs)
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
		
               counts[ list(fs)[0] ] += 1
		##aded by CR 2 lines
	       #dict_nonunique[ list(fs)[0]] += nonunique2
		#nonunique2 = 0

         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()
   print "Gene\tUnique_reads"
   for fn in sorted( counts.keys() ):
	
	print "%s\t%d" % ( fn, counts[fn])
	
	##print "*%s\t%d" % (fn, dict_nonunique[fn])
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
Exemplo n.º 53
0
def count_reads_in_features(sam_filename,
                            gff_filename,
                            samtype,
                            order,
                            stranded,
                            overlap_mode,
                            feature_type,
                            id_attribute,
                            quiet,
                            minaqual,
                            samout,
                            include_non_annotated=False,
                            htseq_no_ambiguous=True):
    """
    This is taken from the function count_reads_in_features() from the 
    script htseq-count in the HTSeq package version 0.61.p2 
    The reason to do so is to fix two really small bugs related to the SAM output.
    The code of the function is small and simple so for now we
    will use the patched function here. A patch request has been sent
    to the HTSeq team.
    The description of the parameters are the same as htseq-count.
    Two parameters were added to filter out what to write in the sam output
    
    The HTSEQ License
    HTSeq is free software: you can redistribute it and/or modify it under the terms of 
    the GNU General Public License as published by the Free Software Foundation, 
    either version 3 of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, 
    but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

    The full text of the GNU General Public License, version 3, 
    can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html
    """
    # Set up the filters
    count_reads_in_features.filter_htseq = \
    ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
    if not include_non_annotated:
        count_reads_in_features.filter_htseq.append("__no_feature")
    count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous

    # Open SAM output file
    flag_write = "wb" if samtype == "bam" else "wh"
    flag_read = "rb" if samtype == "bam" else "r"
    saminfile = pysam.AlignmentFile(sam_filename, flag_read)
    count_reads_in_features.samoutfile = pysam.AlignmentFile(
        samout, flag_write, template=saminfile)
    saminfile.close()
    # Counter of annotated records
    count_reads_in_features.annotated = 0

    # Function to write to SAM output
    def write_to_samout(r, assignment):
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None and assignment not in count_reads_in_features.filter_htseq \
            and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1):
                sam_record = read.to_pysam_AlignedRead(
                    count_reads_in_features.samoutfile)
                sam_record.set_tag("XF", assignment, "Z")
                count_reads_in_features.samoutfile.write(sam_record)
                count_reads_in_features.annotated += 1

    # Annotation objects
    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}
    gff = HTSeq.GFF_Reader(gff_filename)

    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError, ("Feature %s does not contain a '%s' attribute" \
                                       % (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError, ("Feature %s at %s does not have strand information but you are " \
                                       "running htseq-count in stranded mode. Use '--stranded=no'." %
                                       (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
    except:
        raise

    if len(counts) == 0:
        raise RuntimeError, "No features of type '%s' found.\n" % feature_type

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format %s specified." % samtype

    try:
        read_seq_file = SAM_or_BAM_Reader(sam_filename)
        read_seq = read_seq_file
        first_read = iter(read_seq).next()
        pe_mode = first_read.paired_end
    except:
        raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file."

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError, "Illegal order specified."

        for r in read_seq:

            if not pe_mode:
                if not r.aligned:
                    write_to_samout(r, "__not_aligned")
                    continue

                try:
                    if r.optional_field("NH") > 1:
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass

                if r.aQual < minaqual:
                    write_to_samout(r, "__too_low_aQual")
                    continue

                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()

                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) \
                    or (r[1] is not None and r[1].optional_field("NH") > 1):
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass

                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    raise RuntimeError, "Illegal overlap mode."

                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature")
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                else:
                    write_to_samout(r, list(fs)[0])

            except UnknownChrom:
                write_to_samout(r, "__no_feature")

    except:
        count_reads_in_features.samoutfile.close()
        raise

    count_reads_in_features.samoutfile.close()
    return count_reads_in_features.annotated