def metagene_bin(): '''Main program for metagene_bin.py''' arguments = get_arguments() for infile in arguments.input: print "Processing file:\t{}".format(infile) # returns a dict of file names with keys of orientation:gap_counting output_files = build_output_filenames(infile, arguments.output_prefix, arguments.window_size, arguments.step_size, arguments.separate_groups) with open(infile, 'r') as inf: metagene = inf.readline() for output in output_files.values(): with open(output, 'w') as outf: outf.write(metagene) # needed for plotting outf.write( "Gene,Orientation,Gapped,Window,Inclusive_Start,Inclusive_End,Abundance\n" ) header = inf.readline().strip().split(",") positions = header[2:] # positions relative to gene start for counts_line in read_chunk(inf, 1024): counts_parts = counts_line.strip().split(",") counts = counts_parts[2:] length = len(counts) (orientation, gap) = counts_parts[1].split(":") output = "{},{},{}".format(counts_parts[0], orientation, gap) window = 0 exclusive_end = arguments.window_size while exclusive_end <= length: inclusive_start = exclusive_end - arguments.window_size coverage = 0.0 for i in range(inclusive_start, exclusive_end): coverage += float(counts[i]) with open(output_files[counts_parts[1]], 'a') as outf: outf.write("{},{},{},{},{}\n".format( output, window, positions[inclusive_start], positions[exclusive_end - 1], coverage)) window += 1 exclusive_end += arguments.step_size
def metagene_bin(): '''Main program for metagene_bin.py''' arguments = get_arguments() for infile in arguments.input: print "Processing file:\t{}".format(infile) # returns a dict of file names with keys of orientation:gap_counting output_files = build_output_filenames(infile, arguments.output_prefix, arguments.window_size, arguments.step_size, arguments.separate_groups) with open(infile, 'r') as inf: metagene = inf.readline() for output in output_files.values(): with open(output, 'w') as outf: outf.write(metagene) # needed for plotting outf.write("Gene,Orientation,Gapped,Window,Inclusive_Start,Inclusive_End,Abundance\n") header = inf.readline().strip().split(",") positions = header[2:] # positions relative to gene start for counts_line in read_chunk(inf, 1024): counts_parts = counts_line.strip().split(",") counts = counts_parts[2:] length = len(counts) (orientation, gap) = counts_parts[1].split(":") output = "{},{},{}".format(counts_parts[0], orientation, gap) window = 0 exclusive_end = arguments.window_size while exclusive_end <= length: inclusive_start = exclusive_end - arguments.window_size coverage = 0.0 for i in range(inclusive_start, exclusive_end): coverage += float(counts[i]) with open(output_files[counts_parts[1]], 'a') as outf: outf.write("{},{},{},{},{}\n".format(output, window, positions[inclusive_start], positions[exclusive_end - 1], coverage)) window += 1 exclusive_end += arguments.step_size
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format(arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format(metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format( arguments.alignment, feature.get_samtools_region())]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam(samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read(read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file: output_file.write( "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable))) else: raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format( feature.get_chromosome_region(), feature.name, arguments.alignment))
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format( arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format( metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe([ 'samtools view {} {}'.format(arguments.alignment, feature.get_samtools_region()) ]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam( samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read( read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open( "{}.metagene_counts.csv".format( arguments.output_prefix), 'a') as output_file: output_file.write("{}\n".format( feature.print_metagene(interval_override=arguments. interval_variable))) else: raise MetageneError( "Could not pull chromosomal region {} for feature {} from BAM file {}." .format(feature.get_chromosome_region(), feature.name, arguments.alignment))