def __main__(): # Get args. input_name = sys.argv[1] output_name = sys.argv[2] feature_name = sys.argv[3] condition = sys.argv[4] # Unescape operations in condition str. for key, value in mapped_ops.items(): condition = condition.replace(key, value) # Error checking: condition should be of the form <operator><number> for op in ops: if op in condition: empty, number_str = condition.split(op) try: number = float(number_str) except: number = None if empty != "" or not number: print("Invalid condition: %s, cannot filter." % condition, file=sys.stderr) return break # Do filtering. kept_features = 0 skipped_lines = 0 first_skipped_line = 0 out = open(output_name, 'w') for i, feature in enumerate(GFFReaderWrapper(open(input_name))): if not isinstance(feature, GenomicInterval): continue count = 0 for interval in feature.intervals: if interval.feature == feature_name: count += 1 eval_text = '%s %s' % (count, condition) if not check_expression(eval_text): print("Invalid condition: %s, cannot filter." % condition, file=sys.stderr) sys.exit(1) if eval(eval_text): # Keep feature. for interval in feature.intervals: out.write("\t".join(interval.fields) + '\n') kept_features += 1 # Needed because i is 0-based but want to display stats using 1-based. i += 1 # Clean up. out.close() info_msg = "%i of %i features kept (%.2f%%) using condition %s. " % \ (kept_features, i, float(kept_features) / i * 100.0, feature_name + condition) if skipped_lines > 0: info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( skipped_lines, first_skipped_line) print(info_msg)
def main(): # Arguments input_fname, out_fname = sys.argv[1:] # Do conversion. index = Indexes() offset = 0 reader_wrapper = GFFReaderWrapper(fileinput.FileInput(input_fname), fix_strand=True) for feature in list(reader_wrapper): # Add feature; index expects BED coordinates. if isinstance(feature, GenomicInterval): convert_gff_coords_to_bed(feature) index.add(feature.chrom, feature.start, feature.end, offset) # Always increment offset, even if feature is not an interval and hence # not included in the index. offset += feature.raw_size index.write(open(out_fname, "wb"))
def main(): # Process arguments. parser = optparse.OptionParser() parser.add_option('-F', '--format', dest="input_format") (options, args) = parser.parse_args() in_fname, out_fname = args input_format = options.input_format.lower() # Create dict of name-location pairings. name_loc_dict = {} if input_format in ['gff', 'gtf']: # GTF/GFF format # Create reader. if input_format == 'gff': in_reader = GFFReaderWrapper(open(in_fname, 'r')) else: #input_format == 'gtf' in_reader = read_unordered_gtf(open(in_fname, 'r')) for feature in in_reader: if isinstance(feature, Comment): continue for name in feature.attributes: val = feature.attributes[name] try: float(val) continue except: convert_gff_coords_to_bed(feature) # Value is not a number, so it can be indexed. if val not in name_loc_dict: # Value is not in dictionary. name_loc_dict[val] = { 'contig': feature.chrom, 'start': feature.start, 'end': feature.end } else: # Value already in dictionary, so update dictionary. loc = name_loc_dict[val] if feature.start < loc['start']: loc['start'] = feature.start if feature.end > loc['end']: loc['end'] = feature.end elif input_format == 'bed': # BED format. for line in open(in_fname, 'r'): # Ignore track lines. if line.startswith("track"): continue fields = line.split() # Ignore lines with no feature name. if len(fields) < 4: continue # Process line name_loc_dict[fields[3]] = { 'contig': fields[0], 'start': int(fields[1]), 'end': int(fields[2]) } # Create sorted list of entries. out = open(out_fname, 'w') max_len = 0 entries = [] for name in sorted(name_loc_dict.iterkeys()): loc = name_loc_dict[name] entry = '%s\t%s\t%s' % (name.lower(), name, '%s:%i-%i' % (loc['contig'], loc['start'], loc['end'])) if len(entry) > max_len: max_len = len(entry) entries.append(entry) # Write padded entries. out.write(str(max_len + 1).ljust(max_len) + '\n') for entry in entries: out.write(entry.ljust(max_len) + '\n') out.close()
#!/usr/bin/env python # This tool takes a gff file as input and counts the number of features in it. import sys, fileinput from galaxy import eggs from galaxy.datatypes.util.gff_util import GFFReaderWrapper from bx.intervals.io import GenomicInterval # Get args. input_file = sys.argv[1:] # Count features. count = 0 for feature in GFFReaderWrapper(fileinput.FileInput(input_file), fix_strand=True): if isinstance(feature, GenomicInterval): count += 1 print count