def GFFGetDuplicateSGDs(gff_data): """Return GFF data with duplicate SGD names Returns an OrderedDictionary where the keys are the duplicate SGD names, e.g. dups = GFFGetDuplicateSGDs(gff) for sgd in dups.keys(): ... loops over SGD names ... for data in dups[sgd]: ... loops over duplicates ... Note that duplicates are determined purely by SGD name; no account is taken of chromosome or strand. Arguments: gff_data: a GFFFile object containing the GFF file data Returns: OrderedDictionary with SGDs as keys for lists of the duplicate TabDataLines corresponding to the SGD """ # Use ordered dictionary to sort info on duplicates duplicates = OrderedDictionary() # Process data line-by-line for data in gff_data: attributes = data['attributes'] if 'SGD' in attributes: # Store data sgd = attributes['SGD'] if sgd != '': # Check for duplicates if sgd in duplicates: duplicates[sgd].append(data) else: duplicates[sgd] = [data] # Filter out true duplicates i.e. SGDs with at least two # GFF data lines for sgd in duplicates.keys(): if len(duplicates[sgd]) < 2: del(duplicates[sgd]) # Finished return duplicates
def main(): """Main program """ # Set up logging format logging.basicConfig(format='%(levelname)s: %(message)s') p = optparse.OptionParser(usage="%prog [options] <file>.gff", version="%prog "+__version__, description= "Utility to perform various 'cleaning' operations on a GFF file.") p.add_option('-o',action='store',dest='output_gff',default=None, help="Name of output GFF file (default is '<file>_clean.gff')") p.add_option('--prepend',action='store',dest='prepend_str',default=None, help="String to prepend to seqname in first column") p.add_option('--clean',action='store_true',dest='do_clean', help="Perform all the 'cleaning' manipulations on the input data (equivalent " "to specifying all of --clean-score, --clean-replace-attributes, " "--clean-exclude-attributes and --clean-group-sgds)") p.add_option('--clean-score',action='store_true',dest='do_clean_score', help="Replace 'Anc_*' and blanks in 'score' field with zeroes") p.add_option('--clean-replace-attributes',action='store_true', dest='do_clean_replace_attributes', help="Replace 'ID', 'Gene', 'Parent' and 'Name' attributes with the value " "of the SGD attribute, if present") p.add_option('--clean-exclude-attributes',action='store_true', dest='do_clean_exclude_attributes', help="Remove the 'kaks', 'kaks2' and 'ncbi' attributes (to remove " "arbitrary attributes, see the --remove-attribute=... option)") p.add_option('--clean-group-sgds',action='store_true',dest='do_clean_group_sgds', help="Group features with the same SGD by adding unique numbers to the 'ID' " "attributes; IDs will have the form 'CDS:<SGD>:<n>' (where n is a unique " "number for a given SGD)") p.add_option('--report-duplicates',action='store_true',dest='report_duplicates', help="Report duplicate SGD names and write list to <file>_duplicates.gff " "with line numbers, chromosome, start coordinate and strand.") p.add_option('--resolve-duplicates',action='store',dest='mapping_file',default=None, help="Resolve duplicate SGDs by matching against 'best' genes in the supplied " "mapping file; other non-matching genes are discarded and written to " "<file>_discarded.gff.") p.add_option('--discard-unresolved',action='store_true',dest='discard_unresolved', help="Discard any unresolved duplicates, which are written to " "<file>_unresolved.gff.") p.add_option('--insert-missing',action='store',dest='gene_file',default=None, help="Insert genes from gene file with SGD names that don't appear in the " "input GFF. If GENE_FILE is blank ('='s must still be present) then the mapping " "file supplied with the --resolve-duplicates option will be used instead.") p.add_option('--add-exon-ids',action='store_true',dest='add_exon_ids',default=False, help="For exon features without an ID attribute, construct and insert an " "ID of the form 'exon:<Parent>:<n>' (where n is a unique number).") p.add_option('--add-missing-ids',action='store_true',dest='add_missing_ids',default=False, help="For features without an ID attribute, construct and insert a " "generated ID of the form '<feature>:<Parent>:<n>' (where n is a unique " "number).") p.add_option('--no-percent-encoding',action='store_true',dest='no_encoding',default=False, help="Convert encoded attributes to the correct characters in " "the output GFF. WARNING this may result in a non-cannonical GFF that can't " "be read correctly by this or other programs.") p.add_option('--remove-attribute',action='append',dest='rm_attr', help="Remove attribute RM_ATTR from the list of attributes for all records " "in the GFF file (can be specified multiple times)") p.add_option('--strict-attributes',action='store_true',dest='strict_attributes', help="Remove attributes that don't conform to the KEY=VALUE format") p.add_option('--debug',action='store_true',dest='debug', help="Print debugging information") # Process the command line options,arguments = p.parse_args() # Check for debugging if options.debug: # Turn on debugging output logging.getLogger().setLevel(logging.DEBUG) # Input files if len(arguments) != 1: p.error("input GFF file required") else: infile = arguments[0] if not os.path.exists(infile): p.error("Input file '%s' not found" % infile) # Report version p.print_version() # Set flags based on command line # String to prepend to first column prepend_str = options.prepend_str # Cleaning options if options.do_clean: # Update values in the "score" column clean_score = True # Clean up the "attributes" column clean_replace_attributes = True clean_exclude_attributes = True # Set ID field in "attributes" to group lines with matching SGDs group_SGDs = True else: # Set options based on user input clean_score = options.do_clean_score clean_replace_attributes = options.do_clean_replace_attributes clean_exclude_attributes = options.do_clean_exclude_attributes group_SGDs = options.do_clean_group_sgds # Report duplicate names report_duplicates = options.report_duplicates # Resolve duplicated genes using CDS file if options.mapping_file is not None: resolve_duplicates = True cdsfile = options.mapping_file else: resolve_duplicates = False cdsfile = None # Discard unresolved duplicates discard_unresolved = options.discard_unresolved # Insert missing genes if options.gene_file is not None: insert_missing = True if options.gene_file: genefile = options.gene_file else: genefile = cdsfile else: insert_missing = False genefile = None # Add an artificial exon ID attribute add_exon_ids = options.add_exon_ids # Add generated ID attributes add_missing_ids = options.add_missing_ids # Suppress encoding of attributes on output no_attribute_encoding = options.no_encoding # Remove attributes that don't conform to KEY=VALUE format strict_attributes = options.strict_attributes # Name for output files ##outext = os.path.splitext(os.path.basename(infile))[1] if not options.output_gff: outbase = os.path.splitext(os.path.basename(infile))[0] outfile = outbase+'_clean.gff' else: outbase = os.path.splitext(os.path.basename(options.output_gff))[0] outfile = options.output_gff print "Input : %s" % infile print "Output: %s" % outfile dupfile = outbase+'_duplicates.txt' delfile = outbase+'_discarded.gff' unresfile = outbase+'_unresolved.gff' # Read in data from file gff_data = GFFFile(infile) # Prepend string to seqname column if prepend_str is not None: print "Prepending '%s' to values in 'seqname' column" % prepend_str for data in gff_data: data['seqname'] = prepend_str+str(data['seqname']) # Check/clean score column values if clean_score: print "Replacing 'Anc_*' and blanks with '0's in 'score' column" score_unexpected_values = [] for data in gff_data: try: # Numerical value score = float(data['score']) if score != 0: score_unexpected_values.append(data['score']) except ValueError: # String value if data['score'].strip() != '' and not data['score'].startswith('Anc_'): score_unexpected_values.append(data['score']) # Replace "Anc_*" or blank values in "score" column with zero if data['score'].startswith('Anc_') or data['score'].strip() == '': data['score'] = '0' # Report unexpected values n = len(score_unexpected_values) if n > 0: logging.warning("%d 'score' values that are not '', 0 or 'Anc_*'" % n) logging.warning("Other values: %s" % score_unexpected_values) # Clean up the data in "attributes" column: replace keys if clean_replace_attributes: # Initialise mapping of keys from input to output in "attributes" column # where new values are required etc attributes_key_map = OrderedDictionary() attributes_key_map['ID'] = 'SGD' attributes_key_map['Gene'] = 'SGD' attributes_key_map['Parent'] = 'SGD' attributes_key_map['Name'] = 'SGD' attributes_dont_replace_with_empty_data = True print "Cleaning up attributes: replacing keys:" for key in attributes_key_map.keys(): print "\t%s -> %s" % (key,attributes_key_map[key]) if attributes_dont_replace_with_empty_data: print "(Replacement will be skipped if new data is missing/blank)" GFFUpdateAttributes(gff_data,attributes_key_map,[], attributes_dont_replace_with_empty_data) # Clean up the data in "attributes" column: exclude keys if clean_exclude_attributes: # List of keys to exclude attributes_exclude_keys = ['kaks','kaks2','ncbi'] print "Excluding keys:" for key in attributes_exclude_keys: print "\t%s" % key GFFUpdateAttributes(gff_data,{},attributes_exclude_keys,True) # Set the IDs for consecutive lines with matching SGD names, to indicate that # they're in the same gene if group_SGDs: print "Grouping SGDs by setting ID's for consecutive lines with the same SGD values" GFFGroupSGDs(gff_data) # Find duplicates in input file if report_duplicates or resolve_duplicates: duplicate_sgds = GFFGetDuplicateSGDs(gff_data) if report_duplicates: # Write to duplicates file print "Writing duplicate SGD names to %s" % dupfile fd = open(dupfile,'w') ndup = 0 ngroups = 0 for sgd in duplicate_sgds.keys(): assert(len(duplicate_sgds[sgd]) > 1) ndup += 1 fd.write("%s\t" % sgd) for data in duplicate_sgds[sgd]: # Write the line number, chromosome, start and strand data line = ';'.join(('L'+str(data.lineno()), str(data['seqname']),str(data['start']),str(data['end']))) fd.write("\t%s" % line) fd.write("\n") logging.debug("%s\t%s" % (sgd,duplicate_sgds[sgd])) for group in GroupGeneSubsets(duplicate_sgds[sgd]): if len(group) > 1: ngroups += 1 if ndup == 0: fd.write("No duplicate SGDs\n") fd.close() print "%d duplicates found (of which %d are trivial)" % (ndup,ngroups) if resolve_duplicates: print "Resolving duplicate SGDs using data from %s" % cdsfile print "Discarded genes will be written to %s" % delfile # Get data on best gene mappings from CDS file # Format is tab-delimited, each line has: # orf chr start end strand mapping = TabFile(cdsfile,column_names=('name','chr','start','end','strand')) # Overlap margin overlap_margin = 1000 # Perform resolution result = GFFResolveDuplicateSGDs(gff_data,mapping,duplicate_sgds,overlap_margin) # # Report the results # # Convenience variables for lists of unresolved, discarded etc duplicates resolved_sgds = result['resolved_sgds'] unresolved_sgds_no_mapping_genes = result['unresolved_sgds_no_mapping_genes'] unresolved_sgds_no_mapping_genes_after_filter = \ result['unresolved_sgds_no_mapping_genes_after_filter'] unresolved_sgds_no_overlaps = result['unresolved_sgds_no_overlaps'] unresolved_sgds_multiple_matches = result['unresolved_sgds_multiple_matches'] discard = result['discard'] # Remaining unresolved cases if len(unresolved_sgds_no_mapping_genes) > 0: print "No mapping genes with same SGDs found in %s:" % cdsfile for sgd in unresolved_sgds_no_mapping_genes: print "\t%s" % sgd print if len(unresolved_sgds_no_mapping_genes_after_filter) > 0: print "No mapping genes with same chromosome and/or strand:" for sgd in unresolved_sgds_no_mapping_genes_after_filter: print "\t%s" % sgd print if len(unresolved_sgds_no_overlaps) > 0: print "No mapping genes with overlaps:" for sgd in unresolved_sgds_no_overlaps: print "\t%s" % sgd print if len(unresolved_sgds_multiple_matches) > 0: print "Multiple matching mapping genes:" for sgd in unresolved_sgds_multiple_matches: print "\t%s" % sgd print # Summary counts for each case print "Total number of duplicated indexes : %d" % len(duplicate_sgds.keys()) print "Number of resolved duplicate SGDs : %d" % len(resolved_sgds) print "Unresolved duplicates:" print "* No mapping genes with same SGD : %d" % len(unresolved_sgds_no_mapping_genes) print "* No mapping genes with same chr/str : %d" % len(unresolved_sgds_no_mapping_genes_after_filter) print "* No mapping genes with overlap : %d" % len(unresolved_sgds_no_overlaps) print "* Multiple mapping genes match : %d" % len(unresolved_sgds_multiple_matches) # Remove discarded duplicates from the data print "Removing discarded duplicates and writing to %s" % delfile fd = open(delfile,'w') for discard_data in discard: try: ip = gff_data.indexByLineNumber(discard_data.lineno()) del(gff_data[ip]) fd.write("%s\n" % discard_data) except IndexError: logging.warning("Failed to delete line %d: not found" % discard_data.lineno()) fd.close() # Remove unresolved duplicates if requested if discard_unresolved: print "Removing unresolved duplicates and writing to %s" % unresfile # Get list of unresolved SGDs all_unresolved = result['unresolved_sgds'] # Get list of unresolved duplicates unresolved = [] for data in gff_data: attributes = data['attributes'] if 'SGD' in attributes: if attributes['SGD'] in all_unresolved: unresolved.append(data) # Discard them fu = open(unresfile,'w') for discard in unresolved: try: ip = gff_data.indexByLineNumber(discard.lineno()) del(gff_data[ip]) fu.write("%s\n" % discard) except IndexError: logging.warning("Failed to delete line %d: not found" % discard.lineno()) fu.close() # Look for "missing" genes in mapping file if insert_missing: # Get name for file with gene list if genefile is None: genefile = cdsfile print "Inserting unmatched genes from %s" % genefile # Get gene data from CDS file # Format is tab-delimited, each line has: # orf chr start end strand mapping = TabFile(genefile,column_names=('name','chr','start','end','strand')) n_genes_before_insert = len(gff_data) gff_data = GFFInsertMissingGenes(gff_data,mapping) print "Inserted %d missing genes" % (len(gff_data) - n_genes_before_insert) # Construct and insert ID for exons if add_exon_ids: print "Inserting artificial IDs for exon records" gff_data = GFFAddExonIDs(gff_data) # Construct and insert missing ID attributes if add_missing_ids: print "Inserting generated IDs for records where IDs are missing" gff_data = GFFAddIDAttributes(gff_data) # Strip attributes requested for removal if options.rm_attr: print "Removing the following attributes from all records:" for attr in options.rm_attr: print "\t* %s" % attr GFFUpdateAttributes(gff_data,exclude_keys=options.rm_attr) # Remove attributes that don't conform to KEY=VALUE format if strict_attributes: print "Removing attributes that don't conform to KEY=VALUE format" GFFUpdateAttributes(gff_data,exclude_nokeys=True) # Suppress percent encoding of attributes if no_attribute_encoding: print "Converting encoded special characters in attribute data to non-encoded form" logging.warning("!!! Special characters will not be correctly encoded in the output !!!") logging.warning("!!! The resulting GFF may not be readable by this or other programs !!!") gff_data = GFFDecodeAttributes(gff_data) # Write to output file print "Writing output file %s" % outfile gff_data.write(outfile)