示例#1
0
def GFFGetDuplicateSGDs(gff_data):
    """Return GFF data with duplicate SGD names

    Returns an OrderedDictionary where the keys are the duplicate
    SGD names, e.g.

    dups = GFFGetDuplicateSGDs(gff)
    for sgd in dups.keys():
      ... loops over SGD names ...
      for data in dups[sgd]:
        ... loops over duplicates ...

    Note that duplicates are determined purely by SGD name; no
    account is taken of chromosome or strand.

    Arguments:
      gff_data: a GFFFile object containing the GFF file data

    Returns:
      OrderedDictionary with SGDs as keys for lists of the
      duplicate TabDataLines corresponding to the SGD
    """
    # Use ordered dictionary to sort info on duplicates
    duplicates = OrderedDictionary()
    # Process data line-by-line
    for data in gff_data:
        attributes = data['attributes']
        if 'SGD' in attributes:
            # Store data
            sgd = attributes['SGD']
            if sgd != '':
                # Check for duplicates
                if sgd in duplicates:
                    duplicates[sgd].append(data)
                else:
                    duplicates[sgd] = [data]
    # Filter out true duplicates i.e. SGDs with at least two
    # GFF data lines
    for sgd in duplicates.keys():
            if len(duplicates[sgd]) < 2:
                del(duplicates[sgd])
    # Finished
    return duplicates
示例#2
0
def main():
    """Main program
    """
    # Set up logging format
    logging.basicConfig(format='%(levelname)s: %(message)s')

    p = optparse.OptionParser(usage="%prog [options] <file>.gff",
                              version="%prog "+__version__,
                              description=
                              "Utility to perform various 'cleaning' operations on a GFF file.")
    p.add_option('-o',action='store',dest='output_gff',default=None,
                 help="Name of output GFF file (default is '<file>_clean.gff')")
    p.add_option('--prepend',action='store',dest='prepend_str',default=None,
                 help="String to prepend to seqname in first column")
    p.add_option('--clean',action='store_true',dest='do_clean',
                 help="Perform all the 'cleaning' manipulations on the input data (equivalent "
                 "to specifying all of --clean-score, --clean-replace-attributes, "
                 "--clean-exclude-attributes and --clean-group-sgds)")
    p.add_option('--clean-score',action='store_true',dest='do_clean_score',
                 help="Replace 'Anc_*' and blanks in 'score' field with zeroes")
    p.add_option('--clean-replace-attributes',action='store_true',
                 dest='do_clean_replace_attributes',
                 help="Replace 'ID', 'Gene', 'Parent' and 'Name' attributes with the value "
                 "of the SGD attribute, if present")
    p.add_option('--clean-exclude-attributes',action='store_true',
                 dest='do_clean_exclude_attributes',
                 help="Remove the 'kaks', 'kaks2' and 'ncbi' attributes (to remove "
                 "arbitrary attributes, see the --remove-attribute=... option)")
    p.add_option('--clean-group-sgds',action='store_true',dest='do_clean_group_sgds',
                 help="Group features with the same SGD by adding unique numbers to the 'ID' "
                 "attributes; IDs will have the form 'CDS:<SGD>:<n>' (where n is a unique "
                 "number for a given SGD)")
    p.add_option('--report-duplicates',action='store_true',dest='report_duplicates',
                 help="Report duplicate SGD names and write list to <file>_duplicates.gff "
                 "with line numbers, chromosome, start coordinate and strand.")
    p.add_option('--resolve-duplicates',action='store',dest='mapping_file',default=None,
                 help="Resolve duplicate SGDs by matching against 'best' genes in the supplied "
                 "mapping file; other non-matching genes are discarded and written to "
                 "<file>_discarded.gff.")
    p.add_option('--discard-unresolved',action='store_true',dest='discard_unresolved',
                 help="Discard any unresolved duplicates, which are written to "
                 "<file>_unresolved.gff.")
    p.add_option('--insert-missing',action='store',dest='gene_file',default=None,
                 help="Insert genes from gene file with SGD names that don't appear in the "
                 "input GFF. If GENE_FILE is blank ('='s must still be present) then the mapping "
                 "file supplied with the --resolve-duplicates option will be used instead.")
    p.add_option('--add-exon-ids',action='store_true',dest='add_exon_ids',default=False,
                 help="For exon features without an ID attribute, construct and insert an "
                 "ID of the form 'exon:<Parent>:<n>' (where n is a unique number).")
    p.add_option('--add-missing-ids',action='store_true',dest='add_missing_ids',default=False,
                 help="For features without an ID attribute, construct and insert a "
                 "generated ID of the form '<feature>:<Parent>:<n>' (where n is a unique "
                 "number).")
    p.add_option('--no-percent-encoding',action='store_true',dest='no_encoding',default=False,
                 help="Convert encoded attributes to the correct characters in "
                 "the output GFF. WARNING this may result in a non-cannonical GFF that can't "
                 "be read correctly by this or other programs.")
    p.add_option('--remove-attribute',action='append',dest='rm_attr',
                 help="Remove attribute RM_ATTR from the list of attributes for all records "
                 "in the GFF file (can be specified multiple times)")
    p.add_option('--strict-attributes',action='store_true',dest='strict_attributes',
                 help="Remove attributes that don't conform to the KEY=VALUE format")
    p.add_option('--debug',action='store_true',dest='debug',
                 help="Print debugging information")

    # Process the command line
    options,arguments = p.parse_args()

    # Check for debugging
    if options.debug:
        # Turn on debugging output
        logging.getLogger().setLevel(logging.DEBUG)

    # Input files
    if len(arguments) != 1:
        p.error("input GFF file required")
    else:
        infile = arguments[0]
        if not os.path.exists(infile):
            p.error("Input file '%s' not found" % infile)

    # Report version
    p.print_version()

    # Set flags based on command line

    # String to prepend to first column
    prepend_str = options.prepend_str
    # Cleaning options
    if options.do_clean:
        # Update values in the "score" column
        clean_score = True
        # Clean up the "attributes" column
        clean_replace_attributes = True
        clean_exclude_attributes = True
        # Set ID field in "attributes" to group lines with matching SGDs
        group_SGDs = True
    else:
        # Set options based on user input
        clean_score = options.do_clean_score
        clean_replace_attributes = options.do_clean_replace_attributes
        clean_exclude_attributes = options.do_clean_exclude_attributes
        group_SGDs = options.do_clean_group_sgds
    # Report duplicate names
    report_duplicates = options.report_duplicates
    # Resolve duplicated genes using CDS file
    if options.mapping_file is not None:
        resolve_duplicates = True
        cdsfile = options.mapping_file
    else:
        resolve_duplicates = False
        cdsfile = None
    # Discard unresolved duplicates
    discard_unresolved = options.discard_unresolved
    # Insert missing genes
    if options.gene_file is not None:
        insert_missing = True
        if options.gene_file:
            genefile = options.gene_file
        else:
            genefile = cdsfile
    else:
        insert_missing = False
        genefile = None
    # Add an artificial exon ID attribute
    add_exon_ids = options.add_exon_ids
    # Add generated ID attributes
    add_missing_ids = options.add_missing_ids
    # Suppress encoding of attributes on output
    no_attribute_encoding = options.no_encoding
    # Remove attributes that don't conform to KEY=VALUE format
    strict_attributes = options.strict_attributes

    # Name for output files
    ##outext = os.path.splitext(os.path.basename(infile))[1]
    if not options.output_gff:
        outbase = os.path.splitext(os.path.basename(infile))[0]
        outfile = outbase+'_clean.gff'
    else:
        outbase = os.path.splitext(os.path.basename(options.output_gff))[0]
        outfile = options.output_gff
    print "Input : %s" % infile
    print "Output: %s" % outfile
    dupfile = outbase+'_duplicates.txt'
    delfile = outbase+'_discarded.gff'
    unresfile = outbase+'_unresolved.gff'

    # Read in data from file
    gff_data = GFFFile(infile)

    # Prepend string to seqname column
    if prepend_str is not None:
        print "Prepending '%s' to values in 'seqname' column" % prepend_str
        for data in gff_data:
            data['seqname'] = prepend_str+str(data['seqname'])

    # Check/clean score column values
    if clean_score:
        print "Replacing 'Anc_*' and blanks with '0's in 'score' column"
        score_unexpected_values = []
        for data in gff_data:
            try:
                # Numerical value
                score = float(data['score'])
                if score != 0:
                    score_unexpected_values.append(data['score'])
            except ValueError:
                # String value
                if data['score'].strip() != '' and not data['score'].startswith('Anc_'):
                    score_unexpected_values.append(data['score'])
            # Replace "Anc_*" or blank values in "score" column with zero
            if data['score'].startswith('Anc_') or data['score'].strip() == '':
                data['score'] = '0'
        # Report unexpected values
        n = len(score_unexpected_values)
        if n > 0:
            logging.warning("%d 'score' values that are not '', 0 or 'Anc_*'" % n)
            logging.warning("Other values: %s" % score_unexpected_values)

    # Clean up the data in "attributes" column: replace keys
    if clean_replace_attributes:
        # Initialise mapping of keys from input to output in "attributes" column
        # where new values are required etc
        attributes_key_map = OrderedDictionary()
        attributes_key_map['ID'] = 'SGD'
        attributes_key_map['Gene'] = 'SGD'
        attributes_key_map['Parent'] = 'SGD'
        attributes_key_map['Name'] = 'SGD'
        attributes_dont_replace_with_empty_data = True
        print "Cleaning up attributes: replacing keys:"
        for key in attributes_key_map.keys():
            print "\t%s -> %s" % (key,attributes_key_map[key])
        if attributes_dont_replace_with_empty_data:
            print "(Replacement will be skipped if new data is missing/blank)"
        GFFUpdateAttributes(gff_data,attributes_key_map,[],
                            attributes_dont_replace_with_empty_data)

    # Clean up the data in "attributes" column: exclude keys
    if clean_exclude_attributes:
        # List of keys to exclude
        attributes_exclude_keys = ['kaks','kaks2','ncbi']
        print "Excluding keys:"
        for key in attributes_exclude_keys:
            print "\t%s" % key
        GFFUpdateAttributes(gff_data,{},attributes_exclude_keys,True)

    # Set the IDs for consecutive lines with matching SGD names, to indicate that
    # they're in the same gene
    if group_SGDs:
        print "Grouping SGDs by setting ID's for consecutive lines with the same SGD values"
        GFFGroupSGDs(gff_data)

    # Find duplicates in input file
    if report_duplicates or resolve_duplicates:
        duplicate_sgds = GFFGetDuplicateSGDs(gff_data)
                
    if report_duplicates:
        # Write to duplicates file
        print "Writing duplicate SGD names to %s" % dupfile
        fd = open(dupfile,'w')
        ndup = 0
        ngroups = 0
        for sgd in duplicate_sgds.keys():
            assert(len(duplicate_sgds[sgd]) > 1)
            ndup += 1
            fd.write("%s\t" % sgd)
            for data in duplicate_sgds[sgd]:
                # Write the line number, chromosome, start and strand data
                line = ';'.join(('L'+str(data.lineno()),
                                 str(data['seqname']),str(data['start']),str(data['end'])))
                fd.write("\t%s" % line)
            fd.write("\n")
            logging.debug("%s\t%s" % (sgd,duplicate_sgds[sgd]))
            for group in GroupGeneSubsets(duplicate_sgds[sgd]):
                if len(group) > 1: ngroups += 1
        if ndup == 0:
            fd.write("No duplicate SGDs\n")
        fd.close()
        print "%d duplicates found (of which %d are trivial)" % (ndup,ngroups)

    if resolve_duplicates:
        print "Resolving duplicate SGDs using data from %s" % cdsfile
        print "Discarded genes will be written to %s" % delfile
        # Get data on best gene mappings from CDS file
        # Format is tab-delimited, each line has:
        # orf      chr      start     end      strand
        mapping = TabFile(cdsfile,column_names=('name','chr','start','end','strand'))
        # Overlap margin
        overlap_margin = 1000
        # Perform resolution
        result = GFFResolveDuplicateSGDs(gff_data,mapping,duplicate_sgds,overlap_margin)
        #
        # Report the results
        #
        # Convenience variables for lists of unresolved, discarded etc duplicates
        resolved_sgds = result['resolved_sgds']
        unresolved_sgds_no_mapping_genes = result['unresolved_sgds_no_mapping_genes']
        unresolved_sgds_no_mapping_genes_after_filter = \
            result['unresolved_sgds_no_mapping_genes_after_filter']
        unresolved_sgds_no_overlaps = result['unresolved_sgds_no_overlaps']
        unresolved_sgds_multiple_matches = result['unresolved_sgds_multiple_matches']
        discard = result['discard']
        # Remaining unresolved cases
        if len(unresolved_sgds_no_mapping_genes) > 0:
            print "No mapping genes with same SGDs found in %s:" % cdsfile
            for sgd in unresolved_sgds_no_mapping_genes:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_no_mapping_genes_after_filter) > 0:
            print "No mapping genes with same chromosome and/or strand:"
            for sgd in unresolved_sgds_no_mapping_genes_after_filter:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_no_overlaps) > 0:
            print "No mapping genes with overlaps:"
            for sgd in unresolved_sgds_no_overlaps:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_multiple_matches) > 0:
            print "Multiple matching mapping genes:"
            for sgd in unresolved_sgds_multiple_matches:
                print "\t%s" % sgd
            print
        # Summary counts for each case
        print "Total number of duplicated indexes   : %d" % len(duplicate_sgds.keys())
        print "Number of resolved duplicate SGDs    : %d" % len(resolved_sgds)
        print "Unresolved duplicates:"
        print "* No mapping genes with same SGD     : %d" % len(unresolved_sgds_no_mapping_genes)
        print "* No mapping genes with same chr/str : %d" % len(unresolved_sgds_no_mapping_genes_after_filter)
        print "* No mapping genes with overlap      : %d" % len(unresolved_sgds_no_overlaps)
        print "* Multiple mapping genes match       : %d" % len(unresolved_sgds_multiple_matches)

        # Remove discarded duplicates from the data
        print "Removing discarded duplicates and writing to %s" % delfile
        fd = open(delfile,'w')
        for discard_data in discard:
            try:
                ip = gff_data.indexByLineNumber(discard_data.lineno())
                del(gff_data[ip])
                fd.write("%s\n" % discard_data)
            except IndexError:
                logging.warning("Failed to delete line %d: not found" % discard_data.lineno())
        fd.close()

        # Remove unresolved duplicates if requested
        if discard_unresolved:
            print "Removing unresolved duplicates and writing to %s" % unresfile
            # Get list of unresolved SGDs
            all_unresolved = result['unresolved_sgds']
            # Get list of unresolved duplicates
            unresolved = []
            for data in gff_data:
                attributes = data['attributes']
                if 'SGD' in attributes:
                    if attributes['SGD'] in all_unresolved:
                        unresolved.append(data)
            # Discard them
            fu = open(unresfile,'w')
            for discard in unresolved:
                try:
                    ip = gff_data.indexByLineNumber(discard.lineno())
                    del(gff_data[ip])
                    fu.write("%s\n" % discard)
                except IndexError:
                    logging.warning("Failed to delete line %d: not found" % discard.lineno())
            fu.close()

    # Look for "missing" genes in mapping file
    if insert_missing:
        # Get name for file with gene list
        if genefile is None:
            genefile = cdsfile
        print "Inserting unmatched genes from %s" % genefile
        # Get gene data from CDS file
        # Format is tab-delimited, each line has:
        # orf      chr      start     end      strand
        mapping = TabFile(genefile,column_names=('name','chr','start','end','strand'))
        n_genes_before_insert = len(gff_data)
        gff_data = GFFInsertMissingGenes(gff_data,mapping)
        print "Inserted %d missing genes" % (len(gff_data) - n_genes_before_insert)

    # Construct and insert ID for exons
    if add_exon_ids:
        print "Inserting artificial IDs for exon records"
        gff_data = GFFAddExonIDs(gff_data)

    # Construct and insert missing ID attributes
    if add_missing_ids:
        print "Inserting generated IDs for records where IDs are missing"
        gff_data = GFFAddIDAttributes(gff_data)

    # Strip attributes requested for removal
    if options.rm_attr:
        print "Removing the following attributes from all records:"
        for attr in options.rm_attr:
            print "\t* %s" % attr
        GFFUpdateAttributes(gff_data,exclude_keys=options.rm_attr)

    # Remove attributes that don't conform to KEY=VALUE format
    if strict_attributes:
        print "Removing attributes that don't conform to KEY=VALUE format"
        GFFUpdateAttributes(gff_data,exclude_nokeys=True)

    # Suppress percent encoding of attributes
    if no_attribute_encoding:
        print "Converting encoded special characters in attribute data to non-encoded form"
        logging.warning("!!! Special characters will not be correctly encoded in the output  !!!")
        logging.warning("!!! The resulting GFF may not be readable by this or other programs !!!")
        gff_data = GFFDecodeAttributes(gff_data)

    # Write to output file
    print "Writing output file %s" % outfile
    gff_data.write(outfile)