Python OrderedDictionary примеры использования

Язык программирования: Python

Пространство имен/Пакет: GFFFile

Класс/Тип: OrderedDictionary

Примеров на hotexamples.com: 2

Python OrderedDictionary - 2 примера найдено. Это лучшие примеры Python кода для GFFFile.OrderedDictionary, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OrderedDictionary(2)

keys(2)

Пример #1

Показать файл

def GFFGetDuplicateSGDs(gff_data):
    """Return GFF data with duplicate SGD names

    Returns an OrderedDictionary where the keys are the duplicate
    SGD names, e.g.

    dups = GFFGetDuplicateSGDs(gff)
    for sgd in dups.keys():
      ... loops over SGD names ...
      for data in dups[sgd]:
        ... loops over duplicates ...

    Note that duplicates are determined purely by SGD name; no
    account is taken of chromosome or strand.

    Arguments:
      gff_data: a GFFFile object containing the GFF file data

    Returns:
      OrderedDictionary with SGDs as keys for lists of the
      duplicate TabDataLines corresponding to the SGD
    """
    # Use ordered dictionary to sort info on duplicates
    duplicates = OrderedDictionary()
    # Process data line-by-line
    for data in gff_data:
        attributes = data['attributes']
        if 'SGD' in attributes:
            # Store data
            sgd = attributes['SGD']
            if sgd != '':
                # Check for duplicates
                if sgd in duplicates:
                    duplicates[sgd].append(data)
                else:
                    duplicates[sgd] = [data]
    # Filter out true duplicates i.e. SGDs with at least two
    # GFF data lines
    for sgd in duplicates.keys():
            if len(duplicates[sgd]) < 2:
                del(duplicates[sgd])
    # Finished
    return duplicates

Пример #2

Показать файл

def main():
    """Main program
    """
    # Set up logging format
    logging.basicConfig(format='%(levelname)s: %(message)s')

    p = optparse.OptionParser(usage="%prog [options] <file>.gff",
                              version="%prog "+__version__,
                              description=
                              "Utility to perform various 'cleaning' operations on a GFF file.")
    p.add_option('-o',action='store',dest='output_gff',default=None,
                 help="Name of output GFF file (default is '<file>_clean.gff')")
    p.add_option('--prepend',action='store',dest='prepend_str',default=None,
                 help="String to prepend to seqname in first column")
    p.add_option('--clean',action='store_true',dest='do_clean',
                 help="Perform all the 'cleaning' manipulations on the input data (equivalent "
                 "to specifying all of --clean-score, --clean-replace-attributes, "
                 "--clean-exclude-attributes and --clean-group-sgds)")
    p.add_option('--clean-score',action='store_true',dest='do_clean_score',
                 help="Replace 'Anc_*' and blanks in 'score' field with zeroes")
    p.add_option('--clean-replace-attributes',action='store_true',
                 dest='do_clean_replace_attributes',
                 help="Replace 'ID', 'Gene', 'Parent' and 'Name' attributes with the value "
                 "of the SGD attribute, if present")
    p.add_option('--clean-exclude-attributes',action='store_true',
                 dest='do_clean_exclude_attributes',
                 help="Remove the 'kaks', 'kaks2' and 'ncbi' attributes (to remove "
                 "arbitrary attributes, see the --remove-attribute=... option)")
    p.add_option('--clean-group-sgds',action='store_true',dest='do_clean_group_sgds',
                 help="Group features with the same SGD by adding unique numbers to the 'ID' "
                 "attributes; IDs will have the form 'CDS:<SGD>:<n>' (where n is a unique "
                 "number for a given SGD)")
    p.add_option('--report-duplicates',action='store_true',dest='report_duplicates',
                 help="Report duplicate SGD names and write list to <file>_duplicates.gff "
                 "with line numbers, chromosome, start coordinate and strand.")
    p.add_option('--resolve-duplicates',action='store',dest='mapping_file',default=None,
                 help="Resolve duplicate SGDs by matching against 'best' genes in the supplied "
                 "mapping file; other non-matching genes are discarded and written to "
                 "<file>_discarded.gff.")
    p.add_option('--discard-unresolved',action='store_true',dest='discard_unresolved',
                 help="Discard any unresolved duplicates, which are written to "
                 "<file>_unresolved.gff.")
    p.add_option('--insert-missing',action='store',dest='gene_file',default=None,
                 help="Insert genes from gene file with SGD names that don't appear in the "
                 "input GFF. If GENE_FILE is blank ('='s must still be present) then the mapping "
                 "file supplied with the --resolve-duplicates option will be used instead.")
    p.add_option('--add-exon-ids',action='store_true',dest='add_exon_ids',default=False,
                 help="For exon features without an ID attribute, construct and insert an "
                 "ID of the form 'exon:<Parent>:<n>' (where n is a unique number).")
    p.add_option('--add-missing-ids',action='store_true',dest='add_missing_ids',default=False,
                 help="For features without an ID attribute, construct and insert a "
                 "generated ID of the form '<feature>:<Parent>:<n>' (where n is a unique "
                 "number).")
    p.add_option('--no-percent-encoding',action='store_true',dest='no_encoding',default=False,
                 help="Convert encoded attributes to the correct characters in "
                 "the output GFF. WARNING this may result in a non-cannonical GFF that can't "
                 "be read correctly by this or other programs.")
    p.add_option('--remove-attribute',action='append',dest='rm_attr',
                 help="Remove attribute RM_ATTR from the list of attributes for all records "
                 "in the GFF file (can be specified multiple times)")
    p.add_option('--strict-attributes',action='store_true',dest='strict_attributes',
                 help="Remove attributes that don't conform to the KEY=VALUE format")
    p.add_option('--debug',action='store_true',dest='debug',
                 help="Print debugging information")

    # Process the command line
    options,arguments = p.parse_args()

    # Check for debugging
    if options.debug:
        # Turn on debugging output
        logging.getLogger().setLevel(logging.DEBUG)

    # Input files
    if len(arguments) != 1:
        p.error("input GFF file required")
    else:
        infile = arguments[0]
        if not os.path.exists(infile):
            p.error("Input file '%s' not found" % infile)

    # Report version
    p.print_version()

    # Set flags based on command line

    # String to prepend to first column
    prepend_str = options.prepend_str
    # Cleaning options
    if options.do_clean:
        # Update values in the "score" column
        clean_score = True
        # Clean up the "attributes" column
        clean_replace_attributes = True
        clean_exclude_attributes = True
        # Set ID field in "attributes" to group lines with matching SGDs
        group_SGDs = True
    else:
        # Set options based on user input
        clean_score = options.do_clean_score
        clean_replace_attributes = options.do_clean_replace_attributes
        clean_exclude_attributes = options.do_clean_exclude_attributes
        group_SGDs = options.do_clean_group_sgds
    # Report duplicate names
    report_duplicates = options.report_duplicates
    # Resolve duplicated genes using CDS file
    if options.mapping_file is not None:
        resolve_duplicates = True
        cdsfile = options.mapping_file
    else:
        resolve_duplicates = False
        cdsfile = None
    # Discard unresolved duplicates
    discard_unresolved = options.discard_unresolved
    # Insert missing genes
    if options.gene_file is not None:
        insert_missing = True
        if options.gene_file:
            genefile = options.gene_file
        else:
            genefile = cdsfile
    else:
        insert_missing = False
        genefile = None
    # Add an artificial exon ID attribute
    add_exon_ids = options.add_exon_ids
    # Add generated ID attributes
    add_missing_ids = options.add_missing_ids
    # Suppress encoding of attributes on output
    no_attribute_encoding = options.no_encoding
    # Remove attributes that don't conform to KEY=VALUE format
    strict_attributes = options.strict_attributes

    # Name for output files
    ##outext = os.path.splitext(os.path.basename(infile))[1]
    if not options.output_gff:
        outbase = os.path.splitext(os.path.basename(infile))[0]
        outfile = outbase+'_clean.gff'
    else:
        outbase = os.path.splitext(os.path.basename(options.output_gff))[0]
        outfile = options.output_gff
    print "Input : %s" % infile
    print "Output: %s" % outfile
    dupfile = outbase+'_duplicates.txt'
    delfile = outbase+'_discarded.gff'
    unresfile = outbase+'_unresolved.gff'

    # Read in data from file
    gff_data = GFFFile(infile)

    # Prepend string to seqname column
    if prepend_str is not None:
        print "Prepending '%s' to values in 'seqname' column" % prepend_str
        for data in gff_data:
            data['seqname'] = prepend_str+str(data['seqname'])

    # Check/clean score column values
    if clean_score:
        print "Replacing 'Anc_*' and blanks with '0's in 'score' column"
        score_unexpected_values = []
        for data in gff_data:
            try:
                # Numerical value
                score = float(data['score'])
                if score != 0:
                    score_unexpected_values.append(data['score'])
            except ValueError:
                # String value
                if data['score'].strip() != '' and not data['score'].startswith('Anc_'):
                    score_unexpected_values.append(data['score'])
            # Replace "Anc_*" or blank values in "score" column with zero
            if data['score'].startswith('Anc_') or data['score'].strip() == '':
                data['score'] = '0'
        # Report unexpected values
        n = len(score_unexpected_values)
        if n > 0:
            logging.warning("%d 'score' values that are not '', 0 or 'Anc_*'" % n)
            logging.warning("Other values: %s" % score_unexpected_values)

    # Clean up the data in "attributes" column: replace keys
    if clean_replace_attributes:
        # Initialise mapping of keys from input to output in "attributes" column
        # where new values are required etc
        attributes_key_map = OrderedDictionary()
        attributes_key_map['ID'] = 'SGD'
        attributes_key_map['Gene'] = 'SGD'
        attributes_key_map['Parent'] = 'SGD'
        attributes_key_map['Name'] = 'SGD'
        attributes_dont_replace_with_empty_data = True
        print "Cleaning up attributes: replacing keys:"
        for key in attributes_key_map.keys():
            print "\t%s -> %s" % (key,attributes_key_map[key])
        if attributes_dont_replace_with_empty_data:
            print "(Replacement will be skipped if new data is missing/blank)"
        GFFUpdateAttributes(gff_data,attributes_key_map,[],
                            attributes_dont_replace_with_empty_data)

    # Clean up the data in "attributes" column: exclude keys
    if clean_exclude_attributes:
        # List of keys to exclude
        attributes_exclude_keys = ['kaks','kaks2','ncbi']
        print "Excluding keys:"
        for key in attributes_exclude_keys:
            print "\t%s" % key
        GFFUpdateAttributes(gff_data,{},attributes_exclude_keys,True)

    # Set the IDs for consecutive lines with matching SGD names, to indicate that
    # they're in the same gene
    if group_SGDs:
        print "Grouping SGDs by setting ID's for consecutive lines with the same SGD values"
        GFFGroupSGDs(gff_data)

    # Find duplicates in input file
    if report_duplicates or resolve_duplicates:
        duplicate_sgds = GFFGetDuplicateSGDs(gff_data)
                
    if report_duplicates:
        # Write to duplicates file
        print "Writing duplicate SGD names to %s" % dupfile
        fd = open(dupfile,'w')
        ndup = 0
        ngroups = 0
        for sgd in duplicate_sgds.keys():
            assert(len(duplicate_sgds[sgd]) > 1)
            ndup += 1
            fd.write("%s\t" % sgd)
            for data in duplicate_sgds[sgd]:
                # Write the line number, chromosome, start and strand data
                line = ';'.join(('L'+str(data.lineno()),
                                 str(data['seqname']),str(data['start']),str(data['end'])))
                fd.write("\t%s" % line)
            fd.write("\n")
            logging.debug("%s\t%s" % (sgd,duplicate_sgds[sgd]))
            for group in GroupGeneSubsets(duplicate_sgds[sgd]):
                if len(group) > 1: ngroups += 1
        if ndup == 0:
            fd.write("No duplicate SGDs\n")
        fd.close()
        print "%d duplicates found (of which %d are trivial)" % (ndup,ngroups)

    if resolve_duplicates:
        print "Resolving duplicate SGDs using data from %s" % cdsfile
        print "Discarded genes will be written to %s" % delfile
        # Get data on best gene mappings from CDS file
        # Format is tab-delimited, each line has:
        # orf      chr      start     end      strand
        mapping = TabFile(cdsfile,column_names=('name','chr','start','end','strand'))
        # Overlap margin
        overlap_margin = 1000
        # Perform resolution
        result = GFFResolveDuplicateSGDs(gff_data,mapping,duplicate_sgds,overlap_margin)
        #
        # Report the results
        #
        # Convenience variables for lists of unresolved, discarded etc duplicates
        resolved_sgds = result['resolved_sgds']
        unresolved_sgds_no_mapping_genes = result['unresolved_sgds_no_mapping_genes']
        unresolved_sgds_no_mapping_genes_after_filter = \
            result['unresolved_sgds_no_mapping_genes_after_filter']
        unresolved_sgds_no_overlaps = result['unresolved_sgds_no_overlaps']
        unresolved_sgds_multiple_matches = result['unresolved_sgds_multiple_matches']
        discard = result['discard']
        # Remaining unresolved cases
        if len(unresolved_sgds_no_mapping_genes) > 0:
            print "No mapping genes with same SGDs found in %s:" % cdsfile
            for sgd in unresolved_sgds_no_mapping_genes:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_no_mapping_genes_after_filter) > 0:
            print "No mapping genes with same chromosome and/or strand:"
            for sgd in unresolved_sgds_no_mapping_genes_after_filter:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_no_overlaps) > 0:
            print "No mapping genes with overlaps:"
            for sgd in unresolved_sgds_no_overlaps:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_multiple_matches) > 0:
            print "Multiple matching mapping genes:"
            for sgd in unresolved_sgds_multiple_matches:
                print "\t%s" % sgd
            print
        # Summary counts for each case
        print "Total number of duplicated indexes   : %d" % len(duplicate_sgds.keys())
        print "Number of resolved duplicate SGDs    : %d" % len(resolved_sgds)
        print "Unresolved duplicates:"
        print "* No mapping genes with same SGD     : %d" % len(unresolved_sgds_no_mapping_genes)
        print "* No mapping genes with same chr/str : %d" % len(unresolved_sgds_no_mapping_genes_after_filter)
        print "* No mapping genes with overlap      : %d" % len(unresolved_sgds_no_overlaps)
        print "* Multiple mapping genes match       : %d" % len(unresolved_sgds_multiple_matches)

        # Remove discarded duplicates from the data
        print "Removing discarded duplicates and writing to %s" % delfile
        fd = open(delfile,'w')
        for discard_data in discard:
            try:
                ip = gff_data.indexByLineNumber(discard_data.lineno())
                del(gff_data[ip])
                fd.write("%s\n" % discard_data)
            except IndexError:
                logging.warning("Failed to delete line %d: not found" % discard_data.lineno())
        fd.close()

        # Remove unresolved duplicates if requested
        if discard_unresolved:
            print "Removing unresolved duplicates and writing to %s" % unresfile
            # Get list of unresolved SGDs
            all_unresolved = result['unresolved_sgds']
            # Get list of unresolved duplicates
            unresolved = []
            for data in gff_data:
                attributes = data['attributes']
                if 'SGD' in attributes:
                    if attributes['SGD'] in all_unresolved:
                        unresolved.append(data)
            # Discard them
            fu = open(unresfile,'w')
            for discard in unresolved:
                try:
                    ip = gff_data.indexByLineNumber(discard.lineno())
                    del(gff_data[ip])
                    fu.write("%s\n" % discard)
                except IndexError:
                    logging.warning("Failed to delete line %d: not found" % discard.lineno())
            fu.close()

    # Look for "missing" genes in mapping file
    if insert_missing:
        # Get name for file with gene list
        if genefile is None:
            genefile = cdsfile
        print "Inserting unmatched genes from %s" % genefile
        # Get gene data from CDS file
        # Format is tab-delimited, each line has:
        # orf      chr      start     end      strand
        mapping = TabFile(genefile,column_names=('name','chr','start','end','strand'))
        n_genes_before_insert = len(gff_data)
        gff_data = GFFInsertMissingGenes(gff_data,mapping)
        print "Inserted %d missing genes" % (len(gff_data) - n_genes_before_insert)

    # Construct and insert ID for exons
    if add_exon_ids:
        print "Inserting artificial IDs for exon records"
        gff_data = GFFAddExonIDs(gff_data)

    # Construct and insert missing ID attributes
    if add_missing_ids:
        print "Inserting generated IDs for records where IDs are missing"
        gff_data = GFFAddIDAttributes(gff_data)

    # Strip attributes requested for removal
    if options.rm_attr:
        print "Removing the following attributes from all records:"
        for attr in options.rm_attr:
            print "\t* %s" % attr
        GFFUpdateAttributes(gff_data,exclude_keys=options.rm_attr)

    # Remove attributes that don't conform to KEY=VALUE format
    if strict_attributes:
        print "Removing attributes that don't conform to KEY=VALUE format"
        GFFUpdateAttributes(gff_data,exclude_nokeys=True)

    # Suppress percent encoding of attributes
    if no_attribute_encoding:
        print "Converting encoded special characters in attribute data to non-encoded form"
        logging.warning("!!! Special characters will not be correctly encoded in the output  !!!")
        logging.warning("!!! The resulting GFF may not be readable by this or other programs !!!")
        gff_data = GFFDecodeAttributes(gff_data)

    # Write to output file
    print "Writing output file %s" % outfile
    gff_data.write(outfile)