Пример #1
0
def main():
    usage = "%prog <in-file> <part>/<parts>"
    description = "Takes a sequence data file, partitions it into the "\
        "number of partitions given and prints out the indices of the "\
        "sequences the appear in the requested partition. Specify the "\
        "partition number (from 0) and total number of partitions in the "\
        "form <partition-num>/<total-parts>."
    parser = OptionParser(usage=usage, description=description)
    options, arguments = parser.parse_args()
        
    if len(arguments) == 0:
        print >>sys.stderr, "You must specify an input data file"
        sys.exit(1)
    elif len(arguments) == 1:
        print >>sys.stderr, "You must give a partition specifier: <part>/<parts>"
    filename = os.path.abspath(arguments[0])
    part, parts = arguments[1].split("/")
    part, parts = int(part), int(parts)
    
    # Read in the data file
    seqs = SequenceIndex.from_file(filename)
    
    # Partition the sequences
    indices = range(len(seqs))
    # Use the partition function to ensure this partitioning is consistent
    #  with all other places the sequences get partitioned
    all_parts = partition(indices, parts)
    print " ".join(["%d" % i for i in all_parts[part]])
Пример #2
0
def main():
    usage = "%prog [options] <in-file>"
    parser = OptionParser(usage=usage)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", default=DEFAULT_PARTITIONS, help="the number of partitions to use (default: %d)" % DEFAULT_PARTITIONS)
    parser.add_option("--ids", dest="ids", action="store_true", help="don't output any files - just print out a list of the ids of the sequences in each partition")
    options, arguments = parser.parse_args()
        
    if len(arguments) == 0:
        print >>sys.stderr, "You must specify an input data file"
        sys.exit(1)
    filename = os.path.abspath(arguments[0])
    
    # Read in the data file
    seqs = SequenceIndex.from_file(filename)
    
    part_pattern = "%s.part%%d" % filename
    heldout_pattern = "%s.heldout_part%%d" % filename
    # Divide the data up into partitions, with their complements
    parts = zip(partition(seqs.sequences, options.partitions), holdout_partition(seqs.sequences, options.partitions))
    # Save each partition and its complement
    for i,(part,heldout) in enumerate(parts):
        if options.ids:
            # Just print out a list of the ids in the partition
            print " ".join(["%d" % s.id for s in part])
        else:
            save_sequences(part_pattern % i, part)
            save_sequences(heldout_pattern % i, heldout)
            print >>sys.stderr, "Wrote partition %d to %s and %s" % (i,part_pattern % i,heldout_pattern % i)
Пример #3
0
def main():
    usage = "%prog [options] <in-file>"
    parser = OptionParser(usage=usage)
    parser.add_option("-p",
                      "--partitions",
                      dest="partitions",
                      action="store",
                      type="int",
                      default=DEFAULT_PARTITIONS,
                      help="the number of partitions to use (default: %d)" %
                      DEFAULT_PARTITIONS)
    parser.add_option(
        "--ids",
        dest="ids",
        action="store_true",
        help=
        "don't output any files - just print out a list of the ids of the sequences in each partition"
    )
    options, arguments = parser.parse_args()

    if len(arguments) == 0:
        print >> sys.stderr, "You must specify an input data file"
        sys.exit(1)
    filename = os.path.abspath(arguments[0])

    # Read in the data file
    seqs = SequenceIndex.from_file(filename)

    part_pattern = "%s.part%%d" % filename
    heldout_pattern = "%s.heldout_part%%d" % filename
    # Divide the data up into partitions, with their complements
    parts = zip(partition(seqs.sequences, options.partitions),
                holdout_partition(seqs.sequences, options.partitions))
    # Save each partition and its complement
    for i, (part, heldout) in enumerate(parts):
        if options.ids:
            # Just print out a list of the ids in the partition
            print " ".join(["%d" % s.id for s in part])
        else:
            save_sequences(part_pattern % i, part)
            save_sequences(heldout_pattern % i, heldout)
            print >> sys.stderr, "Wrote partition %d to %s and %s" % (
                i, part_pattern % i, heldout_pattern % i)
from apps.sequences.models import ChordSequence
from django.db.models import Q
from jazzparser.utils.data import holdout_partition, partition
import os.path, sys

NUM_PARTITIONS = 10
FILENAME = "partition"

# Build a list of the sequences to put in each partition
# Only include fully annotated sequences
print >> sys.stderr, "Building list of fully annotated sequences"
seqs = [
    seq.id for seq in ChordSequence.objects.filter(analysis_omitted=False)
    if seq.fully_annotated
]
partitions = zip(partition(seqs, NUM_PARTITIONS),
                 holdout_partition(seqs, NUM_PARTITIONS))

for i, parts in enumerate(partitions):
    part, rest = parts
    # Output two files for each partition
    part_file = "%s-%d" % (FILENAME, i)
    held_file = "%s-%d-heldout" % (FILENAME, i)
    print >> sys.stderr, "Outputing partition %d to %s and %s" % (i, part_file,
                                                                  held_file)
    # Output the partition's file
    query = Q(id__in=part)
    save_pickled_data(part_file, query)
    # Output the rest of the data
    query = Q(id__in=rest)
    save_pickled_data(held_file, query)
from apps.sequences.datautils import save_pickled_data
from apps.sequences.models import ChordSequence
from django.db.models import Q
from jazzparser.utils.data import holdout_partition, partition
import os.path, sys

NUM_PARTITIONS = 10
FILENAME = "partition"

# Build a list of the sequences to put in each partition
# Only include fully annotated sequences
print >>sys.stderr, "Building list of fully annotated sequences"
seqs = [seq.id for seq in 
                ChordSequence.objects.filter(analysis_omitted=False)
                if seq.fully_annotated]
partitions = zip(partition(seqs, NUM_PARTITIONS), holdout_partition(seqs, NUM_PARTITIONS))

for i,parts in enumerate(partitions):
    part, rest = parts
    # Output two files for each partition
    part_file = "%s-%d" % (FILENAME, i)
    held_file = "%s-%d-heldout" % (FILENAME, i)
    print >>sys.stderr, "Outputing partition %d to %s and %s" % (i, part_file, held_file)
    # Output the partition's file
    query = Q(id__in=part)
    save_pickled_data(part_file, query)
    # Output the rest of the data
    query = Q(id__in=rest)
    save_pickled_data(held_file, query)
Пример #6
0
def prepare_evaluation_options(usage=None, description=None, 
        optparse_options=[], check_args=None, optparse_groups=[]):
    """
    Various tasks common to the initial part of the evaluation routine
    scripts (C{models/eval.py}).
    
    @todo: This is not used any more. Remove it, after checking it's definitely 
        not used.
    
    @param usage: the optparse usage string
    @param description: the optparse description string
    @type optparse_options: list of tuples
    @param optparse_options: (args,kwargs) pairs to add additional 
        options to the optparse parser.
    @type check_args: function
    @param check_args: function to take the command-line arguments and 
        check them. This will be called early in the script. Must 
        return a tuple of (1) the model name (or model basename) that 
        will be used in the partition model names and (2) the input 
        filename to get sequences from.
    @type optparse_groups: list of pairs
    @param optparse_groups: specificatios for option groups to add to the 
        optparse option parser. The first of each pair is a tuple of 
        args to C{OptionGroup}'s init (excluding the first). 
        The second is a list of options 
        each formatted as C{optparse_options}.
        
    @rtype: tuple
    @return: (1) list of (sequences,model_name,partition_index) tuples
        for each partition; (2) list of lists containing the sequence 
        ids for each partition; (3) optparse options; (4) optparse 
        arguments.
    
    """
    import sys
    from optparse import OptionParser, OptionGroup
    from jazzparser.utils.config import parse_args_with_config
    from jazzparser.utils.loggers import init_logging
    from jazzparser.data.db_mirrors import SequenceIndex
    from jazzparser.utils.data import partition
    
    parser = OptionParser(usage=usage, description=description)
    group = OptionGroup(parser, "Input", "Input data and partitioning for evaluation")
    group.add_option("-s", "--sequence", dest="sequence", action="store", help="limit the evaluation to just one sequence, with the given index in the input file")
    group.add_option("--partition", dest="partition", action="store", help="restrict to only one partition of the data. Specify as i/n, where i is the partition number and n the total number of partitions.")
    group.add_option("-p", "--partitions", dest="partitions", type="int", action="store", help="test on all n partitions of the data, using a different model for each. Will look for a model <NAME>i, where <NAME> is the given model name and i the partition number.")
    parser.add_option_group(group)
    
    parser.add_option("--debug", dest="debug", action="store_true", help="show debugging output")
    
    # Add the options according to their specs
    for args,kwargs in optparse_options:
        parser.add_option(*args, **kwargs)
        
    # Add groups and their options
    for group_args,options in optparse_groups:
        # Check whether the group already exists
        same_titles = [g for g in parser.option_groups if g.title == group_args[0]]
        if same_titles:
            group = same_titles[0]
        else:
            group = OptionGroup(parser, *group_args)
            parser.add_option_group(group)
        # Add options to this group
        for args,kwargs in options:
            group.add_option(*args, **kwargs)
    options, arguments = parse_args_with_config(parser)
    
    if check_args is None:
        raise ValueError, "could not check arguments and get model "\
            "name. check_args must not be None"
    model_name,input_filename = check_args(arguments)
        
    if options.debug:
        # Set the log level to debug and do the standard logging init
        init_logging(logging.DEBUG)
    else:
        init_logging()
        
    # Load up sequences
    seqs = SequenceIndex.from_file(input_filename)
        
    def _get_seq_by_index(index):
        seq = seqs.sequence_by_index(index)
        if seq is None:
            print >>sys.stderr, "There are only %d sequences" % len(seqs)
            sys.exit(1)
        return seq
    
    ################ Data partitioning ####################
    if options.partitions is not None:
        # Divide the data up into n partitions and use a different model name for each
        total_parts = options.partitions
        print >>sys.stderr, "Cross validation: dividing test data into %d partitions" % total_parts
        partitions = [(part,"%s%d" % (model_name,i), i) for i,part in enumerate(partition(seqs.sequences, total_parts))]
        part_ids = partition(seqs.ids, total_parts)
    elif options.partition is not None:
        # Just select one partition
        # Split up the argument to get two integers
        parti,total_parts = options.partition.split("/")
        parti,total_parts = int(parti), int(total_parts)
        print >>sys.stderr, "Restricting sequences to %d-way partition %d" % (total_parts,parti)
        # Get a list of sequence indices to restrict our set to
        part_ids = partition(seqs.ids, total_parts)[parti]
        partitions = [ [(part,"%s%d" % (model_name,i), i) for i,part in enumerate(partition(seqs.sequences, total_parts))][parti] ]
    elif options.sequence is not None:
        # Just select one sequence
        seq = _get_seq_by_index(int(options.sequence))
        partitions = [( [seq], model_name, 0 )]
        part_ids = [seq.id]
    else:
        # Don't partition the sequences
        partitions = [(seqs.sequences, model_name,0)]
        part_ids = [None]
    
    return partitions,part_ids,options,arguments
Пример #7
0
     num_inputs = len(input_data)
     # Fill the progress record with names and mark as incomplete
     completed_parses = dict([(name,False) \
                             for name in input_data.get_identifiers()])
     if partitions > 1:
         if options.sequence_partitions is not None:
             # Split the inputs up into partitions on the basis of 
             #  an even partitioning of chord sequences
             # This can only be done with 
             if not isinstance(input_data, SegmentedMidiBulkInput):
                 logger.error("option --sequence-partitions is only "\
                     "valid with bulk midi input data")
                 return 1
             chord_seqs = DbBulkInput.from_file(options.sequence_partitions)
             # Partition the chord sequences: we only need indices
             seq_indices = enumerate(partition(
                         [i for i in range(len(chord_seqs))], partitions))
             seq_partitions = dict(
                 sum([[(index,part_num) for index in part] for 
                         (part_num,part) in seq_indices], []) )
             # Associate a partition num with each midi input
             partition_numbers = [
                 seq_partitions[midi.sequence_index] for midi in input_data]
         else:
             # Prepare a list of partition numbers to append to model names
             partition_numbers = sum([
                 [partnum for i in part] for (partnum,part) in \
                  enumerate(partition(range(num_inputs), partitions))], [])
 else:
     # Otherwise, there's just one input
     input_list = [input_data]
     num_inputs = 1