def __init__(self, grammar, input, options={}, *args, **kwargs): super(Baseline3Tagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs inpairs = group_pairs(self.input, none_final=True) # Get all the possible signs from the grammar for index,pair in enumerate(inpairs): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.category_count.keys(): sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) if sign is not None: probability = self.model.get_prob_cat_given_chord_pair(tag, *pair) word_signs.append((sign, tag, probability)) word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2]))) self._tagged_data.append(word_signs) # Work out the sizes of the batches to return these in batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio) # Transform these into a form that's easier to use for getting the signs so_far = 0 batch_ranges = [] for batch in batches: batch_ranges.append((so_far,so_far+batch)) so_far += batch self._batch_ranges.append(batch_ranges)
def _sequence_to_candc_format(formatter, sequence): """ Produces a string representation of observations to be used as training data for a C&C model from a chord sequence internal model. This is an inner function for the various different formats of C&C data we use. """ from jazzparser.utils.base import group_pairs # Produce observations from chord pairs pairs_list = group_pairs(list(sequence.iterator()) + [None]) observation_list = [formatter(*chords) for chords in pairs_list] return "%s\n" % " ".join(observation_list)
def __init__(self, inputs, durations=None, times=None, id=None, chords=None, sequence=None, *args, **kwargs): super(DbInput, self).__init__(*args, **kwargs) self.inputs = inputs self.durations = durations self.times = times self.id = id self.chords = chords self.sequence = sequence if durations is None and times is None: raise ValueError, "cannot create a DbInput with neither " "times nor durations given" elif times is None: self.times = [sum(durations[:i]) for i in range(len(durations))] elif durations is None: from jazzparser.utils.base import group_pairs self.durations = [time1 - time0 for (time1, time0) in group_pairs(times)] + [Fraction(1)]
def train(self, sequences, grammar=None, logger=None): seqs = 0 chords = 0 # Each sequence in the given corpus for seq in sequences: seqs += 1 # Each chord in the sequence for c1,c2 in group_pairs(seq.iterator(), none_final=True): chords += 1 self._add_category_chord_count(c1.category, observation_from_chord_pair(c1, c2)) # Add a bit of training info to the descriptive text self.model_description = """\ Unigram probability model of combined observations of interval and chord type Training sequences: %(seqs)d Training samples: %(samples)d""" % { 'seqs' : seqs, 'samples' : chords }
def __init__(self, inputs, durations=None, times=None, id=None, \ chords=None, sequence=None, *args, **kwargs): super(DbInput, self).__init__(*args, **kwargs) self.inputs = inputs self.durations = durations self.times = times self.id = id self.chords = chords self.sequence = sequence if durations is None and times is None: raise ValueError, "cannot create a DbInput with neither "\ "times nor durations given" elif times is None: self.times = [sum(durations[:i]) for i in range(len(durations))] elif durations is None: from jazzparser.utils.base import group_pairs self.durations = [ time1 - time0 for (time1, time0) in group_pairs(times) ] + [Fraction(1)]
def _steps_list(seq): """ Given a list of (coordinate,function) pairs, produces a similar list that represents the steps between each point in the path and its previous point, maintaining the original functions. The first point yields the step from the origin, ignoring its enharmonic block (in other words, the step from (0,0) within its enharmonic block). This means that effectively we don't care what enharmonic block the path lies in, only the relative points along the path. """ def _minus(c0, c1): return (c0[0] - c1[0], c0[1] - c1[1]) # Get the functions out for later coords, funs = zip(*seq) steps = [coords[0]] + [(_minus(c1, c0)) for c0, c1 in group_pairs(coords)] # Put the functions back in for the result return zip(steps, funs)
def _steps_list(seq): """ Given a list of (coordinate,function) pairs, produces a similar list that represents the steps between each point in the path and its previous point, maintaining the original functions. The first point yields the step from the origin, ignoring its enharmonic block (in other words, the step from (0,0) within its enharmonic block). This means that effectively we don't care what enharmonic block the path lies in, only the relative points along the path. """ def _minus(c0, c1): return (c0[0]-c1[0], c0[1]-c1[1]) # Get the functions out for later coords,funs = zip(*seq) steps = [coords[0]] + [(_minus(c1,c0)) for c0,c1 in group_pairs(coords)] # Put the functions back in for the result return zip(steps, funs)
def __init__(self, inputs, durations=None, times=None, roman=False, *args, **kwargs): super(ChordInput, self).__init__(*args, **kwargs) self.inputs = inputs self.durations = durations self.times = times self.roman = roman # Compute the durations from times or vice versa if durations is None and times is None: raise ValueError, "cannot create a ChordInput with neither "\ "times nor durations given" elif times is None: self.times = [ sum(durations[:i], Fraction(0)) for i in range(len(durations)) ] elif durations is None: from jazzparser.utils.base import group_pairs self.durations = [ time1 - time0 for (time1, time0) in group_pairs(times) ] + [Fraction(1)] # Convert all strings to internal chord representation # Done now so we check the chords can all be understood before doing # anything else self.chords = [ Chord.from_name(name, roman=roman).to_db_mirror() for name in inputs ] for chord, dur in zip(self.chords, self.durations): chord.duration = dur
def __init__(self, inputs, durations=None, times=None, roman=False, *args, **kwargs): super(ChordInput, self).__init__(*args, **kwargs) self.inputs = inputs self.durations = durations self.times = times self.roman = roman # Compute the durations from times or vice versa if durations is None and times is None: raise ValueError, "cannot create a ChordInput with neither " "times nor durations given" elif times is None: self.times = [sum(durations[:i], Fraction(0)) for i in range(len(durations))] elif durations is None: from jazzparser.utils.base import group_pairs self.durations = [time1 - time0 for (time1, time0) in group_pairs(times)] + [Fraction(1)] # Convert all strings to internal chord representation # Done now so we check the chords can all be understood before doing # anything else self.chords = [Chord.from_name(name, roman=roman).to_db_mirror() for name in inputs] for chord, dur in zip(self.chords, self.durations): chord.duration = dur
def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [ observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs ] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities( observations) word_tag_probs = [] for index, probs in enumerate(probabilities): features = { 'duration': self.durations[index], 'time': self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag( self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __, __, p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0, 1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes]
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum( [["%d-%s" % (interval, chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff': self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
def get_vanilla_book(): """ Downloads the whole of the Vanilla Book: L{http://www.ralphpatt.com/Song.html}. """ from BeautifulSoup import BeautifulSoup from urllib2 import urlopen from urllib import quote from urlparse import urljoin import re from jazzparser.utils.base import group_pairs # ~ raise NotImplementedError, "not finished writing this" INDEX_PAGE = "http://www.ralphpatt.com/Song.html" SONG_BASE = "http://www.ralphpatt.com/" # The overbar alternative ending marker alt_end_re = re.compile(r"(\d+).(_+)") # Fetch the referring page and parse it soup = BeautifulSoup(urlopen(INDEX_PAGE).read()) # Pull out all the links links = soup.findAll("a") # Get just the links to songs: all in VB/ song_links = [l["href"] for l in links if l.has_key("href") and l["href"].startswith("VB/")] for song_link in song_links: url = "%s%s" % (SONG_BASE, song_link) song_soup = BeautifulSoup(urlopen(url).read()) # The song's name is in the title tag song_name = song_soup.title.string.strip() print song_name # The chords are in a pre tag chord_text = "".join(song_soup.body.pre.findAll(text=True)) # Remove the key line lines = chord_text.split("\n") start_line = 0 for i, line in enumerate(lines): if line.lower().startswith("key"): # Found the key line: ignore everything up to here start_line = i + 1 break else: # No key line found! print "No key line for %s" % song_name continue lines = lines[start_line:] # Find the chord lines: they start with | or [ song_lines = [] for i, line in enumerate(lines): if line.startswith("[") or line.startswith("|"): song_lines.append((lines[i - 1], lines[i])) try: bars = [] bar_ranges = [] open_repeats = [] for overline, line in song_lines: barlines = list(re.finditer(r"(\|\|)|(\|)|(\[:)|(:\])|(\[)", line)) barline_ptns = [] for i, (start_match, end_match) in enumerate(group_pairs(barlines)): # If the bar has zero length, it's just two barlines # next to each other: ignore if start_match.end() == end_match.start(): continue barline_ptns.append(start_match.start()) # Get the upper and lower parts of this bar if i == len(barlines) - 2: # If this is the last bar on the line, go to the end overbar = overline[start_match.start() - 2 :] else: overbar = overline[start_match.start() - 2 : end_match.start()] overbar_cnt = overbar.strip() if len(overbar_cnt) < 2: overbar_cnt = "" bar = line[start_match.end() : end_match.start()] # We might loose some timing information at this point, # but it's not really worth trying to get chords = [str(c) for c in bar.split() if c != "/"] bars.append(chords) # Check the starting barline for a repeat barline = line[start_match.start() : start_match.end()] end_barline = line[end_match.start() : end_match.end()] # If we're starting a repeat, note that it starts here if barline == "[:": open_repeats.append(len(bars) - 1) # If we're ending a repeat, copy in the repeated bars if end_barline == ":]": if len(open_repeats) == 0: print "Unmatched open repeat in %s" % song_name raise ChordSequenceParseError repeat_start = open_repeats.pop() bars.extend(bars[repeat_start:]) if overbar_cnt.startswith("__"): overbar_cnt = overbar_cnt[2:].lstrip() elif overbar_cnt.startswith("_"): overbar_cnt = overbar_cnt[1:].lstrip() if len(overbar_cnt): alt_end = alt_end_re.match(overbar_cnt) if alt_end: print "alt end", alt_end.groups()[0] else: print overbar_cnt ## TODO: deal with alternative endings (in the overbar) except ChordSequenceParseError: continue
def get_vanilla_book(): """ Downloads the whole of the Vanilla Book: L{http://www.ralphpatt.com/Song.html}. """ from BeautifulSoup import BeautifulSoup from urllib2 import urlopen from urllib import quote from urlparse import urljoin import re from jazzparser.utils.base import group_pairs #~ raise NotImplementedError, "not finished writing this" INDEX_PAGE = 'http://www.ralphpatt.com/Song.html' SONG_BASE = 'http://www.ralphpatt.com/' # The overbar alternative ending marker alt_end_re = re.compile(r'(\d+).(_+)') # Fetch the referring page and parse it soup = BeautifulSoup(urlopen(INDEX_PAGE).read()) # Pull out all the links links = soup.findAll("a") # Get just the links to songs: all in VB/ song_links = [l['href'] for l in links if l.has_key("href") and \ l['href'].startswith("VB/")] for song_link in song_links: url = "%s%s" % (SONG_BASE, song_link) song_soup = BeautifulSoup(urlopen(url).read()) # The song's name is in the title tag song_name = song_soup.title.string.strip() print song_name # The chords are in a pre tag chord_text = ''.join(song_soup.body.pre.findAll(text=True)) # Remove the key line lines = chord_text.split("\n") start_line = 0 for i,line in enumerate(lines): if line.lower().startswith("key"): # Found the key line: ignore everything up to here start_line = i+1 break else: # No key line found! print "No key line for %s" % song_name continue lines = lines[start_line:] # Find the chord lines: they start with | or [ song_lines = [] for i,line in enumerate(lines): if line.startswith("[") or line.startswith("|"): song_lines.append((lines[i-1], lines[i])) try: bars = [] bar_ranges = [] open_repeats = [] for overline,line in song_lines: barlines = list(re.finditer(r"(\|\|)|(\|)|(\[:)|(:\])|(\[)", line)) barline_ptns = [] for i,(start_match,end_match) in enumerate(group_pairs(barlines)): # If the bar has zero length, it's just two barlines # next to each other: ignore if start_match.end() == end_match.start(): continue barline_ptns.append(start_match.start()) # Get the upper and lower parts of this bar if i == len(barlines) - 2: # If this is the last bar on the line, go to the end overbar = overline[start_match.start()-2:] else: overbar = overline[start_match.start()-2:end_match.start()] overbar_cnt = overbar.strip() if len(overbar_cnt) < 2: overbar_cnt = "" bar = line[start_match.end():end_match.start()] # We might loose some timing information at this point, # but it's not really worth trying to get chords = [str(c) for c in bar.split() if c != "/"] bars.append(chords) # Check the starting barline for a repeat barline = line[start_match.start():start_match.end()] end_barline = line[end_match.start():end_match.end()] # If we're starting a repeat, note that it starts here if barline == "[:": open_repeats.append(len(bars)-1) # If we're ending a repeat, copy in the repeated bars if end_barline == ":]": if len(open_repeats) == 0: print "Unmatched open repeat in %s" % song_name raise ChordSequenceParseError repeat_start = open_repeats.pop() bars.extend(bars[repeat_start:]) if overbar_cnt.startswith("__"): overbar_cnt = overbar_cnt[2:].lstrip() elif overbar_cnt.startswith("_"): overbar_cnt = overbar_cnt[1:].lstrip() if len(overbar_cnt): alt_end = alt_end_re.match(overbar_cnt) if alt_end: print "alt end", alt_end.groups()[0] else: print overbar_cnt ## TODO: deal with alternative endings (in the overbar) except ChordSequenceParseError: continue
def train_transition_distribution(self, inputs, grammar, contprob=0.3): """ Train the transition distribution parameters in a supervised manner, using chord corpus input. This is used as an initialization step to set transition parameters before running EM on unannotated data. @type inputs: L{jazzparser.data.input.AnnotatedDbBulkInput} @param inputs: annotated chord training data @type contprob: float or string @param contprob: probability mass to reserve for staying on the same state (self transitions). Use special value 'learn' to learn the probabilities from the durations """ self.add_history( "Training transition probabilities using %d annotated chord "\ "sequences" % len(inputs)) learn_cont = contprob == "learn" # Prepare the label sequences that we'll train on if learn_cont: # Repeat values with a duration > 1 sequences = [] for seq in inputs: sequence = [] last_cat = None for chord,cat in zip(seq, seq.categories): # Put it in once for each duration for i in range(chord.duration): sequence.append((chord,cat)) sequences.append(sequence) else: sequences = [list(zip(sequence, sequence.categories)) for \ sequence in inputs] # Prepare a list of transformations to apply to the categories label_transform = {} # First include all the categories we want to keep as they were for schema in self.schemata: label_transform[schema] = (schema, 0) # Then include any transformations the grammar defines for pos,mapping in grammar.equiv_map.items(): label_transform[pos] = (mapping.target.pos, mapping.root) # Apply the transformation to all the training data training_samples = [] for chord_cats in sequences: seq_samples = [] for chord,cat in chord_cats: # Transform the label if it has a transformation if cat in label_transform: use_cat, alter_root = label_transform[cat] else: use_cat, alter_root = cat, 0 root = (chord.root + alter_root) % 12 seq_samples.append((str(use_cat), root)) training_samples.append(seq_samples) training_data = sum([ [(cat0, cat1, (root1 - root0) % 12) for ((cat0,root0),(cat1,root1)) in \ group_pairs(seq_samples)] \ for seq_samples in training_samples], []) # Count up the observations schema_transition_counts = ConditionalFreqDist() root_transition_counts = ConditionalFreqDist() for (label0, label1, root_change) in training_data: # Only use counts for categories the model's looking for if label0 in self.schemata and label1 in self.schemata: schema_transition_counts[label0].inc(label1) root_transition_counts[(label0,label1)].inc(root_change) # Transition probability to final state (end of sequence) for sequence in training_samples: # Inc the count of going from the label the sequence ends on to # the final state schema_transition_counts[sequence[-1][0]].inc(None) # Use Laplace (plus one) smoothing # We don't use the laplace_estimator because we want the conversion # to a dict prob dist to get all the labels, not just to discount # the ones it's seen for label0 in self.schemata: for label1 in self.schemata: for root_change in range(12): # Exclude self-transition for now, unless we're learning it if learn_cont or not (label0 == label1 and root_change == 0): schema_transition_counts[label0].inc(label1) root_transition_counts[(label0,label1)].inc(root_change) # We don't add a count for going to the final state: we don't # want to initialize it with too much weight # Estimate distribution from this frequency distribution schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\ ConditionalProbDist(schema_transition_counts, mle_estimator, None), \ mutable=True, samples=self.schemata+[None]) root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\ ConditionalProbDist(root_transition_counts, mle_estimator, None), \ mutable=True, samples=range(12)) if not learn_cont: # Discount all probabilities to allow for self-transition probs discount = logprob(1.0 - contprob) self_prob = logprob(contprob) for label0 in self.schemata: # Give saved prob mass to self-transitions trans_dist[label0].update((label0, 0), self_prob) # Discount all other transitions to allow for this for label1 in self.schemata: for root_change in range(12): if not (label0 == label1 and root_change == 0): # Discount non self transitions trans_dist[label0].update((label1, root_change), \ trans_dist[label0].logprob((label1, root_change)) + \ discount) # Recreate the dict prob dist so it's not mutable any more schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(schema_trans_dist) root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(root_trans_dist) ## Now for the initial distribution # Count up the observations initial_counts = FreqDist() for sequence in training_samples: initial_counts.inc(sequence[0][0]) # Use Laplace (plus one) smoothing #for label in self.schemata: # initial_counts.inc(label) # Estimate distribution from this frequency distribution initial_dist = prob_dist_to_dictionary_prob_dist(\ mle_estimator(initial_counts, None), samples=self.schemata) # Replace the model's transition distributions self.schema_transition_dist = schema_trans_dist self.root_transition_dist = root_trans_dist self.initial_state_dist = initial_dist # Invalidate the cache self.clear_cache()
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs): super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) if type(self) == CandcTagger: raise NotImplementedError, "Tried to instantiate CandcTagger "\ "directly. You should use one of its subclasses." self.tag_batch_ratio = self.options['batch'] model = self.options['model'].split('.') # Check that candc is available for supertagging if not os.path.exists(settings.CANDC.BASE_PATH): raise CandcConfigurationError, "The C&C parser base "\ "directory %s does not exist" % settings.CANDC.BASE_PATH if not os.path.exists(settings.CANDC.MODELS_PATH): raise CandcConfigurationError, "The C&C parser models "\ "directory %s does not exist" % settings.CANDC.MODELS_PATH candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command) if not os.path.exists(candc_cmd): raise CandcConfigurationError, "The C&C supertagger command "\ "%s does not exist. Have you built it?" % candc_cmd # Check the model exists candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model)) if not os.path.exists(candc_model): raise CandcConfigurationError, "The C&C model given (%s) "\ "doesn't exist." % candc_model # Create a logger to dump the output to logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model)) candc_logger = create_logger(filename=logfile) self.logger.info("Logging C&C output to %s" % logfile) # Note in the log what we're trying to tag candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input])) # Read in the list of tags to smooth over self.tag_list = read_tag_list(os.path.join(candc_model, "tags")) # Read in extra options opts_filename = os.path.join(candc_model, "jpopts") if not os.path.exists(opts_filename): self.extra_opts = {} else: with open(opts_filename, 'r') as opts_file: self.extra_opts = dict( [line.strip("\n").split(":", 1) for line in opts_file.readlines()]) # Pull the chord mapping out of the options self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None)) # Spawn a process to do the tagging candc_command = [candc_cmd, "--model", candc_model, "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args self.tagger = Popen(candc_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) candc_logger.info("C&C command: %s" % " ".join(candc_command)) self.tokens = self.input # Build some observations from the tokens observations = [ interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) for ch1,ch2 in group_pairs(self.tokens+[None]) ] # Add a dummy POS tag to each input item self.observations = ["%s|C" % t for t in observations] candc_logger.info("Input: %s" % " ".join(self.observations)) # Run the tagger on this input try: tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations)) except OSError, err: logger.error("Could not run the C&C supertagger (%s)" % err) candc_logger.error("Error: %s" % err) # Output the actual error that the command returned error = self.tagger.stderr.read() logger.error("C&C returned the error: %s" % error) candc_logger.error("C&C error: %s" % error) raise CandcTaggingError, "error running the C&C supertagger: %s" % error
def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities(observations) word_tag_probs = [] for index,probs in enumerate(probabilities): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __,__,p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0,1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes]
'messages' : messages, 'time' : timer.get_time(), }) return response else: # Parsed successfully # Do some postprocessing and return to the main function # Output audio files from the harmonical if (options.harmonical is not None or \ options.enharmonical is not None) and len(results) > 0: path = grammar.formalism.sign_to_coordinates(results[0]) # Assuming we used a temporal formalism, the times should be # available as a list from the semantics times = results[0].semantics.get_path_times() point_durations = [next-current for current,next in group_pairs(times)] + [0] # Get 3d coordinates as well path3d = zip(add_z_coordinates(path, pitch_range=2), point_durations) path2d = zip(path,point_durations) # Get chord types out of the input chords = tagger.get_string_input() chord_durs = [tagger.get_word_duration(i) for i in range(tagger.input_length)] chord_types = [(Chord.from_name(c).type,dur) for c,dur in zip(chords,chord_durs)] if options.midi: # Maybe set this as a CL option or a setting # 73 - flute # 0 - piano # 4 - e-piano instrument = 73 # TODO: make these filenames different for multiple inputs