def conductGeneration(generation, corpus, previous_output): ''' Conducts a generation of learning and testing on the input data Inputs generation (int) --- the number of the generation corpus (array) --- the lemmas and their info from reading the corpus file previous_output (dict) --- the output phonology of the previous generation Returns the output of the current generation--the expected outputs for the following generation ''' # Build the right size network net = buildNetwork(constants.input_nodes, constants.hidden_nodes, output_nodes) # Build the right size training set emptytraining_set = SupervisedDataSet(constants.input_nodes, output_nodes) # Initialize corpus object and expected output dictionary training_corpus = objects.Corpus(emptytraining_set) # Iterate through tokens and convert to binary for lemma in corpus: # Input phonologies in case dictionary and feed to realign function case_dict = { case: form.input_phon[generation] for case, form in lemma.cases.iteritems() }
def conductGeneration(generation, corpus, previous_output): ''' Conducts a generation of learning and testing on the input data inputs generation (int) --- the number of the generation corpus (array) --- the lemmas and their info from reading the corpus file previous_output (dict) --- the output (gender, declension, case, number) of the previous generation outputs ''' print "Trial %s" % str(constants.trial) # Build the right size network net = buildNetwork(constants.input_nodes, constants.hidden_nodes, constants.output_nodes) # Build the right size training set emptytraining_set = SupervisedDataSet(constants.input_nodes, constants.output_nodes) # Initialize corpus object and expected output dictionary training_corpus = objects.Corpus(emptytraining_set) # Iterate through tokens and convert to binary for lemma in corpus: # Iterate through cases for case, form in lemma.cases.iteritems(): # Create the input tuple form.createInputTuple(form.syllables) # Extract the class information from the previous generation expected_outputs[form.lemmacase] = previous_output[form.lemmacase] # Add words according to their frequencies training_corpus.addByFreq(form, previous_output) print expected_outputs
def conductGeneration(generation, corpus, previous_output): ''' Conducts a generation of learning and testing on the input data Inputs generation (int) --- the number of the generation corpus (array) --- the lemmas and their info from reading the corpus file previous_output (dict) --- the output phonology of the previous generation Returns the output of the current generation--the expected outputs for the following generation ''' # Build the right size network net = buildNetwork(input_nodes, constants.hidden_nodes, constants.output_nodes) # Build the right size training set emptytraining_set = SupervisedDataSet(input_nodes, constants.output_nodes) # Initialize corpus object and expected output dictionary training_corpus = objects.Corpus(emptytraining_set) # Iterate through tokens and convert to binary for lemma in corpus: # Iterate through cases for case, form in lemma.cases.iteritems(): # Add words according to their frequencies training_corpus.addByFreq(constants.token_freq, form, previous_output[form.lemmacase]) # Construct the training set print '''--------Generation %s--------''' % generation print '''-----------Trial %s---------- Training on %d Epochs Vectors: %s Number of Input Nodes: %d Number of Hidden Nodes: %d Number of Output Nodes: %d Token Frequency taken into account: %s\n''' % ( constants.trial, constants.epochs, constants.vectors, input_nodes, constants.hidden_nodes, constants.output_nodes, constants.token_freq) print "Constructing the training set" training_set = training_corpus.constructTrainingSet() # Construct the trainer trainer = BackpropTrainer(net, training_set) # Train print "Training the model" error = trainer.trainEpochs(constants.epochs) print "Number of Tokens in Training Set: %s" % len(training_set) results = {} # For each word in the test set, calculate output tuple print "Running the test set" # Counter to count correct ncorrect = 0 tot_phon = 0 for (form, input_tuple, expected_output) in training_corpus.test: # Activate the net, and smooth the output result = smooth(tuple(net.activate(input_tuple))) # Append output tuple to result results[form.lemmacase] = result # Hash the output tuple to get the phonological form result new_phonology = '' # Divide tuple into chunks (each 11 units, representing one phoneme) chunked_list = list(chunks(list(result), constants.n_feat)) # Divide previous output tuple into chunks chunked_prev = list( chunks(list(previous_output[form.lemmacase]), constants.n_feat)) for phon_index in range(len(chunked_list)): phoneme = chunked_list[phon_index] prev_phoneme = chunked_prev[phon_index] new_phonology += constants.feat_to_phon[tuple(phoneme)] # If phoneme matches, add to number correct if prev_phoneme != [0.5] * constants.n_feat: tot_phon += 1 if phoneme == prev_phoneme: ncorrect += 1 # Output for this generation is new suffix new_suf = ''.join(new_phonology) for seq in functions.to_revert.keys(): if seq in new_suf: new_suf = new_suf.replace(seq, functions.to_revert[seq]) new_suf = new_suf.replace('-', '') form.output_change[generation] = new_suf print form.lemmacase, form.root + form.suffix, form.parent.declension, form.parent.gender, form.parent.totfreq, form.root + new_suf, new_suf print "Results have been determined" try: print "Percentage correct in test run: %f" % round( float(ncorrect) / float(tot_phon) * 100, 2) except ZeroDivisionError: print "Percentage correct in test run: 0.00" return results
def conductGeneration(generation, corpus, previous_output): ''' Conducts a generation of learning and testing on the input data inputs generation (int) --- the number of the generation corpus (array) --- the lemmas and their info from reading the corpus file previous_output (dict) --- the output (gender, declension, case, number) of the previous generation Returns the output of the current generation--the expected outputs for the following generation ''' # Build the right size network net = buildNetwork(constants.input_nodes, constants.hidden_nodes, constants.output_nodes) # Build the right size training set emptytraining_set = SupervisedDataSet(constants.input_nodes, constants.output_nodes) # Initialize corpus object and expected output dictionary training_corpus = objects.Corpus(emptytraining_set) # Iterate through tokens and convert to binary for lemma in corpus: # JUST SKIP THOSE THAT ARE DEFECTIVE if len(lemma.cases.keys()) < 6: continue # Iterate through cases for case, form in lemma.cases.iteritems(): # Get new input from previous output new_gender, new_dec, new_case, new_num, prev_output = form.output_change[ generation - 1] # Use new input as new syllables new_syllables = lemma.cases[new_case + new_num].syllables # Append to input change form.input_change[generation] = (new_case + new_num, ''.join(new_syllables).replace( '-', '')) # print form.lemmacase, form.input_change[generation] # Create the input tuple form.createInputTuple(new_syllables) # Add words according to their frequencies training_corpus.addByFreq(constants.token_freq, form, expected_outputs[form.lemmacase]) # Print information print "--------Generation %s--------" % generation if generation >= constants.gnvdrop_generation: print "Genitive Case Dropped" # Construct the training set print "Constructing the training set" training_set = training_corpus.constructTrainingSet() # Construct the trainer trainer = BackpropTrainer(net, training_set) # Train print "Training the model" if constants.epochs == 1: error = trainer.train() else: error = trainer.trainEpochs(constants.epochs) print "Number of Tokens in Training Set: %s" % len(training_set) print "Training Error: %s" % error results = {} # For each word in the test set print "Running the test set" for (form, input_tuple, expected_output) in training_corpus.test: # Determine if we should drop the genitive drop_gen = generation >= constants.gnvdrop_generation # Activate the net, and smooth the output result = smooth(tuple(net.activate(input_tuple)), gendrop=drop_gen, hierarchy=constants.hierarchy) # Append output tuple to result results[form.lemmacase] = result # Hash the output tuple to get the result gender = constants.tup_to_gen[tuple( result[constants.gen_b:constants.dec_b])] dec = constants.tup_to_dec[tuple( result[constants.dec_b:constants.casenum_b])] casenum = constants.tup_to_case[tuple(result[constants.casenum_b:])] output = form.parent_lemma.cases[casenum].phonology # Set input change once we figure out how to deal with the phonology form.output_change[generation] = (gender, dec, casenum[0:3], casenum[3:], output) print "Results have been determined" return results
def conductGeneration(generation, corpus, previous_output): ''' Conducts a generation of learning and testing on the input data Inputs generation (int) --- the number of the generation corpus (array) --- the lemmas and their info from reading the corpus file previous_output (dict) --- the output phonology of the previous generation Returns the output of the current generation--the expected outputs for the following generation ''' # Build the right size network net = buildNetwork(input_nodes, constants.hidden_nodes, constants.output_nodes) # Build the right size training set emptytraining_set = SupervisedDataSet(input_nodes, constants.output_nodes) # Initialize corpus object and expected output dictionary training_corpus = objects.Corpus(emptytraining_set) # Iterate through tokens and convert to binary for lemma in corpus: # Iterate through cases for case, form in lemma.cases.iteritems(): # Add words according to their frequencies training_corpus.addByFreq(constants.token_freq, form, previous_output[form.lemmacase]) # Construct the training set print "--------Generation %s--------" % generation print "Constructing the training set" training_set = training_corpus.constructTrainingSet() # Construct the trainer trainer = BackpropTrainer(net, training_set) # Train print "Training the model" error = trainer.trainEpochs(constants.epochs) print "Number of Tokens in Training Set: %s" % len(training_set) results = {} # For each word in the test set, calculate output tuple print "Running the test set" # Counter to count correct ncorrect = 0 for (form, input_tuple, expected_output) in training_corpus.test: # Activate the net, and smooth the output result = smooth(tuple(net.activate(input_tuple)), suf_dict) # Append output tuple to result results[form.lemmacase] = result # Add to ncorrect if matches previous if result == previous_output[form.lemmacase]: ncorrect += 1 # Hash the output tuple to get the suffix result new_suf = inv_suf[result] form.output_change[generation] = new_suf print form.lemmacase, form.root + form.suffix, form.parent.declension, form.parent.gender, form.parent.totfreq, form.root + new_suf, new_suf print "Results have been determined" print "Percentage correct in test run: %f" % round( float(ncorrect) / float(len(previous_output)) * 100, 2) return results
def conductGeneration(generation, corpus, previousOutput): ''' Conducts a generation of learning and testing on the input data inputs generation (int) --- the number of the generation corpus (array) --- the output of reading the corpus file previousOutput (dict) --- the output of the previous generation outputs ''' print "Trial %s" % str(constants.trial) input_size = constants.inputNodes # if we're using slavic data, modify the expected size of the input vector. if constants.includeSlavic and generation >= constants.generationToIntroduceSlavic: input_size = constants.inputNodesSlav # build the right size network net = buildNetwork(input_size, constants.hiddenNodes, constants.outputNodes) # build the right size training set emptyTrainingSet = SupervisedDataSet(input_size, constants.outputNodes) # initialize corpus object trainingCorpus = objects.Corpus(emptyTrainingSet) # iterate through tokens passed to the function for token in corpus: # iterate through cases for (case, word) in token.cases: # set its syllables, based on the generation (i.e. account for sound changes) word.setSyllables(generation, word.syllables) # extract the gender from the previous generation print previousOutput (placeholder, previousResult) = previousOutput[[ wordinfo for (wordinfo, gender) in previousOutput ].index(word.description)] # print previousResult # print placeholder # we already know the word # adds words according to their frequencies trainingCorpus.configure(word, previousResult, generation) # construct the training set trainingSet = trainingCorpus.constructTrainingSet() # construct the trainer trainer = BackpropTrainer(net, trainingSet) # train if constants.epochs == 1: error = trainer.train() else: error = trainer.trainEpochs(constants.epochs) print "--------Generation: %s--------" % generation if generation >= constants.generationToDropGen: print "Genitive Case Dropped" if constants.includeSlavic and generation >= constants.generationToIntroduceSlavic: print "Slavic Information Introduced" print "Number of Training Epochs: %s" % constants.epochs print "Number of Training Tokens: %s" % len(trainingSet) print "Training Error: %s" % error results = [] # Dictionary of changes changes = { 'total': 0, 'gen_change': defaultdict(lambda: 0), 'dec_change': defaultdict(lambda: 0), 'gencase_change': defaultdict(lambda: 0), 'gennum_change': defaultdict(lambda: 0), 'deccase_change': defaultdict(lambda: 0), 'decnum_change': defaultdict(lambda: 0), 'gencasenum_change': defaultdict(lambda: 0), 'deccasenum_change': defaultdict(lambda: 0) } # for each work in the input for (word, inputTuple, expectedOutput, trueLatinGender, trueRomanianGender) in trainingCorpus.test: # Count how many tokens are in the test set counterBag.totalCounter.increment() # determine if we should drop the genetive should_drop_gen = generation >= constants.generationToDropGen # activate the net, and smooth the output result = smooth(tuple(net.activate(inputTuple)), gendrop=should_drop_gen, equalcase=True) # append output tuple to result results.append((word.description, result)) # If this is the first generation if counterBag.generationCounter.value == 1: # add genchange[word.description].append( (0, word.parentToken.latinGender[0])) # Change index depending if gen has been dropped or not (gen_b, gen_e, dec_b, dec_e, case_b, case_e, num_b, num_e) = (0, 3, 3, 8, 8, 11, 11, 13) # hash the output tuple to get the result gender = constants.tup_to_gen[tuple(result[gen_b:gen_e])] declension = constants.tup_to_dec[tuple(result[dec_b:dec_e])] case = constants.tup_to_case[tuple(result[case_b:case_e])] num = constants.tup_to_num[tuple(result[num_b:num_e])] to_add = (counterBag.generationCounter.value, gender + declension + case + num, word.parentToken.latinGender[0], word.parentToken.declension, word.case, word.num) genchange[word.description].append(to_add) word.genchange[counterBag.generationCounter.value] = (gender, declension, case, num) return results
def conductGeneration(generation, corpus, previous_output): ''' Conducts a generation of learning and testing on the input data Inputs generation (int) --- the number of the generation corpus (array) --- the lemmas and their info from reading the corpus file previous_output (dict) --- the output phonology of the previous generation Returns the output of the current generation--the expected outputs for the following generation ''' # Build the right size network net = buildNetwork(input_nodes, constants.hidden_nodes, constants.output_nodes) # Build the right size training set emptytraining_set = SupervisedDataSet(input_nodes, constants.output_nodes) # Initialize corpus object and expected output dictionary training_corpus = objects.Corpus(emptytraining_set) # Iterate through tokens and convert to binary for lemma in corpus: # Iterate through cases for case, form in lemma.cases.iteritems(): # Create the input tuple form.createInputTuple(input_nodes, root_size) # Add words according to their frequencies training_corpus.addByFreq(constants.token_freq, form, previous_output[form.lemmacase]) # Print information print "--------Generation %s--------" % generation # if generation >= constants.gnvdrop_generation: # print "Genitive Case Dropped" # Construct the training set print "Constructing the training set" training_set = training_corpus.constructTrainingSet() # Construct the trainer trainer = BackpropTrainer(net, training_set) # Train print "Training the model" error = trainer.trainEpochs(constants.epochs) print "Number of Tokens in Training Set: %s" % len(training_set) print "Training Error: %s" % error results = {} # For each word in the test set, calculate output tuple print "Running the test set" # Counter to count correct. Exclude -'s from total phonemes ncorrect = 0 tot_phon = 0 for (form, input_tuple, expected_output) in training_corpus.test: # # Determine if we should drop the genitive drop_gen = generation >= constants.gnvdrop_generation # Activate the net, and smooth the output result = smooth(tuple(net.activate(input_tuple)), gendrop=drop_gen, hierarchy=constants.hierarchy) # Append output tuple to result results[form.lemmacase] = result # Hash the output tuple to get the phonological form result new_phonology = '' # Divide tuple into chunks (each 12 units, representing one phoneme) chunked_list = list(chunks(list(result), 12)) # Divide previous output tuple into chunks chunked_prev = list(chunks(list(previous_output[form.lemmacase]), 12)) for phon_index in range(len(chunked_list)): phoneme = chunked_list[phon_index] prev_phoneme = chunked_prev[phon_index] new_phonology += constants.feat_to_phon[tuple(phoneme)] # If phoneme matches, add to number correct if prev_phoneme != [0.5]*12: tot_phon += 1 if phoneme == prev_phoneme: ncorrect += 1 print form.lemmacase, form.parent.declension, form.parent.gender, new_phonology # Set input change once we figure out how to deal with the phonology form.output_change[generation] = new_phonology.replace('-', '') print "Results have been determined" print "Percentage correct in test run: {:.2f}".format(float(ncorrect)/float(tot_phon)*100) return results