Exemplos de Corpus em Python, exemplos de objects.Corpus em Python

Exemplo n.º 1

0

Exibir arquivo

def conductGeneration(generation, corpus, previous_output):
    '''
        Conducts a generation of learning and testing on the input data

        Inputs
                generation (int) --- the number of the generation
                corpus (array) --- the lemmas and their info from reading the corpus file
                previous_output (dict) --- the output phonology of the previous generation
        Returns the output of the current generation--the expected outputs for the following generation
        '''

    # Build the right size network
    net = buildNetwork(constants.input_nodes, constants.hidden_nodes,
                       output_nodes)

    # Build the right size training set
    emptytraining_set = SupervisedDataSet(constants.input_nodes, output_nodes)

    # Initialize corpus object and expected output dictionary
    training_corpus = objects.Corpus(emptytraining_set)

    # Iterate through tokens and convert to binary
    for lemma in corpus:

        # Input phonologies in case dictionary and feed to realign function
        case_dict = {
            case: form.input_phon[generation]
            for case, form in lemma.cases.iteritems()
        }

Exemplo n.º 2

0

Exibir arquivo

def conductGeneration(generation, corpus, previous_output):
    '''
        Conducts a generation of learning and testing on the input data

        inputs
                generation (int) --- the number of the generation
                corpus (array) --- the lemmas and their info from reading the corpus file
                previous_output (dict) --- the output (gender, declension, case, number) of the previous generation
        outputs

        '''

    print "Trial %s" % str(constants.trial)

    # Build the right size network
    net = buildNetwork(constants.input_nodes, constants.hidden_nodes,
                       constants.output_nodes)

    # Build the right size training set
    emptytraining_set = SupervisedDataSet(constants.input_nodes,
                                          constants.output_nodes)

    # Initialize corpus object and expected output dictionary
    training_corpus = objects.Corpus(emptytraining_set)

    # Iterate through tokens and convert to binary
    for lemma in corpus:

        # Iterate through cases
        for case, form in lemma.cases.iteritems():

            # Create the input tuple
            form.createInputTuple(form.syllables)

            # Extract the class information from the previous generation
            expected_outputs[form.lemmacase] = previous_output[form.lemmacase]

            # Add words according to their frequencies
            training_corpus.addByFreq(form, previous_output)

    print expected_outputs

Exemplo n.º 3

0

Exibir arquivo

def conductGeneration(generation, corpus, previous_output):
    '''
        Conducts a generation of learning and testing on the input data

        Inputs
                generation (int) --- the number of the generation
                corpus (array) --- the lemmas and their info from reading the corpus file
                previous_output (dict) --- the output phonology of the previous generation
        Returns the output of the current generation--the expected outputs for the following generation
        '''

    # Build the right size network
    net = buildNetwork(input_nodes, constants.hidden_nodes,
                       constants.output_nodes)

    # Build the right size training set
    emptytraining_set = SupervisedDataSet(input_nodes, constants.output_nodes)

    # Initialize corpus object and expected output dictionary
    training_corpus = objects.Corpus(emptytraining_set)

    # Iterate through tokens and convert to binary
    for lemma in corpus:

        # Iterate through cases
        for case, form in lemma.cases.iteritems():

            # Add words according to their frequencies
            training_corpus.addByFreq(constants.token_freq, form,
                                      previous_output[form.lemmacase])

    # Construct the training set
    print '''--------Generation %s--------''' % generation
    print '''-----------Trial %s----------
        Training on %d Epochs
        Vectors: %s
        Number of Input Nodes: %d
        Number of Hidden Nodes: %d
        Number of Output Nodes: %d
        Token Frequency taken into account: %s\n''' % (
        constants.trial, constants.epochs, constants.vectors, input_nodes,
        constants.hidden_nodes, constants.output_nodes, constants.token_freq)

    print "Constructing the training set"
    training_set = training_corpus.constructTrainingSet()

    # Construct the trainer
    trainer = BackpropTrainer(net, training_set)

    # Train
    print "Training the model"

    error = trainer.trainEpochs(constants.epochs)

    print "Number of Tokens in Training Set: %s" % len(training_set)

    results = {}

    # For each word in the test set, calculate output tuple
    print "Running the test set"

    # Counter to count correct
    ncorrect = 0
    tot_phon = 0

    for (form, input_tuple, expected_output) in training_corpus.test:

        # Activate the net, and smooth the output
        result = smooth(tuple(net.activate(input_tuple)))

        # Append output tuple to result
        results[form.lemmacase] = result

        # Hash the output tuple to get the phonological form result
        new_phonology = ''

        # Divide tuple into chunks (each 11 units, representing one phoneme)
        chunked_list = list(chunks(list(result), constants.n_feat))
        # Divide previous output tuple into chunks
        chunked_prev = list(
            chunks(list(previous_output[form.lemmacase]), constants.n_feat))

        for phon_index in range(len(chunked_list)):
            phoneme = chunked_list[phon_index]
            prev_phoneme = chunked_prev[phon_index]

            new_phonology += constants.feat_to_phon[tuple(phoneme)]
            # If phoneme matches, add to number correct
            if prev_phoneme != [0.5] * constants.n_feat:
                tot_phon += 1
                if phoneme == prev_phoneme:
                    ncorrect += 1

        # Output for this generation is new suffix
        new_suf = ''.join(new_phonology)
        for seq in functions.to_revert.keys():
            if seq in new_suf:
                new_suf = new_suf.replace(seq, functions.to_revert[seq])
        new_suf = new_suf.replace('-', '')

        form.output_change[generation] = new_suf

        print form.lemmacase, form.root + form.suffix, form.parent.declension, form.parent.gender, form.parent.totfreq, form.root + new_suf, new_suf

    print "Results have been determined"
    try:
        print "Percentage correct in test run: %f" % round(
            float(ncorrect) / float(tot_phon) * 100, 2)
    except ZeroDivisionError:
        print "Percentage correct in test run: 0.00"

    return results

Exemplo n.º 4

0

Exibir arquivo

Arquivo: main.py Projeto: tylerlau07/romance_nominal_change

def conductGeneration(generation, corpus, previous_output):
    '''
        Conducts a generation of learning and testing on the input data

        inputs
                generation (int) --- the number of the generation
                corpus (array) --- the lemmas and their info from reading the corpus file
                previous_output (dict) --- the output (gender, declension, case, number) of the previous generation
        Returns the output of the current generation--the expected outputs for the following generation

        '''

    # Build the right size network
    net = buildNetwork(constants.input_nodes, constants.hidden_nodes,
                       constants.output_nodes)

    # Build the right size training set
    emptytraining_set = SupervisedDataSet(constants.input_nodes,
                                          constants.output_nodes)

    # Initialize corpus object and expected output dictionary
    training_corpus = objects.Corpus(emptytraining_set)

    # Iterate through tokens and convert to binary
    for lemma in corpus:
        # JUST SKIP THOSE THAT ARE DEFECTIVE
        if len(lemma.cases.keys()) < 6:
            continue

        # Iterate through cases
        for case, form in lemma.cases.iteritems():
            # Get new input from previous output
            new_gender, new_dec, new_case, new_num, prev_output = form.output_change[
                generation - 1]

            # Use new input as new syllables
            new_syllables = lemma.cases[new_case + new_num].syllables

            # Append to input change
            form.input_change[generation] = (new_case + new_num,
                                             ''.join(new_syllables).replace(
                                                 '-', ''))

            # print form.lemmacase, form.input_change[generation]

            # Create the input tuple
            form.createInputTuple(new_syllables)

            # Add words according to their frequencies
            training_corpus.addByFreq(constants.token_freq, form,
                                      expected_outputs[form.lemmacase])

    # Print information
    print "--------Generation %s--------" % generation
    if generation >= constants.gnvdrop_generation:
        print "Genitive Case Dropped"

    # Construct the training set
    print "Constructing the training set"
    training_set = training_corpus.constructTrainingSet()

    # Construct the trainer
    trainer = BackpropTrainer(net, training_set)

    # Train
    print "Training the model"
    if constants.epochs == 1:
        error = trainer.train()
    else:
        error = trainer.trainEpochs(constants.epochs)
    print "Number of Tokens in Training Set: %s" % len(training_set)
    print "Training Error: %s" % error

    results = {}

    # For each word in the test set
    print "Running the test set"
    for (form, input_tuple, expected_output) in training_corpus.test:

        # Determine if we should drop the genitive
        drop_gen = generation >= constants.gnvdrop_generation

        # Activate the net, and smooth the output
        result = smooth(tuple(net.activate(input_tuple)),
                        gendrop=drop_gen,
                        hierarchy=constants.hierarchy)

        # Append output tuple to result
        results[form.lemmacase] = result

        # Hash the output tuple to get the result
        gender = constants.tup_to_gen[tuple(
            result[constants.gen_b:constants.dec_b])]
        dec = constants.tup_to_dec[tuple(
            result[constants.dec_b:constants.casenum_b])]
        casenum = constants.tup_to_case[tuple(result[constants.casenum_b:])]
        output = form.parent_lemma.cases[casenum].phonology

        # Set input change once we figure out how to deal with the phonology
        form.output_change[generation] = (gender, dec, casenum[0:3],
                                          casenum[3:], output)

    print "Results have been determined"

    return results

Exemplo n.º 5

0

Exibir arquivo

Arquivo: main.py Projeto: tylerlau07/romance_nominal_change

def conductGeneration(generation, corpus, previous_output):
    '''
        Conducts a generation of learning and testing on the input data

        Inputs
                generation (int) --- the number of the generation
                corpus (array) --- the lemmas and their info from reading the corpus file
                previous_output (dict) --- the output phonology of the previous generation
        Returns the output of the current generation--the expected outputs for the following generation
        '''

    # Build the right size network
    net = buildNetwork(input_nodes, constants.hidden_nodes,
                       constants.output_nodes)

    # Build the right size training set
    emptytraining_set = SupervisedDataSet(input_nodes, constants.output_nodes)

    # Initialize corpus object and expected output dictionary
    training_corpus = objects.Corpus(emptytraining_set)

    # Iterate through tokens and convert to binary
    for lemma in corpus:

        # Iterate through cases
        for case, form in lemma.cases.iteritems():

            # Add words according to their frequencies
            training_corpus.addByFreq(constants.token_freq, form,
                                      previous_output[form.lemmacase])

    # Construct the training set
    print "--------Generation %s--------" % generation
    print "Constructing the training set"
    training_set = training_corpus.constructTrainingSet()

    # Construct the trainer
    trainer = BackpropTrainer(net, training_set)

    # Train
    print "Training the model"

    error = trainer.trainEpochs(constants.epochs)

    print "Number of Tokens in Training Set: %s" % len(training_set)

    results = {}

    # For each word in the test set, calculate output tuple
    print "Running the test set"

    # Counter to count correct
    ncorrect = 0

    for (form, input_tuple, expected_output) in training_corpus.test:

        # Activate the net, and smooth the output
        result = smooth(tuple(net.activate(input_tuple)), suf_dict)

        # Append output tuple to result
        results[form.lemmacase] = result

        # Add to ncorrect if matches previous
        if result == previous_output[form.lemmacase]: ncorrect += 1

        # Hash the output tuple to get the suffix result
        new_suf = inv_suf[result]

        form.output_change[generation] = new_suf

        print form.lemmacase, form.root + form.suffix, form.parent.declension, form.parent.gender, form.parent.totfreq, form.root + new_suf, new_suf

    print "Results have been determined"
    print "Percentage correct in test run: %f" % round(
        float(ncorrect) / float(len(previous_output)) * 100, 2)

    return results

Exemplo n.º 6

0

Exibir arquivo

Arquivo: main.py Projeto: tylerlau07/romance_nominal_change

def conductGeneration(generation, corpus, previousOutput):
    '''
        Conducts a generation of learning and testing on the input data

        inputs
                generation (int) --- the number of the generation
                corpus (array) --- the output of reading the corpus file
                previousOutput (dict) --- the output of the previous generation
        outputs

        
        '''

    print "Trial %s" % str(constants.trial)
    input_size = constants.inputNodes

    # if we're using slavic data, modify the expected size of the input vector.
    if constants.includeSlavic and generation >= constants.generationToIntroduceSlavic:
        input_size = constants.inputNodesSlav

    # build the right size network
    net = buildNetwork(input_size, constants.hiddenNodes,
                       constants.outputNodes)

    # build the right size training set
    emptyTrainingSet = SupervisedDataSet(input_size, constants.outputNodes)

    # initialize corpus object
    trainingCorpus = objects.Corpus(emptyTrainingSet)

    # iterate through tokens passed to the function
    for token in corpus:

        # iterate through cases
        for (case, word) in token.cases:
            # set its syllables, based on the generation (i.e. account for sound changes)
            word.setSyllables(generation, word.syllables)
            # extract the gender from the previous generation
            print previousOutput
            (placeholder, previousResult) = previousOutput[[
                wordinfo for (wordinfo, gender) in previousOutput
            ].index(word.description)]
            # print previousResult
            # print placeholder # we already know the word
            # adds words according to their frequencies
            trainingCorpus.configure(word, previousResult, generation)

    # construct the training set
    trainingSet = trainingCorpus.constructTrainingSet()

    # construct the trainer
    trainer = BackpropTrainer(net, trainingSet)

    # train
    if constants.epochs == 1:
        error = trainer.train()
    else:
        error = trainer.trainEpochs(constants.epochs)

    print "--------Generation: %s--------" % generation
    if generation >= constants.generationToDropGen:
        print "Genitive Case Dropped"

    if constants.includeSlavic and generation >= constants.generationToIntroduceSlavic:
        print "Slavic Information Introduced"

    print "Number of Training Epochs: %s" % constants.epochs
    print "Number of Training Tokens: %s" % len(trainingSet)
    print "Training Error: %s" % error

    results = []

    # Dictionary of changes
    changes = {
        'total': 0,
        'gen_change': defaultdict(lambda: 0),
        'dec_change': defaultdict(lambda: 0),
        'gencase_change': defaultdict(lambda: 0),
        'gennum_change': defaultdict(lambda: 0),
        'deccase_change': defaultdict(lambda: 0),
        'decnum_change': defaultdict(lambda: 0),
        'gencasenum_change': defaultdict(lambda: 0),
        'deccasenum_change': defaultdict(lambda: 0)
    }
    # for each work in the input
    for (word, inputTuple, expectedOutput, trueLatinGender,
         trueRomanianGender) in trainingCorpus.test:

        # Count how many tokens are in the test set
        counterBag.totalCounter.increment()

        # determine if we should drop the genetive
        should_drop_gen = generation >= constants.generationToDropGen

        # activate the net, and smooth the output
        result = smooth(tuple(net.activate(inputTuple)),
                        gendrop=should_drop_gen,
                        equalcase=True)

        # append output tuple to result
        results.append((word.description, result))

        # If this is the first generation
        if counterBag.generationCounter.value == 1:
            # add
            genchange[word.description].append(
                (0, word.parentToken.latinGender[0]))

        # Change index depending if gen has been dropped or not
        (gen_b, gen_e, dec_b, dec_e, case_b, case_e, num_b,
         num_e) = (0, 3, 3, 8, 8, 11, 11, 13)

        # hash the output tuple to get the result
        gender = constants.tup_to_gen[tuple(result[gen_b:gen_e])]
        declension = constants.tup_to_dec[tuple(result[dec_b:dec_e])]
        case = constants.tup_to_case[tuple(result[case_b:case_e])]
        num = constants.tup_to_num[tuple(result[num_b:num_e])]

        to_add = (counterBag.generationCounter.value,
                  gender + declension + case + num,
                  word.parentToken.latinGender[0], word.parentToken.declension,
                  word.case, word.num)

        genchange[word.description].append(to_add)
        word.genchange[counterBag.generationCounter.value] = (gender,
                                                              declension, case,
                                                              num)

    return results

Exemplo n.º 7

0

Exibir arquivo

Arquivo: main.py Projeto: tylerlau07/romance_nominal_change

def conductGeneration(generation, corpus, previous_output):
        '''
        Conducts a generation of learning and testing on the input data

        Inputs
                generation (int) --- the number of the generation
                corpus (array) --- the lemmas and their info from reading the corpus file
                previous_output (dict) --- the output phonology of the previous generation
        Returns the output of the current generation--the expected outputs for the following generation
        '''
        # Build the right size network
        net = buildNetwork(input_nodes, constants.hidden_nodes, constants.output_nodes)

        # Build the right size training set
        emptytraining_set = SupervisedDataSet(input_nodes, constants.output_nodes)

        # Initialize corpus object and expected output dictionary
        training_corpus = objects.Corpus(emptytraining_set)

        # Iterate through tokens and convert to binary
        for lemma in corpus:

                # Iterate through cases
                for case, form in lemma.cases.iteritems():

                        # Create the input tuple
                        form.createInputTuple(input_nodes, root_size)

                        # Add words according to their frequencies
                        training_corpus.addByFreq(constants.token_freq, form, previous_output[form.lemmacase])

        # Print information
        print "--------Generation %s--------" % generation
        # if generation >= constants.gnvdrop_generation:
        #         print "Genitive Case Dropped"

        # Construct the training set
        print "Constructing the training set"
        training_set = training_corpus.constructTrainingSet()

        # Construct the trainer
        trainer = BackpropTrainer(net, training_set)

        # Train
        print "Training the model"
        
        error = trainer.trainEpochs(constants.epochs)
        
        print "Number of Tokens in Training Set: %s" % len(training_set)
        print "Training Error: %s" % error

        results = {}

        # For each word in the test set, calculate output tuple
        print "Running the test set"

        # Counter to count correct. Exclude -'s from total phonemes
        ncorrect = 0
        tot_phon = 0

        for (form, input_tuple, expected_output) in training_corpus.test:             

                # # Determine if we should drop the genitive
                drop_gen = generation >= constants.gnvdrop_generation

                # Activate the net, and smooth the output
                result = smooth(tuple(net.activate(input_tuple)), gendrop=drop_gen, hierarchy=constants.hierarchy)  

                # Append output tuple to result
                results[form.lemmacase] = result

                # Hash the output tuple to get the phonological form result
                new_phonology = ''
                
                # Divide tuple into chunks (each 12 units, representing one phoneme)
                chunked_list = list(chunks(list(result), 12))
                # Divide previous output tuple into chunks
                chunked_prev = list(chunks(list(previous_output[form.lemmacase]), 12))

                for phon_index in range(len(chunked_list)):
                        phoneme = chunked_list[phon_index]
                        prev_phoneme = chunked_prev[phon_index]

                        new_phonology += constants.feat_to_phon[tuple(phoneme)]
                        # If phoneme matches, add to number correct
                        if prev_phoneme != [0.5]*12:
                                tot_phon += 1
                                if phoneme == prev_phoneme: 
                                        ncorrect += 1

                print form.lemmacase, form.parent.declension, form.parent.gender, new_phonology

                # Set input change once we figure out how to deal with the phonology
                form.output_change[generation] = new_phonology.replace('-', '')

        print "Results have been determined"

        print "Percentage correct in test run: {:.2f}".format(float(ncorrect)/float(tot_phon)*100)

        return results