Exemplo n.º 1
0
def evaluateJamspell(modelFile, testText, alphabetFile, maxWords=50000):
    utils.loadAlphabet(alphabetFile)
    corrector = JamspellCorrector(modelFile)
    random.seed(42)
    originalText = loadText(testText)
    erroredText = generateTypos(originalText)
    assert len(originalText) == len(erroredText)
    originalSentences = generateSentences(originalText)
    erroredSentences = generateSentences(erroredText)
    errorsRate, fixRate, broken, topNerr, topNfix, execTime = \
        evaluateCorrector('jamspell', corrector, originalSentences, erroredSentences, maxWords)
    return errorsRate, fixRate, broken, topNerr, topNfix
Exemplo n.º 2
0
    def train(self, trainFile):
        print '[info] loading text'
        text = loadText(trainFile)
        sentences = generateSentences(text)
        sentences = self.convertToIDs(sentences)

        print '[info] generating N-grams', len(sentences)
        total = len(sentences)
        lastTime = time.time()
        for i in xrange(0, total):
            sentence = sentences[i]
            for w in sentence:
                self.gram1[w] += 1
                self.totalWords += 1
            for j in xrange(len(sentence) - 1):
                self.gram2[(sentence[j], sentence[j+1])] += 1
            for j in xrange(len(sentence) - 2):
                self.gram3[(sentence[j], sentence[j+1], sentence[j+2])] += 1
            if time.time() - lastTime >= 4.0:
                lastTime = time.time()
                print '[info] processed %.2f%%' % (100.0 * i / total)

        print '[info] finished training'
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='spelling correctors evaluation')
    parser.add_argument('file',
                        type=str,
                        help='text file to use for evaluation')
    parser.add_argument('-hs',
                        '--hunspell',
                        type=str,
                        help='path to hunspell model')
    parser.add_argument('-ns',
                        '--norvig',
                        type=str,
                        help='path to train file for Norvig spell corrector')
    parser.add_argument('-cs',
                        '--context',
                        type=str,
                        help='path to context spell model')
    parser.add_argument('-csp',
                        '--context_prototype',
                        type=str,
                        help='path to context spell prototype model')
    parser.add_argument('-jsp',
                        '--jamspell',
                        type=str,
                        help='path to jamspell model file')
    parser.add_argument('-t', '--test', action="store_true")
    parser.add_argument('-mx',
                        '--max_words',
                        type=int,
                        help='max words to evaluate')
    parser.add_argument('-a', '--alphabet', type=str, help='alphabet file')
    args = parser.parse_args()

    if args.alphabet:
        utils.loadAlphabet(args.alphabet)

    correctors = {
        'dummy': DummyCorrector(),
    }
    # corrector = correctors['dummy']

    maxWords = args.max_words

    print('[info] loading models')

    if args.hunspell:
        corrector = correctors['hunspell'] = HunspellCorrector(args.hunspell)

    if args.norvig:
        corrector = correctors['norvig'] = NorvigCorrector(args.norvig)

    if args.context:
        corrector = correctors['context'] = ContextCorrector(args.context)

    if args.context_prototype:
        corrector = correctors['prototype'] = ContextPrototypeCorrector(
            args.context_prototype)

    if args.jamspell:
        corrector = correctors['jamspell'] = JamspellCorrector(args.jamspell)

    if args.test:
        return testMode(corrector)

    random.seed(42)
    print('[info] loading text')
    originalText = loadText(args.file)
    originalTextLen = len(list(originalText))

    print('[info] generating typos')
    #将原始的词随机修改,并以单个词的集合-列表返回
    erroredText = generateTypos(originalText)
    erroredTextLen = len(list(erroredText))

    assert originalTextLen == erroredTextLen
    #将原始文本分割成句子(去掉其中的非法符号和非句号)(不包含句号)
    originalSentences = generateSentences(originalText)
    erroredSentences = generateSentences(erroredText)

    assert len(originalSentences) == len(erroredSentences)

    # for s in originalSentences[:50]:
    #    print ' '.join(s) + '.'

    print('[info] total words: %d' % len(originalText))
    print('[info] evaluating')

    results = {}

    for correctorName, corrector in correctors.items():
        errorsRate, fixRate, broken, topNerr, topNfix, execTime = \
            evaluateCorrector(correctorName, corrector, originalSentences, erroredSentences, maxWords)
        results[
            correctorName] = errorsRate, fixRate, broken, topNerr, topNfix, execTime

    print('')

    print('[info] %12s %8s  %8s  %8s  %8s  %8s  %8s' %
          ('', 'errRate', 'fixRate', 'broken', 'topNerr', 'topNfix', 'time'))
    # 将多个打分器的结果 resultsfixRate从大到小排序打印出来
    # 匿名函数 ~ 将x替换为results.items()即就是results.items[i][1]
    for k, _ in sorted(results.items(), key=lambda x: x[1]):
        print('[info] %10s  %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2fs' % \
              (k,
               100.0 * results[k][0],
               100.0 * results[k][1],
               100.0 * results[k][2],
               100.0 * results[k][3],
               100.0 * results[k][4],
               results[k][5]))
Exemplo n.º 4
0
def main(argv):
    # default values
    #path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
    path = ''
    outputfile = ''
    startfrom = 1
    numsyms = 100
    generateonly = False

    try:
        opts, args = getopt.getopt(argv, "hi:o:s:g:x", ["ifile=", "ofile="])
    except getopt.GetoptError:
        print(
            'program.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>'
        )
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('-' * 50)
            print('\n')
            print(
                'test.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>\n'
            )
            print('-i <inputfile>\t text file to train the network')
            print(
                '-g <numchars>\t number of characters to generate in generate-only mode'
            )
            print(
                '-s <startfrom>\t training epoch to start from (file must exist)'
            )
            print('\n\n')
            print('-' * 50)
            sys.exit()
        elif opt in ("-i", "--ifile"):
            path = arg
        elif opt in ("-g", "--generate"):
            generateonly = True
            numsyms = int(arg)
        #elif opt in ("-x", "--generateonly"):
        #    generateonly = True
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-s", "--startepo"):
            startfrom = int(arg)

    print('Input file is %s' % path)
    print('Output file is %s' % outputfile)
    print('Starting from epoch %s' % startfrom)
    print('Generative model only %s ' % generateonly)
    print('Symbols to generate %d ' % numsyms)

    maxlen = 5

    # load data and build vocab
    voc = ut.loadText(path, bychar=False, lower=True)
    syms = set(voc[0])  # voc = (text, sym_indices, indices_sym)

    X, y = ut.buildSkipgram(voc, maxlen=20, step=5)
    #X,y = ut.buildTrainingSequences(voc, maxlen=maxlen, step=1)
    print('input shape (%s, %s)' % (X.shape, y.shape))

    vocsize = len(syms)
    emb_dims = 128

    # build the model: 2 stacked LSTM
    print('Build model...')
    model = Sequential()
    layers = [maxlen, 100, 512, 1]  # (X,y) from buildSkipgram
    #layers = [maxlen, 100, 512, vocsize]   # (X,y) from buildSequences

    model.add(Embedding(vocsize, emb_dims))

    #model.add(LSTM(input_dim=layers[0], output_dim=layers[1], return_sequences=True))  # with next lstm layer
    model.add(
        LSTM(input_dim=layers[0], output_dim=layers[1],
             return_sequences=False))  # with no more layers
    model.add(Dropout(0.5))

    #model.add(LSTM(layers[2], return_sequences=False))
    #model.add(Dropout(0.2))

    model.add(Dense(output_dim=layers[3]))  # for skipgram
    model.add(Activation('sigmoid'))  # for skipgram
    #model.add(Activation('softmax'))         # for sequences

    #model.compile(loss='categorical_crossentropy', optimizer='rmsprop')  # buildSequences
    model.compile(loss='mse', optimizer='rmsprop')  # buildSkipgram

    if generateonly:
        print('Loading model from epoch %d' % (startfrom))
        model.load_weights('results/lstm_word_based_epo_%d' % (startfrom))

        yn = 'y'  # default answer

        while (yn == 'y'):
            #ut.generate(model, voc, numchars=numsyms)
            ut.generateByWord(model, voc, numwords=numsyms)
            yn = ''
            while (yn not in ['y', 'n']):
                print("Generate more? [Y/n]: ")
                yn = str(raw_input()).lower()
    else:  # not generate-only mode

        # train the model, output generated text after each iteration
        for iteration in range(startfrom, 400, 10):
            print()
            print('-' * 50)
            print('Starting from epoch %d' % iteration)

            if iteration >= 2:
                print('-' * 50)
                print('Loading model from epoch %d' % (iteration - 1))
                model.load_weights('results/lstm_emb_word_based_epo_%d' %
                                   (iteration - 1))

            model.fit(X, y, batch_size=128, nb_epoch=10)
            model.save_weights('results/lstm_emb_word_based_epo_%d' %
                               (10 + iteration - 1),
                               overwrite=True)
            print('Extracting embeddings')
            emb = model.layers[0]
            embeddings = emb.W.get_value()
            print('embeddings shape', embeddings.shape)
            print('Saving embeddings and vocabulary for t-SNE')
            np.save('results/lstm_embeddings', embeddings)
            #np.save('results/vocab_embeddings', voc[1])
            ut.saveStuff(voc[1], 'results/vocab_embeddings')
from keras.datasets.data_utils import get_file

import numpy as np
import random
import sys
import getopt
import os.path

import utils as ut

path = 'data/alice.txt'

maxlen = 20

# load data and build vocab
voc = ut.loadText(path, bychar=True)
syms = set(voc[0])
X, y = ut.buildTrainingSet(voc, maxlen=maxlen, step=3)

# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
#model.add(Embedding(len(syms), 128))
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(syms))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(syms)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
def main(argv):
    # default values
    #path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
    path = ''
    outputfile = ''  
    startfrom = 1
    numsyms=100
    generateonly = False

    try:
        opts, args = getopt.getopt(argv,"hi:o:s:g:x",["ifile=","ofile="])
    except getopt.GetoptError:
        print('program.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('-'*50)
            print('\n')
            print('test.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>\n')
            print('-i <inputfile>\t text file to train the network')
            print('-g <numchars>\t number of characters to generate in generate-only mode')
            print('-s <startfrom>\t training epoch to start from (file must exist)')
            print('\n\n')
            print('-'*50)
            sys.exit()
        elif opt in ("-i", "--ifile"):
            path = arg
        elif opt in ("-g", "--generate"):
            generateonly = True
            numsyms = int(arg)
        #elif opt in ("-x", "--generateonly"):
        #    generateonly = True
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-s", "--startepo"):
            startfrom = int(arg)
        
    print('Input file is %s'          %path)
    print('Output file is %s'         %outputfile)
    print('Starting from epoch %s'    %startfrom)
    print('Generative model only %s ' %generateonly)
    print('Symbols to generate %d '%numsyms)

    maxlen = 20  # alice 
    
    # load data and build vocab
    voc = ut.loadText(path, bychar=True, lower=True)
    syms = set(voc[0])      # voc = (text, sym_indices, indices_sym) 
    X,y = ut.buildTrainingSet(voc, maxlen=maxlen, step=3) 
            
    
    
    # build the model: 2 stacked LSTM
    print('Build model...')
    model = Sequential()
    #model.add(Embedding(len(syms), 128))
    model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(syms))))
    model.add(Dropout(0.2))
    model.add(LSTM(512, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(len(syms)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

    if generateonly:
        print('Loading model from epoch %d' %(startfrom))
        #model.load_weights('results/lstm_word_based_epo_%d'%(startfrom))
        model.load_weights('results/lstm_char_based/tweet_lstm_char_based_epo_%d'%(startfrom))
        
        yn = 'y'   # default answer

        while(yn == 'y'):
            ut.generate(model, voc, numchars=numsyms)    
            #ut.generateByWord(model, voc, numwords=numsyms)    
            yn = ''
            while(yn not in ['y', 'n']):
                print("Generate more? [Y/n]: ")
                yn = str(raw_input()).lower()
    else:  # not generate-only mode 
    
        # train the model, output generated text after each iteration
        for iteration in range(startfrom, 400):
            print()
            print('-' * 50)
            print('Starting from epoch %d' %iteration)
            
            if iteration >= 2:
                print('-' * 50)
                print('Loading model from epoch %d' %(iteration-1))
                model.load_weights('results/lstm_char_based/tweet_lstm_char_based_epo_%d'%(iteration-1))
                
            model.fit(X, y, batch_size=128, nb_epoch=1)
            model.save_weights('results/lstm_char_based/tweet_lstm_char_based_epo_%d'%iteration, overwrite=True)
            # are we learning well? let's print some
            ut.generate(model, voc, numchars=42)