Пример #1
0
def train(X, Y, valX=None, valY=None, testX=None, testY=None):
    feats = formatFeatures(X, Y)
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in pycrfInstances(feats, labeled=True):
        trainer.append(xseq, yseq)
    os_handle, tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="crf_temp")
    trainer.train(tmp_file)
    model = ''
    with open(tmp_file, 'rb') as f:
        model = f.read()
    os.close(os_handle)
    os.remove(tmp_file)
    scores = {}
    train_pred = predict(model, X)
    train_stats = compute_performance_stats('train', train_pred, Y)
    scores['train'] = train_stats

    if valX:
        val_pred = predict(model, valX)
        val_stats = compute_performance_stats('dev', val_pred, valY)
        scores['dev'] = val_stats

    if testX:
        test_pred = predict(model, testX)
        test_stats = compute_performance_stats('test', test_pred, testY)
        scores['test'] = test_stats
    scores['hyperparams'] = {}
    enabled_mods = enabledModules()
    for module, enabled in enabled_mods.items():
        e = bool(enabled)
        scores['hyperparams'][module] = e
    return model, scores
Пример #2
0
def train(X, Y, val_X=None, val_Y=None, test_X=None, test_Y=None):
    '''
    train()
    Train a Conditional Random Field for sequence tagging.
    
    @param X.     List of sparse-matrix sequences. Each sequence is one sentence.
    @param Y.     List of sequence tags. Each sequence is the sentence's per-token tags.
    @param val_X. More X data, but a heldout dev set.
    @param val_Y. More Y data, but a heldout dev set.
    @return A tuple of encoded parameter weights and hyperparameters for predicting.
    '''

    # Sanity Check detection: features & label
    #with open('a','w') as f:
    #    for xline,yline in zip(X,Y):
    #        for x,y in zip(xline,yline):
    #            print >>f, y, '\t', x.nonzero()[1][0]
    #        print >>f

    # Format features fot crfsuite
    feats = format_features(X, Y)

    # Create a Trainer object.
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in pycrf_instances(feats, labeled=True):
        trainer.append(xseq, yseq)

    # Train the model
    os_handle, tmp_file = tempfile.mkstemp(dir='/tmp', suffix="crf_temp")
    trainer.train(tmp_file)

    # Read the trained model into a string (so it can be pickled)
    model = ''
    with open(tmp_file, 'rb') as f:
        model = f.read()
    os.close(os_handle)

    # Remove the temporary file
    os.remove(tmp_file)

    ######################################################################

    # information about fitting the model
    scores = {}

    # how well does the model fir the training data?
    train_pred = predict(model, X)
    train_stats = compute_performance_stats('train', train_pred, Y)
    scores['train'] = train_stats

    if val_X:
        val_pred = predict(model, val_X)
        val_stats = compute_performance_stats('dev', val_pred, val_Y)
        scores['dev'] = val_stats

    if test_X:
        test_pred = predict(model, test_X)
        test_stats = compute_performance_stats('test', test_pred, test_Y)
        scores['test'] = test_stats

    # keep track of which external modules were used for building this model!
    scores['hyperparams'] = {}
    enabled_mods = enabled_modules()
    for module, enabled in enabled_mods.items():
        e = bool(enabled)
        scores['hyperparams'][module] = e

    return model, scores
Пример #3
0
def train(X, Y, val_X=None, val_Y=None, test_X=None, test_Y=None):
    '''
    train()
    Train a Conditional Random Field for sequence tagging.
    
    @param X.     List of sparse-matrix sequences. Each sequence is one sentence.
    @param Y.     List of sequence tags. Each sequence is the sentence's per-token tags.
    @param val_X. More X data, but a heldout dev set.
    @param val_Y. More Y data, but a heldout dev set.
    @return A tuple of encoded parameter weights and hyperparameters for predicting.
    '''

    # Sanity Check detection: features & label
    #with open('a','w') as f:
    #    for xline,yline in zip(X,Y):
    #        for x,y in zip(xline,yline):
    #            print >>f, y, '\t', x.nonzero()[1][0]
    #        print >>f

    # Format features fot crfsuite
    feats = format_features(X,Y)
    # print(feats)# [... ,'' ,t12193=1\t12199=1', ...]

    # Create a Trainer object.
    trainer = pycrfsuite.Trainer(verbose=False)
    trainer.select(algorithm= 'pa') # algorithm:{‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’}

    # print(trainer.params()) # ['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch']
    # print(trainer.get_params())
    '''
    {'feature.minfreq': 0.0, 'feature.possible_states': False, 'feature.possible_transitions': False, 'type': 1, 'c': 1.0, 'error_sensitive': True, 'averaging': True, 'max_iterations': 100,
    'epsilon': 0.0}
    '''
    for xseq, yseq in pycrf_instances(feats, labeled=True):
        trainer.append(xseq, yseq)


    # Train the model
    os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="crf_temp") #how is tmp_file created?
    #tmp_dir = cliner_dir/data/tmp #
    trainer.train(tmp_file)# save temporary file to C:\Users\Anak\PycharmProjects\CliNER\data\tmp\tmp02nmns01crf_temp

    # print(trainer.logparser.last_iteration)
    #{'num': 54, 'scores': {}, 'loss': 49.276289, 'feature_norm': 5.78118, 'error_norm': 0.041626, 'active_features': 16870, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.003}

    # Read the trained model into a string (so it can be pickled)
    model = ''
    with open(tmp_file, 'rb') as f:
        model = f.read()
    os.close(os_handle)

    # print(model) # b'x01\x00\x00\x00\xd1D\x00\x00\x01\x00\x00\x00\xd2D\x00\x00\x01\x00\x00\x00\xd3D\x00
    # print(X)
    '''
    [<2x12206 sparse matrix of type '<class 'numpy.float64'>'
        with 70 stored elements in Compressed Sparse Row format>, <9x12206 sparse matrix of type '<class 'numpy.float64'>'
    '''

    # Remove the temporary file
    os.remove(tmp_file)

    ######################################################################
    # information about fitting the model
    scores = {}

    # how well does the model fit the training data?
    train_pred = predict(model,     X) # ANAK
    train_stats = compute_performance_stats('train', train_pred, Y)
    scores['train'] = train_stats

    if val_X:
        val_pred  = predict(model, val_X)
        val_stats = compute_performance_stats('dev', val_pred, val_Y)
        scores['dev'] = val_stats

    if test_X:
        test_pred  = predict(model, test_X)
        test_stats = compute_performance_stats('test', test_pred, test_Y)
        scores['test'] = test_stats

    # keep track of which external modules were used for building this model!
    scores['hyperparams'] = {}
    enabled_mods = enabled_modules()
    for module,enabled in enabled_mods.items():
        e = bool(enabled)
        scores['hyperparams'][module] = e

    # print(len(scores)) # 3
    return model, scores
Пример #4
0
def train(X, Y, val_X=None, val_Y=None):
    '''
    train()

    Train a Conditional Random Field for sequence tagging.
    
    @param X.     List of sparse-matrix sequences. Each sequence is one sentence.
    @param Y.     List of sequence tags. Each sequence is the sentence's per-token tags.
    @param val_X. More X data, but a heldout dev set.
    @param val_Y. More Y data, but a heldout dev set.

    @return A tuple of encoded parameter weights and hyperparameters for predicting.
    '''

    # Sanity Check detection: features & label
    #with open('a','w') as f:
    #    for xline,yline in zip(X,Y):
    #        for x,y in zip(xline,yline):
    #            print >>f, y, '\t', x.nonzero()[1][0]
    #        print >>f

    # Format features fot crfsuite
    feats = format_features(X,Y)

    # Create a Trainer object.
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in pycrf_instances(feats, labeled=True):
        trainer.append(xseq, yseq)

    # Train the model
    os_handle,tmp_file = tempfile.mkstemp(dir='/tmp', suffix="crf_temp")
    trainer.train(tmp_file)

    # Read the trained model into a string (so it can be pickled)
    model = ''
    with open(tmp_file, 'r') as f:
        model = f.read()
    os.close(os_handle)

    # Remove the temporary file
    os.remove(tmp_file)

    ######################################################################

    # information about fitting the model
    scores = {}

    # how well does the model fir the training data?
    train_pred = predict(model,     X)
    train_stats = compute_performance_stats('train', train_pred, Y)
    scores['train'] = train_stats

    if val_X:
        val_pred  = predict(model, val_X)
        val_stats = compute_performance_stats('dev', val_pred, val_Y)
        scores['dev'] = val_stats

    # keep track of which external modules were used for building this model!
    scores['hyperparams'] = {}
    enabled_mods = enabled_modules()
    for module,enabled in enabled_mods.items():
        e = bool(enabled)
        scores['hyperparams'][module] = e

    return model, scores