Exemplo n.º 1
0
from spacy import displacy

nlp = spacy.load('en')

# Open the file and read line by line
with open('../questions.txt') as fp:  

    ## For each load the object as NLP doc and make it into a complete string
    document_string = ' '.join(fp)

# now create a doc object and pass it to nlp for further processing
skip_and_print('Working with string: "%s"' % document_string)

doc = nlp(document_string)

# Find noun chunks
# ~~~~~~~~~~~~~~~~

skip_and_print('All the found noun chunks & some properties:')

rows = [['Chunk', '.root', 'root.dep_', '.root.head']]
for chunk in doc.noun_chunks:
    rows.append([
        chunk,            # A Span object with the full phrase.
        chunk.root,       # The key Token within this phrase.
        chunk.root.dep_,  # The grammatical role of this phrase.
        chunk.root.head   # The grammatical parent Token.
    ])
print_table(rows, padding=4)

Exemplo n.º 2
0
def main():
    from load_mnist import load_mnist, filter_dataset
    from image_utils import plot_flat_colorimage, plot_top_influence_colorimage
    from print_utils import print_table
    # load raw mnist
    x_train,y_train,x_test,y_test = load_mnist()
    # filter 1 and 7
    pos_class = 1
    neg_class = 7
    num_class = 2
    test_indices = 20
    x_train,y_train = filter_dataset(x_train,y_train,pos_class,neg_class)
    x_test,y_test = filter_dataset(x_test,y_test,pos_class,neg_class)

    # train logistic regression with LBGFS
    from sklearn import linear_model
    max_iter = 1000
    C = 1.0 / (x_train.shape[0] * 0.01)
    sklearn_model = linear_model.LogisticRegression(
        C= C,
        tol = 1e-8,
        fit_intercept=False,
        solver="lbfgs",
        multi_class="auto",
        warm_start=True,
        max_iter=max_iter,
        )

    sklearn_model.fit(x_train,y_train)

    # get test gradient loss value
    test_pred = sklearn_model.predict_proba(x_test[test_indices].reshape(1,-1))[:,1]
    test_grad_loss_val = grad_logloss_theta_lr(y_test[test_indices],test_pred,x_test[test_indices].reshape(1,-1))
    print("test grad loss norm:",np.linalg.norm(test_grad_loss_val))

    y_pred = sklearn_model.predict_proba(x_train)[:,1]

    "Get inverse hvp"
    # inverse_hvp = inverse_hvp_lissa(x_train,y_train,y_pred,test_grad_loss_val,100,5,200)
    inverse_hvp = inverse_hvp_lr_newtonCG(x_train,y_train,y_pred,test_grad_loss_val,0.01,False)

    start_time = time.time()
    num_tr_sample = x_train.shape[0]
    train_idx = np.arange(num_tr_sample)

    predicted_loss_diff = []
    for idx in range(num_tr_sample):
        train_grad_loss_val = grad_logloss_theta_lr(y_train[idx],y_pred[idx],x_train[idx].reshape(1,-1))
        predicted_loss_diff.append(
            np.dot(inverse_hvp, train_grad_loss_val) / num_tr_sample
            )
    predicted_loss_diffs = np.asarray(predicted_loss_diff)
    duration = time.time() - start_time
    print("Multiplying by {} train examples took {:.1f} sec".format(num_tr_sample, duration))
    print("Attribute predicted_loss_diffs, mean {}, max {}, min {}".format(
        predicted_loss_diffs.mean(), predicted_loss_diffs.max(), predicted_loss_diffs.min())
    )
    print("Test image:")
    print(y_test[test_indices])
    plot_flat_colorimage(x_test[test_indices],y_test[test_indices],28)

    print("Top from predicted influence:")
    plot_top_influence_colorimage(x_train,y_train,predicted_loss_diffs,top_n=5,ascending=True)
    print("Top harmful from predicted influence:")
    plot_top_influence_colorimage(x_train,y_train,predicted_loss_diffs,top_n=5,ascending=False)
    columns = ["idx","label","influence"]
    rows = []
    for counter,train_idx in enumerate(np.argsort(predicted_loss_diffs)[-5:]):
        rows.append([train_idx,y_train[train_idx],predicted_loss_diffs[train_idx]])

    print_table(columns,rows)
    return
Exemplo n.º 3
0
skip_and_print('Working with string: "%s"' % document_string)
doc = nlp(document_string)

# Finding named entities.
# ~~~~~~~~~~~~~~~~~~~~~~~

rows = [['Name', 'Start', 'End', 'Label']]

# Each `ent` object is an instance of the `Span` class.
for ent in doc.ents:
    rows.append([
        ent.text,  # The str of the named entity phrase.
        ent.start_char,  # Source str index of the first char.
        ent.end_char,  # Source str index of the last+1 char.
        ent.label_  # A str label for the entity type.
    ])

skip_and_print('Named entities found:')
print_table(rows)

# Named entities found:
#
# Name           Start End Label
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

for ent in doc.ents:
    skip_and_print('Recovering "%s":' % ent)
    print(document_string)
    print(' ' * ent.start_char + '^' * len(ent.text))
Exemplo n.º 4
0
# may simply be the lowercased word itself.

# Let's print a table of token texts, pos, ids, normalized forms, and
# the ids for the normalized forms:

skip_and_print('Printing all POS')
rows = [['Text', 'POS', 'Dep', 'Head', 'POS']]
for token in doc:
    rows.append([
        token.text,  # Token str w/o outer space.
        token.pos_,  # pos tag of the token
        token.dep_,  # Integer id for .text value.
        token.head.text,  # Normalized str of .text value.
        token.head.pos_
    ])  # Integer id for .norm_ value.
print_table(rows)

skip_and_print('Printing all nodes')
nodeRows = [['Text']]
for token in doc:
    nodeText = token.text
    nodePos = token.pos_
    if (nodePos == 'PROPN'):
        if (nodeText not in nodeRows):
            nodeRows.append([token.text])

print_table(nodeRows)
'''

Not found relevent for csv parsing