예제 #1
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print ("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path, "r", encoding="utf-8") as f:
        for line in f:
            (yc, Xc) = line.rstrip("\n").split("\t")
            Xt.append(Xc)
            yt.append(yc.split(","))

    # Model
    print ("Loading model params...")
    if len(args) > 3:
        print "Loading %s/model_%d.npz" % (model_path, m_num)
        params = load_params("%s/model_%d.npz" % (model_path, m_num))
    else:
        print "Loading %s/best_model.npz" % model_path
        params = load_params("%s/best_model.npz" % model_path)

    print ("Loading dictionaries...")
    with open("%s/dict.pkl" % model_path, "rb") as f:
        chardict = pkl.load(f)
    with open("%s/label_dict.pkl" % model_path, "rb") as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True)

    print ("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()
    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print ("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print ("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr, y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
            out_target.append(y[idx])

    # Save
    print ("Saving...")
    with open("%s/data.pkl" % save_path, "w") as f:
        pkl.dump(out_data, f)
    with open("%s/predictions.npy" % save_path, "w") as f:
        np.save(f, np.asarray(out_pred))
    with open("%s/embeddings.npy" % save_path, "w") as f:
        np.save(f, np.asarray(out_emb))
    with open("%s/targets.pkl" % save_path, "w") as f:
        pkl.dump(out_target, f)
예제 #2
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc)

    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt) / N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join(
                [inverse_labeldict[r] for r in ranks[idx, :5]]))
            out_emb.append(e[idx, :])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags.txt' % save_path, 'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
예제 #3
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc)

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
예제 #4
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc)
            yt.append(yc.split(','))

    # Model
    print("Loading model params...")
    if len(args) > 3:
        print 'Loading %s/model_%d.npz' % (model_path, m_num)
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        print 'Loading %s/best_model.npz' % model_path
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt,
                                  yt,
                                  labeldict,
                                  batch_size=N_BATCH,
                                  max_classes=MAX_CLASSES,
                                  test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()
    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr, y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
            out_target.append(y[idx])

    # Save
    print("Saving...")
    with open('%s/data.pkl' % save_path, 'w') as f:
        pkl.dump(out_data, f)
    with open('%s/predictions.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_pred))
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
    with open('%s/targets.pkl' % save_path, 'w') as f:
        pkl.dump(out_target, f)