示例#1
0
def decode():
    with tf.Session() as sess:

        # Load vocabularies.
        vocab_file = FLAGS.data_dir + "/vocab.pkl"
        word2id = pkl.load(open(vocab_file, "rb"))
        id2word = {v: k for (k, v) in word2id.items()}

        embeddings = embedding.Embedding(None, word2id, id2word,
                                         word2id["UNK"], word2id["PAD"],
                                         word2id["</s>"], word2id["<s>"])

        # Create model and load parameters.
        FLAGS.batch_size = 1  # We decode one sentence at a time.
        model = create_model(sess, True, len(word2id))

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            encoder_inputs, decoder_inputs, target_weights, bucket_id = utils.prepare_input_sent(
                sentence, embeddings, _buckets)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(
                sess,
                np.array([encoder_inputs]).transpose(),
                np.array([decoder_inputs]).transpose(),
                np.array([target_weights]).transpose(), bucket_id, True)
            print(utils.process_output(output_logits, embeddings))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
示例#2
0
def propagate(model=None,
              positive_seed=None,
              negative_seed=None,
              name="Unknown"):
    print("[INFO] Model name:", name)
    return (pi.random_walk(
        embedding.Embedding(model.wv.vectors, list(model.wv.vocab.keys())),
        positive_seed, negative_seed))
def load_data(corpus_file, word2id, max_sent=0):
    """
    Given a dataset file and word2id embeddings, read them to lists
    :param feature_label_file:
    :param max_sent:
    :param word2id:
    :return:
    """
    end_id = word2id["</s>"]
    PAD_id = word2id["PAD"]
    UNK_id = word2id["UNK"]
    start_id = word2id["<s>"]
    id2word = {v: k for (k, v) in word2id.items()}
    word_embedding = embedding.Embedding(None, word2id, id2word, UNK_id,
                                         PAD_id, end_id, start_id)

    # load features and labels
    feature_vectors = []
    sentences = []
    labels = []
    with codecs.open(corpus_file, "r", "utf8", "replace") as data:
        i = 0
        for line in data:
            if i >= max_sent and max_sent > 0:
                break
            stripped = line.strip()
            tokens = stripped.split()
            vector = word_embedding.encode(tokens)
            vector.append(end_id)  # add </s> to sentence
            sentences.append(tokens)
            if i == 0:
                # no previous dialogue available
                feature_vectors.append(vector)
            else:
                # input is previous sentence
                feature_vectors.append(labels[i - 1])
            vector = [start_id] + vector  # re-pend start_id to decoder inputs
            labels.append(vector)
            i += 1

    logging.info("Loaded %d sentences" % len(feature_vectors))

    return feature_vectors, sentences, labels
示例#4
0
    def __init__(self, data, dim, tau, grid_params):
        self.pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

        _embedding = embedding.Embedding(data)
        self.embedded = _embedding.embedding(tau=tau, m=dim)

        # transform to features and labels
        self.features = []
        self.labels = []
        for i, vector in enumerate(self.embedded):
            if (i + 1) >= len(self.embedded):
                break
            self.features.append(vector)
            self.labels.append(self.embedded[i + 1])

        self.features = np.array(self.features)
        self.labels = np.array(self.labels)

        assert isinstance(grid_params, dict), 'grid_params must be dict'
        self.grid_params = grid_params
def load_embedding(pkl_file):
    word2id = {}
    id2word = {}
    with codecs.open(pkl_file, "rb", "utf8", "replace") as opened:
        words, vectors = pkl.load(opened)
        assert len(words) == len(vectors)
        UNK_id = words.index("<UNK>")
        PAD_id = words.index("<PAD>")
        start_id = words.index("<S>")
        end_id = words.index("</S>")
        word2id["<s>"] = start_id
        word2id["</s>"] = end_id
        for i, w in enumerate(words):
            word2id[w] = i
            id2word[i] = w
    logging.info("Loaded embeddings for %d words with dimensionality %d" %
                 (len(words), len(vectors[0])))
    #print "Special tokens:", UNK_id, PAD_id, start_id, end_id
    emb = embedding.Embedding(vectors, word2id, id2word, UNK_id, PAD_id,
                              end_id, start_id)
    return emb
示例#6
0
print(sys.argv[0])
sys.path.append('../..')

import torch
import torch.backends.cudnn as cudnn
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.cuda.empty_cache()
    cudnn.benchmark = True

import segsemdata
import embedding
import numpy as np

print("load model")
net = embedding.Embedding(pretrained="/data/vgg16-00b39a1b.pth")
net = net.to(device)

print("load data")
datatrain = segsemdata.makeDFC2015(datasetpath="/data/DFC2015",
                                   lod0=False,
                                   dataflag="train")
datatrain = datatrain.copyTOcache(outputresolution=50)
net.adddataset(datatrain.metadata())
net = net.to(device)
nbclasses = len(datatrain.setofcolors)
earlystopping = datatrain.getrandomtiles(1000, 128, 16)

print("train setting")
import torch.nn as nn
#!/usr/bin/env python

import torch
import numpy as np
import pandas as pd
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt

import seaborn as sns

import embedding

sns.set(style="whitegrid", color_codes=True)

ref = embedding.Embedding(gpu=False)
ref.load_vectors("output/pi.1000.txt")
ref.embedding /= ref.embedding.norm(2, 0).expand_as(ref.embedding)
dim = ref.embedding.shape[1]

method = {
    "Power Iteration": "pi",
    # "Power Iteration with Momentum": "pim"
}

l1 = {}  # First component loss
l2 = {}  # Second component loss
lw = {}  # Worst component loss

for m in method:
    it = [i + 1 for i in range(1000)]
示例#8
0
                    # print(x)
                # print()
                if epoch % count_loss == 0:
                    q, a, q_len, a_len = self.get_validation(input, output)
                    accuracy, result, targets = sess.run([
                        self.accuracy, self.pred_labels_sliced,
                        self.decoder_train_targets
                    ],
                                                         feed_dict=self._dict(
                                                             q, a, q_len,
                                                             a_len))
                    print("loss		:\t", loss)
                    print("accuracy :\t", accuracy)
                    print(result)
                    print(a)
                    print(targets)
                    print()


if __name__ == "__main__":
    print(random.randint(0, 9))
    embedding = emb.Embedding(trainable=True)
    embedding.load("./tmp_vectors.txt")
    embedding.init()
    # make a seq2seq model, with embeddings loaded from tmp_vector_file
    model = Seq2Seq(embedding, 100)
    # 3 questions and answers: 1- Q: salam khoobi  => A: mersi khoobam, 2- Q: che khabar => A: salamati ....
    model.train(
        [["salam", "khoobi"], ["che", "khabar"], ["aya", "hava", "sarde"]],
        [["mersi", "khoobam"], ["salamati"], ["are", "fekr", "konam"]])
示例#9
0
if len(sys.argv) > 1:
    env = sys.argv[1]
else:
    env = "local"

# print(labels)
print("Total labels: ", len(config.labels))
print(config.vocabulary_size)

path = ""
if env == "local":
    path = "data/reuters/"
elif env == "server":
    path = "data/reuters/"

cnn = cn.Embedding()
# Construct model
pred = cnn.network(cnn.x, cnn.weights, cnn.biases, cnn.dropout)

# Define loss and optimizer
#cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=cnn.y))
#cost = tf.reduce_mean(bpmll_out_module.bp_mll(pred, cnn.y))
cost = -tf.reduce_sum(
    ((cnn.y * tf.log(pred + 1e-9)) + ((1 - cnn.y) * tf.log(1 - pred + 1e-9))),
    name='xentropy') + 0.01 * (tf.nn.l2_loss(cnn.weights['wd1']) +
                               tf.nn.l2_loss(cnn.weights['out']))
optimizer = tf.train.AdamOptimizer(
    learning_rate=cnn.learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(cnn.y, 1))
    '/Users/pengxiang/corpora/spaces/enwiki-20160901/dim300vecs.bin.gz', True)

levy_deps = embedding.Embedding('levy_deps', 300, syntax_label=False)
levy_deps.load_model(
    '/Users/pengxiang/corpora/spaces/levy_deps', False)

pair_triple = embedding.Embedding(
    'event_based_pair_triple', 300, syntax_label=True)
pair_triple.load_model(
    '/Users/pengxiang/corpora/spaces/enwiki-20160901/event_based/'
    'dim300vecs_w_surface_pair_c_lemma_triple', False)
'''

event_model = embedding.Embedding('event_script',
                                  300,
                                  syntax_label=True,
                                  use_ner=True,
                                  use_lemma=True,
                                  include_compounds=True)
event_model.load_model(
    '/Users/pengxiang/corpora/spaces/03141230_dim300vecs.bin', True)

most_sim_event_eval = MostSimEventEvaluator()
most_sim_event_eval.set_use_max_score(True)

most_sim_event_eval.set_rep_only(True)
most_sim_event_eval.set_head_only(True)
'''
most_sim_event_eval.set_model(word2vec)
most_sim_event_eval.evaluate(all_scripts)

most_sim_event_eval.set_model(levy_deps)
示例#11
0
import copy

import numpy as np
import pandas as pd

import embedding
import classifiers

pp = PurePath(Path.cwd()).parts
pdir = PurePath(*pp)

bid, ask = pd.read_csv(str(pdir) + '/data/eurusd-bid-1h.csv'), pd.read_csv(
    str(pdir) + '/data/eurusd-ask-1h.csv')
mids = ((bid.iloc[:, 1] + ask.iloc[:, 1]) / 2).dropna()

_embedding = embedding.Embedding(mids)
time_delayed_mi = _embedding.time_delayed_mutual_information()
# _embedding.plot_mutual_information(time_delayed_mi)

# First minima of time-delayed mutual information
time_delay = _embedding.locmin(time_delayed_mi)[0]
# _embedding.plot_delayed_series(tau=time_delay)

# Calculate FNN in the range of 10 dimensions. Takes some time to calculate!
# dim = np.arange(1, 10 + 1)
# f1, f2, f3 = _embedding.fnn(mids.values, dim=dim, tau=time_delay, window=10, metric='cityblock')
# _embedding.plot_fnn(dim, f1, f2, f3)

# judging from the plot above FNN goes beyond 10% in dim=4

m = 4
示例#12
0
print(sys.argv)
sys.path.append('../..')

import torch
import torch.backends.cudnn as cudnn
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.cuda.empty_cache()
    cudnn.benchmark = True

import segsemdata
import embedding
import numpy as np

print("load model")
net = embedding.Embedding(pretrained="/home/achanhon/vgg16-00b39a1b.pth")
net = net.to(device)

print("load data")
datatrain = segsemdata.makeTinyMiniFrancePerTown(
    datasetpath="/data01/PUBLIC_DATASETS/MiniFrance/tmFrance/",
    town="all",
    dataflag="train")

if len(sys.argv) == 1 and sys.argv == "grey":
    datatrain = datatrain.copyTOcache(color=False)
if len(sys.argv) == 1 and sys.argv == "normalize":
    datatrain = datatrain.copyTOcache(color=False, normalize=True)

net.adddataset(datatrain.metadata())
net = net.to(device)
示例#13
0
def train():
    with tf.Session() as sess:
        # Read data into buckets and compute their sizes.
        print("Reading training data (limit: %d)." % FLAGS.max_train_data_size)
        # load data and embeddings
        train_file = FLAGS.data_dir + "/corpus.txt"
        vocab_file = FLAGS.data_dir + "/vocab.pkl"
        word2id = pkl.load(open(vocab_file, "rb"))
        id2word = {v: k for (k, v) in word2id.items()}

        embeddings = embedding.Embedding(None, word2id, id2word,
                                         word2id["UNK"], word2id["PAD"],
                                         word2id["</s>"], word2id["<s>"])
        vocab_size = len(word2id)

        train_feature_vectors, train_sentences, train_labels = \
            utils.load_data(train_file, word2id, max_sent=FLAGS.max_train_data_size)

        print("vocab size: %d" % vocab_size)
        print("Training on %d instances" % len(train_labels))
        print("Maximum sentence length (train): %d" %
              max([len(y) for y in train_labels]))
        print("Average sentence length (train): %d" %
              np.mean([len(y) for y in train_labels]))

        # bucketing training data

        # equal bucket sizes
        #buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]  #pre-define buckets
        data_buckets, reordering_indexes = utils.put_in_double_buckets(
            np.asarray(train_feature_vectors), np.asarray(train_labels),
            _buckets, embeddings.PAD_id)
        bucket_sizes = [0] * len(_buckets)
        for i, indx in reordering_indexes.items():
            bucket_sizes[i] = len(indx)
        print("Bucket sizes: %s" % str(bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / len(train_labels)
            for i in xrange(len(bucket_sizes))
        ]

        print("Bucket scale: %s" % str(buckets_scale))

        # Create model.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, False, vocab_size)

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while True:
            # Choose a bucket according to the number of samples within
            probs = np.array(buckets_scale) / sum(buckets_scale)
            bucket_id = np.random.choice(range(len(buckets_scale)), p=probs)
            #print("Bucket %d" % bucket_id)

            # Get a batch and make a step.
            start_time = time.time()
            bucket_xs, bucket_ys, input_lens, output_lens, bucket_masks = data_buckets[
                bucket_id]
            # random order of samples in batch
            order = np.random.permutation(len(bucket_xs))
            batch_samples = order[:FLAGS.batch_size]
            #print("Batch samples: %s" % str(batch_samples))
            # get a batch from this bucket
            encoder_inputs = bucket_xs[batch_samples]  # TODO reverse inputs?
            decoder_inputs = bucket_ys[batch_samples]
            target_weights = bucket_masks[batch_samples]
            #print(encoder_inputs.shape, decoder_inputs.shape, target_weights.shape)  # batch x seq_len  -> transpose as input

            _, step_loss, _ = model.step(sess, encoder_inputs.transpose(),
                                         decoder_inputs.transpose(),
                                         target_weights.transpose(), bucket_id,
                                         False)
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.model_dir,
                                               "translate.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss = 0.0, 0.0
示例#14
0
        EMOJIS = api_call.get('emoji').keys()
        print EMOJIS
    # load the tf model
    with tf.Session() as sess:
        # Load slack metadata
        metadata = None
        with open("metadata.json", "r") as m:
            metadata = json.load(m)

        # Load vocabularies.
        vocab_file = FLAGS.data_dir + "/vocab.pkl"
        word2id = pkl.load(open(vocab_file, "rb"))
        id2word = {v: k for (k, v) in word2id.items()}

        embeddings = embedding.Embedding(None, word2id, id2word,
                                         word2id["UNK"], word2id["PAD"],
                                         word2id["</s>"], word2id["<s>"])

        # Create model and load parameters.
        model = create_model(sess, True, len(word2id))

        if slack_client.rtm_connect():
            print "%s running: id %s, token %s" % (BOT_NAME, BOT_ID, TOKEN)
            while True:
                command, channel = parse_slack_output(slack_client.rtm_read())
                if command and channel:
                    handle_command(command, channel, model, embeddings,
                                   metadata)
                time.sleep(READ_WEBSOCKET_DELAY)
        else:
            print "%s failed" % BOT_NAME