Пример #1
0
  def build_graph(self):
    #get the training data
    (words, counts, words_per_epoch, current_epoch, total_words_processed,
     examples, labels) = word2vec.skipgram(filename=word_config.train_data_path,
                                           batch_size=word_config.batch_size,
                                           window_size=word_config.window_size,
                                           min_count=word_config.min_count,
                                           subsample=word_config.subsample)
    # vocab_words, vocab_counts, words_per_epoch = self._sess.run([words, counts, words_per_epoch])
    if tf.gfile.Exists(os.path.join(word_config.output_dir, 'vocab.txt')):
      vocab_words, vocab_counts = self.load_vocab()
    else:
      vocab_words, vocab_counts = self._sess.run([words, counts])

    vocab_size = len(vocab_words)
    print("Data file: ", word_config.train_data_path)
    print("Vocab size: ", vocab_size - 1, " + UNK")
    # print("Words per epoch: ", words_per_epoch)

    self._id2word = vocab_words
    for id, word in enumerate(self._id2word):
      self._word2id[word] = id

    w_embed_in = tf.Variable(tf.random_uniform([vocab_size, word_config.embed_size],
                                                -0.5 / word_config.embed_size, 0.5 / word_config.embed_size),
                             name="w_embed_in")
    w_embed_out = tf.Variable(tf.zeros([vocab_size, word_config.embed_size]), name="w_embed_out")

    self.param_summary(w_embed_in)
    self.param_summary(w_embed_out)

    # learning_rate = tf.Variable(word_config.learning_rate, trainable=False, name="learning_rate")

    global_step = tf.Variable(0, trainable=False, name="global_step")

    total_words = words_per_epoch * word_config.max_steps

    learning_rate = word_config.learning_rate * tf.maximum(0.0001, tf.cast(1 - total_words_processed / total_words, tf.float32))

    inc = global_step.assign_add(1)
    with tf.control_dependencies([inc]):
      train = word2vec.neg_train(w_embed_in, w_embed_out, examples, labels, learning_rate, vocab_counts.tolist(),
                                 word_config.nr_neg_samples)

    self._vocab_words = vocab_words
    self._vocab_counts = vocab_counts
    self._vocab_size = vocab_size
    self._w_embed_in = w_embed_in
    self._w_embed_out = w_embed_out
    self._train = train
    self._examples = examples
    self._labels = labels
    self._global_step = global_step
    self._current_epoch = current_epoch
    self._total_words_processed = total_words_processed
    self._learning_rate = learning_rate
    print("end of build graph")
Пример #2
0
  def build_graph(self):
	"""Build the model graph."""
	opts = self._options

	# The training data. A text file.
	(words, counts, words_per_epoch, current_epoch, total_words_processed,
	 examples, labels) = word2vec.skipgram(filename=opts.train_data,
										   batch_size=opts.batch_size,
										   window_size=opts.window_size,
										   min_count=opts.min_count,
										   subsample=opts.subsample)
			
	(opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
	opts.vocab_size = len(opts.vocab_words)
	print("Data file: ", opts.train_data)
	print("Vocab size: ", opts.vocab_size - 1, " + UNK")
	print("Words per epoch: ", opts.words_per_epoch)

	self._id2word = opts.vocab_words
	for i, w in enumerate(self._id2word):
	  self._word2id[w] = i

	# Declare all variables we need.
	# Input words embedding: [vocab_size, emb_dim]
	w_in = tf.Variable(
		tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in")

	# Global step: scalar, i.e., shape [].
	w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out")

	# Global step: []
	global_step = tf.Variable(0, name="global_step")

	# Linear learning rate decay.
	words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
	lr = opts.learning_rate * tf.maximum(0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

	# Training nodes.
	inc = global_step.assign_add(1)
	with tf.control_dependencies([inc]):
	  train = word2vec.neg_train(w_in,
								 w_out,
								 examples,
								 labels,
								 lr,
								 vocab_count=opts.vocab_counts.tolist(),
								 num_negative_samples=opts.num_samples)

	self._w_in = w_in
	self._examples = examples
	self._labels = labels
	self._lr = lr
	self._train = train
	self.step = global_step
	self._epoch = current_epoch
	self._words = total_words_processed
Пример #3
0
  def build_graph(self):
    """Build the model graph."""
    opts = self._options

    # The training data. A text file.
    (words, counts, words_per_epoch, current_epoch, total_words_processed,
     examples, labels) = word2vec.skipgram(filename=opts.train_data,
                                           batch_size=opts.batch_size,
                                           window_size=opts.window_size,
                                           min_count=opts.min_count,
                                           subsample=opts.subsample)
    (opts.vocab_words, opts.vocab_counts,
     opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
    opts.vocab_size = len(opts.vocab_words)
    print("Data file: ", opts.train_data)
    print("Vocab size: ", opts.vocab_size - 1, " + UNK")
    print("Words per epoch: ", opts.words_per_epoch)

    self._id2word = opts.vocab_words
    for i, w in enumerate(self._id2word):
      self._word2id[w] = i

    # Declare all variables we need.
    # Input words embedding: [vocab_size, emb_dim]
    w_in = tf.Variable(
        tf.random_uniform(
            [opts.vocab_size,
             opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),
        name="w_in")

    # Global step: scalar, i.e., shape [].
    w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out")

    # Global step: []
    global_step = tf.Variable(0, name="global_step")

    # Linear learning rate decay.
    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
    lr = opts.learning_rate * tf.maximum(
        0.0001,
        1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

    # Training nodes.
    inc = global_step.assign_add(1)
    with tf.control_dependencies([inc]):
      train = word2vec.neg_train(w_in,
                                 w_out,
                                 examples,
                                 labels,
                                 lr,
                                 vocab_count=opts.vocab_counts.tolist(),
                                 num_negative_samples=opts.num_samples)

    self._w_in = w_in
    self._examples = examples
    self._labels = labels
    self._lr = lr
    self._train = train
    self.step = global_step
    self._epoch = current_epoch
    self._words = total_words_processed
  def build_graph(self):
    """Build the model graph."""
    opts = self._options

    # The training data for text skipgram. A text file.
    (words, w_counts, words_per_epoch, w_current_epoch, total_words_processed, w_examples, w_labels) = word2vec.skipgram(filename=opts.text_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample)
    # the training data for entity skipgram
    (entities, e_counts, entities_per_epoch, e_current_epoch, total_entities_processed,
     e_examples, e_labels) = kg_skipgram(filename=opts.kg_data,
                                           batch_size=opts.batch_size,
                                           min_count=opts.min_count)

    (opts.vocab_words, vocab_word_counts,
     opts.words_per_epoch, opts.vocab_entities, vocab_entity_counts, opts.entities_per_epoch) = self._session.run([words, w_counts, words_per_epoch, entities, e_counts, entities_per_epoch])

    # the training data for align anchor skipgram
    (anchors_per_epoch, a_current_epoch, total_anchors_processed,
     a_examples, a_labels) = align_model(filename=opts.anchor_data,
                                           batch_size=opts.batch_size,
                                           window_size=opts.window_size,
                                           subsample=opts.subsample,vocab_word=opts.vocab_words.tolist(),vocab_word_freq=vocab_word_counts.tolist(),vocab_entity=opts.vocab_entities.tolist())


    opts.vocab_word_size = len(opts.vocab_words)
    opts.vocab_entity_size = len(opts.vocab_entities)
    opts.vocab_size = opts.vocab_word_size+opts.vocab_entity_size
    opts.anchors_per_epoch = self._session.run(anchors_per_epoch)

    # for neg sample, [vocab_word_counts, 0...0] and [0...0, vocab_entity_counts]
    opts.vocab_word_counts = tf.concat(0,[vocab_word_counts,tf.zeros([opts.vocab_entity_size])]).eval()
    opts.vocab_entity_counts = tf.concat(0,[tf.zeros([opts.vocab_word_size]), vocab_entity_counts]).eval()

    print("Text data file: ", opts.text_data)
    print("Word vocab size: ", opts.vocab_word_size - 1, " + UNK")
    print("Words per epoch: ", opts.words_per_epoch)

    print("Entity data file: ", opts.kg_data)
    print("Entity vocab size: ", opts.vocab_entity_size)
    print("Entities per epoch: ", opts.entities_per_epoch)
    print("Anchor data file: ", opts.anchor_data)
    print("Anchors per epoch: ", opts.anchors_per_epoch)

    # for i< opts.vocab_word_size is a word, others is entity
    self._id2item = np.concatenate((opts.vocab_words, opts.vocab_entities), 0)
    for i, w in enumerate(self._id2item):
      self._item2id[w] = i

    # Declare all variables we need.
    # Input embedding including both words and entities: [vocab_size, emb_dim]
    # shard variable if larger than 2g, because saver's limit

    self._is_shard = False
    vocab_size_limit = 511000000/opts.emb_dim
    # 2048/4-1 mb
    if opts.vocab_size > vocab_size_limit:
      self._is_shard = True
      remain_size = opts.vocab_size
      for i in xrange(opts.vocab_size/vocab_size_limit):
        self._v_in.append(tf.Variable(tf.random_uniform([vocab_size_limit,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+i))
        self._v_out.append(tf.Variable(tf.zeros([vocab_size_limit,opts.emb_dim]),name="v_out"+i))
        remain_size -= vocab_size_limit
      if remain_size!=0:
        tmp_cap = len(self._v_in)
        self._v_in.append(tf.Variable(tf.random_uniform([remain_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+tmp_cap))
        self._v_out.append(tf.Variable(tf.zeros([remain_size,opts.emb_dim]),name="v_out"+tmp_cap))
    else:
      self._v_in.append(tf.Variable(tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"))
      self._v_out.append(tf.Variable(tf.zeros([opts.vocab_size,opts.emb_dim]),name="v_out"))

    # Global step: []
    global_step = tf.Variable(0, name="global_step")

    # Linear learning rate decay.
    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
    lr = opts.learning_rate * tf.maximum(
        0.0001,
        1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

    # Training nodes.
    inc = global_step.assign_add(1)
    with tf.control_dependencies([inc]):
      w_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0],
                                 tf.concat(0,self._v_out) if self._is_shard else self._v_out[0],
                                 w_examples,
                                 w_labels,
                                 lr,
                                 vocab_count=opts.vocab_word_counts.tolist(),
                                 num_negative_samples=opts.num_samples)
      e_train = neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0],
                                 tf.concat(0,self._v_out) if self._is_shard else self._v_out[0],
                                 e_examples+opts.vocab_word_size,
                                 e_labels+opts.vocab_word_size,
                                 lr,
                                 vocab_count=opts.vocab_entity_counts.tolist(),
                                 num_negative_samples=opts.num_samples)
      a_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], a_examples+opts.vocab_word_size, a_labels, lr, vocab_count=opts.vocab_word_counts.tolist(), num_negative_samples=opts.num_samples)


    self._lr = lr
    self._train_text = w_train
    self._train_kg = e_train
    self._train_align = a_train
    self.step = global_step
    self._epoch_text = w_current_epoch
    self._epoch_kg = e_current_epoch
    self._epoch_align = a_current_epoch
    self._words = total_words_processed
    self._entities = total_entities_processed
    self._anchors = total_anchors_processed
    def build_graph(self):
        """Build the model graph."""
        opts = self._options

        # The training data. A text file.
        (words, counts, words_per_epoch, current_epoch, total_words_processed,
         examples, labels) = word2vec.skipgram(filename=opts.train_data,
                                               batch_size=opts.batch_size,
                                               window_size=opts.window_size,
                                               min_count=opts.min_count,
                                               subsample=opts.subsample)
        (opts.vocab_words, opts.vocab_counts,
         opts.words_per_epoch) = self._session.run(
             [words, counts, words_per_epoch])
        opts.vocab_size = len(opts.vocab_words)
        print("Data file: ", opts.train_data)
        print("Vocab size: ", opts.vocab_size - 1, " + UNK")
        print("Words per epoch: ", opts.words_per_epoch)

        self._id2word = opts.vocab_words
        for i, w in enumerate(self._id2word):
            self._word2id[w] = i

        #let me interrupt and get pos/neg words in vocab

        SOCIAL = False
        SOL = True
        num_words = 100
        LOVEHATE = False
        FIN = False
        BULLBEAR = False

        if (SOCIAL):  #use one of the social polar lexicons
            fileNames = [
                "./data/train_lexicons/10_social_employment_opportunities.txt",
                "./data/train_lexicons/10_social_freedom_from_discrimination.txt",
                "./data/train_lexicons/10_social_good_education.txt",
                "./data/train_lexicons/10_social_honest_and_responsive_government.txt",
                "./data/train_lexicons/10_social_political_freedom.txt"
            ]
            social_issue_to_use = 0
            polarity_dict = pd.read_csv(fileNames[social_issue_to_use],
                                        header=None,
                                        names=["pos", "neg"])
            neg_terms = polarity_dict["neg"]
            pos_terms = polarity_dict["pos"]

        elif (SOL):  #use the stock opinion lexicon
            neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv",
                                    header=0)
            pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv",
                                    header=0)
            ordered = True
            if (ordered):
                neg_terms = neg_terms.sort_values(
                    ["v1"], axis=0, ascending=True)["w1"].iloc[:num_words]
                pos_terms = pos_terms.sort_values(
                    ["v1"], axis=0, ascending=False)["w1"].iloc[:num_words]
            else:
                neg_terms = neg_terms.sample(n=num_words)["w1"]
                pos_terms = pos_terms.sample(n=num_words)["w1"]

        elif (LOVEHATE):  #use the love-hate lexicon
            justlovehate = True
            neg_terms = pd.read_csv("./data/train_lexicons/hate.txt",
                                    names=["neg"])
            pos_terms = pd.read_csv("./data/train_lexicons/love.txt",
                                    names=["pos"])
            if (justlovehate):
                neg_terms = neg_terms["neg"].iloc[-1]
                pos_terms = pos_terms["pos"].iloc[-1]
            else:
                neg_terms = neg_terms["neg"].iloc[15:]
                pos_terms = pos_terms["pos"].iloc[15:]

        elif (FIN):  #use FIN lexicon
            neg_terms = pd.read_csv("./data/train_lexicons/fin_negatives.csv",
                                    header=0,
                                    index_col=0)["negs"]
            pos_terms = pd.read_csv("./data/train_lexicons/fin_positives.csv",
                                    header=0,
                                    index_col=0)["poss"]

        elif (BULLBEAR):
            neg_terms = ["bearish"]
            pos_terms = ["bullish"]

        self.neg_terms_in_vocab = []
        self.neg_ids = []
        self.pos_terms_in_vocab = []
        self.pos_ids = []
        opts = self._options
        for neg_term in neg_terms:
            neg_term = neg_term.encode()
            if neg_term in opts.vocab_words:
                self.neg_terms_in_vocab.append(neg_term)
                self.neg_ids.append(self._word2id.get(neg_term, 0))
        self.neg_ids = tf.constant(self.neg_ids)
        for pos_term in pos_terms:
            pos_term = pos_term.encode()
            if pos_term in opts.vocab_words:
                self.pos_terms_in_vocab.append(pos_term)
                self.pos_ids.append(self._word2id.get(pos_term, 0))
        self.pos_ids = tf.constant(self.pos_ids)

        if (LOVEHATE):
            #evaluation only works for the love-hate lexicon ...
            self.eval_neg_id = [self._word2id.get("hate", 0)]
            self.eval_neg_id = tf.constant(self.eval_neg_id)
            self.eval_pos_id = [self._word2id.get("love", 0)]
            self.eval_pos_id = tf.constant(self.eval_pos_id)

            #eval neg_ids and pos_ids (all train words)
            neg_terms = pd.read_csv("./data/train_lexicons/hate.txt",
                                    names=["neg"])
            pos_terms = pd.read_csv("./data/train_lexicons/love.txt",
                                    names=["pos"])
            neg_terms = neg_terms["neg"].iloc[15:]
            pos_terms = pos_terms["pos"].iloc[15:]
            self.eval_neg_ids = []
            self.eval_pos_ids = []
            opts = self._options
            for neg_term in neg_terms:
                neg_term = neg_term.encode()
                if neg_term in opts.vocab_words:
                    self.eval_neg_ids.append(self._word2id.get(neg_term, 0))
            self.eval_neg_ids = tf.constant(self.eval_neg_ids)
            for pos_term in pos_terms:
                pos_term = pos_term.encode()
                if pos_term in opts.vocab_words:
                    self.eval_pos_ids.append(self._word2id.get(pos_term, 0))
            self.eval_pos_ids = tf.constant(self.eval_pos_ids)
        else:
            neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv",
                                    names=["neg"])
            pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv",
                                    names=["pos"])
            neg_terms = neg_terms["neg"].iloc[num_words:]
            pos_terms = pos_terms["pos"].iloc[num_words:]
            self.eval_neg_ids = []
            self.eval_pos_ids = []
            opts = self._options
            for neg_term in neg_terms:
                neg_term = neg_term.encode()
                if neg_term in opts.vocab_words:
                    self.eval_neg_ids.append(self._word2id.get(neg_term, 0))
            self.eval_neg_ids = tf.constant(self.eval_neg_ids)
            for pos_term in pos_terms:
                pos_term = pos_term.encode()
                if pos_term in opts.vocab_words:
                    self.eval_pos_ids.append(self._word2id.get(pos_term, 0))
            self.eval_pos_ids = tf.constant(self.eval_pos_ids)

        #continue where it left off
        # Declare all variables we need.
        # Input words embedding: [vocab_size, emb_dim]
        w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim],
                                             -0.5 / opts.emb_dim,
                                             0.5 / opts.emb_dim),
                           name="w_in")

        # Global step: scalar, i.e., shape [].
        w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]),
                            name="w_out")

        # Global step: []
        global_step = tf.Variable(0, name="global_step")

        # Linear learning rate decay.
        words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
        lr = opts.learning_rate * tf.maximum(
            0.0001,
            1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

        # Training nodes.
        inc = global_step.assign_add(1)
        with tf.control_dependencies([inc]):
            train = word2vec.neg_train(w_in,
                                       w_out,
                                       examples,
                                       labels,
                                       lr,
                                       vocab_count=opts.vocab_counts.tolist(),
                                       num_negative_samples=opts.num_samples)

        self._w_in = w_in
        self._examples = examples
        self._labels = labels
        self._lr = lr
        self._train = train
        self.step = global_step
        self._epoch = current_epoch
        self._words = total_words_processed

        #Train nodes with antonyms
        loss2 = self.antonym_loss_and_optimize()
        self._loss2 = loss2
Пример #6
0
    #global_step = tf.Variable(0, trainable=False)
  learning_rate_word2vec = tf.Variable(float(lr_rate), trainable=False)

  selected_sense_output_indices = tf.placeholder(tf.int32, None, name='selected_sense_output_indices')
  selected_sense_input_indices = tf.placeholder(tf.int32, None, name='selected_sense_input_indices')

  # [batch_size, sense_dim, sense_embedding_dim]
  embedded_sense_input = tf.nn.embedding_lookup(s_in, selected_sense_input_indices)
  embedded_sense_output = tf.nn.embedding_lookup(s_out, selected_sense_output_indices)

  reward_sense_prob = tf.sigmoid(tf.reduce_sum(tf.mul(embedded_sense_input, embedded_sense_output), 1))
  print 'embedded_sense_input:shape=',embedded_sense_input

  inc = total_words_processed.assign_add(batch_size)
  with tf.control_dependencies([inc]):
    train = word2vec.neg_train(s_in,s_out,selected_sense_input_indices,selected_sense_output_indices,\
      learning_rate_word2vec,vocab_count=sense_counts,num_negative_samples=samp_size)
  
  init_word2vec = tf.initialize_all_variables()

with tf.variable_scope("RLWE"):
  w_out = tf.Variable(tf.zeros([sense_size, embedding_dim]),\
                trainable=True, name="word_outputs")
  
  w_in = tf.Variable(tf.random_uniform([vocab_size, embedding_dim],-(3./embedding_dim)**0.5,(3./embedding_dim)**0.5),\
                trainable=True, name="word_embeddings")

  global_step = tf.Variable(0, trainable=False)
  learning_rate = tf.Variable(float(lr_rate), trainable=False)
  
  context_indices = tf.placeholder(tf.int32, [context_window*2+batch_size, max_context_length])
  sense_indices = tf.placeholder(tf.int32, [(context_window*2+batch_size) * sense_dim])