def build_graph(self, phrase_max_size=1, composition_function='RNN', dim=100, batch_size=1, neg=1, learning_rate=0.2, id_word=[],
                 freq_table=[], init_word_data=[], init_context_data=[], epoch_num=1, embedding_train=False):
     if init_word_data:
         init_word_matrix = utils.read_embedding(init_word_data, id_word)
     else:
         init_word_matrix = []
     if init_context_data:
         init_context_matrix = utils.read_embedding(init_context_data, id_word)
     else:
         init_context_matrix = []
     holder, composed, context_id, true_logit, negative_logit = self.forward(phrase_max_size, composition_function, dim, batch_size, neg, 
                                                                             freq_table, init_word_matrix, init_context_matrix, embedding_train)
     self.holder = holder
     self.composed = composed
     self.context_id = context_id
     self.true_logit = true_logit
     self.negative_logit = negative_logit
     loss = self.nce_loss(true_logit, negative_logit, batch_size, embedding_train)
     self.loss = loss
     for length in loss:
         tf.scalar_summary('%s_SGNS_loss'%length, loss[length])
     processed_num, lr, optimize_op = self.optimizer(loss, sum(freq_table), epoch_num, initial_learning_rate=learning_rate)
     self.processed_num = processed_num
     self.optimize_op = optimize_op
     self.lr = lr
     #initialize all values
     tf.initialize_all_variables().run()
    def train(self, sess, saver):
        self.logger.info("training model")
        best_accuracy = 0
        patient_passes = 0
        self.logger.info("init summary")
        self.add_summary(sess.graph)
        self.logger.info("init session")
        self.attach_session(sess)
        embedding, voc_size = read_embedding(self.flags.embedding_data)
        _ = sess.run(self.embedding_init,
                     {self.embedding_placeholder: embedding})

        for epoch in range(self.flags.epoch):
            self.logger.info("Running epoch {} of {}".format(
                epoch + 1, self.flags.epoch))
            accuracy, loss = self.train_epoch(epoch, embedding, saver)
            self.logger.info(
                'accuracy accuracy on dev {} loss on dev {}'.format(
                    accuracy, loss))
            if accuracy <= best_accuracy:
                patient_passes += 1
                if patient_passes == self.flags.patient_passes:
                    self.logger.info(
                        ' - {} epochs without improvement, training stopped'.
                        format(patient_passes))
                    break
            else:
                self.logger.info('- New best accuracy {}'.format(accuracy))
                best_accuracy = accuracy
                patient_passes = 0
                saver.save(sess,
                           os.path.join(self.model_dir, "model"),
                           global_step=self.step)
示例#3
0
 def preprocess(self,train,test,mode):
     '''
     Preprocess data and do feature extraction
     Args:
         train (dataframe): train data in dataframe format
         test (dataframe): validation data in dataframe format
         mode (str): mode whether it is "training", "eval", "prediction"
     Returns:
         X: train data with shape=[num_examples,max_len,embedding_dim]
         Y: train label with shape=[num_examples,]
         test_X: validation data with shape=[num_examples,max_len,embedding_dim]
         test_Y: validation label with shape=[num_examples,]
     '''
     embeddings_dict,embedding_dim = utils.read_embedding(self.embedding_model)
     num_words = len(embeddings_dict)
     X,Y,test_X,test_Y= utils.create_embedding_features(
                                                         mode,
                                                         train,
                                                         test,
                                                         self.input_col,
                                                         self.target_col,
                                                         embeddings_dict,
                                                         embedding_dim,
                                                         stopwords_file=self.stopwords_file,
                                                         max_len=self.max_len,
                                                         limit=self.limit
                                                         )
     return X,Y,test_X,test_Y,num_words,embedding_dim
示例#4
0
def income_different_size_embedding_scenario():
    # names = np.array(
    #     [['ridge without emd', 'RF without emd'],
    #      ['ridge with emd', 'RF with emd'],
    #      ['ridge just emd', 'RF just emd']])
    # names = [['ridge', 'RF']]
    names = [['ridge']]
    y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p'

    target = utils.read_target(y_path)
    y = np.array(target['mean_income'])
    n_folds = 10
    sizes = [16, 32, 64, 128]
    for size in sizes:
        print 'running embeddings of size ', size
        emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_{0}.emd'.format(
            size)
        x = utils.read_embedding(emd_path, target)
        results = run_all_datasets([x], y, names, regressors, n_folds)
        # all_results = utils.merge_results(results)
        all_results = pd.concat([x for x in results])
        all_results.rename(columns={n_folds: 'train'}, inplace=True)
        results, tests = t_tests(all_results)
        print results
        path = '../results/income/thresh10_' + str(
            size) + '_' + utils.get_timestamp() + '.csv'
        results.to_csv(path, index=True)
def karate_scenario():
    deepwalk_path = 'local_resources/zachary_karate/size8_walks1_len10.emd'

    y_path = 'local_resources/zachary_karate/y.p'
    x_path = 'local_resources/zachary_karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [['logistic'], ['deepwalk']]

    x_deepwalk = utils.read_embedding(deepwalk_path, target)
    # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1)
    X = [x_deepwalk, normalize(x, axis=0)]
    n_folds = 2
    results = run_all_datasets(X, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('results/karate/deepwalk_macro_pvalues' +
                    utils.get_timestamp() + '.csv')
    tests[1].to_csv('results/karate/deepwalk_micro_pvalues' +
                    utils.get_timestamp() + '.csv')
    print 'macro', results[0]
    print 'micro', results[1]
    macro_path = 'results/karate/deepwalk_macro' + utils.get_timestamp(
    ) + '.csv'
    micro_path = 'results/karate/deepwalk_micro' + utils.get_timestamp(
    ) + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
示例#6
0
    def _build_vocab(self, counter, embedding_config):
        """
        :param counter: counter of words in dataset
        :param embedding_config: word_embedding config: (root, word_type, dim)
        :return: itos, stoi, vectors
        """

        wv_dict, wv_vectors, wv_size = read_embedding(embedding_config)

        # embedding size = glove vector size
        embed_size = wv_vectors.size(1)
        print("word embedding size: %d" % embed_size)

        # build itos and stoi
        # words_in_dataset = sorted(counter.keys(), key=lambda x: counter[x], reverse=True)
        words_in_dataset = counter.keys()

        itos = self.specials[:]

        stoi = defaultdict(self.get_unk)

        itos.extend(words_in_dataset)
        for idx, word in enumerate(itos):
            stoi[word] = idx

        # build vectors
        vectors = torch.zeros([len(itos), embed_size])
        for word, idx in stoi.items():
            idx_in_pretrained_array = wv_dict.get(word, None)
            if idx_in_pretrained_array is not None:
                vectors[idx, :wv_size].copy_(wv_vectors[idx_in_pretrained_array])
        return itos, stoi, vectors
def change_index(emd_path, target):
    """
    change the embeding index to be Twitter IDs
    :param emd_path: the path to the embedding file
    :param target: a pandas DataFrame containing the labels indexed by Twitter IDs
    :return: None
    """
    x = utils.read_embedding(emd_path, target)
    df = pd.DataFrame(data=x, index=target.index)
    try:
        del df.index.name
    except AttributeError:
        pass
    df.to_csv(emd_path)
def reindex_embeddings():
    """
    changes the first column of embeddings from an index to a Twitter ID
    :return:
    """
    y_path = '../../local_resources/income_dataset/y_thresh10.p'
    target = utils.read_target(y_path)
    sizes = [16, 32, 64, 128]

    for size in sizes:
        print 'running embeddings of size ', size
        emd_path = '../../local_results/income_dataset/thresh10_{0}.emd'.format(
            size)
        x = utils.read_embedding(emd_path, target)
        df = pd.DataFrame(data=x, index=target.index)
        try:
            del df.index.name
        except AttributeError:
            pass
        df.to_csv(emd_path)
示例#9
0
def income_scenario():
    # names = np.array(
    #     [['ridge without emd', 'RF without emd'],
    #      ['ridge with emd', 'RF with emd'],
    #      ['ridge just emd', 'RF just emd']])
    # names = [['ridge', 'RF']]
    names = [['ridge']]
    y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p'
    emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_64.emd'

    target = utils.read_target(y_path)
    x = utils.read_embedding(emd_path, target)
    y = np.array(target['mean_income'])
    n_folds = 10
    # x, y = utils.read_data(x_path, y_path, threshold=1)
    results = run_all_datasets([x], y, names, regressors, n_folds)
    # all_results = utils.merge_results(results)
    all_results = pd.concat([x for x in results])
    all_results.rename(columns={n_folds: 'train'}, inplace=True)
    results, tests = t_tests(all_results)
    print results
    path = '../results/income/thresh10_' + utils.get_timestamp() + '.csv'
    results.to_csv(path, index=True)
from tqdm import tqdm

if __name__ == '__main__':
    config_file = sys.argv[1]
    configure = json.load(open(config_file))
    config = configure["main_configuration"]
    config_data = config["data_sets"]
    config_model = config["model"]
    config_model_param = config_model["parameters"]
    config_model_train = config_model["train"]
    config_model_test = config_model["test"]
    print("Data extraction\nConfiguration: ")
    print(json.dumps(config, indent=2), end='\n')

    print("Read embeddings ...")
    embed_tensor = convert_embed_2_numpy(read_embedding(config_data["embed"]),
                                         config_data["vocab_size"])

    print("Create a model...")

    query = Input(name="in_query",
                  shape=(config_data['query_maxlen'], ),
                  dtype='int32')  # ex: query vector of 10 words
    doc = Input(name="in_doc",
                shape=(config_data['doc_maxlen'], ),
                dtype='int32')

    embedding = Embedding(config_data['vocab_size'],
                          config_data['embed_size'],
                          weights=[embed_tensor],
                          trainable=config_model_train['train_embed'],
示例#11
0
def main():

    config = """
    {
        "query_maxlen": 10,
        "doc_maxlen": 1000,
        "hist_size": 200,
        "embed_path": "../../data/support/embed_query_100w_50.txt",
        "input": "../../runtime_data/fulltext/real_train_HL_solrex",
        "output": "../../runtime_data/fulltext/real_train_HL_solrex_drmm"
    }
    """

    config = json.loads(config)

    embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding(
        config['embed_path'])
    embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size]))
    embed_dict = convert_embed_2_numpy('embed',
                                       embed_dict=embed_dict,
                                       embed=embed,
                                       normalize=True)

    if not os.path.exists(config['output']):
        os.mkdir(config['output'])

    for dirpath, dirnames, filenames in os.walk(config['input']):
        for fn in filenames:
            if not fn.endswith('.txt'):
                continue
            print os.path.join(dirpath, fn)
            with open(os.path.join(dirpath, fn)) as file:
                qid, query, uid, doc, label = file.readline().split('\t')
                output = open(os.path.join(config['output'], qid + '.txt'),
                              'w')
                query = query.strip().split(
                )[:min(len(query), config['query_maxlen'])]
                query = convert_term2id(query, word_dict)
                query_embed = np.zeros([config['query_maxlen'], embed_size],
                                       dtype=np.float32)
                for i, wid in enumerate(query):
                    query_embed[i] = embed_dict[wid]
                file.seek(0)
                for line in file:
                    qid, query, uid, doc, label = line.split('\t')
                    uid = uid.strip()
                    qid = qid.strip()
                    query = query.strip()
                    label = label.strip()
                    doc = doc.strip().split(
                    )[:min(len(doc), config['doc_maxlen'])]
                    doc = convert_term2id(doc, word_dict)
                    doc_embed = np.zeros([config['doc_maxlen'], embed_size],
                                         dtype=np.float32)
                    for i, wid in enumerate(doc):
                        doc_embed[i] = embed_dict[wid]
                    hist = cal_hist(query_embed, doc_embed, config)
                    hist = ' '.join(map(str, np.reshape(hist, [-1])))
                    output.write(qid + '\t' + query + '\t' + uid + '\t' +
                                 hist + '\t' + label + '\n')

                output.close()
            self.logger.info("Running epoch {} of {}".format(
                epoch + 1, self.flags.epoch))
            accuracy, loss = self.train_epoch(epoch, embedding, saver)
            self.logger.info(
                'accuracy accuracy on dev {} loss on dev {}'.format(
                    accuracy, loss))
            if accuracy <= best_accuracy:
                patient_passes += 1
                if patient_passes == self.flags.patient_passes:
                    self.logger.info(
                        ' - {} epochs without improvement, training stopped'.
                        format(patient_passes))
                    break
            else:
                self.logger.info('- New best accuracy {}'.format(accuracy))
                best_accuracy = accuracy
                patient_passes = 0
                saver.save(sess,
                           os.path.join(self.model_dir, "model"),
                           global_step=self.step)

    # def restore(self):
    #       self.logger.info("loading model")
    #   self.attach_session(sess)
    #   saver.restore(sess, self.flags.model_output)


if __name__ == "__main__":
    eb, i = read_embedding('./data/embedding.pkl')
    print(eb[4])
示例#13
0
"""
'''

config = """
{
    "query_maxlen": 10,
    "doc_maxlen": 1000,
    "hist_size": 200,
    "embed_path": "/home/zhenfan/lab/K-NRM/data/embed_query_50w_50.txt",
    "input": "/home/zhenfan/lab/K-NRM/data/real_train_HL_solrex",
    "output": "/home/zhenfan/lab/K-NRM/data/real_train_HL_solrex_drmm"
}
"""
'''
config = json.loads(config)
embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding(
    config['embed_path'])
embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size]))
embed_dict = convert_embed_2_numpy('embed',
                                   embed_dict=embed_dict,
                                   embed=embed,
                                   normalize=True)


def cal_hist(query_embed, doc_embed, config):
    hist = np.zeros([config['query_maxlen'], config['hist_size']],
                    dtype=np.int32)
    hist[:] = 1
    mm = np.zeros([config['query_maxlen'], config['doc_maxlen']],
                  dtype=np.float32)
    for i in range(config['query_maxlen']):
        for j in range(config['doc_maxlen']):
示例#14
0
 def __init__(self, flags):
     """Init class."""
     self.flags = flags
     self.embedding, self.embedding_size = read_embedding(
         self.flags.model_dir + self.flags.embedding_path)