Python load_dataset 예제들, data_preprocessing.load_dataset Python 예제들

예제 #1

0

파일 보기

 def __init__(self):
     self.__embeddings, self.word2index = dp.load_embeddings()
     self.lemma2index, self.pos2index, self.synset2index, self.ambiguous_words = dp.load_dataset(
         self.word2index)
     self.multitask_model = self.__load_multitask_model()
     self.disambiguation_model = self.__load_disambiguation_model()
     self.index2lemma = {v: k for k, v in self.lemma2index.items()}
     self.index2synset = {v: k for k, v in self.synset2index.items()}
     self.synset2score = dp.load_sentibabelnet()

예제 #2

0

파일 보기

파일: clustering.py 프로젝트: gtyopal/bug_clustering

def train_cluster_model():
    print("Preprocessing data...")
    print("start load_dataset")
    nlp_data_clean, df_bug_nonnlp = dp.load_dataset()
    print("end load_dataset")

    print("start make traing_all_features")
    train_data_all, _ = train.merge_all_features(nlp_data_clean, df_bug_nonnlp)
    print("end make traing_all_features")

    # train_data_all.set_index(nlp_data_clean['bf_bugid'], inplace=True)
    print("start make cluster model")
    model_clustering.clustering(train_data_all, nlp_data_clean)
    print("end make cluster model")

    print("start make df_nlp_features")
    df_nlp_feature = model_clustering.get_nlp_features()
    print("end make df_nlp_features")

예제 #3

0

파일 보기

LEARNING_RATE = 0.1  # learning rate to optimize
"""
Directories
"""
TRAIN_DIR = "dataset/DATA/TRAIN"
VALID_DIR = "dataset/DATA/DEV"
TMP_DIR = "/tmp/"
ANALOGIES_FILE = "dataset/eval/questions-words.txt"
"""
Read data and write them to disk, only run for a first time. Comment the line below if you want to load them instead.
"""
dp.write_dataset(domain_limit=DOMAIN_LIMIT, skip_window=WINDOW_SIZE)
"""
Load data from disk.
"""
data, counter, W2I, I2W, unigram_table = dp.load_dataset()
VOCABULARY_SIZE = len(W2I)  # The most N word to consider in the dictionary
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
questions = dp.read_analogies(ANALOGIES_FILE, W2I)
print("Load data done")
print("Data size: %d samples with %d unique words" % (data.shape[0], len(W2I)))
"""
Skip gram model
Below is a long sequence of graph nodes for the skip gram model. The flow is basically from the (inputs, labels)
to the positive entropy loss (posEnt) and negative entropy loss (negEnt). To calculate posEnt, one just needs to
calculate posLog = inputs*pos_outputs and then forward it to sigmoid cross entropy with positive labels. 
Similarly, to negEnt, one first calculates negLog = inputs*neg_outputs ==> sigmoid cross entropy with 
negative labels. While pos_outputs are lookups of vector 'labels' on embedding tables, neg_outputs are lookups of
vector 'sampled_indices' which have to be determined by a candidate sampling function.

예제 #4

0

파일 보기

DICTIONARY_PATH = os.path.join(DATASET_DIR, DATASET_ID + '_words.ls')
DATASET_PATH = os.path.join(DATASET_DIR, DATASET_ID + '_dataset_1.ls')
TMP_DIR = r'C:\Users\Sergio\Documents\WIR\tmp'

# when resuming training
WEIGHTS_BASE_PATH = r'C:\Users\Sergio\Documents\WIR\tmp\tweets_all_978d3bfaa50a4e028d4c740ba39578df\978d3bfaa50a4e028d4c740ba39578df'
WEIGHTS_PRESENT = WEIGHTS_BASE_PATH != None
WORD_EMBEDDINGS_PATH = WEIGHTS_BASE_PATH + '_word-embeddings.ls'
CONTEXT_EMBEDDINGS_PATH = WEIGHTS_BASE_PATH + '_context-embeddings.ls'
BIASES_PATH = WEIGHTS_BASE_PATH + '_biases.ls'

# dataset loading
dictionary, direct_map, inverse_map = load_dictionary(DICTIONARY_PATH)
DICTIONARY_SIZE = len(dictionary)   # size of the loaded dictionary
print('>> {} loaded words'.format(DICTIONARY_SIZE))
dataset = load_dataset(DATASET_PATH, SAMPLE_SIZE)
print('>> {} loaded sentences'.format(len(dataset)))

# model definition
graph = tf.Graph()
with graph.as_default():

    # input tensors
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
        train_labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, NUM_TRUE]) # outputs for every input word

    # variable tensors
    with tf.name_scope('variables'):
        if not WEIGHTS_PRESENT:
            word_embeddings = tf.Variable(tf.random_uniform([DICTIONARY_SIZE, EMBEDDING_SIZE], -1.0, 1.0), name='word_embeddings')

예제 #5

0

파일 보기

파일: role_classifier.py 프로젝트: leleea7/NLP

### HYPERPARAMETERS ###

BATCH_SIZE = 10  # How many sentences i take
HIDDEN_SIZE = 128
LEARNING_RATE = 0.001
EPOCHS = 1
PRINT_RESULTS = True
GENERATE_ANSWER = False
GLOBAL_EPOCH = dp.global_epoch(TMP_DIR + 'epoch.txt')

### LOAD GLOVE EMBEDDINGS AND TRAIN SET ###

embeddings, word2index = dp.load_embeddings('WSD/Data/glove.6B.100d.txt')
sentences = dp.get_sentences(DATA_DIR + 'CoNLL2009-ST-English-train.txt')
training_data, role2index, pos2index = dp.load_dataset(sentences, word2index)
del sentences
#print('ROLES DICTIONARY SIZE:', len(role2index))
#print('POS DICTIONARY SIZE', len(pos2index))

### MODEL ###

graph = tf.Graph()

with graph.as_default():

    with tf.name_scope('input'):
        # shape = (batch_size, max length of sentence in batch)
        word_ids = tf.placeholder(tf.int32, shape=[None, None])

        # shape = (batch_size, max length of sentence in batch)

예제 #6

0

파일 보기

    def train(self, epochs=1):
        if self.__train_set and self.__test_set:
            pass
        else:
            self.__train_set, self.__test_set = dp.load_dataset(
                self.__TMP_DIR, self.__DATA_DIR)

        # Open a writer to write summaries.
        self.__writer = tf.summary.FileWriter(self.__TMP_DIR,
                                              self.__session.graph)

        average_loss = 0

        for step in tqdm.tqdm(range(self.__ITERATIONS * epochs),
                              desc='Training Siamese Network'):
            batch, label = dp.get_batch(self.__train_set, self.__BATCH_SIZE)

            pair_1 = np.array([b[0] for b in batch])
            pair_2 = np.array([b[1] for b in batch])

            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            _, l = self.__session.run([self.__train_op, self.__loss],
                                      feed_dict={
                                          self.__img_1: pair_1,
                                          self.__img_2: pair_2,
                                          self.__flags: label
                                      },
                                      run_metadata=run_metadata)

            average_loss += l

            # print loss and accuracy on test set every 500 steps
            if (step % 500 == 0 and step > 0) or (step
                                                  == (self.__ITERATIONS - 1)):
                correct = 0
                k = len(self.__test_set)
                for _ in range(k):
                    test, label = dp.get_one_shot_test(self.__test_set)
                    pair_1 = np.array([b[0] for b in test])
                    pair_2 = np.array([b[1] for b in test])

                    run_metadata = tf.RunMetadata()

                    pred = self.__session.run(self.__prediction,
                                              feed_dict={
                                                  self.__img_1: pair_1,
                                                  self.__img_2: pair_2
                                              },
                                              run_metadata=run_metadata)
                    if pred[0] == 0:
                        correct += 1

                print('Loss:', str(average_loss / step), '\tAccuracy:',
                      correct / k)

                with open(self.__TMP_DIR + '/log.txt', 'a',
                          encoding='utf8') as f:
                    f.write(
                        str(correct / k) + ' ' + str(average_loss / step) +
                        '\n')

            if step == (self.__ITERATIONS - 1):
                self.__writer.add_run_metadata(run_metadata,
                                               'step%d' % step,
                                               global_step=self.__GLOBAL_ITER +
                                               step + 1)

        self.__saver.save(self.__session,
                          os.path.join(self.__TMP_DIR, 'model.ckpt'))
        dp.global_iteration(self.__TMP_DIR + '/iteration.txt',
                            update=self.__GLOBAL_ITER + step + 1)

        pg.generate_accuracy_plot(self.__TMP_DIR + '/')
        pg.generate_loss_plot(self.__TMP_DIR + '/')

        self.__writer.close()