def loss(self, model, paths, total_paths, alpha=1.0): start, next_report, loss = time.time(), 5.0, 0.0 num_nodes = 0 for job_no, job in enumerate( chunkize_serial(prepare_sentences(model, paths), 250)): batch_loss = np.zeros(1, dtype=np.float32) batch_work = np.zeros(model.layer1_size, dtype=np.float32) batch_node = sum([ loss_o2(model.node_embedding, model.context_embedding, path, self.negative, self.window_size, model.table, alpha, model.layer1_size, batch_work, py_loss=batch_loss) for path in job if path is not None ]) num_nodes += batch_node loss += batch_loss[0] elapsed = time.time() - start if elapsed >= next_report: log.debug("PROGRESS: at %.2f%% path, %.0f paths/s" % (100.0 * num_nodes / total_paths, num_nodes / elapsed if elapsed else 0.0)) # log.debug("loss: {}".format(loss)) next_report = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports # def worker_loss(job, next_report): # """Train the model, lifting lists of paths from the jobs queue.""" # # py_work = np.zeros(model.layer1_size, dtype=np.float32) # job_nodes = sum([loss_o2(model.node_embedding, model.context_embedding, path, self.negative, # self.window_size, model.table, alpha, model.layer1_size, # py_work, py_loss=loss) for path in job]) # execute the sgd # num_nodes[0] += job_nodes # elapsed = time.time() - start # # if elapsed >= next_report: # print("PROGRESS: at %.2f%% path, %.0f paths/s" % ( # 100.0 * num_nodes[0] / total_paths, num_nodes[0] / elapsed if elapsed else 0.0)) # next_report = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports # print(loss) # return next_report # # for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, paths), 250)): # next_report = worker_loss(job, next_report) log.info(num_nodes) log.info(loss) return loss
def loss(self, model, edges): ret_loss = 0 for edge in prepare_sentences(model, edges): assert len( edge) == 2, "edges have to be done by 2 nodes :{}".format(edge) edge_loss = np.log( sigmoid( np.dot(model.node_embedding[edge[1].index], model.node_embedding[edge[0].index].T))) assert edge_loss <= 0, "malformed loss" ret_loss -= edge_loss return ret_loss
def learn_second(network, lr, model, examples_files, total_example, alpha=1.0, batch_size=20): """ Helper function used to optimize O2 :param network: network model to optimize :param lr: learning rate :param model: model containing the shared data :param examples_files: list of files containing the examples :param total_example: total example for training :param alpha: trade-off param :param batch_size: size of the batch :return: loss value """ num_batch = 0 log.info("compute o2") optimizer = SGD(network.parameters(), lr) log.debug("read example file: {}".format("\t".join(examples_files))) loss_val = 0 if alpha <= 0: return loss_val for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, graph_utils.combine_example_files_iter(examples_files), network.transfer_fn(model.vocab)), batch_size, long_tensor=LongTensor): input, output = batch loss = (alpha * network.forward( input, output, negative_sampling_fn=model.negative_sample)) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / (total_example / batch_size))) log.debug("O2 loss: {}".format(loss_val)) return loss_val
def learn_community(network, lr, model, nodes, beta=1.0, batch_size=20): """ Helper function used to optimize O3 :param network: model to optimize :param lr: learning rate :param model: model containing the shared data :param nodes: nodes on which execute the learning :param beta: trade-off value :param batch_size: size of the batch :return: loss value """ num_batch = 0 log.info("compute o3") optimizer = SGD(network.parameters(), lr) loss_val = 0 if beta <= 0.: return loss_val for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, nodes, network.transfer_fn()), batch_size, long_tensor=LongTensor): input, output = batch loss = network.forward(input, model) loss.data *= (beta / model.k) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / (total_example / batch_size))) log.debug("O3 loss: {}".format(loss_val)) return loss_val
def learn_first(network, lr, model, edges, num_iter=1, batch_size=20): """ Helper function used to optimize O1 :param network: neural network to train :param lr: learning rate :param model: model containing the shared data :param edges: numpy list of edges used for training :param num_iter: iteration number over the edges :param batch_size: size of the batch :return: loss value """ log.info("computing o1") optimizer = SGD(network.parameters(), lr) num_batch = 0 total_batch = (edges.shape[0] * num_iter) / batch_size loss_val = 0 for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, edges, network.transfer_fn(model.vocab)), batch_size, long_tensor=LongTensor): input, output = batch loss = network.forward(input, output, negative_sampling_fn=model.negative_sample) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / total_batch)) log.debug("O1 loss: {}".format(loss_val)) return loss_val
def learn_second(network, lr, model, examples_files, alpha=1.0): """ Helper function used to optimize O1 and O3 :param loss: loss to optimize :param lr: learning rate :param model: deprecated_model used to compute the batches and the negative sampling :param examples_files: list of files containing the examples :param num_iter: iteration number over the edges :return: """ log.info("compute o2") optimizer = SGD(network.parameters(), lr) log.debug("read example file: {}".format("\t".join(examples_files))) for batch in emb_utils.batch_generator( emb_utils.prepare_sentences( model, graph_utils.combine_example_files_iter(examples_files), network.transfer_fn(model.vocab)), 20): input, output = batch loss = (alpha * network.forward( input, output, negative_sampling_fn=model.negative_sample)) optimizer.zero_grad() loss.backward() optimizer.step()
def learn_first(network, lr, model, edges, num_iter=1): """ Helper function used to optimize O1 and O3 :param network: neural network to train :param lr: learning rate :param model: deprecated_model used to compute the batches and the negative sampling :param edges: numpy list of edges used for training :param num_iter: iteration number over the edges :return: """ log.info("computing o1") optimizer = SGD(network.parameters(), lr) for batch in emb_utils.batch_generator( emb_utils.prepare_sentences( model, emb_utils.RepeatCorpusNTimes(edges, n=num_iter), network.transfer_fn(model.vocab)), 20): input, output = batch loss = network.forward(input, output, negative_sampling_fn=model.negative_sample) optimizer.zero_grad() loss.backward() optimizer.step()
def train(self, model, edges, chunksize=150, iter=1): """ Update the model's neural weights from a sequence of paths (can be a once-only generator stream). """ assert model.node_embedding.dtype == np.float32 log.info("O1 training model with %i workers on %i vocabulary and %i features and 'negative sampling'=%s" % (self.workers, len(model.vocab), model.layer1_size, self.negative)) if not model.vocab: raise RuntimeError("you must first build vocabulary before training the model") edges = RepeatCorpusNTimes(edges, iter) total_node = edges.corpus.shape[0] * edges.corpus.shape[1] * edges.n log.debug('total edges: %d' % total_node) start, next_report, word_count = time.time(), [5.0], [0] #int(sum(v.count * v.sample_probability for v in self.vocab.values())) jobs = Queue(maxsize=2*self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() def worker_train(): """Train the model, lifting lists of paths from the jobs queue.""" while True: job = jobs.get(block=True) if job is None: # data finished, exit jobs.task_done() # print('thread %s break' % threading.current_thread().name) break py_work = np.zeros(model.layer1_size, dtype=np.float32) job_words = sum(train_o1(model.node_embedding, edge, self.lr, self.negative, model.table, py_size=model.layer1_size, py_work=py_work) for edge in job if edge is not None) jobs.task_done() lock.acquire(timeout=30) try: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: log.info("PROGRESS: at %.2f%% words\tword_computed %d\talpha %.05f\t %.0f words/s" % (100.0 * word_count[0] / total_node, word_count[0], self.lr, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 5.0 # don't flood the log, wait at least a second between progress reports finally: lock.release() workers = [threading.Thread(target=worker_train, name='thread_'+str(i)) for i in range(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, edges), chunksize)): jobs.put(job) for _ in range(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start log.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0]/ elapsed if elapsed else 0.0))
def train(self, model, paths, total_nodes, alpha=1.0, node_count=0, chunksize=150): """ Update the model's neural weights from a sequence of paths (can be a once-only generator stream). :param model: model containing the shared data :param paths: generator of the paths :param total_nodes: total number of nodes in the path :param alpha: trade-off parameter :param node_count: init of the number of nodes :param chunksize: size of the batch :return: """ assert model.node_embedding.dtype == np.float32 assert model.context_embedding.dtype == np.float32 log.info( "O3 CONTEXT training model with %i workers on %i vocabulary and %i features, using \t'negative sampling'=%s\t'windows'=%s" % (self.workers, len(model.vocab), model.layer1_size, self.negative, self.window_size)) if alpha <= 0.: return if not model.vocab: raise RuntimeError( "you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] if total_nodes is None: raise AttributeError('need the number of node') node_count = [0] jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of nodes trained so far, log reports...) def worker_train(): """Train the model, lifting lists of paths from the jobs queue.""" py_work = np.zeros(model.layer1_size, dtype=np.float32) while True: job = jobs.get() if job is None: # data finished, exit break lr = max(self.min_lr, self.lr * (1 - 1.0 * node_count[0] / total_nodes)) job_nodes = sum( train_o2(model.node_embedding, model.context_embedding, path, lr, self.negative, self.window_size, model.table, py_alpha=alpha, py_size=model.layer1_size, py_work=py_work) for path in job) #execute the sgd with lock: node_count[0] += job_nodes elapsed = time.time() - start if elapsed >= next_report[0]: log.info( "PROGRESS: at %.2f%% nodes, lr %.05f, %.0f nodes/s" % (100.0 * node_count[0] / total_nodes, lr, node_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [ threading.Thread(target=worker_train) for _ in range(self.workers) ] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (eliding OOV/downsampled nodes), and start filling the jobs queue for job_no, job in enumerate( chunkize_serial(prepare_sentences(model, paths), chunksize)): jobs.put(job) log.debug( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in range(self.workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start log.info("training on %i nodes took %.1fs, %.0f nodes/s" % (node_count[0], elapsed, node_count[0] / elapsed if elapsed else 0.0))