Exemplo n.º 1
0
    def train_single_sent_id(self,
                             sentences,
                             iteration,
                             work=None,
                             neu1=None,
                             sent_vec=None,
                             cat_vec=None):
        if work is None:
            work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)
        if neu1 is None:
            neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)
        sent_grad = self.init_grad_weight(1)
        cat_grad = self.init_grad_weight(1)

        if sent_vec is None:
            sent_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            if self.init_adjust:
                denom = sqrt(self.layer1_size)
            else:
                denom = self.layer1_size
            sent_vec[:] = (random.rand(self.layer1_size).astype(REAL) -
                           0.5) / denom
        if cat_vec is None:
            cat_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            self.cat_learn = 0

        for i in range(iteration):
            alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration)
                        ) if self.update_mode == 0 else self.alpha
            for sentence in sentences:
                sampled = [self.vocab.get(word, None) for word in sentence]
                train_cat_vec(self, sent_vec, cat_vec, sampled, alpha, work,
                              neu1, sent_grad, cat_grad)
        return sent_vec, cat_vec
Exemplo n.º 2
0
 def reset_weights(self):
     """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
     random.seed(self.seed)
     self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn0 += (random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size
     self.syn0norm = None
Exemplo n.º 3
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(
                self.layer1_size + 8,
                dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                if self.update_mode == 0:
                    alpha = max(
                        self.min_alpha,
                        self.alpha * (1 - 1.0 * word_count[0] / total_words))
                else:
                    alpha = self.alpha
                job_words = sum(
                    train_sent_vec(self, self.sents[sent_no], sentence, alpha,
                                   work, neu1, self.sents_grad[sent_no])
                    for sent_no, sentence in job)
                with lock:
                    word_count[0] += job_words
                    sent_count[0] += chunksize
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info(
                            "PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s"
                            % (100.0 * sent_count[0] / total_sents, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
Exemplo n.º 4
0
    def build_vec(self, sentences, has_vocab = False):
        """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
        if not has_vocab :
            logger.info("build vocabulary and")
        logger.info("resetting vectors")
        random.seed(self.seed)
        sentence_no, vocab = -1, {}
        total_words = 0
        self.sents_len = 0 #the num of sentence ids
        self.total_sents = 0 #the num of sentences
        self.cat_len = 0 #the num of category ids
        sent_cat_hash = {} #hash table for sent_no and cat_no
        for sentence_no, sent_tuple in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            (sentence_no, total_words, len(vocab)))
            sentence = sent_tuple[0]
            for word in sentence:
                total_words += 1
                if word in vocab:
                    vocab[word].count += 1
                else:
                    vocab[word] = Vocab(count=1)
            sent_id = sent_tuple[1]
            cat_id = sent_tuple[2]
            self.total_sents += 1
            if not self.cat_no_hash.has_key(cat_id):
                self.cat_no_hash[cat_id] = self.cat_len
                self.cat_id_list.append(cat_id)
                self.cat_len += 1
            if not self.sent_no_hash.has_key(sent_id):
                self.sent_no_hash[sent_id] = self.sents_len
                self.sent_id_list.append(sent_id)
                self.sents_len += 1
            sent_cat = str(self.sent_no_hash[sent_id])+" "+str(self.cat_no_hash[cat_id])
            sent_cat_hash.setdefault(sent_cat,0)
            sent_cat_hash[sent_cat] += 1

        logger.info("collected %i word types from a corpus of %i words and %i sentences(ident:%i)  with %i categories" %
                    (len(vocab), total_words, self.total_sents, self.sents_len, self.cat_len))

        self.build_vocab(vocab)
        self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size), dtype=REAL)
        self.cats = matutils.zeros_aligned((self.cat_len, self.layer1_size), dtype=REAL)
        # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
        self.reset_weights()

        # make sent_cat_pair
        self.sent_cat_pair = empty((len(sent_cat_hash),2), dtype=uint32)
        self.pair_len = len(sent_cat_hash)
        idx = 0
        for sent_cat in sent_cat_hash.keys():
            tpl = sent_cat.split(" ")
            self.sent_cat_pair[idx][0] = uint32(tpl[0])
            self.sent_cat_pair[idx][1] = uint32(tpl[1])
            idx += 1
        #sort by cat_no, sent_no in place
        self.sent_cat_pair.view('u4,u4').sort(order=['f1','f0'], axis=0)
Exemplo n.º 5
0
 def reset_weights(self):
     """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
     logger.info("resetting layer weights")
     random.seed(self.seed)
     self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
     for i in xrange(len(self.vocab)):
         self.syn0[i] = (random.rand(self.layer1_size) - 0.5) / self.layer1_size
     if self.hs:
         self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     if self.negative:
         self.syn1neg = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn0norm = None
Exemplo n.º 6
0
 def init_pairnorm(self):
     # avoid initializing from multiple threads
     lock = threading.Lock()
     with lock:
         if getattr(self, 'pairnorm', None) is not None: return
         self.pairnorm = matutils.zeros_aligned((self.pair_len, self.layer1_size), dtype=REAL)
         init_pairtable(self)
Exemplo n.º 7
0
 def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
     """
     Infer a vector for given post-bulk training document.
 
     Document should be a list of (word) tokens.
     """
     doctag_vectors = empty((1, self.vector_size), dtype=REAL)
     doctag_vectors[0] = self.seeded_vector(' '.join(doc_words))
     doctag_locks = ones(1, dtype=REAL)
     doctag_indexes = [0]
 
     work = zeros(self.layer1_size, dtype=REAL)
     if not self.sg:
         neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
 
     for i in range(steps):
         if self.sg:
             train_document_dbow(self, doc_words, doctag_indexes, alpha, work,
                                 learn_words=False, learn_hidden=False,
                                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
         elif self.dm_concat:
             train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1,
                                      learn_words=False, learn_hidden=False,
                                      doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
         else:
             train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1,
                               learn_words=False, learn_hidden=False,
                               doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
         alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha
 
     return doctag_vectors[0]
Exemplo n.º 8
0
    def train_single_sent_id(self, sentences, iteration, work=None, neu1=None):
        if work is None: work = zeros(self.layer1_size, dtype=REAL)
        if neu1 is None:
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
        num_of_grad = 0
        if (self.update_mode == 1): num_of_grad = self.layer1_size
        elif (self.update_mode == 2): num_of_grad = 2 * self.layer1_size
        elif (self.update_mode == 3): num_of_grad = 2 * self.layer1_size + 3
        sent_grad = zeros(num_of_grad, dtype=REAL)

        if self.init_adjust:
            denom = sqrt(self.layer1_size)
        else:
            denom = self.layer1_size

        new_sent = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom
        for i in range(iteration):
            alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration)
                        ) if self.update_mode == 0 else self.alpha
            for sentence in sentences:
                sampled = [self.vocab.get(word, None) for word in sentence]
                train_sent_vec(self, new_sent, sampled, alpha, work, neu1,
                               sent_grad)

        return new_sent
 def worker_infer():
     while True:
         job = jobs.get()
         if job is None:
             break
         diff = 0.0
         work = np.zeros(model1.layer1_size, dtype=REAL)
         neu1 = matutils.zeros_aligned(model1.layer1_size, dtype=REAL)
         for sent_tuple in job:
             cat_id_gold = sent_tuple[2]
             sent_vec1 = model1.train_single_sent_id([sent_tuple[0]], 20, work, neu1)
             sims1 = np.empty(model1.sents_len, dtype=REAL)
             nearest_sent_fast(model1, sent_vec1, 0, sims1)
             sent_vec2 = model2.train_single_sent_id([sent_tuple[0]], 20, work, neu1)
             sims2 = np.empty(model2.sents_len, dtype=REAL)
             nearest_sent_fast(model2, sent_vec2, 0, sims2)
             sims1 += sims2
             neighbors = np.argsort(sims1)[::-1]
             cat_ids = {}
             nearest = []
             ident_cat = True
             for top_cand in neighbors:
                 sent_id = model1.sent_id_list[top_cand]
                 cat_id = sent_cat[sent_id]
                 if not ident_cat or not cat_ids.has_key(cat_id):
                     cat_ids[cat_id] = 1
                     nearest.append(cat_id)
                     if len(nearest) == topK:
                         break
             diff += 1.0 if cat_id_gold in nearest else 0.0
             print nearest, cat_id_gold
             confusion_mtx.setdefault(cat_id_gold, {})
             confusion_mtx[cat_id_gold].setdefault(nearest[0], 0)
             confusion_mtx[cat_id_gold][nearest[0]] += 1
         qout.put(diff)
Exemplo n.º 10
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(
                self.layer1_size,
                dtype=REAL)  # each thread must have its own work memory

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = sum(
                    train_sentence(self, sentence, alpha, work)
                    for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
Exemplo n.º 11
0
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Document should be a list of (word) tokens.
        """
        doctag_vectors = empty((1, self.vector_size), dtype=REAL)
        doctag_vectors[0] = self.seeded_vector(' '.join(doc_words))
        doctag_locks = ones(1, dtype=REAL)
        doctag_indexes = [0]

        work = zeros(self.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(self, doc_words, doctag_indexes, alpha, work,
                                    learn_words=False, learn_hidden=False,
                                    doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            elif self.dm_concat:
                train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1,
                                         learn_words=False, learn_hidden=False,
                                         doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            else:
                train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1,
                                  learn_words=False, learn_hidden=False,
                                  doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
Exemplo n.º 12
0
 def init_pairnorm(self):
     # avoid initializing from multiple threads
     lock = threading.Lock()
     with lock:
         if getattr(self, 'pairnorm', None) is not None: return
         self.pairnorm = matutils.zeros_aligned(
             (self.pair_len, self.layer1_size), dtype=REAL)
         init_pairtable(self)
Exemplo n.º 13
0
 def reset_weights(self):
     """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
     logger.info("resetting layer weights")
     random.seed(self.seed)
     self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size),
                                        dtype=REAL)
     # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
     for i in xrange(len(self.vocab)):
         self.syn0[i] = (random.rand(self.layer1_size) -
                         0.5) / self.layer1_size
     if self.hs:
         self.syn1 = matutils.zeros_aligned(
             (len(self.vocab), self.layer1_size), dtype=REAL)
     if self.negative:
         self.syn1neg = matutils.zeros_aligned(
             (len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn0norm = None
Exemplo n.º 14
0
 def worker_infer():
     while True:
         job = jobs.get()
         if job is None:
             break
         diff = 0.
         work = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL)
         neu1 = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL)
         for sent_tuple in job:
             cat_id = sent_tuple[2]
             ret = model.infer([sent_tuple[0]], iteration=20, k=topK, work=work, neu1=neu1)
             diff += 1. if cat_id in ret[2] else 0.
             print ret[2],cat_id
             confusion_mtx.setdefault(cat_id, {})
             confusion_mtx[cat_id].setdefault(ret[2][0], 0)
             confusion_mtx[cat_id][ret[2][0]] += 1
         qout.put(diff)
Exemplo n.º 15
0
 def worker_infer():
     while True:
         job = jobs.get()
         if job is None:
             break
         diff = 0.
         work = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL)
         neu1 = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL)
         for sent_tuple in job:
             cat_id = sent_tuple[2]
             ret = model.infer([sent_tuple[0]],
                               iteration=20,
                               k=topK,
                               work=work,
                               neu1=neu1)
             diff += 1. if cat_id in ret[2] else 0.
             print ret[2], cat_id
             confusion_mtx.setdefault(cat_id, {})
             confusion_mtx[cat_id].setdefault(ret[2][0], 0)
             confusion_mtx[cat_id][ret[2][0]] += 1
         qout.put(diff)
Exemplo n.º 16
0
 def init_grad_weight(self, length):
     grad_size = 0
     if self.update_mode == 1:
         grad_size = self.layer1_size
     elif self.update_mode == 2:
         grad_size = 2 * self.layer1_size
     elif self.update_mode == 3:
         grad_size = 2 * self.layer1_size + 3
     grad = matutils.zeros_aligned((length, grad_size), dtype=REAL)
     if self.update_mode == 3:
         grad[:,grad_size - 3] = ADAM_BETA1
         grad[:,grad_size - 2] = ADAM_BETA1
         grad[:,grad_size - 1] = ADAM_BETA2
     return grad
Exemplo n.º 17
0
 def init_grad_weight(self, length):
     grad_size = 0
     if self.update_mode == 1:
         grad_size = self.layer1_size
     elif self.update_mode == 2:
         grad_size = 2 * self.layer1_size
     elif self.update_mode == 3:
         grad_size = 2 * self.layer1_size + 3
     grad = matutils.zeros_aligned((length, grad_size), dtype=REAL)
     if self.update_mode == 3:
         grad[:, grad_size - 3] = ADAM_BETA1
         grad[:, grad_size - 2] = ADAM_BETA1
         grad[:, grad_size - 1] = ADAM_BETA2
     return grad
Exemplo n.º 18
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                if self.update_mode == 0:
                    alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                else:
                    alpha = self.alpha
                job_words = train_from_job(self, job, alpha, work, neu1)
                with lock:
                    word_count[0] += job_words
                    sent_count[0] += chunksize
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s" %
                                    (100.0 * sent_count[0] / total_sents, alpha, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
Exemplo n.º 19
0
 def train_single_sent_id(self, sentences, iteration, work=None, neu1=None, sent_vec=None, cat_vec=None):
     if work is None: work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)
     if neu1 is None: neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)
     sent_grad = self.init_grad_weight(1)
     cat_grad = self.init_grad_weight(1)
     
     if sent_vec is None:
         sent_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
         if self.init_adjust:
             denom = sqrt(self.layer1_size)
         else:
             denom = self.layer1_size
         sent_vec[:] = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom
     if cat_vec is None:
         cat_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
         self.cat_learn = 0
 
     for i in range(iteration):
         alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration)) if self.update_mode == 0 else self.alpha
         for sentence in sentences:
             sampled = [self.vocab.get(word, None) for word in sentence]
             train_cat_vec(self, sent_vec, cat_vec, sampled, alpha, work, neu1, sent_grad, cat_grad)
     return sent_vec, cat_vec
Exemplo n.º 20
0
 def worker_infer():
     while True:
         job = jobs.get()
         if job is None:
             break
         diff = 0.
         work = matutils.zeros_aligned(model1.layer1_size + 8, dtype=REAL)
         neu1 = matutils.zeros_aligned(model1.layer1_size + 8, dtype=REAL)
         for sent_tuple in job:
             cat_id_gold = sent_tuple[2]
             sent_vec1, cat_vec1 = model1.train_single_sent_id(
                 [sent_tuple[0]], 20, work, neu1)
             sims1 = np.empty(model1.pair_len, dtype=REAL)
             catsentvec_sim_sum(model1, sent_vec1, cat_vec1, sims1)
             sent_vec2, cat_vec2 = model2.train_single_sent_id(
                 [sent_tuple[0]], 20, work, neu1)
             sims2 = np.empty(model2.pair_len, dtype=REAL)
             catsentvec_sim_sum(model2, sent_vec2, cat_vec2, sims2)
             sims1 += sims2
             #joint_catsentvec_sim_sum(pairtable, sent_vec1, cat_vec1, sent_vec2, cat_vec2, sims1)
             neighbors = np.argsort(sims1)[::-1]
             cat_ids = {}
             nearest = []
             ident_cat = True
             for top_cand in neighbors:
                 (sent_no, cat_no) = model1.sent_cat_pair[top_cand]
                 cat_id = model1.cat_id_list[cat_no]
                 if not ident_cat or not cat_ids.has_key(cat_id):
                     cat_ids[cat_id] = 1
                     nearest.append(cat_id)
                     if len(nearest) == topK: break
             diff += 1. if cat_id_gold in nearest else 0.
             print nearest, cat_id_gold
             confusion_mtx.setdefault(cat_id_gold, {})
             confusion_mtx[cat_id_gold].setdefault(nearest[0], 0)
             confusion_mtx[cat_id_gold][nearest[0]] += 1
         qout.put(diff)
Exemplo n.º 21
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # each thread must have its own work memory

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
                            (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
Exemplo n.º 22
0
    def train_single_sent_id(self, sentences, iteration, work=None, neu1=None):
        if work is None: work = zeros(self.layer1_size, dtype=REAL)
        if neu1 is None: neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
        num_of_grad = 0
        if (self.update_mode == 1): num_of_grad = self.layer1_size
        elif (self.update_mode == 2): num_of_grad = 2 * self.layer1_size
        elif (self.update_mode == 3): num_of_grad = 2 * self.layer1_size + 3
        sent_grad = zeros(num_of_grad, dtype=REAL)

        if self.init_adjust:
            denom = sqrt(self.layer1_size)
        else:
            denom = self.layer1_size

        new_sent = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom
        for i in range(iteration):
            alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration)) if self.update_mode == 0 else self.alpha
            for sentence in sentences:
                sampled = [self.vocab.get(word, None) for word in sentence]
                train_sent_vec(self, new_sent, sampled, alpha, work, neu1, sent_grad)
        
        return new_sent
Exemplo n.º 23
0
    def build_vec(self, sentences, has_vocab=False):
        """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
        logger.info("resetting vectors for sentences")
        if not has_vocab:
            logger.info("build vocabulary and")
        logger.info("resetting vectors")
        random.seed(self.seed)
        sentence_no, vocab = -1, {}
        total_words = 0
        self.sents_len = 0  #the num of sentence ids
        self.total_sents = 0  #the num of sentences
        for sentence_no, sent_tuple in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types"
                    % (sentence_no, total_words, len(vocab)))
            sentence = sent_tuple[0]
            for word in sentence:
                total_words += 1
                if word in vocab:
                    vocab[word].count += 1
                else:
                    vocab[word] = Vocab(count=1)
            sent_id = sent_tuple[1]
            self.total_sents += 1
            if not self.sent_no_hash.has_key(sent_id):
                self.sent_no_hash[sent_id] = self.sents_len
                self.sent_id_list.append(sent_id)
                self.sents_len += 1

        logger.info(
            "collected %i word types from a corpus of %i words and %i sentences(ident:%i)"
            % (len(vocab), total_words, self.total_sents, self.sents_len))

        self.build_vocab(vocab)
        self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size),
                                            dtype=REAL)
        # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
        self.reset_weights()
Exemplo n.º 24
0
    def build_vec(self, sentences, has_vocab = False):
        """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
        logger.info("resetting vectors for sentences")
        if not has_vocab :
            logger.info("build vocabulary and")
        logger.info("resetting vectors")
        random.seed(self.seed)
        sentence_no, vocab = -1, {}
        total_words = 0
        self.sents_len = 0 #the num of sentence ids
        self.total_sents = 0 #the num of sentences
        for sentence_no, sent_tuple in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            (sentence_no, total_words, len(vocab)))
            sentence = sent_tuple[0]
            for word in sentence:
                total_words += 1
                if word in vocab:
                    vocab[word].count += 1
                else:
                    vocab[word] = Vocab(count=1)
            sent_id = sent_tuple[1]
            self.total_sents += 1
            if not self.sent_no_hash.has_key(sent_id):
                self.sent_no_hash[sent_id] = self.sents_len
                self.sent_id_list.append(sent_id)
                self.sents_len += 1

        logger.info("collected %i word types from a corpus of %i words and %i sentences(ident:%i)" %
                    (len(vocab), total_words, self.total_sents, self.sents_len))

        self.build_vocab(vocab)
        self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size), dtype=REAL)
        # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
        self.reset_weights()
Exemplo n.º 25
0
    def build_vec(self, sentences, has_vocab=False):
        """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
        if not has_vocab:
            logger.info("build vocabulary and")
        logger.info("resetting vectors")
        random.seed(self.seed)
        sentence_no, vocab = -1, {}
        total_words = 0
        self.sents_len = 0  #the num of sentence ids
        self.total_sents = 0  #the num of sentences
        self.cat_len = 0  #the num of category ids
        sent_cat_hash = {}  #hash table for sent_no and cat_no
        for sentence_no, sent_tuple in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types"
                    % (sentence_no, total_words, len(vocab)))
            sentence = sent_tuple[0]
            for word in sentence:
                total_words += 1
                if word in vocab:
                    vocab[word].count += 1
                else:
                    vocab[word] = Vocab(count=1)
            sent_id = sent_tuple[1]
            cat_id = sent_tuple[2]
            self.total_sents += 1
            if not self.cat_no_hash.has_key(cat_id):
                self.cat_no_hash[cat_id] = self.cat_len
                self.cat_id_list.append(cat_id)
                self.cat_len += 1
            if not self.sent_no_hash.has_key(sent_id):
                self.sent_no_hash[sent_id] = self.sents_len
                self.sent_id_list.append(sent_id)
                self.sents_len += 1
            sent_cat = str(self.sent_no_hash[sent_id]) + " " + str(
                self.cat_no_hash[cat_id])
            sent_cat_hash.setdefault(sent_cat, 0)
            sent_cat_hash[sent_cat] += 1

        logger.info(
            "collected %i word types from a corpus of %i words and %i sentences(ident:%i)  with %i categories"
            % (len(vocab), total_words, self.total_sents, self.sents_len,
               self.cat_len))

        self.build_vocab(vocab)
        self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size),
                                            dtype=REAL)
        self.cats = matutils.zeros_aligned((self.cat_len, self.layer1_size),
                                           dtype=REAL)
        # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
        self.reset_weights()

        # make sent_cat_pair
        self.sent_cat_pair = empty((len(sent_cat_hash), 2), dtype=uint32)
        self.pair_len = len(sent_cat_hash)
        idx = 0
        for sent_cat in sent_cat_hash.keys():
            tpl = sent_cat.split(" ")
            self.sent_cat_pair[idx][0] = uint32(tpl[0])
            self.sent_cat_pair[idx][1] = uint32(tpl[1])
            idx += 1
        #sort by cat_no, sent_no in place
        self.sent_cat_pair.view('u4,u4').sort(order=['f1', 'f0'], axis=0)