Exemplo n.º 1
0
 def dLdv(self, v):
     logger.info("run dldv")
     v_s = sparse_mat(v)
     exp_count = np.sum([sparse_mat(np.exp(mat.dot(v_s.transpose()).toarray())
                                           /np.sum(np.exp(mat.dot(
         v_s.transpose()).toarray()))).transpose().dot(mat)
                       for mat in self.data_alt_features]).transpose()
     logger.critical("iteration number: {}".format(self.iteration_number))
     self.iteration_number += 1
     return -np.subtract(np.subtract(self.sum_of_features.toarray(), exp_count.toarray()[:,0]), (LAMBDA * v_s).transpose().toarray()[:,0])
Exemplo n.º 2
0
 def feature_extractor_all_tags2(self, words, is_cap, is_num, last_t,
                                 last2_t, idx):
     word_alt_features = [None] * len(self.set_of_tags)
     for tag_id, tag in self.int_to_tag.items():
         word_alt_features[tag_id] = self.feature_extractor_aux(
             words, is_cap, is_num, last_t, last2_t, tag, idx)
     return sparse_mat(word_alt_features)
Exemplo n.º 3
0
 def feature_extractor_for_tags(self, words, is_cap, is_num, tags, last_t,
                                last2_t, idx):
     mat = []
     for t in tags:
         mat.append(
             self.feature_extractor_aux(words, is_cap, is_num, last_t,
                                        last2_t, t, idx))
     return sparse_mat(mat).transpose()
Exemplo n.º 4
0
    def infer(self, all_words):
        # logger.debug('Infering for given sentence')
        filter_words = [x for x in all_words if x[0] not in known_tags]
        ret_tags = [None]*len(all_words)
        for i in range(len(all_words)):
            if all_words[i][0] in known_tags:
                ret_tags[i] = known_tags[all_words[i][0]]
        self.pi = np.zeros((len(filter_words) + 1, len(self.tag_to_int), len(self.tag_to_int)))
        self.bp = np.zeros((len(filter_words) + 1, len(self.tag_to_int), len(self.tag_to_int)))
        self.pi[0, self.tag_to_int['*'], self.tag_to_int['*']] = 1
        sentence = [filter_words[i][0] for i in range(len(filter_words))]
        is_cap = [filter_words[i][2] for i in range(len(filter_words))]
        is_num = [filter_words[i][3] for i in range(len(filter_words))]
        v_s = sparse_mat(self.v)
        for k in range(1, len(sentence)+1):
            V = {x: i for i, x in enumerate(self.tags_for_word(sentence, k))}
            U = {x: i for i, x in enumerate(self.tags_for_word(sentence, k-1))}
            T = {x: i for i, x in enumerate(self.tags_for_word(sentence, k-2))}
            for u in U.keys():
                feature_tag_mat_per_v = [self.feature_extractor_for_tags(sentence, is_cap, is_num,
                                                                         T.keys(), v, u, k - 1) for v in V.keys()]
                mahane = [np.sum(np.exp(v_s.dot(sparse_mat([feature_tag_mat_per_v[v][:,
                                                                T[t]].transpose().toarray()[0, :] for v in V.values()])
                                                    .transpose()).toarray()[0, :])) for t in T.keys()]
                for v in V.keys():
                    mone = np.exp(v_s.dot((feature_tag_mat_per_v[V[v]])).toarray())[0, :]
                    prob = mone / mahane
                    calc = [self.pi[k-1, self.tag_to_int[t], self.tag_to_int[u]] * prob[idx] for t, idx in T.items()]
                    self.pi[k, self.tag_to_int[u], self.tag_to_int[v]] = max(calc)
                    tmp = list(T.keys())[list(T.values()).index(np.argmax(calc))]
                    self.bp[k, self.tag_to_int[u], self.tag_to_int[v]] = self.tag_to_int[tmp]

        tags = [None] * len(sentence)
        u, v = np.unravel_index(np.argmax(self.pi[len(sentence)]), self.pi[len(sentence)].shape)
        tags[len(tags)-1] = self.int_to_tag[v]
        tags[len(tags)-2] = self.int_to_tag[u]
        for k in range(len(sentence) - 3, -1, -1):
            tags[k] = self.int_to_tag[self.bp[k+3, self.tag_to_int[tags[k+1]], self.tag_to_int[tags[k+2]]]]
        i = 0
        for tag in tags:
            while ret_tags[i] is not None:
                i += 1
            ret_tags[i] = tag
        return ret_tags
Exemplo n.º 5
0
    def train(self, train_data):
        if self.is_test:
            logger.info("Model already trained.")
            return
        logger.info("Collecting features and tags")
        tag_enum = 0
        for sentence in train_data:
            filter_sentence = [x for x in sentence if x[0] not in known_tags]
            words = [a[0] for a in filter_sentence]
            tags = [a[1] for a in filter_sentence]
            is_cap = [a[2] for a in filter_sentence]
            is_num = [a[3] for a in filter_sentence]
            self.train_sentences.append(words)
            self.train_tags.append(tags)
            self.train_is_cap.append(is_cap)
            self.train_is_num.append(is_num)
            for tag in tags:
                if tag not in self.set_of_tags:
                    self.set_of_tags.add(tag)
                    self.tag_to_int[tag] = tag_enum
                    self.int_to_tag[tag_enum] = tag
                    tag_enum += 1
            for idx, _ in enumerate(filter_sentence):
                if words[idx] not in self.word_tag_dict:
                    self.word_tag_dict[words[idx]] = {tags[idx]}
                else:
                    self.word_tag_dict[words[idx]].add(tags[idx])
        for i in range(len(self.train_sentences)):
            for idx in range(len(self.train_sentences[i])):
                self.feature_collector(self.train_sentences[i], self.train_tags[i], self.train_tags[i][idx], idx)
            progress_bar(i / len(self.train_sentences),
                         "completed {} of {} sentences".format(i, len(self.train_sentences)))
        progress_bar(1, "")
        print()
        set_of_useful_features = [k for k, v in self.set_of_features.items() if float(v) >= 5]
        self.int = 0
        for key in set_of_useful_features:
            self.key_to_int[key] = self.int
            self.int += 1

        logger.info("Collected {} features and {} tags".format(self.int, tag_enum))
        self.data_features = sparse_mat((0, self.int))
        self.data_alt_features = []
        self.sum_of_features = sparse_mat((1, self.int))
        logger.info("Extracting features")
        for i in range(len(self.train_sentences)):
            for idx in range(len(self.train_sentences[i])):
                fv = self.feature_extractor(self.train_sentences[i], self.train_is_cap[i], self.train_is_num[i],
                                            self.train_tags[i][idx],
                                            self.train_tags[i][idx - 1] if idx > 0 else '*',
                                            self.train_tags[i][idx - 2] if idx > 1 else '*', idx)
                self.data_features = hstack([self.data_features.transpose(), fv.transpose()], format='csr').transpose()
                self.sum_of_features += fv
                self.data_alt_features.append(self.feature_extractor_all_tags(self.train_sentences[i],
                                                                              self.train_is_cap[i],
                                                                              self.train_is_num[i],
                                                                              self.train_tags[i][idx - 1] if idx > 0 else '*',
                                                                              self.train_tags[i][idx - 2] if idx > 1 else '*',
                                                                              idx))
            progress_bar(i/len(self.train_sentences), "completed {} of {} sentences".format(i, len(self.train_sentences)))
        progress_bar(1, "")
        print()
        logger.info("Extracted features for all words")
        logger.debug('Start Now!!')
        self.v, f, d = minimize(self.L, np.zeros(self.int), factr=1e12, pgtol=1e-3, fprime=self.dLdv)
        logger.debug('End Now!!')
        logger.debug("v is: {}".format(self.v))
        logger.debug("Result of minimize is {}".format("success" if d['warnflag'] == 0 else "failure"))
        logger.debug("Function called {} times".format(d['funcalls']))
        logger.debug("Number of iterations {}".format(d['nit']))
        self.finish_train()
Exemplo n.º 6
0
 def L(self, v):
     logger.info('run L(v)')
     v_s = sparse_mat(v)
     tmp1 = np.sum(self.data_features.dot(v_s.transpose()).toarray())
     tmp2 = np.sum([np.log(np.sum(np.exp(mat.dot(v_s.transpose()).toarray()))) for mat in self.data_alt_features])
     return -(tmp1 - tmp2 - ((LAMBDA / 2) * v_s.dot(v_s.transpose())[0, 0]))
Exemplo n.º 7
0
 def feature_extractor(self, words, is_cap, is_num, tag, last_t, last2_t,
                       idx):
     return sparse_mat(
         self.feature_extractor_aux(words, is_cap, is_num, tag, last_t,
                                    last2_t, idx))