Пример #1
0
def read_vocab(vocab_file, vocab_limit):
    if vocab_file.endswith(".json"):
        vocab = load_json(vocab_file)
    else:
        vocab = {l.strip(): c for c, l in enumerate(line_reader(vocab_file))}
    assert vocab["<s>"] == 0
    return {w: i for w, i in vocab.items() if i < vocab_limit}
Пример #2
0
def read_vocab(vocab_file, vocab_limit):
    if vocab_file.endswith(".json"):
        vocab = load_json(vocab_file)
    else:
        vocab = {l.strip(): c for c, l in enumerate(line_reader(vocab_file))}
    assert vocab["<s>"] == 0
    return {w: i for w, i in vocab.items() if i < vocab_limit}
Пример #3
0
 def __init__(self, vocab_file, tag_vocab_file, vocab_limit):
     self.vocab = read_vocab(vocab_file, vocab_limit)
     self.tag_vocab = load_json(tag_vocab_file)
Пример #4
0
    return e / len(frequent)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-input_dir", help="Directory containing model and vocabulary files.", required=True)
    parser.add_argument("-f", default="W_w", help="Model file (optional), meant for models per epoch.")
    parser.add_argument("-data_path", help="Filepath containing the SCWS dataset.", default="data/SCWS/ratings.txt")
    parser.add_argument("-win_size", default=3, type=int,
                        help="Context window size (n words to the left and n to the right).")
    parser.add_argument("-n_most_freq", type=int, help="Only consider n most freq. words from vocabulary.")
    args = parser.parse_args()

    w_index_path = "{}/w_index.json".format(args.input_dir)
    # model_path = "{}/sg.pickle".format(args.input_dir)
    log.info("Loading model.")
    w_index = load_json(w_index_path)
    if args.n_most_freq:
        w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1}
        print(len(w_index))

    embs = load_npy("{}/{}.npy".format(args.input_dir, args.f))
    c_embs = load_npy("{}/W_c.npy".format(args.input_dir))
    try:
        if args.f == "W_w":
            n = ""
        else:
            n = eval(args.f[-1])
            assert 0 <= n < 9
        bias = load_npy("{}/Wb{}.npy".format(args.input_dir, n))
    except FileNotFoundError:
        bias = None
Пример #5
0
 def __init__(self, vocab_file, tag_vocab_file, vocab_limit):
     self.vocab = read_vocab(vocab_file, vocab_limit)
     self.tag_vocab = load_json(tag_vocab_file)
Пример #6
0
import argparse
from bimu.utils.conll_utils import Conll07Reader
from bimu.utils.generic_utils import save_json, load_json


class TagVocab(dict):
    def update_tags(self, input_file):
        reader = Conll07Reader(input_file)
        for sent in reader:
            for tag in sent.cpos:
                if tag not in self:
                    self[tag] = len(self)

    def write(self, output_file):
        save_json(self, output_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-input_file", required=True)
    parser.add_argument("-append", action="store_true")
    parser.add_argument("-output_file", required=True)
    args = parser.parse_args()

    tag_vocab = TagVocab()
    if args.append:
        tag_vocab.update(load_json(args.output_file))
    tag_vocab.update_tags(args.input_file)
    tag_vocab.write(args.output_file)
Пример #7
0
    def Sequence_Level(self, train_file, test_file, num_label, epochs, tag_vocab_file):
        log.debug("Declaring theano vars.")
        random.seed(5)
        W1 = theano.shared(0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1)
        W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1)
        W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1)
        U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1)

        x = T.dmatrix("x")  # len(l) by win*dimension
        y = T.lvector("y")
        learn_rate = T.scalar("learn_rate")

        A1 = T.dot(x, W1)
        B1 = A1 * (A1 > 0)
        A2 = T.dot(B1, W2)
        B2 = A2 * (A2 > 0)
        A3 = T.dot(B2, W3)
        B3 = A3 * (A3 > 0)
        G = T.dot(B3, U)
        L1 = T.nnet.softmax(G)  # len(l) by num_label

        #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U))

        cost = T.nnet.categorical_crossentropy(L1, y).mean()
        gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U])
        #gw_x = T.grad(cost, [x])

        log.info("Compiling theano model.")
        f1 = theano.function(inputs=[x, y, learn_rate], outputs=[cost], updates=(
            (W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3),
            (U, U - learn_rate * gu)))

        #f2 = theano.function(inputs=[x, y], outputs=cost)
        prediction = T.argmax(L1, axis=1)
        discrepancy = prediction - y
        f3 = theano.function(inputs=[x, y], outputs=[discrepancy,prediction])
        #f4 = theano.function(inputs=[x, y], outputs=gw_x)

        alpha = self.alpha
        log.info("Read-in the training and test data.")
        open_train = open(train_file, "r")
        train_lines = open_train.readlines()
        open_test = open(test_file, "r")
        test_lines = open_test.readlines()

        log.info("Start training.")
        counter = 0
        start = time.time()
        iter_ = epochs
        for j in range(0, iter_):
            log.info("Epoch: {}...".format(j+1))
            x_ = []
            y_ = []
            for i in range(len(train_lines)):
                if i % 1000 == 0:
                    log.debug(i)
                counter = counter + 1
                current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines))
                if current_alpha < 0.01: current_alpha = 0.01
                line_ = train_lines[i]
                G = line_.split("|")
                token_line = G[0]
                label_line = G[1]
                token_list = list(fromstring(token_line, dtype=int, sep=' '))
                x_ = self.contextwin(token_list)  # len(l) by win*dimension
                y_ = fromstring(label_line, dtype=int, sep=' ')
                f1(x_, y_, current_alpha)

            total_num = 0
            total_value = 0
            goldlabels = []
            predictions = []
            goldlabels2 = []
            predictions2 = []

            for i in range(len(test_lines)):
                line_ = test_lines[i]
                G = line_.split("|")
                token_line = G[0].strip()
                label_line = G[1].strip()

                y = fromstring(label_line, dtype=int, sep=' ')
                x = self.contextwin(list(fromstring(token_line, dtype=int, sep=' ')))
                total_num = total_num + x.shape[0]
                discrep, preds = f3(x, y)
                goldlabels.extend(list(y))
                goldlabels2.append(list(y))
                predictions.extend(list(preds))
                predictions2.append(list(preds))
                total_value = total_value + x.shape[0] - count_nonzero(discrep)

            assert len(goldlabels) == len(predictions)
            #  write out for evaluation with conlleval
            t_idx = load_json(tag_vocab_file)
            inv_t_idx = {i: t for t, i in t_idx.items()}
            with open("out", "w") as out:
                for gs, ps in zip(goldlabels2, predictions2):
                    for g, p in zip(gs, ps):
                        out.write("_ _ {} {}\n".format(inv_t_idx[g], inv_t_idx[p]))
                    out.write("\n")
            log.info("f1 {}".format(f1_score(goldlabels, predictions, average="weighted")))
            acc = 1.00 * total_value / total_num
            log.info("acc " + str(acc))
        log.info("Training completed: {}s/epoch".format((time.time()-start)/iter_))
Пример #8
0
    def Sequence_Level(self, train_file, test_file, num_label, epochs,
                       tag_vocab_file):
        log.debug("Declaring theano vars.")
        random.seed(5)
        W1 = theano.shared(
            0.2 * random.random([self.win * self.dimension, self.hidden1]) -
            0.1)
        W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) -
                           0.1)
        W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) -
                           0.1)
        U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1)

        x = T.dmatrix("x")  # len(l) by win*dimension
        y = T.lvector("y")
        learn_rate = T.scalar("learn_rate")

        A1 = T.dot(x, W1)
        B1 = A1 * (A1 > 0)
        A2 = T.dot(B1, W2)
        B2 = A2 * (A2 > 0)
        A3 = T.dot(B2, W3)
        B3 = A3 * (A3 > 0)
        G = T.dot(B3, U)
        L1 = T.nnet.softmax(G)  # len(l) by num_label

        #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U))

        cost = T.nnet.categorical_crossentropy(L1, y).mean()
        gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U])
        #gw_x = T.grad(cost, [x])

        log.info("Compiling theano model.")
        f1 = theano.function(
            inputs=[x, y, learn_rate],
            outputs=[cost],
            updates=((W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2),
                     (W3, W3 - learn_rate * gw3), (U, U - learn_rate * gu)))

        #f2 = theano.function(inputs=[x, y], outputs=cost)
        prediction = T.argmax(L1, axis=1)
        discrepancy = prediction - y
        f3 = theano.function(inputs=[x, y], outputs=[discrepancy, prediction])
        #f4 = theano.function(inputs=[x, y], outputs=gw_x)

        alpha = self.alpha
        log.info("Read-in the training and test data.")
        open_train = open(train_file, "r")
        train_lines = open_train.readlines()
        open_test = open(test_file, "r")
        test_lines = open_test.readlines()

        log.info("Start training.")
        counter = 0
        start = time.time()
        iter_ = epochs
        for j in range(0, iter_):
            log.info("Epoch: {}...".format(j + 1))
            x_ = []
            y_ = []
            for i in range(len(train_lines)):
                if i % 1000 == 0:
                    log.debug(i)
                counter = counter + 1
                current_alpha = alpha * (iter_ * len(train_lines) -
                                         counter) / (iter_ * len(train_lines))
                if current_alpha < 0.01: current_alpha = 0.01
                line_ = train_lines[i]
                G = line_.split("|")
                token_line = G[0]
                label_line = G[1]
                token_list = list(fromstring(token_line, dtype=int, sep=' '))
                x_ = self.contextwin(token_list)  # len(l) by win*dimension
                y_ = fromstring(label_line, dtype=int, sep=' ')
                f1(x_, y_, current_alpha)

            total_num = 0
            total_value = 0
            goldlabels = []
            predictions = []
            goldlabels2 = []
            predictions2 = []

            for i in range(len(test_lines)):
                line_ = test_lines[i]
                G = line_.split("|")
                token_line = G[0].strip()
                label_line = G[1].strip()

                y = fromstring(label_line, dtype=int, sep=' ')
                x = self.contextwin(
                    list(fromstring(token_line, dtype=int, sep=' ')))
                total_num = total_num + x.shape[0]
                discrep, preds = f3(x, y)
                goldlabels.extend(list(y))
                goldlabels2.append(list(y))
                predictions.extend(list(preds))
                predictions2.append(list(preds))
                total_value = total_value + x.shape[0] - count_nonzero(discrep)

            assert len(goldlabels) == len(predictions)
            #  write out for evaluation with conlleval
            t_idx = load_json(tag_vocab_file)
            inv_t_idx = {i: t for t, i in t_idx.items()}
            with open("out", "w") as out:
                for gs, ps in zip(goldlabels2, predictions2):
                    for g, p in zip(gs, ps):
                        out.write("_ _ {} {}\n".format(inv_t_idx[g],
                                                       inv_t_idx[p]))
                    out.write("\n")
            log.info("f1 {}".format(
                f1_score(goldlabels, predictions, average="weighted")))
            acc = 1.00 * total_value / total_num
            log.info("acc " + str(acc))
        log.info("Training completed: {}s/epoch".format(
            (time.time() - start) / iter_))
Пример #9
0
 def load(self, load_dir):
     self.w_index = load_json("{}/w_index.json".format(load_dir))
     self.inv_w_index = {i: w for w, i in self.w_index.items()}
     if os.path.isfile("{}/w_cn.json".format(load_dir)):
         self.w_cn = load_json("{}/w_cn.json".format(load_dir))