Exemplo n.º 1
0
    def __init__(self, model_path, word_dim=None, caps_dim=None, suffix_dim=None):
        self.model_path = model_path
        if word_dim is None:
            # use as supertagger
            with open(os.path.join(model_path, "tagger_defs.txt")) as defs_file:
                defs = json.load(defs_file)
            self.word_dim = defs["word_dim"]
            self.caps_dim = defs["caps_dim"]
            self.suffix_dim = defs["suffix_dim"]
        else:
            # training
            self.word_dim = word_dim
            self.caps_dim = caps_dim
            self.suffix_dim = suffix_dim

        self.words = read_model_defs(os.path.join(model_path, "words.txt"))
        self.suffixes = read_model_defs(os.path.join(model_path, "suffixes.txt"))
        self.caps = read_model_defs(os.path.join(model_path, "caps.txt"))
        self.targets = read_model_defs(os.path.join(model_path, "target.txt"))

        # self.unk_word = self.words["*UNKNOWN*"]
        self.unk_suffix = self.suffixes["UNK"]

        in_dim = 7 * (self.word_dim + self.caps_dim + self.suffix_dim)
        super(EmbeddingTagger, self).__init__(
                emb_word=L.EmbedID(len(self.words), self.word_dim),
                emb_caps=L.EmbedID(len(self.caps), self.caps_dim),
                emb_suffix=L.EmbedID(len(self.suffixes), self.suffix_dim),
                linear=L.Linear(in_dim, len(self.targets)),
                )
Exemplo n.º 2
0
 def __init__(self, model_path, samples_path):
     self.model_path = model_path
     self.words = read_model_defs(os.path.join(model_path, "words.txt"))
     self.suffixes = read_model_defs(os.path.join(model_path, "suffixes.txt"))
     self.caps = read_model_defs(os.path.join(model_path, "caps.txt"))
     self.targets = read_model_defs(os.path.join(model_path, "target.txt"))
     self.samples = open(samples_path).readlines()
     self.unk_word = self.words["*UNKNOWN*"]
     self.unk_suffix = self.suffixes["UNK"]
Exemplo n.º 3
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 dep_dim=100,
                 dropout_ratio=0.5):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            # training
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.nlayers = nlayers
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt"))
            p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(FastBiaffineLSTMParser,
              self).__init__(emb_word=L.EmbedID(self.n_words,
                                                self.word_dim,
                                                ignore_label=IGNORE),
                             emb_suf=L.EmbedID(self.n_suffixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             emb_prf=L.EmbedID(self.n_prefixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             lstm_f=L.NStepLSTM(self.nlayers, self.in_dim,
                                                self.hidden_dim, 0.32),
                             lstm_b=L.NStepLSTM(self.nlayers, self.in_dim,
                                                self.hidden_dim, 0.32),
                             arc_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             arc_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             rel_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             rel_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             biaffine_arc=Biaffine(self.dep_dim),
                             biaffine_tag=Bilinear(self.dep_dim, self.dep_dim,
                                                   len(self.targets)))
Exemplo n.º 4
0
 def __init__(self, model_path):
     self.model_path = model_path
     self.words = read_model_defs(model_path + "/words.txt")
     self.chars = read_model_defs(model_path + "/chars.txt")
     self.unk_word = self.words[UNK]
     self.start_word = self.words[START]
     self.end_word = self.words[END]
     self.unk_char = self.chars[UNK]
     self.start_char = self.chars[START]
     self.end_char = self.chars[END]
Exemplo n.º 5
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 char_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 dep_dim=100,
                 dropout_ratio=0.5):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.char_dim = char_dim
            p.hidden_dim = hidden_dim
            p.nlayers = nlayers
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_chars = len(read_model_defs(model_path + "/chars.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        self.in_dim = self.word_dim + self.char_dim
        self.dropout_ratio = dropout_ratio
        super(BiaffineJaLSTMParser,
              self).__init__(emb_word=L.EmbedID(self.n_words, self.word_dim),
                             emb_char=L.EmbedID(self.n_chars,
                                                50,
                                                ignore_label=IGNORE),
                             conv_char=L.Convolution2D(1,
                                                       self.char_dim, (3, 50),
                                                       stride=1,
                                                       pad=(1, 0)),
                             lstm_f=L.NStepLSTM(self.nlayers, self.in_dim,
                                                self.hidden_dim, 0.32),
                             lstm_b=L.NStepLSTM(self.nlayers, self.in_dim,
                                                self.hidden_dim, 0.32),
                             arc_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             arc_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             rel_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             rel_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             biaffine_arc=Biaffine(self.dep_dim),
                             biaffine_tag=L.Bilinear(self.dep_dim,
                                                     self.dep_dim,
                                                     len(self.targets)))
Exemplo n.º 6
0
 def __init__(self, model_path):
     self.words = read_model_defs(model_path + "/words.txt")
     self.suffixes = read_model_defs(model_path + "/suffixes.txt")
     self.prefixes = read_model_defs(model_path + "/prefixes.txt")
     self.unk_word = self.words[UNK]
     self.start_word = self.words[START]
     self.end_word = self.words[END]
     self.unk_suf = self.suffixes[UNK]
     self.unk_prf = self.prefixes[UNK]
     self.start_pre = [[self.prefixes[START]] + [IGNORE] * 3]
     self.start_suf = [[self.suffixes[START]] + [IGNORE] * 3]
     self.end_pre = [[self.prefixes[END]] + [IGNORE] * 3]
     self.end_suf = [[self.suffixes[END]] + [IGNORE] * 3]
Exemplo n.º 7
0
    def __init__(self, model_path, word_dim=None, char_dim=None):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            # use as supertagger
            with open(defs_file) as f:
                defs = json.load(f)
            self.word_dim = defs["word_dim"]
            self.char_dim = defs["char_dim"]
        else:
            # training
            self.word_dim = word_dim
            self.char_dim = char_dim
            with open(defs_file, "w") as f:
                json.dump(
                    {
                        "model": self.__class__.__name__,
                        "word_dim": self.word_dim,
                        "char_dim": self.char_dim
                    }, f)

        self.extractor = FeatureExtractor(model_path)
        self.targets = read_model_defs(model_path + "/target.txt")
        self.train = True

        hidden_dim = 1000
        in_dim = WINDOW_SIZE * (self.word_dim + self.char_dim)
        super(JaCCGEmbeddingTagger, self).__init__(
            emb_word=L.EmbedID(len(self.extractor.words), self.word_dim),
            emb_char=L.EmbedID(len(self.extractor.chars),
                               self.char_dim,
                               ignore_label=IGNORE),
            linear1=L.Linear(in_dim, hidden_dim),
            linear2=L.Linear(hidden_dim, len(self.targets)),
        )
Exemplo n.º 8
0
 def __init__(self, model_path, samples_path):
     self.model_path = model_path
     self.targets = read_model_defs(model_path + "/target.txt")
     self.extractor = FeatureExtractor(model_path)
     with open(samples_path) as f:
         self.samples = json.load(f).items()
     assert isinstance(self.samples[0][0], unicode)
Exemplo n.º 9
0
 def __init__(self, model_path, samples_path):
     self.model_path = model_path
     self.extractor = FeatureExtractor(model_path)
     self.targets = read_model_defs(model_path + "/target.txt")
     with open(samples_path) as f:
         self.samples = sorted(
                 json.load(f).items(), key=lambda x: len(x[1][0]))
Exemplo n.º 10
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 relu_dim=64,
                 dropout_ratio=0.5):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.relu_dim = relu_dim
            p.nlayers = nlayers
            p.dropout_ratio = dropout_ratio
            p.in_dim = self.word_dim + 8 * self.afix_dim
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt"))
            p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        super(PeepHoleLSTMTagger, self).__init__(
            emb_word=L.EmbedID(self.n_words,
                               self.word_dim,
                               ignore_label=IGNORE),
            emb_suf=L.EmbedID(self.n_suffixes,
                              self.afix_dim,
                              ignore_label=IGNORE),
            emb_prf=L.EmbedID(self.n_prefixes,
                              self.afix_dim,
                              ignore_label=IGNORE),
            lstm_f1=DyerLSTM(self.in_dim, self.hidden_dim),
            lstm_f2=DyerLSTM(self.hidden_dim, self.hidden_dim),
            lstm_b1=DyerLSTM(self.in_dim, self.hidden_dim),
            lstm_b2=DyerLSTM(self.hidden_dim, self.hidden_dim),
            linear1=L.Linear(2 * self.hidden_dim, self.relu_dim),
            linear2=L.Linear(self.relu_dim, len(self.targets)),
        )
Exemplo n.º 11
0
    def __init__(self,
                 model_path,
                 ccgbank_path,
                 tritrain_path,
                 weight,
                 length=False):
        self.model_path = model_path
        self.targets = read_model_defs(model_path + "/target.txt")
        self.extractor = FeatureExtractor(model_path, length)
        self.weight = weight
        self.ncopies = 15
        with open(ccgbank_path) as f:
            self.ccgbank_samples = json.load(f)
            self.ccgbank_size = len(self.ccgbank_samples)
        with open(tritrain_path) as f:
            self.tritrain_samples = json.load(f)
            self.tritrain_size = len(self.tritrain_samples)

        print("len(ccgbank):", self.ccgbank_size, file=sys.stderr)
        print("len(ccgbank) * # copies:",
              self.ccgbank_size * self.ncopies,
              file=sys.stderr)
        print("len(tritrain):", self.tritrain_size, file=sys.stderr)
Exemplo n.º 12
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 char_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 relu_dim=64,
                 dropout_ratio=0.5):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            # use as supertagger
            with open(defs_file) as f:
                defs = json.load(f)
            self.word_dim = defs["word_dim"]
            self.char_dim = defs["char_dim"]
            self.hidden_dim = defs["hidden_dim"]
            self.relu_dim = defs["relu_dim"]
            self.nlayers = defs["nlayers"]
            self.train = False
            self.extractor = FeatureExtractor(model_path)
        else:
            # training
            self.word_dim = word_dim
            self.char_dim = char_dim
            self.hidden_dim = hidden_dim
            self.relu_dim = relu_dim
            self.nlayers = nlayers
            self.train = True
            with open(defs_file, "w") as f:
                json.dump(
                    {
                        "model": self.__class__.__name__,
                        "word_dim": self.word_dim,
                        "char_dim": self.char_dim,
                        "hidden_dim": hidden_dim,
                        "relu_dim": relu_dim,
                        "nlayers": nlayers
                    }, f)

        self.targets = read_model_defs(model_path + "/target.txt")
        self.words = read_model_defs(model_path + "/words.txt")
        self.chars = read_model_defs(model_path + "/chars.txt")
        self.in_dim = self.word_dim + self.char_dim
        self.dropout_ratio = dropout_ratio
        super(JaLSTMTagger, self).__init__(
            emb_word=L.EmbedID(len(self.words), self.word_dim),
            emb_char=L.EmbedID(len(self.chars), 50, ignore_label=IGNORE),
            conv_char=L.Convolution2D(1,
                                      self.char_dim, (3, 50),
                                      stride=1,
                                      pad=(1, 0)),
            lstm_f=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, 0.),
            lstm_b=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, 0.),
            conv1=L.Convolution2D(1,
                                  2 * self.hidden_dim,
                                  (7, 2 * self.hidden_dim),
                                  stride=1,
                                  pad=(3, 0)),
            linear1=L.Linear(2 * self.hidden_dim, self.relu_dim),
            linear2=L.Linear(self.relu_dim, len(self.targets)),
        )
Exemplo n.º 13
0
 def __init__(self, model_path):
     self.words = read_model_defs(model_path + "/words.txt")
     self.chars = read_model_defs(model_path + "/chars.txt")
     self.unk_word = self.words[UNK]
     self.unk_char = self.chars[UNK]
     self.max_char_len = max(len(w) for w in self.words if w != UNK)
Exemplo n.º 14
0
 def __init__(self, model_path, samples_path, length=False):
     self.model_path = model_path
     self.targets = read_model_defs(model_path + "/target.txt")
     self.extractor = FeatureExtractor(model_path, length)
     with open(samples_path) as f:
         self.samples = json.load(f)