Exemplo n.º 1
0
    def setup_classifier(self):
        dic = self.dic
        hparams = self.hparams

        n_vocab = len(dic.tables['unigram'])
        unigram_embed_size = hparams['unigram_embed_size']

        if 'pretrained_unigram_embed_size' in hparams and hparams[
                'pretrained_unigram_embed_size'] > 0:
            pretrained_unigram_embed_size = hparams[
                'pretrained_unigram_embed_size']
        else:
            pretrained_unigram_embed_size = 0

        if 'pretrained_embed_usage' in hparams:
            pretrained_embed_usage = models.util.ModelUsage.get_instance(
                hparams['pretrained_embed_usage'])
        else:
            pretrained_embed_usage = models.util.ModelUsage.NONE

        if common.is_segmentation_task(self.task):
            n_label = len(dic.tables[constants.SEG_LABEL])
            n_labels = [n_label]
        else:
            self.log('Error: invaliad task {}'.format(self.task))
            sys.exit()

        if (pretrained_embed_usage == models.util.ModelUsage.ADD
                or pretrained_embed_usage == models.util.ModelUsage.INIT):
            if pretrained_unigram_embed_size > 0 and pretrained_unigram_embed_size != unigram_embed_size:
                print(
                    'Error: pre-trained and random initialized unigram embedding vectors must be the same size (dimension) for {} operation: d1={}, d2={}'
                    .format(hparams['pretrained_embed_usage'],
                            pretrained_unigram_embed_size, unigram_embed_size),
                    file=sys.stderr)
                sys.exit()

        predictor = models.tagger.construct_RNNTagger(
            n_vocab=n_vocab,
            unigram_embed_size=unigram_embed_size,
            rnn_unit_type=hparams['rnn_unit_type'],
            rnn_bidirection=hparams['rnn_bidirection'],
            rnn_batch_first=hparams['rnn_batch_first'],
            rnn_n_layers=hparams['rnn_n_layers'],
            rnn_hidden_size=hparams['rnn_hidden_size'],
            mlp_n_layers=hparams['mlp_n_layers'],
            mlp_hidden_size=hparams['mlp_hidden_size'],
            n_labels=n_labels[0],
            use_crf=hparams['inference_layer'] == 'crf',
            # crf_top_k=hparams['crf_top_k'],
            rnn_dropout=hparams['rnn_dropout'],
            embed_dropout=hparams['embed_dropout']
            if 'embed_dropout' in hparams else 0.0,
            mlp_dropout=hparams['mlp_dropout'],
            pretrained_unigram_embed_size=pretrained_unigram_embed_size,
            pretrained_embed_usage=pretrained_embed_usage,
        )

        self.classifier = classifiers.sequence_tagger.SequenceTagger(
            predictor, task=self.task)
Exemplo n.º 2
0
 def grow_inference_layers(self, dic_grown):
     n_labels_org = self.predictor.mlp.layers[-1].weight.shape[0]
     if common.is_segmentation_task(self.task):
         n_labels_grown = len(dic_grown.tables[constants.SEG_LABEL].id2str)
     models.util.grow_MLP(n_labels_org, n_labels_grown,
                          self.predictor.mlp.layers[-1])
     if self.predictor.use_crf:
         models.util.grow_crf_layer(n_labels_org, n_labels_grown,
                                    self.predictor.crf)
Exemplo n.º 3
0
    def decode_batch(self, *inputs, org_tokens=None, org_attrs=None, file=sys.stdout):
        ys = self.classifier.decode(*inputs)
        id2label = (self.dic.tables[constants.SEG_LABEL if common.is_segmentation_task(self.task)
                                    else constants.ATTR_LABEL(0)].id2str)

        # for i in range(len(inputs[0])):
        #     print(len(inputs[0][i]), inputs[0][i])
        #     print(len(org_tokens[i]), org_tokens[i])
        #     print(len(ys[i]), ys[i])
        #     print()

        if not org_attrs:
            org_attrs = [None] * len(org_tokens)


        for x_str, a_str, y in zip(org_tokens, org_attrs, ys):
            y_str = [id2label[int(yi)] for yi in y]
            y_str = self.convert_to_valid_BIES_seq(y_str)

            if self.task == constants.TASK_TAG:
                if a_str:
                    res = ['{}{}{}{}{}'.format(xi_str, self.args.output_attr_delim,
                                                 ai_str, self.args.output_attr_delim,
                                                 yi_str)
                           for xi_str, ai_str, yi_str in zip(x_str, a_str, y_str)]
                else:
                    res = ['{}{}{}'.format(xi_str, self.args.output_attr_delim, yi_str)
                           for xi_str, yi_str in zip(x_str, y_str)]
                
                if self.args.output_data_format == 'wl':
                    res.append('')
                res = self.args.output_token_delim.join(res)


            elif self.task == constants.TASK_SEG:
                res = ['{}{}'.format(xi_str, self.args.output_token_delim 
                                     if (yi_str.startswith('E') or yi_str.startswith('S')) 
                                     else '') for xi_str, yi_str in zip(x_str, y_str)]
                res = ''.join(res).rstrip(' ')

            elif self.task == constants.TASK_SEGTAG:
                res = ['{}{}'.format(
                    xi_str, 
                    (self.args.output_attr_delim+yi_str[2:]+self.args.output_token_delim) 
                    if (yi_str.startswith('E-') or yi_str.startswith('S-')) else ''
                ) for xi_str, yi_str in zip(x_str, y_str)]
                res = ''.join(res).rstrip(' ')

            else:
                print('Error: Invalid decode type', file=self.logger)
                sys.exit()

            print(res, file=file)
Exemplo n.º 4
0
    def setup_classifier(self):
        dic = self.dic
        hparams = self.hparams

        n_vocab = len(dic.tables['unigram'])
        unigram_embed_dim = hparams['unigram_embed_dim']
        
        if 'bigram_embed_dim' in hparams and hparams['bigram_embed_dim'] > 0:
            bigram_embed_dim = hparams['bigram_embed_dim']
            n_bigrams = len(dic.tables[constants.BIGRAM])
        else:
            bigram_embed_dim = n_bigrams = 0

        if 'pretrained_unigram_embed_dim' in hparams and hparams['pretrained_unigram_embed_dim'] > 0:
            pretrained_unigram_embed_dim = hparams['pretrained_unigram_embed_dim']
        else:
            pretrained_unigram_embed_dim = 0

        if 'pretrained_bigram_embed_dim' in hparams and hparams['pretrained_bigram_embed_dim'] > 0:
            pretrained_bigram_embed_dim = hparams['pretrained_bigram_embed_dim']
        else:
            pretrained_bigram_embed_dim = 0

        if 'pretrained_embed_usage' in hparams:
            pretrained_embed_usage = models.util.ModelUsage.get_instance(hparams['pretrained_embed_usage'])
        else:
            pretrained_embed_usage = models.util.ModelUsage.NONE

        if common.is_segmentation_task(self.task):
            n_label = len(dic.tables[constants.SEG_LABEL])
            n_labels = [n_label]
            attr1_embed_dim = n_attr1 = 0

        else:
            n_labels = []
            for i in range(3): # tmp
                if constants.ATTR_LABEL(i) in dic.tables:
                    n_label = len(dic.tables[constants.ATTR_LABEL(i)])
                    n_labels.append(n_label)
                
            if 'attr1_embed_dim' in hparams and hparams['attr1_embed_dim'] > 0:
                attr1_embed_dim = hparams['attr1_embed_dim']
                n_attr1 = n_labels[1] if len(n_labels) > 1 else 0
            else:
                attr1_embed_dim = n_attr1 = 0

        if (pretrained_embed_usage == models.util.ModelUsage.ADD or
            pretrained_embed_usage == models.util.ModelUsage.INIT):
            if pretrained_unigram_embed_dim > 0 and pretrained_unigram_embed_dim != unigram_embed_dim:
                print('Error: pre-trained and random initialized unigram embedding vectors '
                      + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage'])
                      + ': d1={}, d2={}'.format(pretrained_unigram_embed_dim, unigram_embed_dim),
                      file=sys.stderr)
                sys.exit()

            if pretrained_bigram_embed_dim > 0 and pretrained_bigram_embed_dim != bigram_embed_dim:
                print('Error: pre-trained and random initialized bigram embedding vectors '
                      + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage'])
                      + ': d1={}, d2={}'.format(pretrained_bigram_embed_dim, bigram_embed_dim),
                      file=sys.stderr)
                sys.exit()

        predictor = models.tagger.construct_RNNTagger(
            n_vocab, unigram_embed_dim, n_bigrams, bigram_embed_dim,
            n_attr1, attr1_embed_dim, 0, 0,
            hparams['rnn_unit_type'], hparams['rnn_bidirection'], 
            hparams['rnn_n_layers'], hparams['rnn_n_units'], 
            hparams['rnn_n_layers2'] if 'rnn_n_layers2' in hparams else 0,
            hparams['rnn_n_units2'] if 'rnn_n_units2' in hparams else 0,
            hparams['mlp_n_layers'], hparams['mlp_n_units'], n_labels[0], 
            use_crf=hparams['inference_layer'] == 'crf',
            feat_dim=hparams['additional_feat_dim'], mlp_n_additional_units=0,
            rnn_dropout=hparams['rnn_dropout'],
            embed_dropout=hparams['embed_dropout'] if 'embed_dropout' in hparams else 0.0,
            mlp_dropout=hparams['mlp_dropout'],
            pretrained_unigram_embed_dim=pretrained_unigram_embed_dim,
            pretrained_bigram_embed_dim=pretrained_bigram_embed_dim,
            pretrained_embed_usage=pretrained_embed_usage)

        self.classifier = classifiers.sequence_tagger.SequenceTagger(predictor, task=self.task)