Exemplo n.º 1
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())

        retype_vocab_size = len(self.vocab.types)
        rename_vocab_size = len(self.vocab.names)
        self.target_embedding = nn.Embedding(
            retype_vocab_size + rename_vocab_size, config["target_embedding_size"]
        )
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            1,
            config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(
            decoder_layer, config["num_layers"], decoder_norm
        )
        self.output = nn.Linear(
            config["hidden_size"], retype_vocab_size + rename_vocab_size
        )
        self.mem_mask = config["mem_mask"]
        self.config: Dict = config
        self.retype_vocab_size = retype_vocab_size
Exemplo n.º 2
0
    def __init__(self,
                 url: str,
                 config: Optional[Dict] = None,
                 percent: float = 1.0):
        # support wildcards
        urls = sorted(glob.glob(url))
        urls = urls[:int(percent * len(urls))]
        super().__init__(urls)
        if config:
            # annotate example for training
            from utils.vocab import Vocab

            self.vocab = Vocab.load(config["vocab_file"])
            with open(config["typelib_file"]) as type_f:
                self.typelib = TypeLibCodec.decode(type_f.read())
            self.max_src_tokens_len = config["max_src_tokens_len"]
            self.max_num_var = config["max_num_var"]
            annotate = self._annotate
            self.rename = config.get("rename", False)
            # sort = Dataset._sort
            sort = identity
        else:
            # for creating the vocab
            annotate = identity
            sort = identity
        self = (self.pipe(Dataset._file_iter_to_line_iter).map(
            Example.from_json).map(annotate).shuffle(
                Dataset.SHUFFLE_BUFFER).pipe(sort))
Exemplo n.º 3
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())
            self.typelib = self.typelib.fix()
        self.target_embedding = nn.Embedding(len(self.vocab.subtypes),
                                             config["target_embedding_size"])
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )
        # self.cached_decode_mask: Dict[int, torch.Tensor] = {}
        # self.size = torch.zeros(len(self.vocab.types), dtype=torch.long)

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            1,
            config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(decoder_layer, config["num_layers"],
                                          decoder_norm)
        self.output = nn.Linear(config["hidden_size"],
                                len(self.vocab.subtypes))

        self.config: Dict = config
def covar_analysis(args):
    model = GaussianBilinearModel.load_model(args.model)
    rel_vocab = Vocab.load(args.relation)
    rel_mats = model.relation_mats
    scores = [abs(np.linalg.det(mat)) for mat in rel_mats]

    sort_idxs = np.argsort(scores)[::-1]
    for idx in sort_idxs:
        print('{} : {}'.format(rel_vocab.get_word(idx), scores[idx]))
Exemplo n.º 5
0
    def build(cls, config):
        params = util.update(cls.default_params(), config)

        vocab = Vocab.load(params['vocab_file'])
        model = cls(params['ast_node_encoding_size'], params['hidden_size'],
                    params['dropout'], vocab)
        model.config = params

        return model
    def __init__(self, config):
        super().__init__()

        self.vocab = vocab  = Vocab.load(config['vocab_file'])
        self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size'])
        self.config = config

        self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size'])

        if self.config['transformer'] == 'none':
            dropout = config['dropout']
            self.lstm_encoder = nn.LSTM(input_size=self.src_word_embed.embedding_dim,
                                        hidden_size=config['source_encoding_size'] // 2, num_layers=config['num_layers'],
                                        batch_first=True, bidirectional=True, dropout=dropout)

            self.dropout = nn.Dropout(dropout)

        elif self.config['transformer'] == 'bert':
            self.vocab_size = len(self.vocab.source_tokens) + 1

            state_dict = torch.load('saved_checkpoints/bert_2604/bert_pretrained_epoch_23_batch_140000.pth')

            keys_to_delete = ["cls.predictions.bias", "cls.predictions.transform.dense.weight", "cls.predictions.transform.dense.bias", "cls.predictions.transform.LayerNorm.weight",
                            "cls.predictions.transform.LayerNorm.bias", "cls.predictions.decoder.weight", "cls.predictions.decoder.bias",
                            "cls.seq_relationship.weight", "cls.seq_relationship.bias"]

            from collections import OrderedDict
            new_state_dict = OrderedDict()
            for k, v in state_dict['model'].items():
                if k in keys_to_delete: continue
                name = k[5:] # remove `bert.`
                new_state_dict[name] = v

            bert_config = BertConfig(vocab_size=self.vocab_size, max_position_embeddings=512, num_hidden_layers=6, hidden_size=256, num_attention_heads=4)
            self.bert_model = BertModel(bert_config)
            self.bert_model.load_state_dict(new_state_dict)

        elif self.config['transformer'] == 'xlnet':
            self.vocab_size = len(self.vocab.source_tokens) + 1

            state_dict = torch.load('saved_checkpoints/xlnet_2704/xlnet1_pretrained_epoch_13_iter_500000.pth')

            keys_to_delete = ["lm_loss.weight", "lm_loss.bias"]

            from collections import OrderedDict
            new_state_dict = OrderedDict()
            for k, v in state_dict['model'].items():
                if k in keys_to_delete: continue
                if k[:12] == 'transformer.': name = k[12:]
                else:                       name = k
                new_state_dict[name] = v

            xlnet_config = XLNetConfig(vocab_size=self.vocab_size, d_model=256, n_layer=12)
            self.xlnet_model = XLNetModel(xlnet_config)
            self.xlnet_model.load_state_dict(new_state_dict)
        else:
            print("Error! Unknown transformer type '{}'".format(self.config['transformer']))
    def build(cls, config):
        params = util.update(cls.default_params(), config)

        vocab = Vocab.load(params['vocab_file'])
        model = cls(params['variable_encoding_size'], params['hidden_size'],
                    params['dropout'], params['tie_embedding'],
                    params['input_feed'], vocab)
        model.config = params

        return model
Exemplo n.º 8
0
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.load(vocab_file)

    if word2vec:
        w2vec = ModelReader(word2vec)
    else:
        w2vec = None

    train.train(db, entity_db, vocab, w2vec, **kwargs)
Exemplo n.º 9
0
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.load(vocab_file)

    if word2vec:
        w2vec = ModelReader(word2vec)
    else:
        w2vec = None

    train.train(db, entity_db, vocab, w2vec, **kwargs)
Exemplo n.º 10
0
    def __init__(self, config):
        super().__init__()

        self.vocab = vocab = Vocab.load(config['vocab_file'])

        self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size'])

        dropout = config['dropout']
        self.encoder = TransformerModel(self.src_word_embed.embedding_dim, 1, config['source_encoding_size'], config['num_layers'], dropout=dropout)

        self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size'])

        self.dropout = nn.Dropout(dropout)
        self.config = config
Exemplo n.º 11
0
    def __init__(self, config, train=True):
        self.config = config
        self.train = train

        # model specific config
        self.is_ensemble = config['encoder']['type'] == 'EnsembleModel'
        if not self.is_ensemble:
            self.vocab = Vocab.load(config['data']['vocab_file'])
            self.grammar = self.vocab.grammar

        self.use_seq_encoder = config['encoder']['type'] == 'SequentialEncoder'
        self.use_hybrid_encoder = config['encoder']['type'] == 'HybridEncoder'
        self.init_gnn_with_seq_encoding = \
            config['encoder']['type'] == 'GraphASTEncoder' \
            and config['encoder']['init_with_seq_encoding']
Exemplo n.º 12
0
    def __init__(self, config):
        super().__init__()

        self.vocab = vocab = Vocab.load(config["vocab_file"])

        self.src_word_embed = nn.Embedding(len(vocab.source_tokens),
                                           config["source_embedding_size"])

        dropout = config["dropout"]
        self.encoder = TransformerModel(
            self.src_word_embed.embedding_dim,
            config["num_heads"],
            config["hidden_size"],
            config["num_layers"],
            dropout=dropout,
        )

        self.dropout = nn.Dropout(dropout)
        self.config = config
Exemplo n.º 13
0
    def build(cls, config):
        params = util.update(GraphASTEncoder.default_params(), config)

        print(params)

        connections = params['connections']
        connection2edge_type = {
            'top_down': 1,
            'bottom_up': 1,
            'variable_master_nodes': 2,
            'terminals': 2,
            'master_node': 2,
            'var_usage': 2,
            'func_root_to_arg': 1
        }
        num_edge_types = sum(connection2edge_type[key] for key in connections)
        gnn = GatedGraphNeuralNetwork(
            hidden_size=params['gnn']['hidden_size'],
            layer_timesteps=params['gnn']['layer_timesteps'],
            residual_connections=params['gnn']['residual_connections'],
            num_edge_types=num_edge_types
        )

        vocab = Vocab.load(params['vocab_file'])
        node_type_embedder = NodeTypeEmbedder(
            len(vocab.grammar.variable_types),
            params['node_type_embedding_size']
        )
        node_content_embedder = SubTokenEmbedder(
            vocab.obj_name.subtoken_model_path,
            params['node_content_embedding_size']
        )

        model = cls(gnn,
                    params['connections'],
                    params['node_syntax_type_embedding_size'],
                    params['decoder_hidden_size'],
                    node_type_embedder,
                    node_content_embedder,
                    vocab,
                    config=params)

        return model
Exemplo n.º 14
0
 def __init__(self, config, config_load=None):
     super().__init__()
     if config_load is not None:
         config = config_load
     self.encoder = Encoder.build(config["encoder"])
     self.retype = config["data"].get("retype", False)
     self.rename = config["data"].get("rename", False)
     self.interleave = config["data"].get("interleave", False)
     if self.interleave:
         self.interleave_module = InterleaveDecodeModule(config)
     else:
         if self.retype:
             self.retyping_module = RetypingDecodeModule(config)
         if self.rename:
             self.renaming_module = RenamingDecodeModule(config)
     self.config = config
     self.vocab = Vocab.load(config["data"]["vocab_file"])
     self._preprocess()
     self.soft_mem_mask = config["decoder"]["mem_mask"] == "soft"
Exemplo n.º 15
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())
        vocab_size = (
            len(self.vocab.names)
            if config.get("rename", False)
            else len(self.vocab.types)
        )
        self.target_id_key = (
            "target_name_id" if config.get("rename", False) else "target_type_id"
        )
        self.target_embedding = nn.Embedding(
            vocab_size, config["target_embedding_size"]
        )
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )
        self.cached_decode_mask: Dict[int, torch.Tensor] = {}
        self.size = torch.zeros(vocab_size, dtype=torch.long)

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            config["num_heads"],
            4 * config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(
            decoder_layer, config["num_layers"], decoder_norm
        )
        self.output = nn.Linear(config["hidden_size"], vocab_size)
        self.mem_mask = config["mem_mask"]
        if config.get("rename", False):
            self.mem_mask = "none"

        self.config: Dict = config
Exemplo n.º 16
0
    def __init__(self, args):
        self.p = args

        if not os.path.isdir(self.p.log_dir): os.mkdir(self.p.log_dir)
        if not os.path.isdir(self.p.save_dir): os.mkdir(self.p.save_dir)

        pprint(vars(self.p))
        self.logger = get_logger(self.p.name, self.p.log_dir)
        self.logger.info(vars(self.p))

        self.save_path = os.path.join(self.p.save_dir, self.p.name) + '.pth'

        if self.p.gpu != '-1':
            self.device = torch.device('cuda')
            torch.cuda.set_rng_state(torch.cuda.get_rng_state())
            torch.backends.cudnn.deterministic = True
        else:
            self.device = torch.device('cpu')

        def lr_func(epoch):
            if epoch < 10:
                return 1.0
            elif 10 <= epoch and epoch < 25:
                return 0.3
            else:
                return 0.1

        self.data = self.load_data()
        self.vocab = Vocab.load('data/vocab.bpe10000/vocab')
        self.model = self.add_model()
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.p.lr)
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optim,
                                                           lr_lambda=lr_func,
                                                           last_epoch=-1)

        self.curr_epoch = 0

        if self.p.restore: self.load_model(self.save_path)
Exemplo n.º 17
0
    def __init__(self, config):
        super().__init__()

        self.vocab = vocab = Vocab.load(config['vocab_file'])

        self.src_word_embed = nn.Embedding(len(vocab.source_tokens),
                                           config['source_embedding_size'])

        dropout = config['dropout']
        self.lstm_encoder = nn.LSTM(
            input_size=self.src_word_embed.embedding_dim,
            hidden_size=config['source_encoding_size'] // 2,
            num_layers=config['num_layers'],
            batch_first=True,
            bidirectional=True,
            dropout=dropout)

        self.decoder_cell_init = nn.Linear(config['source_encoding_size'],
                                           config['decoder_hidden_size'])

        self.dropout = nn.Dropout(dropout)
        self.config = config
Exemplo n.º 18
0
def path_analysis(args):
    ent_vocab = Vocab.load(args.entity)
    rel_vocab = RelationVocab.load(args.relation, inv_flg=True)
    triple_dat = TripletDataset.load(args.triple, ent_vocab, rel_vocab)
    pq_dat = PathQueryDataset.load(args.query, ent_vocab, rel_vocab)
    g = LabeledDiGraph(triple_dat, inv_flg=True)

    # traversal path querys
    n_rel = []
    n_tail = []
    for (sub, rels, _) in pq_dat.samples:
        cur_ents = set([sub])
        for r in rels:
            next_ents = set()
            for e in cur_ents:
                new_ents = g.walk(e, r)
                next_ents.update(new_ents)
            cur_ents = next_ents
        n_rel.append(len(rels))
        n_tail.append(len(cur_ents))
    print(n_rel)
    print(n_tail)
    print('Correlation Coefficient: {}'.format(
        np.corrcoef(n_rel, n_tail)[0, 1]))
Exemplo n.º 19
0
def train(args):

    if not os.path.exists(args.save_dir): os.mkdir(args.save_dir)

    if args.gpu != '-1' and torch.cuda.is_available():
        device = torch.device('cuda')
        torch.cuda.set_rng_state(torch.cuda.get_rng_state())
        torch.backends.cudnn.deterministic = True
    else:
        device = torch.device('cpu')

    config = {
        'train': {
            'unchanged_variable_weight': 0.1,
            'buffer_size': 5000
        },
        'encoder': {
            'type': 'SequentialEncoder'
        },
        'data': {
            'vocab_file': 'data/vocab.bpe10000/vocab'
        }
    }

    train_set = Dataset('data/preprocessed_data/train-shard-*.tar')
    dev_set = Dataset('data/preprocessed_data/dev.tar')

    vocab = Vocab.load('data/vocab.bpe10000/vocab')

    if args.decoder:
        vocab_size = len(vocab.all_subtokens) + 1
    else:
        vocab_size = len(vocab.source_tokens) + 1

    max_iters = args.max_iters
    lr = args.lr
    warm_up = args.warm_up

    batch_size = 4096
    effective_batch_size = args.batch_size

    max_embeds = 1000 if args.decoder else 512

    bert_config = BertConfig(vocab_size=vocab_size,
                             max_position_embeddings=max_embeds,
                             num_hidden_layers=6,
                             hidden_size=256,
                             num_attention_heads=4)
    model = BertForPreTraining(bert_config)

    if args.restore:
        state_dict = torch.load(os.path.join(args.save_dir, args.res_name))
        model.load_state_dict(state_dict['model'])
        batch_count = state_dict['step']
        epoch = state_dict['epoch']

    model.train()
    model.to(device)

    if len(args.gpu) > 1 and device == torch.device('cuda'):
        model = nn.DataParallel(model)

    def lr_func(step):
        if step > warm_up:
            return (max_iters - step) / (max_iters - warm_up)
        else:
            return (step / warm_up)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 eps=1e-6,
                                 weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                  lr_lambda=lr_func,
                                                  last_epoch=-1)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')

    if args.restore:
        optimizer.load_state_dict(state_dict['optim'])
        scheduler.load_state_dict(state_dict['scheduler'])

    batch_count = 0
    epoch = 0
    cum_loss = 0.0

    while True:
        # load training dataset, which is a collection of ASTs and maps of gold-standard renamings
        train_set_iter = train_set.batch_iterator(
            batch_size=batch_size,
            return_examples=False,
            config=config,
            progress=True,
            train=True,
            max_seq_len=512,
            num_readers=args.num_readers,
            num_batchers=args.num_batchers)
        epoch += 1
        print("Epoch {}".format(epoch))

        loss = 0
        num_seq = 0

        optimizer.zero_grad()

        for batch in train_set_iter:
            if args.decoder:
                input_ids = batch.tensor_dict['prediction_target'][
                    'src_with_true_var_names']
            else:
                input_ids = batch.tensor_dict['src_code_tokens']

            attention_mask = torch.ones_like(input_ids)
            attention_mask[input_ids == 0] = 0.0

            assert torch.max(input_ids) < vocab_size
            assert torch.min(input_ids) >= 0

            if input_ids.shape[0] > max_embeds:
                print(
                    "Warning - length {} is greater than max length {}. Skipping."
                    .format(input_ids.shape[0], max_embeds))
                continue

            input_ids, labels = mask_tokens(inputs=input_ids,
                                            mask_token_id=vocab_size - 1,
                                            vocab_size=vocab_size,
                                            mlm_probability=0.15)

            input_ids[attention_mask == 0] = 0
            labels[attention_mask == 0] = -100

            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                labels = labels.cuda()
                attention_mask = attention_mask.cuda()

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            masked_lm_labels=labels)

            unreduced_loss = loss_fn(
                outputs[0].view(-1, bert_config.vocab_size),
                labels.view(-1)).reshape(labels.shape) / (
                    torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7)
            loss += unreduced_loss.sum()
            num_seq += input_ids.shape[0]

            if num_seq > effective_batch_size:
                batch_count += 1
                loss /= num_seq
                cum_loss += loss.item()

                if batch_count % 20 == 0:
                    print("{} batches, Loss : {:.4}, LR : {:.6}".format(
                        batch_count, cum_loss / 20,
                        scheduler.get_lr()[0]))
                    cum_loss = 0.0

                if batch_count % 10000 == 0:
                    fname1 = os.path.join(
                        args.save_dir, 'bert_{}_step_{}.pth'.format(
                            ('decoder' if args.decoder else 'encoder'),
                            batch_count))
                    fname2 = os.path.join(
                        args.save_dir, 'bert_{}.pth'.format(
                            ('decoder' if args.decoder else 'encoder'),
                            batch_count))

                    state = {
                        'epoch': epoch,
                        'step': batch_count,
                        'model': model.module.state_dict(),
                        'optim': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict()
                    }

                    torch.save(state, fname1)
                    torch.save(state, fname2)

                    print("Saved file to path {}".format(fname1))
                    print("Saved file to path {}".format(fname2))

                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                loss = 0
                num_seq = 0

            if batch_count == max_iters:
                print(f'[Learner] Reached max iters', file=sys.stderr)
                exit()

        print("Max_len = {}".format(max_len))
        break
    p.add_argument('--entity')
    p.add_argument('--relation')

    args = p.parse_args()

    assert args.task in ['kbc', 'pq'], 'Invalid task: {}'.format(args.task)
    assert args.metric in ['mrr',
                           'hits'], 'Invalid metric: {}'.format(args.metric)
    if args.metric == 'hits':
        assert args.nbest, 'Please indecate n-best in using hits'

    model = GaussianBilinearModel.load_model(args.model)

    print('Preparing dataset...')
    if args.task == 'kbc':
        ent_vocab = Vocab.load(args.entity)
        rel_vocab = Vocab.load(args.relation)
        dataset = TripletDataset.load(args.data, ent_vocab, rel_vocab)
    elif args.task == 'pq':
        ent_vocab = Vocab.load(args.entity)
        rel_vocab = RelationVocab.load(args.relation, inv_flg=True)
        dataset = PathQueryDataset.load(args.data, ent_vocab, rel_vocab)
        if not hasattr(model, 'inv_flg') or not model.inv_flg:
            print('initializing inverse relation representations...')
            model.init_inverse()

    print('Start evaluation...')
    if args.metric == 'mrr':
        from evaluation import mrr
        # res = mrr.cal_mrr(model, dataset)
        res = mrr.multi_cal_mrr(model, dataset)
Exemplo n.º 21
0
def main(args):

    # print argument values
    print("Info: arguments\n\t" +
          "\n\t".join(["{}: {}".format(a, v) for a, v in vars(args).items()]),
          file=sys.stderr)

    # set seed
    if not args.seed:
        seed = random.randint(1, MAX_SEED)
        args.seed = seed
        print("using seed: ", args.seed)

    init_dynet(args)

    assert os.path.exists(args.data)
    if args.task == SENTIMENT:
        assert args.trg_domain in SENTIMENT_DOMAINS, f'Error: {args.trg_domain} is not a sentiment domain.'
        assert args.src_domain is not None, 'Error: A source domain must be specified.'
    elif args.task == POS:
        assert args.trg_domain in POS_DOMAINS, f'Error: {args.trg_domain} is not a POS domain.'

    if args.task == SENTIMENT:
        assert args.max_vocab_size == 5000, f'Error: Max vocab size is not 5000.'

    # create the model and log directories if they do not exist
    for dir_path in [args.model_dir, os.path.dirname(args.log_file)]:
        print("Check if directory exists:", dir_path)
        if not os.path.exists(dir_path):
            print('Creating %s...' % dir_path)
            os.makedirs(dir_path)
    # create predictions folder if it does not exist
    if args.output_predictions:
        if not os.path.exists(args.output_predictions):
            print('Creating output predictions folder: {}'.format(
                args.output_predictions))
            os.makedirs(args.output_predictions)

    if args.strategy not in [BASE, MTTRI_BASE]:
        # check that pre-trained models exist
        assert args.start_model_dir is not None,\
            'Error: start_model_dir needs to be provided.'
        for suffix in ['.model', '.params.pickle']:
            if args.strategy != TRI_TRAINING:  # tri-training w/ disagreement is enabled with --disagreement
                model_file = os.path.join(
                    args.start_model_dir,
                    args.start + "_run" + str(args.start_run) + suffix)
                assert os.path.exists(model_file),\
                    'Error: %s does not exist.' % model_file
            else:
                # check if 3 exists for tri_training
                model_name = args.start + "_bootstrap3_run" + str(
                    args.start_run) + suffix
                model_file = os.path.join(args.start_model_dir, model_name)
                assert os.path.exists(model_file), \
                    'Error: %s does not exist.' % model_file

    if args.task == POS:
        pos_path = os.path.join(args.data, 'gweb_sancl', 'pos_fine')
        assert os.path.exists(pos_path)
        train_path = os.path.join(pos_path, 'wsj', 'gweb-wsj-train.conll')
        dev_path = os.path.join(pos_path, 'wsj', 'gweb-wsj-dev.conll')
        unlabeled_path = os.path.join(
            args.data, 'gweb_sancl', 'unlabeled',
            'gweb-%s.unlabeled.txt' % args.trg_domain)
        dev_test_path = os.path.join(pos_path, args.trg_domain,
                                     'gweb-%s-dev.conll' % args.trg_domain)
        test_path = os.path.join(pos_path, args.trg_domain,
                                 'gweb-%s-test.conll' % args.trg_domain)
    elif args.task == SENTIMENT:
        sentiment_path = os.path.join(args.data, 'processed_acl')
        train_path = dev_path = os.path.join(sentiment_path, args.src_domain)
        # since there is no target domain test set, we just tune hyperparams
        # on book->dvd
        unlabeled_path = dev_test_path = test_path = os.path.join(
            sentiment_path, args.trg_domain)
    else:
        raise ValueError()

    # load the data and save it to a pickle file
    split2data = {}
    read_data = data_readers.task2read_data_func(args.task)
    for split, path_ in zip(
        ['train', 'dev', 'dev_test', 'test', 'unlabeled'],
        [train_path, dev_path, dev_test_path, test_path, unlabeled_path]):
        if split == 'unlabeled':
            data = read_data(
                path_, unlabeled=True,
                max_unlabeled=args.max_unlabeled)  # [[instances],[]]
        else:
            data = read_data(
                path_, unlabeled=False,
                max_train=args.max_train)  # keeps [[instances],[labels]]

        # the DANN paper uses somewhat different splits than the standard, so
        # we create the splits here
        if args.task == SENTIMENT:
            if split == 'train':
                # in the DANN paper, they use all 2000 training examples
                pass
            elif split == 'dev':
                # the DANN paper uses 200 target samples for testing,
                # which are read from the unlabeled file
                continue
            elif split == 'unlabeled':
                # in the DANN paper, we use the content of the unlabeled file
                # for testing
                split = 'test'
                data, data_dev = (data[0][:-200],
                                  data[1][:-200]), (data[0][-200:],
                                                    data[1][-200:])
                # we use 200 labeled samples for validation
                split2data['dev'] = list(data_dev)
            elif split == 'test':
                # in the DANN set-up, we use this data as unlabeled data
                split = 'unlabeled'
                data = data[0], []
            elif split == 'unlabeled':
                data = data[0], []
        elif args.max_unlabeled and split == 'unlabeled':
            print('Restricting # of unlabeled examples to',
                  args.max_unlabeled,
                  file=sys.stderr)
            new_data = data[0][:args.max_unlabeled], data[1][:args.
                                                             max_unlabeled]
            if len(new_data[0]) < args.max_unlabeled:
                args.max_unlabeled = len(
                    new_data[0])  # set if |unlabeled| < --max-unlabeled
            data = new_data
        elif args.max_train and split == 'train':
            print('Restricting # of labeled training examples to',
                  args.max_train,
                  file=sys.stderr)
            data = data[0][:args.max_train], data[1][:args.max_train]

        split2data[split] = list(data)
        print('# of %s examples: %d.' % (split, len(data[0])))

    vocab_dir = args.model_dir if args.strategy in [BASE, MTTRI_BASE
                                                    ] else args.start_model_dir
    vocab_path = os.path.join(vocab_dir, 'vocab.txt')
    vocab = Vocab(vocab_path, max_vocab_size=args.max_vocab_size)
    if not os.path.exists(vocab_path):
        # build the vocabulary
        assert args.strategy in [BASE, MTTRI_BASE],\
            'Error: Vocabulary should only be created with the base model.'
        vocab.create(split2data['train'][0] + split2data['unlabeled'][0],
                     lowercase=args.lowercase)
    else:
        vocab.load()

    if args.task == SENTIMENT:
        print('Creating binary training data...')
        split2data = data_utils.get_tfidf_data(split2data, vocab, tfidf=True)
    elif args.task.startswith('pos'):
        print('Using words as training data for POS tagging...')
    elif args.task == 'parsing':
        print(
            'Using CoNLL entries as training data for parsing. Using word forms to extract feature representations...'
        )
        for split, data in split2data.items():
            split2data[split][0] = [[
                conll_entry.form for conll_entry in conll_entries
            ] for conll_entries in data[0]]
    else:
        raise ValueError(
            'Training data retrieval for task %s is not implemented.' %
            args.task)

    run_scores = []
    train_func = task_utils.task2train_func(args.task, args.strategy)
    for i in range(args.num_runs):
        run_num = i + 1
        print('\nRun %d/%d.' % (run_num, args.num_runs))

        val_score, test_score = train_func(
            vocab, args,
            *itertools.chain.from_iterable([
                split2data['train'], split2data['dev'], split2data['dev_test'],
                split2data['test'], split2data['unlabeled']
            ]), run_num)
        print('Validation score: %.3f. Test score: %.3f' %
              (val_score, test_score))
        run_scores.append((val_score, test_score))

    if args.num_runs > 1:
        # log the results of multiple runs to a file
        data_utils.log_to_file(args, run_scores)
def train(args):

    if args.log:
        log_dir = args.log
    else:
        log_dir = os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            '{}'.format(datetime.now().strftime('%Y%m%d_%H:%M')))

    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # setting for logging
    logger = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    log_path = os.path.join(log_dir, 'log')
    file_handler = logging.FileHandler(log_path)
    fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(fmt)
    logger.addHandler(file_handler)

    logger.info('Arguments...')
    for arg, val in vars(args).items():
        logger.info('{} : {}'.format(arg, val))

    logger.info('Preparing dataset...')
    if not args.entity or not args.relation:
        # make vocab from train set
        logger.info('Making entity/relation vocab from train data...')
        raise NotImplementedError()
    else:
        ent_vocab = Vocab.load(args.entity)
        rel_vocab = Vocab.load(args.relation)

    n_entity, n_relation = len(ent_vocab), len(rel_vocab)
    train_dat = TripletDataset.load(args.train, ent_vocab, rel_vocab)
    logger.info('')
    if args.valid:
        assert args.metric in ['mrr',
                               'hits'], 'Invalid evaluation metric: {}'.format(
                                   args.metric)
        assert args.metric, 'Please indecate evaluation metric for validation'
        if args.metric == 'hits':
            assert args.nbest, 'Please indecate nbest for hits'
        valid_dat = TripletDataset.load(args.valid, ent_vocab, rel_vocab)

    if args.restart:
        logger.info('Restarting training: {}'.format(args.restart))
        model = GaussianBilinearModel.load_model(args.restart)
    else:
        logger.info('Building new model')
        opt = SGD(args.lr, args.gradclip)
        model = GaussianBilinearModel(n_entity, n_relation, args.dim,
                                      args.cmin, args.cmax, opt, args.tri,
                                      args.init_sigma)

    best_model = None
    best_val = -1
    for epoch in range(args.epoch):
        logger.info('start {} epoch'.format(epoch + 1))
        sum_loss = 0
        start = time.time()
        for i, pos_sample in enumerate(data_iter(train_dat)):
            neg_samples = [(pos_sample[0], pos_sample[1],
                            np.random.randint(n_entity))
                           for _ in range(args.num_negative)]
            for neg_sample in neg_samples:
                loss = model.update(pos_sample, neg_sample)
                sum_loss += loss
                # logger.info('loss: {}'.format(loss))
            # logger.info('processing {} samples in this epoch'.format(i+1))
            print('processing {} samples in this epoch'.format(i + 1))
        logger.info('sum loss: {}'.format(sum_loss))
        logger.info('{} sec/epoch for training'.format(time.time() - start))
        model_path = os.path.join(log_dir, 'model{}'.format(epoch + 1))
        model.save_model(model_path)
        if args.valid and (epoch + 1) % args.evalstep == 0:
            val = evaluation(model, valid_dat, args.metric, args.nbest)
            logger.info('{} in validation: {}'.format(args.metric, val))
            if val > best_val:
                best_model = copy.deepcopy(model)
                best_val = val
                best_epoch = epoch + 1

    if args.valid:
        logger.info('best model is {} epoch'.format(best_epoch))
        model_path = os.path.join(log_dir, 'bestmodel')
        best_model.save_model(model_path)

    logger.info('done all')
Exemplo n.º 23
0
from helper import *

from utils.vocab import PAD_ID, Vocab

vocab = Vocab.load('data/vocab.bpe10000/vocab')

def tokens_to_word(inp_seq):
    output = ''
    for t in inp_seq:
        c = vocab.all_subtokens.id2word[t]
        if c == '<s>': c = ''
        if c == '</s>': c = ''
        if c == '<pad>': c = ''
        output += c
    return output

class HistogramBins(object):

    def __init__(self, thresholds, key_func):
        self.thresholds = np.array(thresholds)
        self.thresholds = np.concatenate((self.thresholds, [math.inf]))
        self.key_func   = key_func

        self.bins           = {thresh : [] for thresh in self.thresholds}

    def process(self, data):
        for x in data:
            key = self.key_func(x)

            bin_id = np.argmax((self.thresholds - key) > 0)
            self.bins[self.thresholds[bin_id]].append(x)