예제 #1
0
def create_vocab(datasets, embed_file=None, bert_vocab_path=None, min_count=2):
    wd_vocab = Vocab(min_count, bos=None, eos=None)
    char_vocab = Vocab(bos=None, eos=None)
    tag_vocab = Vocab(bos=None, eos=None)
    ner_vocab = Vocab(bos=None, eos=None)
    for insts in datasets:
        for inst in insts:
            wd_vocab.add(inst.word)
            char_vocab.add(list(inst.word))
            tag_vocab.add(inst.pos_tag)

            if inst.ner_tag != 'O':
                # including PER ORG LOC MISC and UNK
                ner_tag = inst.ner_tag.split('-')[1]
                ner_vocab.add(ner_tag)

    embed_count = wd_vocab.load_embeddings(embed_file)
    print("%d word pre-trained embeddings loaded..." % embed_count)

    bert_vocab = BERTVocab(
        bert_vocab_path) if bert_vocab_path is not None else None

    return MultiVocab(
        dict(word=wd_vocab,
             char=char_vocab,
             tag=tag_vocab,
             ner=ner_vocab,
             bert=bert_vocab))
예제 #2
0
파일: run.py 프로젝트: yizhongw/TagNN-PDTB
def pdtb_prepare(args):
    print('Loading dataset...')
    train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                      PathConfig.train_sections]
    dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                    PathConfig.dev_sections]
    test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                     PathConfig.test_sections]
    dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1)
    print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set),
                                                        len(dataset.test_set)))
    print('Creating word vocab...')
    if not os.path.exists(PathConfig.experiment_data_dir):
        os.mkdir(PathConfig.experiment_data_dir)
    word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    for word in dataset.get_all_words():
        word_vocab.add(word)
    word_vocab.load_pretrained_emb(PathConfig.embedding_path)
    print('Size of word vocab: {}'.format(word_vocab.size()))
    torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj'))
    tag_vocab = Vocab()
    for tag in dataset.get_all_tags():
        tag_vocab.add(tag)
    print('Size of tag vocab: {}'.format(tag_vocab.size()))
    tag_vocab.init_embed(ModelConfig.tag_embed_dim)
    torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj'))
    print('Formatting the dataset to torch variables...')
    dataset.format_instances_to_torch_var(word_vocab, tag_vocab)
    torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
예제 #3
0
def build_vocab(files, vocabulary=None, mtl=False, name="src", save_dir="/"):
    vocabs = []

    if vocabulary is not None:
        for v in vocabulary:
            print(f'Loading from {v}')
            vocab = Vocab()
            vocab.load_from_file(v)
            vocabs.append(vocab)
    else:
        if mtl is True:
            for index, f in enumerate(files):
                vocab = Vocab()
                vocab.build_vocab([f])
                vocab.save(save_dir + name + ".vocab." + str(index) + ".json")
                vocabs.append(vocab)
        else:
            vocab = Vocab()
            vocab.build_vocab(files)
            vocab.save(save_dir + name + ".vocab.json")
            vocabs.append(vocab)

    for index, vocab in enumerate(vocabs):
        print(f'vocabulary size {index+1:d}: {vocab.len():d}')

    return vocabs
예제 #4
0
 def configuration(cls,
                   plm=None,
                   method='lgesql',
                   table_path='data/tables.json',
                   tables='data/tables.bin',
                   db_dir='data/database'):
     cls.plm, cls.method = plm, method
     cls.grammar = ASDLGrammar.from_filepath(GRAMMAR_FILEPATH)
     cls.trans = TransitionSystem.get_class_by_lang('sql')(cls.grammar)
     cls.tables = pickle.load(open(tables,
                                   'rb')) if type(tables) == str else tables
     cls.evaluator = Evaluator(cls.trans, table_path, db_dir)
     if plm is None:
         cls.word2vec = Word2vecUtils()
         cls.tokenizer = lambda x: x
         cls.word_vocab = Vocab(
             padding=True,
             unk=True,
             boundary=True,
             default=UNK,
             filepath='./pretrained_models/glove.42b.300d/vocab.txt',
             specials=SCHEMA_TYPES)  # word vocab for glove.42B.300d
     else:
         cls.tokenizer = AutoTokenizer.from_pretrained(
             os.path.join('./pretrained_models', plm))
         cls.word_vocab = cls.tokenizer.get_vocab()
     cls.relation_vocab = Vocab(padding=False,
                                unk=False,
                                boundary=False,
                                iterable=RELATIONS,
                                default=None)
     cls.graph_factory = GraphFactory(cls.method, cls.relation_vocab)
예제 #5
0
def create_vocab(data_path):
    wd_vocab = Vocab(min_count=3, bos=None, eos=None)
    lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None)
    assert os.path.exists(data_path)
    with open(data_path, 'r', encoding='utf-8') as fin:
        loader = map(lambda x: x.strip().split('|||'), fin)
        for lbl, data_item in loader:
            wds = data_item.strip().split(' ')
            wd_vocab.add(wds)
            lbl_vocab.add(lbl.strip())
    return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})
예제 #6
0
def get_embed_vocab(embed_file):
    assert os.path.exists(embed_file)
    embed_vocab = Vocab(bos=None, eos=None)
    vec_dim = 0
    with open(embed_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.strip().split(' ')
            if len(tokens) < 10:
                continue
            embed_vocab.add(tokens[0])
            if vec_dim == 0:
                vec_dim = len(tokens[1:])

    embed_weights = np.random.uniform(-0.5 / vec_dim, 0.5 / vec_dim,
                                      (len(embed_vocab), vec_dim))
    with open(embed_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.strip().split(' ')
            if len(tokens) < 10:
                continue
            idx = embed_vocab.inst2idx(tokens[0])
            embed_weights[idx] = np.asarray(tokens[1:], dtype=np.float32)
    embed_weights[embed_vocab.pad_idx] = 0.
    embed_weights /= np.std(embed_weights)
    embed_vocab.embeddings = embed_weights
    return embed_vocab
    def create_vocab(self):

        if self.is_training:
            if not os.path.exists(self.vocab_file_path):
                print("Creating vocab")
                self.vocab = Vocab(add_bos=False,
                                   add_eos=False,
                                   add_padding=False,
                                   min_count=self.min_count)

                for example in self.dataset:
                    self.vocab.add_tokenized_sentence(
                        example['tokens'][:self.train_max_length])

                self.vocab.finish()

                with open(self.vocab_file_path, 'wb') as f:
                    pickle.dump(self.vocab, f)
            else:
                with open(self.vocab_file_path, 'rb') as f:
                    self.vocab = pickle.load(f)

        else:
            print("Cargando vocab")
            with open(self.vocab_file_path, 'rb') as f:
                self.vocab = pickle.load(f)
예제 #8
0
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            if self.argv.data_type == 'conll05':
                core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
            else:
                core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
            for label in core_labels:
                vocab_label.add_word(label)

        bio_labels = []
        for sent in sents:
            for props in sent.prd_bio_labels:
                bio_labels += props
        cnt = Counter(bio_labels)
        bio_labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in bio_labels:
            if not label.endswith('-V') and len(label) > 1:
                vocab_label.add_word(label[2:])

        return vocab_label
예제 #9
0
def train(params):
    assert params["mode"].lower() == "train", "change training mode to 'train'"

    vocab = Vocab(params["vocab_path"], params["vocab_size"])
    params['vocab_size'] = vocab.count
    # params["trained_epoch"] = get_train_msg()
    params["learning_rate"] *= np.power(0.9, params["trained_epoch"])

    # 构建模型
    print("Building the model ...")
    model = Seq2Seq(params)
    # 获取保存管理者
    checkpoint = tf.train.Checkpoint(Seq2Seq=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    SEQ2SEQ_CKPT,
                                                    max_to_keep=5)

    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")

    # 训练模型
    print("开始训练模型..")
    print("trained_epoch:", params["trained_epoch"])
    print("mode:", params["mode"])
    print("epochs:", params["epochs"])
    print("batch_size:", params["batch_size"])
    print("max_enc_len:", params["max_enc_len"])
    print("max_dec_len:", params["max_dec_len"])
    print("learning_rate:", params["learning_rate"])

    train_model(model, vocab, params, checkpoint_manager)
예제 #10
0
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            none_label = 'O'
            vocab_label.add_word(none_label)

        labels = []
        for sent in sents:
            if sent.has_prds:
                for prop in sent.prd_bio_labels:
                    labels += prop
        cnt = Counter(labels)
        labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in labels:
            vocab_label.add_word(label)

        return vocab_label
예제 #11
0
def build_vocab(df, vocab_path):
    print(f"building vocab ...")

    vocab_dict = {"<unk>": 1, "<eos>": 2, "<pad>": 3}
    vocab_set = []

    for row in tqdm(df.itertuples()):
        text = row.text.replace(" ", "")  # remove spaces

        phones = pyopenjtalk.g2p(text, join=False)
        # remove pause
        phones = [phone for phone in phones if phone != "pau"]

        for phone in phones:
            if phone not in vocab_set:
                vocab_set.append(phone)

    # alphabetical order
    vocab_set.sort()

    wlines = []
    for v in vocab_set:
        index = len(vocab_dict) + 1
        vocab_dict[v] = index

    for v, index in vocab_dict.items():
        wlines.append(f"{v} {index:d}\n")

    with open(vocab_path, "w", encoding="utf-8") as f:
        f.writelines(wlines)

    print(f"vocabulary saved to {vocab_path}")

    return Vocab(vocab_path)
예제 #12
0
def prepare(args):
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.gpus, args.batch_size, args.train_files,
                          args.dev_files, args.test_files)
    vocab = Vocab(init_random=False, trainable_oov_cnt_threshold=2)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    logger.info('Assigning embeddings...')
    #     vocab.build_embedding_matrix(args.pretrained_word_path)
    vocab.randomly_init_embeddings(args.embed_size)
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    logger.info('Done with preparing!')
예제 #13
0
def vocabs_init(train_data: List[str]) -> Vocab:
    print("Constructing vocabularies...", flush=True)

    vocab = Vocab(train_data)

    print('len(labels_vocab): %d' % len(vocab))

    return vocab
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.K = args.K
        self.rnn_hidden = args.rnn_hidden
        self.max_sent_len = args.max_sent_len
        print("loading pretrained emb......")
        self.emb_matrix = np.load(args.dset_dir + '/' + args.dataset +
                                  '/embedding.npy')
        print("loading dataset vocab......")
        self.vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl')

        # create embedding layers
        self.emb = nn.Embedding(self.vocab.size,
                                args.emb_dim,
                                padding_idx=constant.PAD_ID)
        self.pos_emb = nn.Embedding(len(constant.POS_TO_ID),
                                    args.pos_dim) if args.pos_dim > 0 else None

        # initialize embedding with pretrained word embeddings
        self.init_embeddings()

        # dropout
        self.input_dropout = nn.Dropout(args.input_dropout)

        # GRU for P(Trc|S,Y')
        self.GRU_mean_rc = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)
        self.GRU_std_rc = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)

        # GRU for P(Tner|S,Y')
        self.GRU_mean_ner = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)
        self.GRU_std_ner = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)

        # define r
        self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))

        # define encoder for the sharing representations S
        self.BiLSTM = LSTMRelationModel(args)

        # classifer
        self.Lr = nn.Linear(4 * self.rnn_hidden, 2 * self.rnn_hidden)
        self.Cr = nn.Linear(2 * self.rnn_hidden, len(constant.LABEL_TO_ID))
        self.Cg = nn.Linear(2 * self.rnn_hidden, len(constant.BIO_TO_ID))

        # Fn
        self.logsoft_fn1 = nn.LogSoftmax(dim=2)
        self.logsoft_fn2 = nn.LogSoftmax(dim=3)
예제 #15
0
    def __init__(self, args):
        self.args = args

        self.epoch = args.epoch
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.K = args.K
        self.num_avg = args.num_avg
        self.global_iter = 0
        self.global_epoch = 0
        self.log_file = args.log_file

        # Network & Optimizer
        self.toynet = ToyNet(args).cuda()
        self.optim = optim.Adam(self.toynet.parameters(), lr=self.lr)

        self.ckpt_dir = Path(args.ckpt_dir)
        if not self.ckpt_dir.exists():
            self.ckpt_dir.mkdir(parents=True, exist_ok=True)
        self.load_ckpt = args.load_ckpt
        if self.load_ckpt != '': self.load_checkpoint(self.load_ckpt)

        # loss function
        self.ner_lossfn = nn.NLLLoss(reduction='sum')
        self.rc_lossfn = nn.BCELoss(reduction='sum')

        # History
        self.history = dict()
        # class loss
        self.history['ner_train_loss1'] = []
        self.history['rc_train_loss1'] = []
        self.history['ner_test_loss1'] = []
        self.history['rc_test_loss1'] = []
        self.history['ner_train_loss2'] = []
        self.history['rc_train_loss2'] = []
        self.history['ner_test_loss2'] = []
        self.history['rc_test_loss2'] = []
        self.history['precision_test'] = []
        self.history['recall_test'] = []
        self.history['F1_test'] = []
        # info loss
        self.history['info_train_loss'] = []
        self.history['info_test_loss'] = []

        # Dataset
        vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl')
        self.data_loader = dict()
        self.data_loader['train'] = Dataloader(
            args.dset_dir + '/' + args.dataset + '/train.json',
            args.batch_size, vars(args), vocab)
        self.data_loader['test'] = Dataloader(args.dset_dir + '/' +
                                              args.dataset + '/test.json',
                                              args.batch_size,
                                              vars(args),
                                              vocab,
                                              evaluation=True)
예제 #16
0
파일: eval.py 프로젝트: pnarsina/w266_final
def evaluate_model(evalparams):

    torch.manual_seed(evalparams.seed)
    random.seed(1234)
    if evalparams.cpu:
        evalparams.cuda = False
    elif evalparams.cud:
        torch.cuda.manual_seed(args.seed)

    # load opt
    print(evalparams.model_dir, evalparams.model)
    #     model_file = evalparams.model_dir + "/" + evalparams.model
    model_file = 'best_model.pt'
    print("Loading model from {}".format(model_file))
    opt = torch_utils.load_config(model_file)
    model = RelationModel(opt)
    model.load(model_file)

    # load vocab
    vocab_file = evalparams.model_dir + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    assert opt[
        'vocab_size'] == vocab.size, "Vocab size must match that in the saved model."

    # load data
    data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset)
    print("Loading data from {} with batch size {}...".format(
        data_file, opt['batch_size']))
    batch = DataLoader(data_file,
                       opt['batch_size'],
                       opt,
                       vocab,
                       evaluation=True)

    helper.print_config(opt)
    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])

    predictions = []
    all_probs = []
    for i, b in enumerate(batch):
        preds, probs, _ = model.predict(b)
        predictions += preds
        all_probs += probs
    predictions = [id2label[p] for p in predictions]
    p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True)

    # save probability scores
    if len(evalparams.out) > 0:
        helper.ensure_dir(os.path.dirname(evalparams.out))
        with open(evalparams.out, 'wb') as outfile:
            pickle.dump(all_probs, outfile)
        print("Prediction scores saved to {}.".format(evalparams.out))

    print("Evaluation ended.")

    return (batch.gold(), predictions, model)
예제 #17
0
def load_word_vector(path):
    """
    loading word vector(this project employs GLOVE word vector), save GLOVE word, vector as file
    respectively
    :param path: GLOVE word vector path
    :return: glove vocab,: vocab object, vector(numpy array, of shape(words_num, word_dim))
    """
    base = os.path.splitext(os.path.basename(path))[0]
    glove_vocab_path = os.path.join('../data/glove/', base + '.vocab')
    glove_vector_path = os.path.join('../data/glove/', base + '.path')
    # haved loaded word vector
    if os.path.isfile(glove_vocab_path) and os.path.isfile(glove_vector_path):
        print('======> File found, loading memory <=====!')
        vocab = Vocab(glove_vocab_path)
        vector = np.load(glove_vector_path)
        return vocab, vector

    print('=====>Loading glove word vector<=====')
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        contents = f.readline().rstrip('\n').split(' ')
        word_dim = len(contents[1:])
        count = 1
        for line in f:
            count += 1

    vocab = [None] * count
    vector = np.zeros((count, word_dim))
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        idx = 0
        for line in f:
            contents = line.rstrip('\n').split(' ')
            vocab[idx] = contents[0]
            vector[idx] = np.array(list(map(float, contents[1:])), dtype=float)
            idx += 1
    assert count == idx
    with open(glove_vector_path, 'w', encoding='utf8', errors='ignore') as f:
        for token in vocab:
            f.write(token + '\n')

    vocab = Vocab(glove_vocab_path)
    torch.save(vector, glove_vector_path)
    return vocab, vector
예제 #18
0
def prepare_data():
    # load the dataset
    train_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.train_sections
    ]
    dev_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.dev_sections
    ]
    test_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.test_sections
    ]
    train_dataset = PDTBDataSet(train_sections,
                                tree_type=args.tree_type,
                                level=args.level,
                                multiple_labels=False)
    dev_dataset = PDTBDataSet(dev_sections,
                              tree_type=args.tree_type,
                              level=args.level,
                              multiple_labels=True)
    test_dataset = PDTBDataSet(test_sections,
                               tree_type=args.tree_type,
                               level=args.level,
                               multiple_labels=True)
    if not (train_dataset.consistent_with(dev_dataset)
            and dev_dataset.consistent_with(test_dataset)):
        print('Dataset labels are not consistent.')
        print('Train: {}'.format(sorted(train_dataset.label_map.keys())))
        print('Dev: {}'.format(sorted(dev_dataset.label_map.keys())))
        print('Test: {}'.format(sorted(test_dataset.label_map.keys())))
    print('Size of train set: {}, dev set: {}, test set: {}'.format(
        len(train_dataset), len(dev_dataset), len(test_dataset)))
    # save the dataset
    torch.save(train_dataset,
               os.path.join(paths.experiment_data_dir, 'train.data'))
    torch.save(dev_dataset, os.path.join(paths.experiment_data_dir,
                                         'dev.data'))
    torch.save(test_dataset,
               os.path.join(paths.experiment_data_dir, 'test.data'))
    # build the vocab
    vocab = Vocab(
        mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    all_words = train_dataset.get_all_words() + dev_dataset.get_all_words(
    ) + test_dataset.get_all_words()
    # all_words = train_dataset.get_all_words()
    for word in all_words:
        vocab.add(word)
    # load and initialize the embeddings
    vocab.load_pretrained_emb(paths.embedding_path)
    print('Size of PDTB vocabulary: {}'.format(vocab.size()))
    # save the vocab
    torch.save(vocab, paths.vocab_path)
예제 #19
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.K = args.K
        self.rnn_hidden = args.rnn_hidden
        self.max_sent_len = args.max_sent_len
        print("loading pretrained emb......")
        self.emb_matrix = np.load(args.dset_dir+'/'+args.dataset+'/embedding.npy')
        print("loading dataset vocab......")
        self.vocab = Vocab(args.dset_dir+'/'+args.dataset+'/vocab.pkl')

        # create embedding layers
        self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID)
        self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None

        # initialize embedding with pretrained word embeddings
        self.init_embeddings()

        # dropout
        self.input_dropout = nn.Dropout(args.input_dropout)

        # define r rc distribution
        self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K))
        self.r_diag_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        # orthogonal initialization r_std_rc
        for i in range(self.max_sent_len):
            nn.init.orthogonal_(self.r_std_rc[i], gain=1)

        # define r ner distribution
        self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K))
        self.r_diag_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        # orthogonal initialization r_std_ner
        for i in range(self.max_sent_len):
            nn.init.orthogonal_(self.r_std_ner[i], gain=1)

        # define encoder
        self.BiLSTM = LSTMRelationModel(args)
        self.hidden2mean_rc = nn.Linear(self.rnn_hidden*2, self.K)
        self.hidden2std_rc = nn.Linear(self.rnn_hidden*2, self.K)
        # ner encoder
        self.hidden2mean_ner = nn.Linear(self.rnn_hidden*2, self.K)
        self.hidden2std_ner = nn.Linear(self.rnn_hidden*2, self.K)

        # decoder
        self.rc_lr = nn.Linear(args.K*2, args.K)
        self.rc_cla = nn.Linear(args.K, len(constant.LABEL_TO_ID))
        self.ner_cla = nn.Linear(args.K, len(constant.BIO_TO_ID))
        self.logsoft_fn = nn.LogSoftmax(dim=3)

        # mse loss 
        self.loss_fn = torch.nn.MSELoss(reduction='sum')
예제 #20
0
    def __init__(self,
                 source_name,
                 target_name,
                 max_length=300,
                 source_vocab=None,
                 target_vocab=None):

        self.data_source = self.read_file(source_name)
        self.data_target = self.read_file(target_name)

        self.max_length = max_length

        self.source_vocab = source_vocab
        if source_vocab == None:
            self.source_vocab = Vocab()
            self.source_vocab.build_vocab([source_name])

        self.target_vocab = target_vocab
        if target_vocab == None:
            self.target_vocab = Vocab()
            self.target_vocab.build_vocab([target_name])
예제 #21
0
    def __init__(self,
                 logger,
                 config,
                 data_name,
                 data_path,
                 embed_path=None,
                 user_dict=None,
                 vocab_path=None,
                 stop_word=None,
                 max_len=50,
                 query_max_len=20,
                 target_max_len=20,
                 test_split=0.0,
                 training=True):
        self.logger = logger
        self.reset = config.reset
        self._data_dir = Path('data') / data_name

        self.query_max_len = query_max_len
        self.target_max_len = target_max_len
        self.max_len = max_len

        if training:
            embedding_path = self._data_dir / embed_path
            print(embedding_path.absolute())
            self._embedding = Embedding(str(embedding_path), logger=logger)

        print(
            f"Begin to build segment and ..... feature engnieer .... ngram ....."
        )
        self._segment = Segment_jieba(user_dict=str(self._data_dir /
                                                    user_dict))

        if training:
            print(f"Begin to build vocab")
            self._vocab = Vocab(str(self._data_dir / 'RAW' / vocab_path),
                                self._segment, self._embedding)
            self.word2idx, self.idx2word = self._vocab.word2idx, self._vocab.idx2word
            dump_to_pickle(str(self._data_dir / 'vocab.pkl'),
                           (self.word2idx, self.idx2word), self.reset)
        else:
            print(f"load the vocab")
            (self.word2idx, self.idx2word) = load_from_pickle(
                str(self._data_dir / 'vocab.pkl'))

        self.vocab_size = len(self.word2idx)
        if training:
            filename = str(self._data_dir / 'RAW' / data_path)
            # train_test_split and exist
            self._get_train_and_test(filename, test_split)
예제 #22
0
def main(args):
    print("Load Tokenizer and Define Variables.")
    ## by arguments
    if args.lang == 'ko':
        tokenizer = ko.Tokenizer()
    else:
        raise ValueError(
            "Wrong arguments for --lang. Please pass 'ko' for --lang arguments."
        )
    processed_path = args.path

    ## etc
    emo = emoji.get_emoji_regexp()
    now = datetime.now()

    ## Load data for synthesio
    cols = ['Mention Title', 'Mention Content']
    df = pd.read_parquet('data/Korean.parquet', columns=cols)
    df = df.fillna('')
    docs = [doc for doc in df['Mention Title'] + ' ' + df['Mention Content']]

    print("Tokenize the documents and build the vocab.")
    with Pool(processes=os.cpu_count()) as pool:
        tokenized_docs = pool.map(tokenizer.tokenize, docs)

    token_counts = Counter(list(zip(*chain(*tokenized_docs)))[0]).most_common()
    vocab = Vocab(list_of_tokens=[
        token for token, count in token_counts if count >= int(args.min_count)
    ],
                  token_to_idx={
                      '[PAD]': 0,
                      '[UNK]': 1
                  })
    vocab.lexeme['is_Emoji'] = [
        True if emo.fullmatch(term) != None else False
        for term in vocab.idx_to_token
    ]
    vocab.lexeme['is_Digit'] = [
        True if re.fullmatch(r'[\d\,\.]+', term) != None else False
        for term in vocab.idx_to_token
    ]
    vocab.lexeme['is_Punct'] = [
        True
        if re.fullmatch(rf'[{string.punctuation}]+', term) != None else False
        for term in vocab.idx_to_token
    ]

    print(f"Build the new vocab vocab-size : {len(vocab)}")
    with open(f"{processed_path}/vocab-{now:%Y%m%d}.pkl", 'wb') as f:
        pickle.dump(vocab, f)
예제 #23
0
def create_vocab(data_path, min_count=3):
    root_rel = None
    wd_vocab = Vocab(min_count, eos=None)
    char_vocab = Vocab(min_count, eos=None)
    tag_vocab = Vocab(eos=None)
    rel_vocab = Vocab(bos=None, eos=None)
    with open(data_path, 'r', encoding='utf-8') as fr:
        for deps in read_deps(fr):
            for dep in deps:
                wd_vocab.add(dep.form)
                char_vocab.add(list(dep.form))
                tag_vocab.add(dep.pos_tag)

                if dep.head != 0:
                    rel_vocab.add(dep.dep_rel)
                elif root_rel is None:
                    root_rel = dep.dep_rel
                    rel_vocab.add(dep.dep_rel)
                elif root_rel != dep.dep_rel:
                    print('root = ' + root_rel + ', rel for root = ' +
                          dep.dep_rel)

    return MultiVocab(
        dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, rel=rel_vocab))
예제 #24
0
    def __init__(self, d_model, attn_dropout=0.1, temper_value=0.5):
        super(ScaledDotProductAttention, self).__init__()

        # add temper as hyperparameter
        self.temper = np.power(d_model, temper_value)  # 0.5 originally
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

        # this is only used in attention investigation
        # TODO: set it as a flag in runner.py
        vocab_file = 'dataset/vocab/vocab.pkl'
        self.vocab = Vocab(vocab_file, load=True)

        self.tanh = nn.Tanh()
        self.conv = nn.Conv2d(240, kernel_size=1, out_channels=1)
예제 #25
0
    def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'):
        super(Articles, self).__init__()
        '''Initialization'''
        self.vocab = Vocab(vocab_path, voc_size)
        self.tokenizer = data.get_tokenizer('basic_english')
        self.max_len_story = MAX_LEN_STORY
        self.max_len_highlight = MAX_LEN_HIGHLIGHT

        is_test = {
            False: os.path.join(data_dir, "train.pkl"),
            True: os.path.join(data_dir, "test.pkl")
        }
        self.data_path = is_test.get(test, "Wrong set name.")

        with open(self.data_path, 'rb') as f:
            self.data = load(f)
예제 #26
0
def main(args):
    with open(args.pkl_path, "rb") as f:
        labels = pickle.load(f)
    print("pickle loaded")

    tsv_path = get_eval_path(args.ref)
    dfref = pd.read_table(tsv_path)

    if args.vocab is not None:
        vocab = Vocab(args.vocab)
    else:
        vocab = None

    acc1, acck, cnt = accuracy(labels, dfref, vocab=vocab)

    print(f"{cnt:d} tokens")
    print(f"Accuracy top1: {acc1:.3f} topk: {acck:.3f}")
예제 #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', required=True)
    parser.add_argument('--vocab', required=True)
    parser.add_argument('--vocab-size', required=True, type=int)
    parser.add_argument('--max-length', required=True, type=int)
    parser.add_argument('--out', required=True)
    args = parser.parse_args()

    word_vocab = Vocab.from_file(path=args.vocab, add_pad=True, add_unk=True,
                                 max_size=args.vocab_size)
    label_dict = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
    label_vocab = Vocab(vocab_dict=label_dict, add_pad=False, add_unk=False)
    data_reader = SNLIDataset(
        data_path=args.data, word_vocab=word_vocab, label_vocab=label_vocab,
        max_length=args.max_length)
    with open(args.out, 'wb') as f:
        pickle.dump(data_reader, f)
예제 #28
0
    def build(self, corpus, min_freq=1, embed=None):
        sequences = getattr(corpus, self.name)
        counter = Counter(char for sequence in sequences for token in sequence
                          for char in self.transform(token))
        self.vocab = Vocab(counter, min_freq, self.specials)

        if not embed:
            self.embed = None
        else:
            tokens = self.transform(embed.tokens)
            # if the `unk` token has existed in the pretrained,
            # then replace it with a self-defined one
            if embed.unk:
                tokens[embed.unk_index] = self.unk

            self.vocab.extend(tokens)
            self.embed = torch.zeros(len(self.vocab), embed.dim)
            self.embed[self.vocab.token2id(tokens)] = embed.vectors
예제 #29
0
def run(do_train, do_eval, do_predict, ckpt, get_rouge, max_epochs=100):
    train_set = Articles(test=False)
    test_set = Articles(test=True)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, num_workers=1)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=1)

    encoder = Encoder()
    attention_decoder = AttnDecoder()
    model = PointerGenerator(encoder, attention_decoder)
    model.to(device)
    optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
    loss_function = torch.nn.NLLLoss()

    if ckpt:
        model, optimizer, epoch = load_ckp(checkpoint_path=ckpt, model=model, optimizer=optimizer)
        if do_eval:
            eval(test_loader, model, loss_function)
        elif do_predict:
            vocab = Vocab('data/vocab', voc_size)
            batch = iter(train_loader).next()
            story, highlight = batch
            batcher = Batcher(story, highlight, vocab)
            stories, highlights, extra_zeros, story_extended, highlight_extended, vocab_extended = batcher.get_batch(
                get_vocab_extended=True)

            stories = stories.to(device)
            highlights = highlights.to(device)
            story_extended = story_extended.to(device)
            extra_zeros = extra_zeros.to(device)

            # stories, highlights = get_random_sentences(test_set, batch_size)
            with torch.no_grad():
                output = model(stories, highlights, story_extended, extra_zeros)

            get_batch_prediction(stories, output, highlights)
    if get_rouge:
        get_rouge_files(model, test_loader)
        get_rouge_score()

    else:
        epoch = 0

    if do_train:
        train(train_loader, test_loader, loss_function, model, optimizer, epoch, num_epochs=max_epochs - epoch)
예제 #30
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.K = args.K
        self.L = args.L
        self.rnn_hidden = args.rnn_hidden
        self.max_sent_len = args.max_sent_len
        print("loading pretrained emb......")
        self.emb_matrix = np.load(args.dset_dir + '/' + args.dataset +
                                  '/embedding.npy')
        print("loading dataset vocab......")
        self.vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl')

        # create embedding layers
        self.emb = nn.Embedding(self.vocab.size,
                                args.emb_dim,
                                padding_idx=constant.PAD_ID)
        self.pos_emb = nn.Embedding(len(constant.POS_TO_ID),
                                    args.pos_dim) if args.pos_dim > 0 else None

        # initialize embedding with pretrained word embeddings
        self.init_embeddings()

        # dropout
        self.input_dropout = nn.Dropout(args.input_dropout)

        # define r distribution
        self.r_var = self.K * self.max_sent_len
        self.r_mean = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std = nn.Parameter(
            torch.randn(self.max_sent_len * self.K, self.L))

        # define encoder
        self.BiLSTM = LSTMRelationModel(args)
        self.hidden2mean = nn.Linear(self.rnn_hidden * 2, self.K)
        self.hidden2std = nn.Linear(self.rnn_hidden * 2, self.K)

        # decoder
        self.layer_rc1 = nn.Linear(args.K * 2, args.K)
        self.rc_cla = nn.Linear(args.K, len(constant.LABEL_TO_ID))
        self.layer_ner1 = nn.Linear(args.K, args.K // 2)
        self.ner_cla = nn.Linear(args.K // 2, len(constant.BIO_TO_ID))
        self.logsoft_fn = nn.LogSoftmax(dim=3)