Пример #1
0
    def __init__(self, path=None):
        if path == None:
            path = 'data/HowNet.txt'
        self.word2idx = {}
        self.idx2word = []
        self.idx2freq = []
        self.idx2senses = []
        self.threshold = -1
        self.sememe_dict = Dictionary()
        self.threshold = 0
        file = open(path)
        phase = 0
        re_chn = re.compile(u'[^\u4e00-\u9fa5]')
        cur_word = ''

        # add sememe for special tokens
        self.add_word('<unk>', ['<unk>'])
        self.add_word('<eos>', ['<eos>'])
        self.add_word('<N>', ['基数'])
        self.add_word('<year>', ['时间', '年', '特定'])
        self.add_word('<date>', ['时间', '月', '特定'])
        self.add_word('<hour>', ['时间', '时', '特定'])
        self.add_word('(', ['标点'])
        self.add_word('『', ['标点'])
        self.add_word('……', ['标点'])
        self.add_word('●', ['标点'])
        self.add_word('《', ['标点'])
        self.add_word('—', ['标点'])
        self.add_word('———', ['标点'])
        self.add_word('』', ['标点'])
        self.add_word('》', ['标点'])
        self.add_word('△', ['标点'])
        self.add_word('、', ['标点'])
        self.add_word(')', ['标点'])
        self.add_word('℃', ['标点'])
        self.add_word('▲', ['标点'])

        for line in file.readlines():
            if line[0:3] == 'NO.':
                phase = 1
                continue      # new word
            if phase == 1 and line[0:3] == 'W_C':
                phase = 2
                word = line[4:-1]
                if word == '':
                    phase = 0
                else:
                    cur_word = word
                continue
            if phase == 2 and line[0:3] == 'DEF':
                phase = 3
                content = line[4:-1]
                sememes = re_chn.split(content)
                sememe_bag = []
                for sememe in sememes:
                    if sememe != '':
                        sememe_bag += [sememe]
                if cur_word != '':
                    self.add_word(cur_word, sememe_bag)
        self.sememe_dict.idx2freq = [0] * len(self.sememe_dict)
Пример #2
0
def build_word_dict(args, examples):
    """Return a word dictionary from question and document words in
    provided examples.
    """
    word_dict = Dictionary()
    for w in load_words(args, examples):
        word_dict.add(w)
    return word_dict
Пример #3
0
def build_char_dict(args, examples):
    """Return a char dictionary from question and document words in
    provided examples.
    """
    char_dict = Dictionary()
    for c in load_chars(args, examples):
        char_dict.add(c)
    return char_dict
Пример #4
0
 def _insertfull(iterable):
     for w in iterable:
         w = Dictionary.normalize(w)
         for c in w:
             c = Dictionary.normalize(c)
             if valid_chars and c not in valid_chars:
                 continue
             chars.add(c)
Пример #5
0
 def _insert(iterable):
     for cs in iterable:
         for c in cs:
             c = Dictionary.normalize(c)
             if valid_chars and c not in valid_chars:
                 continue
             chars.add(c)
Пример #6
0
def index_embedding_words(embedding_file):
    """Put all the words in embedding_file into a set."""
    words = set()
    with open(embedding_file) as f:
        for line in f:
            w = Dictionary.normalize(line.rstrip().split(' ')[0])
            words.add(w)
    return words
Пример #7
0
def index_embedding_chars(char_embedding_file):
    """Put all the chars in char_embedding_file into a set."""
    chars = set()
    with open(char_embedding_file) as f:
        for line in f:
            c = Dictionary.normalize(line.rstrip().split(' ')[0])
            chars.add(c)
    return chars
Пример #8
0
def top_question_words(args, examples, word_dict):
    """Count and return the most common question words in provided examples."""
    word_count = Counter()
    for ex in examples:
        for w in ex['question']:
            w = Dictionary.normalize(w)
            if w in word_dict:
                word_count.update([w])
    return word_count.most_common(args.tune_partial)
Пример #9
0
def opts2params(opts, dictionary: data.Dictionary):
    """Convert command line options to a dictionary to construct a model"""
    params = {
        "rnn_type": opts.rnn_type,
        "direction": opts.direction,
        "tok_len": dictionary.tok_len(),
        "tok_emb": opts.tok_emb,
        "tok_hid": opts.tok_hid,
        "char_len": dictionary.char_len(),
        "char_emb": opts.char_emb,
        "char_hid": opts.char_hid,
        "char_kmin": opts.char_kmin,
        "char_kmax": opts.char_kmax,
        "wo_char": opts.wo_char,
        "wo_tok": opts.wo_tok,
        "nlayers": opts.nlayers,
        "dropout": opts.dropout,
        "init_range": opts.init_range,
        "tied": opts.tied
    }
    return params
    def __set_corpus(self):
        pre_dict = Dictionary()
        for lines in self.text_list:
            for line in lines:
                if len(line) > 0:
                    words = line.split()
                    #tokens += len(words)
                    for word in words:
                        pre_dict.add_word(word)

        pro_dict = Dictionary()
        for key in pre_dict.count:
            if (pre_dict.count[key] > 10):
                pro_dict.add_word(key)
        self.corpus = pro_dict
Пример #11
0
def index_embedding_words(embedding_file):
    """Put all the words in embedding_file into a set."""
    words = set()
    counter = 0
    try:
        with open(embedding_file, encoding="utf-8") as f:
            for line in f:
                counter += 1
                w = Dictionary.normalize(line.rstrip().split(' ')[0])
                words.add(w)
    except:
        print("An exception occurred on the " + counter + " word")

    return words
Пример #12
0
    def load(filename, new_args=None, normalize=True):
        logger.info('Loading model %s' % filename)
        saved_params = torch.load(filename,
                                  map_location=lambda storage, loc: storage)
        word_dict = saved_params['word_dict']
        try:
            char_dict = saved_params['char_dict']
        except KeyError as e:
            char_dict = Dictionary()

        feature_dict = saved_params['feature_dict']
        state_dict = saved_params['state_dict']
        args = saved_params['args']
        if new_args:
            args = override_model_args(args, new_args)
        return DocReader(args, word_dict, char_dict, feature_dict, state_dict,
                         normalize)
 def init_vocab(self, path, language):
     """ Initialize an instance of Dictionary, which create
     (or retrieve) the dictionary associated with the data used.
     """
     self.vocab = Dictionary(path, language)
Пример #14
0
    def forward(self, data):
        embed = map(self.embed, data)
        if self.encoder == 'BOTH':
            c_post, c_cmnt, c_neg = map(self.cnn, embed)
            r_post, r_cmnt, r_neg = map(self.rnn, embed)
            post_enc = torch.cat((c_post, r_post), 1)
            cmnt_enc = torch.cat((c_cmnt, r_cmnt), 1)
            neg_enc = torch.cat((c_neg, r_neg), 1)
        else:
            post_enc, cmnt_enc, neg_enc = map(self.encoder, embed)

        return map(normalize, (post_enc, cmnt_enc, neg_enc))


if __name__ == '__main__':
    dic = Dictionary('./full_dataset/train.vocab')
    batch_size = 10
    seq_len = 30
    cuda = False
    data_iter = DataIter(corpus_path='./full_dataset/tmp.txt',
                         batch_size=batch_size,
                         seq_len=seq_len,
                         dictionary=dic,
                         cuda=cuda)
    ntokens = len(dic)
    enc = DSSM(ntokens=ntokens,
               nemb=300,
               sent_len=seq_len,
               dropout=0.5,
               pre_embed=None,
               encoder='CNN',
Пример #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a neural machine translation model')

    # Training corpus
    corpora_group = parser.add_argument_group(
        'training corpora',
        'Corpora related arguments; specify either monolingual or parallel training corpora (or both)'
    )
    corpora_group.add_argument('--src_path',
                               help='the source language monolingual corpus')
    corpora_group.add_argument('--trg_path',
                               help='the target language monolingual corpus')
    corpora_group.add_argument(
        '--max_sentence_length',
        type=int,
        default=90,
        help='the maximum sentence length for training (defaults to 50)')

    # Embeddings/vocabulary
    embedding_group = parser.add_argument_group(
        'embeddings',
        'Embedding related arguments; either give pre-trained cross-lingual embeddings, or a vocabulary and embedding dimensionality to randomly initialize them'
    )
    embedding_group.add_argument('--src_vocabulary',
                                 help='the source language vocabulary')
    embedding_group.add_argument('--trg_vocabulary',
                                 help='the target language vocabulary')
    embedding_group.add_argument('--embedding_size',
                                 type=int,
                                 default=0,
                                 help='the word embedding size')

    # Architecture
    architecture_group = parser.add_argument_group(
        'architecture', 'Architecture related arguments')
    architecture_group.add_argument(
        '--layers',
        type=int,
        default=2,
        help='the number of encoder/decoder layers (defaults to 2)')
    architecture_group.add_argument(
        '--enc_hid_dim',
        type=int,
        default=512,
        help='the number of dimensions for the hidden layer (defaults to 600)')
    architecture_group.add_argument(
        '--dec_hid_dim',
        type=int,
        default=512,
        help='the number of dimensions for the hidden layer (defaults to 600)')

    # Optimization
    optimization_group = parser.add_argument_group(
        'optimization', 'Optimization related arguments')
    optimization_group.add_argument('--batch_size',
                                    type=int,
                                    default=128,
                                    help='the batch size (defaults to 50)')
    optimization_group.add_argument(
        '--learning_rate',
        type=float,
        default=0.0002,
        help='the global learning rate (defaults to 0.0002)')
    optimization_group.add_argument(
        '--dropout',
        metavar='PROB',
        type=float,
        default=0.4,
        help='dropout probability for the encoder/decoder (defaults to 0.3)')
    optimization_group.add_argument(
        '--param_init',
        metavar='RANGE',
        type=float,
        default=0.1,
        help=
        'uniform initialization in the specified range (defaults to 0.1,  0 for module specific default initialization)'
    )
    optimization_group.add_argument(
        '--iterations',
        type=int,
        default=50,
        help='the number of training iterations (defaults to 300000)')
    # Model saving
    saving_group = parser.add_argument_group(
        'model saving', 'Arguments for saving the trained model')
    saving_group.add_argument('--save_path',
                              metavar='PREFIX',
                              help='save models with the given prefix')
    saving_group.add_argument('--save_interval',
                              type=int,
                              default=0,
                              help='save intermediate models at this interval')
    saving_group.add_argument('--model_init_path', help='model init path')

    # Logging/validation
    logging_group = parser.add_argument_group(
        'logging', 'Logging and validation arguments')
    logging_group.add_argument('--log_interval',
                               type=int,
                               default=1000,
                               help='log at this interval (defaults to 1000)')
    logging_group.add_argument('--validate_batch_size',
                               type=int,
                               default=1,
                               help='the batch size (defaults to 50)')
    corpora_group.add_argument('--inference_output',
                               help='the source language monolingual corpus')
    corpora_group.add_argument('--validation_src_path',
                               help='the source language monolingual corpus')
    corpora_group.add_argument('--validation_trg_path',
                               help='the source language monolingual corpus')

    # Other
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--cuda',
                        default=False,
                        action='store_true',
                        help='use cuda')
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--type",
                        type=str,
                        default='train',
                        help="type: train/inference/debug")

    args = parser.parse_args()
    print(args)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    src_dictionary = Dictionary(
        [word.strip() for word in open(args.src_vocabulary).readlines()])
    trg_dictionary = Dictionary(
        [word.strip() for word in open(args.trg_vocabulary).readlines()])

    def init_weights(m):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.normal_(param.data, mean=0, std=0.01)
            else:
                nn.init.constant_(param.data, 0)

    if not args.model_init_path:
        attn = Attention(args.enc_hid_dim, args.dec_hid_dim)
        enc = Encoder(src_dictionary.size(), args.embedding_size,
                      args.enc_hid_dim, args.dec_hid_dim, args.dropout,
                      src_dictionary.PAD)
        dec = Decoder(trg_dictionary.size(), args.embedding_size,
                      args.enc_hid_dim, args.dec_hid_dim, args.dropout, attn)
        s2s = Seq2Seq(enc, dec, src_dictionary.PAD, device)
        parallel_model = Parser(src_dictionary, trg_dictionary, s2s, device)
        parallel_model.apply(init_weights)

    else:
        print(f"load init model from {args.model_init_path}")
        parallel_model = torch.load(args.model_init_path)

    parallel_model = parallel_model.to(device)

    if args.type == TEST:
        test_dataset = treeDataset(args.validation_src_path,
                                   args.validation_trg_path)
        test_dataloader = DataLoader(test_dataset,
                                     shuffle=False,
                                     batch_size=args.validate_batch_size,
                                     collate_fn=collate_fn)
        hit, total, acc = evaluate_iter_loss2(parallel_model, test_dataloader,
                                              src_dictionary, trg_dictionary,
                                              device)
        print(f'hit: {hit: d} |  total: {total: d} | acc: {acc: f}',
              flush=True)

    elif args.type == INFERENCE:
        test_dataset = customDataset(args.validation_src_path,
                                     args.validation_trg_path)
        test_dataloader = DataLoader(test_dataset,
                                     shuffle=False,
                                     batch_size=args.validate_batch_size)
        hit, total, acc = evaluate_iter_acc(parallel_model, test_dataloader,
                                            src_dictionary, trg_dictionary,
                                            device, args.inference_output)
        print(f'hit: {hit: d} |  total: {total: d} | acc: {acc: f}',
              flush=True)
    elif args.type == DEBUG:
        test_dataset = treeDataset(args.validation_src_path,
                                   args.validation_trg_path)
        test_dataloader = DataLoader(test_dataset,
                                     shuffle=False,
                                     batch_size=args.validate_batch_size,
                                     collate_fn=collate_fn)
        hit, total, acc = debug_iter(parallel_model, test_dataloader,
                                     src_dictionary, trg_dictionary, device)
        print(f'hit: {hit: d} |  total: {total: d} | acc: {acc: f}',
              flush=True)

    else:
        train_dataset = treeDataset(args.src_path, args.trg_path)
        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=args.batch_size,
                                      collate_fn=collate_fn)
        test_dataset = treeDataset(args.validation_src_path,
                                   args.validation_trg_path)
        test_dataloader = DataLoader(test_dataset,
                                     shuffle=False,
                                     batch_size=args.validate_batch_size,
                                     collate_fn=collate_fn)

        train(src_dictionary, trg_dictionary, train_dataloader,
              test_dataloader, parallel_model, device, args)
Пример #16
0
 def test_load(self):
     Dictionary.load(dict_file)
Пример #17
0
torch.manual_seed(args.seed)
random.seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# Config to run
config = Config()
if os.path.isfile(args.save):
    checkpoint = torch.load(args.save)
    if 'config' in checkpoint:
        print("Loading saved config")
        config = checkpoint['config']
print(config)

# Dictionary and corpus
dictionary = Dictionary()
training_corpus = Corpus(args.data + "/train.txt",
                         dictionary,
                         create_dict=True,
                         use_cuda=args.cuda,
                         n_gram=config.n_gram,
                         context_mode=config.context_mode)
validation_corpus = Corpus(args.data + "/valid.txt",
                           dictionary,
                           create_dict=True,
                           use_cuda=args.cuda,
                           n_gram=config.n_gram,
                           context_mode=config.context_mode)

# TensorboardX object
writer = SummaryWriter("saved_runs/" + args.save)
Пример #18
0
def main(args: argparse.Namespace):
    # Load input data
    with open(args.train_metadata, 'r') as f:
        train_posts = json.load(f)

    with open(args.val_metadata, 'r') as f:
        val_posts = json.load(f)

    # Load labels
    labels = {}
    with open(args.label_intent, 'r') as f:
        intent_labels = json.load(f)
        labels['intent'] = {}
        for label in intent_labels:
            labels['intent'][label] = len(labels['intent'])

    with open(args.label_semiotic, 'r') as f:
        semiotic_labels = json.load(f)
        labels['semiotic'] = {}
        for label in semiotic_labels:
            labels['semiotic'][label] = len(labels['semiotic'])

    with open(args.label_contextual, 'r') as f:
        contextual_labels = json.load(f)
        labels['contextual'] = {}
        for label in contextual_labels:
            labels['contextual'][label] = len(labels['contextual'])

    # Build dictionary from training set
    train_captions = []
    for post in train_posts:
        train_captions.append(post['orig_caption'])
    dictionary = Dictionary(tokenizer_method="TreebankWordTokenizer")
    dictionary.build_dictionary_from_captions(train_captions)

    # Set up torch device
    if 'cuda' in args.device and torch.cuda.is_available():
        device = torch.device(args.device)
        kwargs = {'pin_memory': True}
    else:
        device = torch.device('cpu')
        kwargs = {}

    # Set up number of workers
    num_workers = min(multiprocessing.cpu_count(), args.num_workers)

    # Set up data loaders differently based on the task
    # TODO: Extend to ELMo + word2vec etc.
    if args.type == 'image_only':
        train_dataset = ImageOnlyDataset(train_posts, labels)
        val_dataset = ImageOnlyDataset(val_posts, labels)
        train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                        batch_size=args.batch_size,
                                                        shuffle=args.shuffle,
                                                        num_workers=num_workers,
                                                        collate_fn=collate_fn_pad_image_only,
                                                        **kwargs)
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                    batch_size=1,
                                                    num_workers=num_workers,
                                                    collate_fn=collate_fn_pad_image_only,
                                                    **kwargs)
    elif args.type == 'image_text':
        train_dataset = ImageTextDataset(train_posts, labels, dictionary)
        val_dataset = ImageTextDataset(val_posts, labels, dictionary)
        train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                        batch_size=args.batch_size,
                                                        shuffle=args.shuffle,
                                                        num_workers=num_workers,
                                                        collate_fn=collate_fn_pad_image_text,
                                                        **kwargs)
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                    batch_size=1,
                                                    num_workers=num_workers,
                                                    collate_fn=collate_fn_pad_image_text,
                                                    **kwargs)
    elif args.type == 'text_only':
        train_dataset = TextOnlyDataset(train_posts, labels, dictionary)
        val_dataset = TextOnlyDataset(val_posts, labels, dictionary)
        train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                        batch_size=args.batch_size,
                                                        shuffle=args.shuffle,
                                                        num_workers=num_workers,
                                                        collate_fn=collate_fn_pad_text_only,
                                                        **kwargs)
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                    batch_size=1,
                                                    num_workers=num_workers,
                                                    collate_fn=collate_fn_pad_text_only,
                                                    **kwargs)

    # Set up the model
    model = Model(vocab_size=dictionary.size()).to(device)

    # Set up an optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_scheduler_step_size, gamma=args.lr_scheduler_gamma) # decay by 0.1 every 15 epochs

    # Set up loss function
    loss_fn = torch.nn.CrossEntropyLoss()

    # Setup tensorboard
    if args.tensorboard:
        writer = tensorboard.SummaryWriter(log_dir=args.log_dir + "/" + args.name, flush_secs=1)
    else:
        writer = None

    # Training loop
    if args.classification == 'intent':
        keys = ['intent']
    elif args.classification == 'semiotic':
        keys = ['semiotic']
    elif args.classification == 'contextual':
        keys = ['contextual']
    elif args.classification == 'all':
        keys = ['intent', 'semiotic', 'contextual']
    else:
        raise ValueError("args.classification doesn't exist.")
    best_auc_ovr = 0.0
    best_auc_ovo = 0.0
    best_acc = 0.0
    best_model = None
    best_optimizer = None
    best_scheduler = None
    for epoch in range(args.epochs):
        for mode in ["train", "eval"]:
            # Set up a progress bar
            if mode == "train":
                pbar = tqdm.tqdm(enumerate(train_data_loader), total=len(train_data_loader))
                model.train()
            else:
                pbar = tqdm.tqdm(enumerate(val_data_loader), total=len(val_data_loader))
                model.eval()

            total_loss = 0
            label = dict.fromkeys(keys, np.array([], dtype=np.int))
            pred = dict.fromkeys(keys, None)
            for _, batch in pbar:
                if 'caption' not in batch:
                    caption_data = None
                else:
                    caption_data = batch['caption'].to(device)
                if 'image' not in batch:
                    image_data = None
                else:
                    image_data = batch['image'].to(device)
                label_batch = {}
                for key in keys:
                    label_batch[key] = batch['label'][key].to(device)
                    
                if mode == "train":
                    model.zero_grad()

                pred_batch = model(image_data, caption_data)
                
                for key in keys:
                    label[key] = np.concatenate((label[key], batch['label'][key].cpu().numpy()))
                    x = pred_batch[key].detach().cpu().numpy()
                    x_max = np.max(x, axis=1).reshape(-1, 1)
                    z = np.exp(x - x_max)
                    prediction_scores = z / np.sum(z, axis=1).reshape(-1, 1)
                    if pred[key] is not None:
                        pred[key] = np.vstack((pred[key], prediction_scores))
                    else:
                        pred[key] = prediction_scores
                       
                loss_batch = {}
                loss = None
                for key in keys:
                    loss_batch[key] = loss_fn(pred_batch[key], label_batch[key])
                    if loss is None:
                        loss = loss_batch[key]
                    else:
                        loss += loss_bath[key] 

                total_loss += loss.item()

                if mode == "train":
                    loss.backward()
                    optimizer.step()

            # Terminate the progress bar
            pbar.close()
            
            # Update lr scheduler
            if mode == "train":
                scheduler.step()

            for key in keys:
                auc_score_ovr = roc_auc_score(label[key], pred[key], multi_class='ovr') # pylint: disable-all
                auc_score_ovo = roc_auc_score(label[key], pred[key], multi_class='ovo') # pylint: disable-all
                accuracy = accuracy_score(label[key], np.argmax(pred[key], axis=1))
                print("[{} - {}] [AUC-OVR={:.3f}, AUC-OVO={:.3f}, ACC={:.3f}]".format(mode, key, auc_score_ovr, auc_score_ovo, accuracy))
                
                if mode == "eval":
                    best_auc_ovr = max(best_auc_ovr, auc_score_ovr)
                    best_auc_ovo = max(best_auc_ovo, auc_score_ovo)
                    best_acc = max(best_acc, accuracy)
                    best_model = model
                    best_optimizer = optimizer
                    best_scheduler = scheduler
                
                if writer:
                    writer.add_scalar('AUC-OVR/{}-{}'.format(mode, key), auc_score_ovr, epoch)
                    writer.add_scalar('AUC-OVO/{}-{}'.format(mode, key), auc_score_ovo, epoch)
                    writer.add_scalar('ACC/{}-{}'.format(mode, key), accuracy, epoch)
                    writer.flush()

            if writer:
                writer.add_scalar('Loss/{}'.format(mode), total_loss, epoch)
                writer.flush()

            print("[{}] Epoch {}: Loss = {}".format(mode, epoch, total_loss))

    hparam_dict = {
        'train_split': args.train_metadata,
        'val_split': args.val_metadata,
        'lr': args.lr,
        'epochs': args.epochs,
        'batch_size': args.batch_size,
        'num_workers': args.num_workers,
        'shuffle': args.shuffle,
        'lr_scheduler_gamma': args.lr_scheduler_gamma,
        'lr_scheduler_step_size': args.lr_scheduler_step_size,
    }
    metric_dict = {
        'AUC-OVR': best_auc_ovr,
        'AUC-OVO': best_auc_ovo,
        'ACC': best_acc
    }

    if writer:
        writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict)
        writer.flush()
    
    Path(args.output_dir).mkdir(exist_ok=True)
    torch.save({
        'hparam_dict': hparam_dict,
        'metric_dict': metric_dict,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }, Path(args.output_dir) / '{}.pt'.format(args.name))
Пример #19
0
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader
from data import Dictionary
from custom_embedder_recurrent import CustomEmbedder
from optimizer import RAdam
import tqdm
import transformers

tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
gpt2 = transformers.GPT2Model.from_pretrained("gpt2")

embedding = gpt2.wte
vocab = tokenizer.decoder

dictionary = Dictionary()
dictionary.word2idx = {v: int(k) for k, v in vocab.items()}
dictionary.idx2word = {int(k): v for k, v in vocab.items()}

model = CustomEmbedder(dictionary, 768)
embedding.weight.requires_grad = False
model = model.cuda()
optimizer = RAdam(model.parameters(), lr=0.001)
writer = SummaryWriter()


class EDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
Пример #20
0
}

ROOT = 'C:\\Users\\lenovo\\.qqbot-tmp\\plugins\\'

import configparser
config = configparser.ConfigParser()
config.read(ROOT + 'app.conf', encoding='utf8')
dict_path = config.get('DICTIONARY', 'dict_path')
model_save_path = config.get('MODEL', 'save_path')
embedding_dim = int(config.get('MODEL', 'embedding_dim'))
hidden_dim = int(config.get('MODEL', 'hidden_dim'))
num_layers = int(config.get('MODEL', 'num_layers'))

jieba.load_userdict(dict_path)

_dict = Dictionary([])
model_dict_path = os.path.join(model_save_path, 'freq1.dict')
_dict.load(model_dict_path)

rep = Replier(model_save_path, _dict, num_layers, embedding_dim, hidden_dim)
rep.load('freq1_(3.562772035598755)')

REPLY_TIME = {}
REPLY_TIME['ME'] = 0


def get_errmsg(key):
    limit = len(MSG[key]) - 1
    idx = random.randint(0, limit)
    return MSG[key][idx]
Пример #21
0
from data import Dictionary, Word

dict = Dictionary()
words = [
    Word('a', []),
    Word('b', [{
        'tag': 'Noun',
        'defs': [{
            'def': 'abcdf',
            'examples': ['1', '2']
        }]
    }])
]
for word in words:
    dict.add(word)
dict.save('test_dict.yaml')
Пример #22
0
class SememeDictionary(object):
    def __init__(self, path=None):
        if path == None:
            path = 'data/HowNet.txt'
        self.word2idx = {}
        self.idx2word = []
        self.idx2freq = []
        self.idx2senses = []
        self.threshold = -1
        self.sememe_dict = Dictionary()
        self.threshold = 0
        file = open(path)
        phase = 0
        re_chn = re.compile(u'[^\u4e00-\u9fa5]')
        cur_word = ''

        # add sememe for special tokens
        self.add_word('<unk>', ['<unk>'])
        self.add_word('<eos>', ['<eos>'])
        self.add_word('<N>', ['基数'])
        self.add_word('<year>', ['时间', '年', '特定'])
        self.add_word('<date>', ['时间', '月', '特定'])
        self.add_word('<hour>', ['时间', '时', '特定'])
        self.add_word('(', ['标点'])
        self.add_word('『', ['标点'])
        self.add_word('……', ['标点'])
        self.add_word('●', ['标点'])
        self.add_word('《', ['标点'])
        self.add_word('—', ['标点'])
        self.add_word('———', ['标点'])
        self.add_word('』', ['标点'])
        self.add_word('》', ['标点'])
        self.add_word('△', ['标点'])
        self.add_word('、', ['标点'])
        self.add_word(')', ['标点'])
        self.add_word('℃', ['标点'])
        self.add_word('▲', ['标点'])

        for line in file.readlines():
            if line[0:3] == 'NO.':
                phase = 1
                continue      # new word
            if phase == 1 and line[0:3] == 'W_C':
                phase = 2
                word = line[4:-1]
                if word == '':
                    phase = 0
                else:
                    cur_word = word
                continue
            if phase == 2 and line[0:3] == 'DEF':
                phase = 3
                content = line[4:-1]
                sememes = re_chn.split(content)
                sememe_bag = []
                for sememe in sememes:
                    if sememe != '':
                        sememe_bag += [sememe]
                if cur_word != '':
                    self.add_word(cur_word, sememe_bag)
        self.sememe_dict.idx2freq = [0] * len(self.sememe_dict)

    def senses_belong(self, sememes_bag, senses_bag):
        for i in range(len(senses_bag)):
            if len(set(sememes_bag + senses_bag[i])) == len(sememes_bag)\
                    and len(sememes_bag) == len(senses_bag[i]):
                return True
        return False

    def add_word(self, word, sememes_bag):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.idx2senses.append([])
            self.idx2freq.append(0)
            self.word2idx[word] = len(self.idx2word) - 1

        idx = self.word2idx[word]
        sememe_bag_idx = []
        for sememe in sememes_bag:
            sememe_bag_idx.append(self.sememe_dict.add_word(sememe))
        sememe_bag_idx = list(set(sememe_bag_idx))
        if not self.senses_belong(sememe_bag_idx, self.idx2senses[idx]):
            self.idx2senses[idx].append(sememe_bag_idx)

        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

    def summary(self, print_sememes=False):
        print('=' * 69)
        print('-' * 31 + 'SUMMARY' + '-' * 31)
        print('Number of Sememes: {}'.format(len(self.sememe_dict)))
        print('Number of Words: {}'.format(len(self.idx2word)))
        tot_senses = 0
        tot_sememes = 0
        for i in range(len(self.idx2word)):
            tot_senses += len(self.idx2senses[i])
            for j in range(len(self.idx2senses[i])):
                tot_sememes += len(self.idx2senses[i][j])
        ws_ratio = (tot_senses + 0.0) / len(self.idx2word)
        ss_ratio = (tot_sememes + 0.0) / tot_senses
        print('Mean Senses per Word: {}'.format(ws_ratio))
        print('Mean Sememes per Sense: {}'.format(ss_ratio))
        print('=' * 69)
        if print_sememes:
            print(','.join(self.sememe_dict.idx2word))

    def exist(self, word):
        return word in self.word2idx

    def add_word_f(self, word):
        if word not in self.word2idx:
            raise ValueError("Word don't exist")
        idx = self.word2idx[word]
        for sense in self.idx2senses[idx]:
            for sememe in sense:
                self.sememe_dict.idx2freq[sememe] += 1
        self.idx2freq[self.word2idx[word]] += 1

    def query_count(self, word):
        if word not in self.word2idx:
            raise ValueError("Word don't exist")
        return self.idx2freq[self.word2idx[word]]

    def freq_le(self, k):
        tot = 0
        for idx in range(len(self.idx2word)):
            if self.idx2freq[idx] < k:
                tot += 1
        return tot

    def freq_ge(self, k):
        tot = 0
        for idx in range(len(self.idx2word)):
            if self.idx2freq[idx] >= k:
                tot += 1
        return tot

    def set_threshold(self, threshold):
        self.threshold = threshold

    def sememe_word_visit(self, word_dict):
        sememe_word = []
        sememe_sense = []
        for i in range(len(self.sememe_dict)):
            sememe_word.append([])
            sememe_sense.append([])
        maximum_senses = 0
        tot_senses = 0
        for word_id in range(len(self.word2idx)):
            if self.idx2freq[word_id] >= self.threshold:
                maximum_senses = max(maximum_senses, len(self.idx2senses[word_id]))
                for sense in self.idx2senses[word_id]:
                    for sememe in sense:
                        sememe_word[sememe].append(word_id)
                        sememe_sense[sememe].append(tot_senses)
                    tot_senses += 1
        tot = 0
        tot_sememes = 0
        max_words = 0
        a = []
        sememe_word_pair = [[], []]
        sememe_sense_pair = [[], []]
        sememe_idx = []
        word_sense = []
        for i in range(len(word_dict)):
            word_sense.append([])
        for i in range(len(self.sememe_dict)):
            cur_str = self.sememe_dict.idx2word[i]
            cur_str += ': '
            words = []
            for j in range(len(sememe_word[i])):
                word_id = sememe_word[i][j]
                sense_id = sememe_sense[i][j]
                words.append(self.idx2word[word_id])
                sememe_word_pair[0].append(tot_sememes)
                sememe_word_pair[1].append(word_dict[self.idx2word[word_id]])
                sememe_sense_pair[0].append(tot_sememes)
                sememe_sense_pair[1].append(sense_id)
                word_sense[word_dict[self.idx2word[word_id]]].append(sense_id)
            tot += len(sememe_word[i])
            max_words = max(max_words, len(sememe_word[i]))
            a += sememe_word[i]
            cur_str += ','.join(words)
            if len(set(sememe_word[i])) > 0:
                sememe_idx.append(tot_sememes)
            else:
                sememe_idx.append(-1)
            tot_sememes += len(sememe_word[i]) > 0
        for i in range(len(word_dict)):
            word_sense[i] = list(set(word_sense[i]))
        print('Total words: {}'.format(len(set(a))))
        print('Maximum words per sememe: {}'.format(max_words))
        print('Maximum sense per word: {}'.format(maximum_senses))
        print('Total respective semems: {}'.format(tot_sememes))
        print('Total sememe-word pairs: {}'.format(tot))
        return sememe_word_pair, sememe_idx, sememe_sense_pair, word_sense

    def visit(self, word, mode='full'):
        if word not in self.word2idx:
            raise ValueError('No word!')
        idx = self.word2idx[word]
        if mode == 'sbag':
            sememes = []
            for sense in self.idx2senses[idx]:
                for sememe in sense:
                    sememes.append(sememe)
            sememes = set(sememes)
            sememes_str = []
            for sememe in sememes:
                sememes_str.append(self.sememe_dict.idx2word[sememe])
            print(word + ':' + ','.join(sememes_str))
        if mode == 'full':
            print('Word: ' + word + ', total {} means'.
                format(len(self.idx2senses[idx])))
            for i in range(len(self.idx2senses[idx])):
                sememes_list = []
                for j in range(len(self.idx2senses[idx][i])):
                    sememes_list.append(
                        self.sememe_dict.idx2word[self.idx2senses[idx][i][j]])
                sememes = ','.join(sememes_list)
                print('Sense #{}: '.format(i + 1) + sememes)
Пример #23
0
 def _insert(iterable):
     for w in iterable:
         w = Dictionary.normalize(w)
         if valid_words and w not in valid_words:
             continue
         words.add(w)
train_labelized = labelize_reviews(train_sentences, 'TRAIN')
test_labelized = labelize_reviews(test, 'TEST')

size = 400
epoch_num = 30

make_pred = False

if False:
    model_dm, model_dbow = doc2vec_train(train_labelized, test_labelized, size, epoch_num)
else:
    model_dm, model_dbow = doc2vec_load(size)
    train_doc2vecs, test_doc2vecs = get_vectors(model_dm, model_dbow, train_labelized, test_labelized, size)

    dictionary = Dictionary()
    for sentence in train_sentences + test:
        for word in sentence:
            dictionary.add_word(word)
    dictionary.refactor(1)
    print('vocab size = %d' % len(dictionary))
    voc_len = len(dictionary)
    voc_len += 1
    train_vecs = np.zeros((len(train_sentences), voc_len))
    test_vecs = np.zeros((len(test), voc_len))
    for i in range(len(train_sentences)):
        sentence = train_sentences[i]
        for word in sentence:
            train_vecs[i, dictionary.word2idx[word]] += 1
            train_vecs[i, voc_len - 1] += 1
    for i in range(len(test)):
Пример #25
0
if __name__ == '__main__':
    print '{:=^30}'.format('all args')
    for arg in vars(args):
        print ' '.join(map(str, (arg, getattr(args, arg))))
    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    corpus_path = args.data + '/'
    dictionary = Dictionary(corpus_path + 'train.vocab')

    train_iter = DataIter(
        corpus_path + 'train.txt',
        args.batch_size,
        args.seq_len,
        dictionary=dictionary,
        cuda=args.cuda,
    )
    valid_iter = DataIter(
        corpus_path + 'valid.txt',
        args.batch_size,
        args.seq_len,
        dictionary=dictionary,
        cuda=args.cuda,
    )
Пример #26
0
    print("Arguments: \n ", args)
    print("Device:", device)

    query_files = [
        os.path.join(args.data, "train.query.txt"),
        os.path.join(args.data, "valid.query.txt"),
        os.path.join(args.data, "test.query.txt")
    ]

    if os.path.exists("./saved/dictionary.pkl"):
        print("Loading previously saved dictionary...")
        with open("./saved/dictionary.pkl", "rb") as f:
            dictionary = pickle.load(f)
    else:
        print("Creating dictionary...")
        dictionary = Dictionary(query_files)
        with open("./saved/dictionary.pkl", "wb") as f:
            pickle.dump(dictionary, f)
    
    nchar = len(dictionary)
    max_seq_len = dictionary.max_seq_len

    lr = args.lr
    clip = args.clip
    batch_size = args.batch_size
    eval_batch_size = 10
    best_val_loss = None

    if args.model == 'LSTM':
        model = LSTMModel(nchar, args.nhid, args.nlayers, max_seq_len, args.dropout)
        if args.load_latest:
Пример #27
0
from os.path import dirname, join
from data import Dictionary

CORPUS_FOLDER = dirname(dirname(__file__))
HND_FOLDER = join(CORPUS_FOLDER, "data", "dictionaries", "hongocduc")
with open(join(HND_FOLDER, "words.txt")) as f:
    lines = f.read().splitlines()
for line in lines:
    open(line)
Dictionary.hi()
Пример #28
0

if __name__ == '__main__':
    # Set the random seed manually for reproducibility.
    args = arg_parse()
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    corpus_path = args.data + '/'
    dictionary = Dictionary(corpus_path + 'vocab.c.txt')

    eval_batch_size = 10

    train_iter = DataIter(
        corpus_path + 'train.txt',
        args.batch_size,
        dictionary=dictionary,
        cuda=args.cuda,
        training=True,
    )
    valid_iter = DataIter(
        corpus_path + 'valid.txt',
        eval_batch_size,
        dictionary=dictionary,
        cuda=args.cuda,
Пример #29
0
    heads = mst(S)

    # Predict labels
    select = torch.LongTensor(heads).unsqueeze(0).expand(S_lab.size(0), -1)
    select = Variable(select)
    selected = torch.gather(S_lab, 1, select.unsqueeze(1)).squeeze(1)
    _, labels = selected.max(dim=0)
    labels = labels.data.numpy()
    return heads, labels


if __name__ == '__main__':

    data_path = '../../stanford-ptb'
    vocab_path = 'vocab/train'
    model_path = 'models/model.pt'

    dictionary = Dictionary(vocab_path)
    corpus = Corpus(data_path=data_path, vocab_path=vocab_path)
    model = torch.load(model_path)
    batches = corpus.train.batches(1)

    words, tags, heads, labels = next(batches)
    S_arc, S_lab = model(words, tags)

    plot(S_arc, heads)
    words = tags = [1, 2, 3, 4]
    heads_pred, labels_pred = predict(model, words, tags)
    print(heads_pred, '\n', heads[0].data.numpy())
    print(labels_pred, '\n', labels[0].data.numpy())
    'C': 'CCONJ',
    'I': 'INTJ',
    'E': 'ADP',
    'M': 'NOUN',  # số từ
    'n': 'NOUN',
    'S': 'NOUN'  # khối
}
TEMP_IGNORE_POS = set([
    'R',  # phụ từ tiếng Việt
    'X',  # không phân loại
    'Z',  # yếu tố cấu tạo từ
    'D',  # không có định nghĩa (ví dụ: chút ít)
    'O',  # úi chà
])
logger.info("Start loading")
dict = Dictionary()
pos_count = {}
data = joblib.load(UTS_DICT_DATA)
count = 0
logger.info("End loading")

for key in data:
    # count += 1
    # if count > 30:
    #     break
    defs = []
    pos_tags = {}
    text = key
    for definition in data[key]:
        pos_tag = definition['pos']
        if pos_tag not in pos_tags: