예제 #1
0
    def __init__(self, opt):
        self.opt = opt
        tokenizer = build_tokenizer(
            fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
            max_seq_len=opt.max_seq_len,
            dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
        embedding_matrix = build_embedding_matrix(
            word2idx=tokenizer.word2idx,
            embed_dim=opt.embed_dim,
            dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
        if opt.model_name != 'base_model':
            boc = build_boc(' ', dat_fname='bag_of_concepts.dat')
            affective_matrix = build_embedding_matrix( word2idx=boc.word2idx,
                embed_dim=100,dat_fname='100_concept_embeddings.dat')
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        else: 
            boc = None
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        
        trainset = ABSADataset(opt.dataset_file['train'], tokenizer, boc)
        testset = ABSADataset(opt.dataset_file['test'], tokenizer, boc)
        
        self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
        self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)

        if opt.device.type == 'cuda':
            print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index))
        self._print_args()
예제 #2
0
    def __init__(self, opt):
        self.opt = opt
        max_seq_len = None
        if opt.use_bert:
            model_path = os.path.join(os.path.dirname(__file__), './bert_pretrain_model/uncased_L-12_H-768_A-12/')
            opt.bse = bert_sent_encoding(model_path=model_path,
                                         seq_length=opt.max_seq_len, batch_size=opt.batch_size, word_vector=True, layer=-1)

            max_seq_len = opt.max_seq_len
            self.model = opt.model_class(opt).to(opt.device)
            trainset = ABSADataset(opt.dataset_file['train'], opt.bse.tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert)
            testset = ABSADataset(opt.dataset_file['test'], opt.bse.tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert)
            self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
            self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_new_tokenizer.dat'.format(opt.dataset), opt=self.opt)
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_embedding_matrix.dat'.format(opt.dataset), opt=self.opt)
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
            trainset = ABSADataset(opt.dataset_file['train'], tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert, use_chinese=opt.chinese)
            testset = ABSADataset(opt.dataset_file['test'], tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert, use_chinese=opt.chinese)
            self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
            self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)

        if opt.device.type == 'cuda':
            print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index))
        self._print_args()
예제 #3
0
    def __init__(self, opt):
        self.opt = opt

        if opt.v2:
            absa_data_reader = ABSADataReaderV2(data_dir=opt.data_dir)
        else:
            absa_data_reader = ABSADataReader(data_dir=opt.data_dir)
        tokenizer = build_tokenizer(data_dir=opt.data_dir)
        embedding_matrix = build_embedding_matrix(opt.data_dir,
                                                  tokenizer.word2idx,
                                                  opt.embed_dim, opt.dataset)
        self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map
        self.train_data_loader = BucketIterator(
            data=absa_data_reader.get_train(tokenizer),
            batch_size=opt.batch_size,
            shuffle=True)
        self.dev_data_loader = BucketIterator(
            data=absa_data_reader.get_dev(tokenizer),
            batch_size=opt.batch_size,
            shuffle=False)
        self.test_data_loader = BucketIterator(
            data=absa_data_reader.get_test(tokenizer),
            batch_size=opt.batch_size,
            shuffle=False)
        self.model = opt.model_class(embedding_matrix, opt, self.idx2tag,
                                     self.idx2polarity).to(opt.device)
        self._print_args()

        if torch.cuda.is_available():
            print('>>> cuda memory allocated:',
                  torch.cuda.memory_allocated(device=opt.device.index))
예제 #4
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            # bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            config = BertConfig.from_pretrained(opt.pretrained_bert_name,
                                                output_attentions=True)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name,
                                             config=config)
            self.pretrained_bert_state_dict = bert.state_dict()
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #5
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            self.model = opt.model_class(bert, opt).to(opt.device)
            self.model.load_state_dict(torch.load(opt.state_dict_path))
            logger.info(
                f"Loaded model {opt.model_name} from {opt.state_dict_path}")
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
            self.model.load_state_dict(torch.load(opt.state_dict_path))

        self.valset = ABSADataset(opt.dataset_file['val'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
예제 #6
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            # freeze pretrained bert params
            # for param in bert.parameters():
            #     param.requires_grad = False
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
        self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)

        if opt.device.type == 'cuda':
            print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index))
        self._print_args()
예제 #7
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #8
0
 def __init__(self, opt):
     self.opt = opt
     tokenizer = build_tokenizer(
         fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
         max_length=opt.max_length,
         data_file='{0}_tokenizer.dat'.format(opt.dataset))
     embedding_matrix = build_embedding_matrix(
         vocab=tokenizer.vocab,
         embed_dim=opt.embed_dim,
         data_file='{0}d_{1}_embedding_matrix.dat'.format(
             str(opt.embed_dim), opt.dataset))
     trainset = SentenceDataset(opt.dataset_file['train'],
                                tokenizer,
                                target_dim=self.opt.polarities_dim)
     testset = SentenceDataset(opt.dataset_file['test'],
                               tokenizer,
                               target_dim=self.opt.polarities_dim)
     self.train_dataloader = DataLoader(dataset=trainset,
                                        batch_size=opt.batch_size,
                                        shuffle=True)
     self.test_dataloader = DataLoader(dataset=testset,
                                       batch_size=opt.batch_size,
                                       shuffle=False)
     self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
     if opt.device.type == 'cuda':
         print('cuda memory allocated:',
               torch.cuda.memory_allocated(self.opt.device.index))
     self._print_args()
예제 #9
0
 def __init__(self, opt):
     super(Inferrer, self).__init__()
     self.opt = opt
     self.tokenizer = build_tokenizer(
         fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
         max_seq_len=opt.max_seq_len,
         ngram=opt.ngram,
         min_count=opt.min_count,
         dat_fname='./bin/{0}_tokenizer.dat'.format(opt.dataset))
     opt.vocab_size = len(self.tokenizer.token2idx) + 2
     opt.ngram_vocab_sizes = [
         len(self.tokenizer.ngram2idx[n]) + 2
         for n in range(2, opt.ngram + 1)
     ]
     embedding_matrix = build_embedding_matrix(
         token2idx=self.tokenizer.token2idx,
         embed_dim=opt.embed_dim,
         embed_file=opt.embed_file,
         dat_fname='./bin/{0}_{1}_embedding_matrix.dat'.format(
             str(opt.embed_dim), opt.dataset))
     self.model = opt.model_class(opt, embedding_matrix)
     print('loading model {0} ...'.format(opt.model_name))
     self.model.load_state_dict(torch.load(opt.state_dict_path))
     self.model = self.model.to(opt.device)
     # Switch model to evaluation mode
     self.model.eval()
     torch.autograd.set_grad_enabled(False)
예제 #10
0
    def __init__(self, opt):
        self.opt = opt

        tokenizer = build_tokenizer(
            fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
            max_seq_len=opt.max_seq_len,
            dat_fname='{0}_tokenizer_train.dat'.format(opt.dataset))
        embedding_matrix = build_embedding_matrix(
            word2idx=tokenizer.word2idx,
            embed_dim=opt.embed_dim,
            dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                str(opt.embed_dim), opt.dataset))
        trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        self.train_data_loader = DataLoader(dataset=trainset,
                                            batch_size=opt.batch_size,
                                            shuffle=True)
        self.test_data_loader = DataLoader(dataset=testset,
                                           batch_size=opt.batch_size,
                                           shuffle=False)

        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        if opt.device.type == 'cuda':
            print("cuda memory allocated:",
                  torch.cuda.memory_allocated(device=opt.device.index))
        self._print_args()
예제 #11
0
파일: train.py 프로젝트: Ghands/HAOFL
    def __init__(self, opt, dtl_param):
        """
        A wrapper for running HAOFL based models.
        :param opt: An object stores all hyper-parameters
        :param dtl_param: A string indicates that parameter of used data transformation method used in DTL layer.
        """
        self.opt = opt

        if 'bert' in self.opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len, 'bert-base-uncased')
            self.model = opt.model_class(opt, tokenizer).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='normal_tokenizer.dat')
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_embedding_matrix.dat'.format(str(
                    opt.embed_dim)))
            self.model = opt.model_class(opt, tokenizer,
                                         embedding_matrix).to(opt.device)

        self.train_set = TrainDataset(opt.dataset_file['train'], tokenizer,
                                      opt, opt.dtl_method, dtl_param,
                                      opt.name_tail)
        self.val_set = TrainDataset(opt.dataset_file['test'], tokenizer, opt,
                                    opt.dtl_method, dtl_param, opt.name_tail)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #12
0
    def __init__(self, config):
        self.config = config

        aste_data_reader = ASTEDataReader(data_dir=config.data_dir)
        tokenizer = build_tokenizer(data_dir=config.data_dir)
        target_vocab = Tokenizer()
        embedding_matrix = build_embedding_matrix(
            config.data_dir, tokenizer.word2idx, config.emb_dim, config.dataset)

        embedding_matrix = torch.tensor(
            embedding_matrix, dtype=torch.float).to(config.device)

        self.train_data_loader = BucketIterator(data=aste_data_reader.get_train(
            tokenizer), batch_size=config.batch_size, shuffle=True, tokenizer=tokenizer, max_decode=config.tgt_max_train, device=config.device)

        self.dev_data_loader = BucketIterator(data=aste_data_reader.get_dev(
            tokenizer), batch_size=config.batch_size, shuffle=False, tokenizer=tokenizer, max_decode=config.tgt_max_test, device=config.device)

        self.test_data_loader = BucketIterator(data=aste_data_reader.get_test(
            tokenizer), batch_size=config.batch_size, shuffle=False, tokenizer=tokenizer, max_decode=config.tgt_max_test, device=config.device)

        self.model = config.model_class(
            embedding_matrix=embedding_matrix, config=config, vocab=tokenizer)
        self.model = self.model.to(config.device)

        if torch.cuda.is_available():
            print('>>> cuda memory allocated:',
                  torch.cuda.memory_allocated(device=config.device.index))
예제 #13
0
 def __init__(self, opt):
     self.opt = opt
     self._set_seed(opt.seed)
     opt.zeta = opt.zeta if 'M' in opt.method else 0.0
     opt.tokenizer = build_tokenizer(domains=opt.domains,
                                     fnames=opt.dataset_file.values())
     embedding_matrix = build_embedding_matrix(
         domains=opt.domains, vocab=opt.tokenizer.vocab['word'])
     self.trainset = MyDataset(side='main',
                               tasks=opt.tasks,
                               domains=opt.domains,
                               fname=opt.dataset_file['train'],
                               tokenizer=opt.tokenizer)
     self.testset = MyDataset(side='main',
                              tasks=opt.tasks,
                              domains=opt.domains,
                              fname=opt.dataset_file['test'],
                              tokenizer=opt.tokenizer)
     self.auxset = MyDataset(side='aux',
                             tasks=opt.tasks,
                             domains=opt.domains,
                             fname=opt.dataset_file['aux'],
                             tokenizer=opt.tokenizer)
     self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
     self._print_args()
예제 #14
0
	def __init__(self, opt):
		self.opt = opt

		tokenizer = build_tokenizer(
			fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
			max_seq_len=opt.max_seq_len,
			dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
		embedding_matrix = build_embedding_matrix(
			word2idx=tokenizer.word2idx,
			embed_dim=opt.embed_dim,
			dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
		self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

		self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
		self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)

		if(opt.polarities_dim==2):
			self.trainset = [data_point for data_point in self.trainset if data_point['polarity']!=1]
			for data in self.trainset:
				data['polarity'] = int(data['polarity']/2)
			self.testset = [data_point for data_point in self.testset if data_point['polarity']!=1]
			for data in self.testset:
				data['polarity'] = int(data['polarity']/2)
			

		assert 0 <= opt.valset_ratio < 1
		if opt.valset_ratio > 0:
			valset_len = int(len(self.trainset) * opt.valset_ratio)
			self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len))
		else:
			self.valset = self.testset

		if opt.device.type == 'cuda':
			logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))
		self._print_args()
예제 #15
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name)
            bert = BertModel.from_pretrained('/home/yinrongdi/bert/bert-base-uncased.tar.gz')
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='temp_data/'+'{0}_tokenizer.dat'.format(opt.dataset),
                step = 4 if opt.tabsa else 3)
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='temp_data/'+'{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        
        if opt.classifier:
            if opt.classifier_with_absa_target:
                    self.classifierset = TABSADataset(opt.dataset_file['classifier_absa_target'],tokenizer,False)
            else:

                if opt.classifier_with_absa:
                    self.classifierset = TABSADataset(opt.dataset_file['classifier'],tokenizer,True)
                else:
                    self.classifierset = TABSADataset(opt.dataset_file['classifier'],tokenizer,False)


        if  opt.tabsa:
            if opt.tabsa_with_absa:
                if opt.gating:
                    self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,True,True)
                    self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True,True)
                else:
                    self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,True)
                    self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True)
            else:
                if opt.gating:
                    self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,False,True)
                    self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True)
                else:
                    self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,False)
                    self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,False)
        else:
            self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
            self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)

        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #16
0
 def __init__(self, opt): # prepare for training the model
     self.opt = opt # hyperparameters and options
     opt.tokenizer = build_tokenizer(fnames=opt.dataset_file.values(), dataset=opt.dataset) # transfrom tokens to indices
     embedding_matrix = build_embedding_matrix(vocab=opt.tokenizer.vocab['word'], dataset=opt.dataset) # pre-trained glove embeddings
     self.trainset = MyDataset(fname=opt.dataset_file['train'], tokenizer=opt.tokenizer) # training set
     self.testset = MyDataset(fname=opt.dataset_file['test'], tokenizer=opt.tokenizer) # testing set
     self.model = RepWalk(embedding_matrix, opt).to(opt.device) # neural network model
     self._print_args() # print arguments
예제 #17
0
    def __init__(self, opt, model_classes):
        self.opt = opt
        if 'bert' in opt.model_name:
            self.tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                            opt.pretrained_bert_name)
        else:
            self.tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))

        self.trainset = ABSADataset(opt.dataset_file['train'], self.tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], self.tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))

        if 'bert' in opt.model_name:
            # ,cache_dir="pretrained/bert/"

            print("--------load module BERT --------")
            #To from pytorch_transformers import BertModel
            self.bert = BertModel.from_pretrained(opt.pretrained_bert_name,
                                                  output_attentions=True,
                                                  cache_dir="pretrained/bert/")

            # Bert pretrained (Old version)
            #bert = BertModel.from_pretrained(opt.pretrained_bert_name, cache_dir="pretrained/bert/")
            print("--------DDDD-----")
            print("OUTPUT")
            print("------   Module LOADED -------")
            #self.model = model_classes[opt.model_name](bert, opt).to(opt.device)
            self.model = opt.model_class(self.bert, opt).to(opt.device)
            #self.model = AEN_BERT(self.bert, opt).to(opt.device)
            print("MODULE LOADED SPECIFIC")
        else:
            self.model = model_classes[opt.model_name](embedding_matrix,
                                                       opt).to(opt.device)

        self._print_args()
예제 #18
0
 def __init__(self, opt):
     """
     初始化模型和数据预处理,并token化
     :param opt: argparse的参数
     """
     self.opt = opt
     #是否是bert类模型,使用bert类模型初始化, 非BERT类使用GloVe
     if 'bert' in opt.model_name:
         #初始化tokenizer
         tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                    opt.pretrained_bert_name,
                                    cache_dir=opt.pretrained_bert_cache_dir)
         # 加载BERT模型
         bert = BertModel.from_pretrained(
             opt.pretrained_bert_name,
             cache_dir=opt.pretrained_bert_cache_dir)
         # 然后把BERT模型和opt参数传入自定义模型,进行进一步处理
         self.model = opt.model_class(bert, opt).to(opt.device)
     else:
         # 自定义tokenizer,生成id2word,word2idx
         tokenizer = build_tokenizer(
             fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
             max_seq_len=opt.max_seq_len,
             dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
         #返回所有单词的词嵌入 [word_nums, embedding_dimesion]
         embedding_matrix = build_embedding_matrix(
             word2idx=tokenizer.word2idx,
             embed_dim=opt.embed_dim,
             dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                 str(opt.embed_dim), opt.dataset))
         # 加载模型
         self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
     # 加载训练集
     self.trainset = ABSADataset(opt.dataset_file['train'],
                                 tokenizer,
                                 recreate_caches=opt.recreate_caches)
     self.testset = ABSADataset(opt.dataset_file['test'],
                                tokenizer,
                                recreate_caches=opt.recreate_caches)
     #如果valset_ratio为0,测试集代替验证集
     assert 0 <= opt.valset_ratio < 1
     if opt.valset_ratio > 0:
         valset_len = int(len(self.trainset) * opt.valset_ratio)
         self.trainset, self.valset = random_split(
             self.trainset, (len(self.trainset) - valset_len, valset_len))
     else:
         self.valset = self.testset
     # 检查cuda的内存
     if opt.device.type == 'cuda':
         logger.info('cuda 可用内存: {}'.format(
             torch.cuda.memory_allocated(device=opt.device.index)))
     self._print_args()
예제 #19
0
 def __init__(self, opt):
     self.opt = opt
    
     absa_data_reader = ABSADataReader(data_dir=opt.data_dir)
     self.tokenizer = build_tokenizer(data_dir=opt.data_dir)
     embedding_matrix = build_embedding_matrix(opt.data_dir, self.tokenizer.word2idx, opt.embed_dim, opt.dataset)
     self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map
     self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device)
     print('loading model {0} ...'.format(opt.model_name))
     self.model.load_state_dict(torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage))
     # switch model to evaluation mode
     self.model.eval()
     torch.autograd.set_grad_enabled(False)
예제 #20
0
파일: infer.py 프로젝트: wutaiqiang/cgcn
    def __init__(self, opt):
        self.opt = opt
        fname = {
            'cr': {
                'train': './datasets/cr/train.csv',
                'test': './datasets/cr/dev.csv'
            },
            'mr': {
                'train': './datasets/mr/train.csv',
                'test': './datasets/mr/dev.csv'
            },
            'mpqa': {
                'train': './datasets/mpqa/train.csv',
                'test': './datasets/mpqa/dev.csv'
            },
            'subj': {
                'train': './datasets/subj/train.csv',
                'test': './datasets/subj/dev.csv'
            },
            'sst2': {
                'train': './datasets/sst2/train.csv',
                'test': './datasets/sst2/test.csv'
            },
            'TREC': {
                'train': './datasets/TREC/train.csv',
                'test': './datasets/TREC/test.csv'
            },
        }
        if os.path.exists(opt.dataset + '_word2idx.pkl'):
            print("loading {0} tokenizer...".format(opt.dataset))
            with open(opt.dataset + '_word2idx.pkl', 'rb') as f:
                word2idx = pickle.load(f)
                self.tokenizer = Tokenizer(word2idx=word2idx)
        else:
            print("reading {0} dataset...".format(opt.dataset))

            text = ABSADatesetReader.__read_text__(
                [fname[opt.dataset]['train'], fname[opt.dataset]['test']])
            self.tokenizer = Tokenizer()
            self.tokenizer.fit_on_text(text)
            with open(opt.dataset + '_word2idx.pkl', 'wb') as f:
                pickle.dump(self.tokenizer.word2idx, f)
        embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx,
                                                  opt.embed_dim, opt.dataset)
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model
        # switch model to evaluation mode
        self.model.eval()
        torch.autograd.set_grad_enabled(False)
예제 #21
0
def get_embedding_matrix(in_vocabulary, in_config):
    embeddings_file = in_config['embedding_matrix']
    if not path.exists(embeddings_file):
        logger.info('Creating embeddings matrix')
        w2v = Word2Vec.load_word2vec_format(
            '../word2vec_google_news/GoogleNews-vectors-negative300.bin',
            binary=True
        )
        embedding_matrix = build_embedding_matrix(w2v, in_vocabulary)
        with open(embeddings_file, 'wb') as embeddings_out:
            np.save(embeddings_out, embedding_matrix)
    else:
        logger.info('Skipping embeddings creating step')
        embedding_matrix = np.load(embeddings_file)
    return embedding_matrix
예제 #22
0
 def __init__(self, opt):
     self.opt = opt
     print("loading {0} tokenizer...".format(opt.dataset))
     with open(opt.dataset + '_word2idx.pkl', 'rb') as f:
         word2idx = pickle.load(f)
         self.tokenizer = Tokenizer(word2idx=word2idx)
     embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx,
                                               opt.embed_dim, opt.dataset)
     self.model = opt.model_class(embedding_matrix, opt)
     print('loading model {0} ...'.format(opt.model_name))
     self.model.load_state_dict(
         torch.load(opt.state_dict_path,
                    map_location=lambda storage, loc: storage))
     # switch model to evaluation mode
     self.model.eval()
     torch.autograd.set_grad_enabled(False)
예제 #23
0
    def __init__(self, opt):
        self.opt = opt
        fname = {
            'twitter': {
                'train': './datasets/acl-14-short-data/train.raw',
                'test': './datasets/acl-14-short-data/test.raw'
            },
            'rest14': {
                'train': './datasets/semeval14/restaurant_train.raw',
                'test': './datasets/semeval14/restaurant_test.raw'
            },
            'lap14': {
                'train': './datasets/semeval14/laptop_train.raw',
                'test': './datasets/semeval14/laptop_test.raw'
            },
            'rest15': {
                'train': './datasets/semeval15/restaurant_train.raw',
                'test': './datasets/semeval15/restaurant_test.raw'
            },
            'rest16': {
                'train': './datasets/semeval16/restaurant_train.raw',
                'test': './datasets/semeval16/restaurant_test.raw'
            },
        }
        if os.path.exists(opt.dataset + '_word2idx.pkl'):
            print("loading {0} tokenizer...".format(opt.dataset))
            with open(opt.dataset + '_word2idx.pkl', 'rb') as f:
                word2idx = pickle.load(f)
                self.tokenizer = Tokenizer(word2idx=word2idx)
        else:
            print("reading {0} dataset...".format(opt.dataset))

            text = ABSADatesetReader.__read_text__(
                [fname[opt.dataset]['train'], fname[opt.dataset]['test']])
            self.tokenizer = Tokenizer()
            self.tokenizer.fit_on_text(text)
            with open(opt.dataset + '_word2idx.pkl', 'wb') as f:
                pickle.dump(self.tokenizer.word2idx, f)
        embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx,
                                                  opt.embed_dim, opt.dataset)
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model
        # switch model to evaluation mode
        self.model.eval()
        torch.autograd.set_grad_enabled(False)
예제 #24
0
 def __init__(self, opt):
     self.opt = opt
     self.tokenizer = build_tokenizer(
         fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
         max_seq_len=opt.max_seq_len,
         dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
     embedding_matrix = build_embedding_matrix(
         word2idx=self.tokenizer.word2idx,
         embed_dim=opt.embed_dim,
         dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
     self.model = opt.model_class(embedding_matrix, opt)
     print('loading model {0} ...'.format(opt.model_name))
     self.model.load_state_dict(torch.load(opt.state_dict_path))
     self.model = self.model.to(opt.device)
     # switch model to evaluation mode
     self.model.eval()
     torch.autograd.set_grad_enabled(False)
예제 #25
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            # set bert_based_vocab
            tokenizer = Tokenizer4Bert(
                opt.max_seq_len,
                '/data/kkzhang/aaa/command/bert-base-uncased-vocab.txt')
            #tokenizer = Tokenizer4Bert(opt.max_seq_len, '/home/kkzhang/bert-large-uncased/bert-large-uncased-vocab.txt')
            # set bert pre_train model
            bert = BertModel.from_pretrained(
                '/data/kkzhang/WordeEmbedding/bert_base/')

            ##### multi gpu ##########
            if torch.cuda.device_count() > 1:
                logging.info('The device has {} gpus!!!!!!!!!!!!!'.format(
                    torch.cuda.device_count()))
                bert = nn.DataParallel(bert)

            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #26
0
    def __init__(self, opt):
        super(Instructor, self).__init__()
        self.opt = opt

        tokenizer = build_tokenizer(
            fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
            max_seq_len=opt.max_seq_len,
            ngram=opt.ngram,
            min_count=opt.min_count,
            dat_fname='./bin/{0}_tokenizer.dat'.format(opt.dataset))
        opt.vocab_size = len(tokenizer.token2idx) + 2
        opt.ngram_vocab_sizes = [
            len(tokenizer.ngram2idx[n]) + 2 for n in range(2, opt.ngram + 1)
        ]
        if opt.embed_file is not None and os.path.exists(opt.embed_file):
            embedding_matrix = build_embedding_matrix(
                token2idx=tokenizer.token2idx,
                embed_dim=opt.embed_dim,
                embed_file=opt.embed_file,
                dat_fname='./bin/{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
        else:
            embedding_matrix = None
        self.model = opt.model_class(opt, embedding_matrix).to(opt.device)

        self.train_set = TCDataset(opt.dataset_file['train'],
                                   opt.label_mapping, opt.ngram, tokenizer)
        self.test_set = TCDataset(opt.dataset_file['test'], opt.label_mapping,
                                  opt.ngram, tokenizer)
        assert 0 <= opt.val_set_ratio < 1
        if opt.val_set_ratio > 0:
            val_set_len = int(len(self.train_set) * opt.val_set_ratio)
            self.train_set, self.val_set = random_split(
                self.train_set,
                (len(self.train_set) - val_set_len, val_set_len))
        else:
            self.val_set = self.test_set

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #27
0
파일: train.py 프로젝트: JZCS2018/SMAT
    def __init__(self, opt):
        self.opt = opt
        if opt.dataset_file['val'] == None:
            fnames = [opt.dataset_file['train'], opt.dataset_file['test']]
        else:
            fnames = [
                opt.dataset_file['train'], opt.dataset_file['val'],
                opt.dataset_file['test']
            ]
        tokenizer = build_tokenizer(fnames,
                                    max_seq_len=opt.max_seq_len,
                                    dat_fname='{0}_tokenizer.dat'.format(
                                        opt.dataset))
        embedding_matrix = build_embedding_matrix(
            word2idx=tokenizer.word2idx,
            embed_dim=opt.embed_dim,
            dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                str(opt.embed_dim), opt.dataset))
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = Dataset(opt.dataset_file['train'],
                                tokenizer,
                                dat_fname='{0}_train.dat'.format(opt.dataset))
        # self.weight_classes =torch.tensor( compute_class_weight('balanced', np.unique([i['polarity'] for i in self.trainset.data]), self.trainset[4]), dtype = torch.float).to(self.opt.device)
        # self.valset = ABSADataset(opt.dataset_file['val'], tokenizer)self.trainset[4]
        self.testset = Dataset(opt.dataset_file['test'],
                               tokenizer,
                               dat_fname='{0}_test.dat'.format(opt.dataset))
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = Dataset(opt.dataset_file['val'],
                                  tokenizer,
                                  dat_fname='{0}_val.dat'.format(opt.dataset))

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #28
0
    def __init__(self, opt):
        self.opt = opt
        self.tokenizer = build_tokenizer(
            fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
            max_length=opt.max_length,
            data_file='./embedding/{0}_{1}_tokenizer.dat'.format(
                opt.model_name, opt.dataset),
        )
        embedding_matrix = build_embedding_matrix(
            vocab=self.tokenizer.vocab,
            embed_dim=opt.embed_dim,
            data_file='./embedding/{0}_{1}d_{2}_embedding_matrix.dat'.format(
                opt.model_name, str(opt.embed_dim), opt.dataset))

        self.model = opt.model_class(embedding_matrix, opt)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model.to(opt.device)

        torch.autograd.set_grad_enabled(False)
예제 #29
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name,
                                             output_hidden_states=True)
            # tokenizer = Tokenizer4Bert(opt.max_seq_len, '/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus/vocab.txt')
            # bert = BertModel.from_pretrained('/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus')
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(
            opt.dataset_file['train'],
            './datasets/semeval14/law_train.raw.graph', tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'],
                                   './datasets/semeval14/law_train.raw.graph',
                                   tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #30
0
    def __init__(self, opt):
        self.opt = opt
       
        absa_data_reader = ABSADataReader(data_dir=opt.data_dir)
        self.tokenizer = build_tokenizer(data_dir=opt.data_dir)
        embedding_matrix = build_embedding_matrix(opt.data_dir, self.tokenizer.word2idx, opt.embed_dim, opt.dataset)
        self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map
        self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device)
        print('loading model {0} ...'.format(opt.model_name))
        # self.model.load_state_dict(torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage))
        # switch model to evaluation mode
        self.model.eval()

        # get a handle on s3
        session = boto3.Session(
            aws_access_key_id='XXXXXXXXXXXX',
            aws_secret_access_key='XXXXXXXX',
            region_name='XXXXXXXX')

        self.s3 = session.resource('s3')
        self.bucket = self.s3.Bucket('surveybuddy-responses')  # example: energy_market_procesing

        torch.autograd.set_grad_enabled(False)