def __init__(self, opt): self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) if opt.model_name != 'base_model': boc = build_boc(' ', dat_fname='bag_of_concepts.dat') affective_matrix = build_embedding_matrix( word2idx=boc.word2idx, embed_dim=100,dat_fname='100_concept_embeddings.dat') self.model = opt.model_class(embedding_matrix, opt).to(opt.device) else: boc = None self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer, boc) testset = ABSADataset(opt.dataset_file['test'], tokenizer, boc) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt): self.opt = opt max_seq_len = None if opt.use_bert: model_path = os.path.join(os.path.dirname(__file__), './bert_pretrain_model/uncased_L-12_H-768_A-12/') opt.bse = bert_sent_encoding(model_path=model_path, seq_length=opt.max_seq_len, batch_size=opt.batch_size, word_vector=True, layer=-1) max_seq_len = opt.max_seq_len self.model = opt.model_class(opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], opt.bse.tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert) testset = ABSADataset(opt.dataset_file['test'], opt.bse.tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_new_tokenizer.dat'.format(opt.dataset), opt=self.opt) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_embedding_matrix.dat'.format(opt.dataset), opt=self.opt) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert, use_chinese=opt.chinese) testset = ABSADataset(opt.dataset_file['test'], tokenizer, max_seq_len=max_seq_len, use_bert=opt.use_bert, use_chinese=opt.chinese) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt): self.opt = opt if opt.v2: absa_data_reader = ABSADataReaderV2(data_dir=opt.data_dir) else: absa_data_reader = ABSADataReader(data_dir=opt.data_dir) tokenizer = build_tokenizer(data_dir=opt.data_dir) embedding_matrix = build_embedding_matrix(opt.data_dir, tokenizer.word2idx, opt.embed_dim, opt.dataset) self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map self.train_data_loader = BucketIterator( data=absa_data_reader.get_train(tokenizer), batch_size=opt.batch_size, shuffle=True) self.dev_data_loader = BucketIterator( data=absa_data_reader.get_dev(tokenizer), batch_size=opt.batch_size, shuffle=False) self.test_data_loader = BucketIterator( data=absa_data_reader.get_test(tokenizer), batch_size=opt.batch_size, shuffle=False) self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device) self._print_args() if torch.cuda.is_available(): print('>>> cuda memory allocated:', torch.cuda.memory_allocated(device=opt.device.index))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertModel.from_pretrained(opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.pretrained_bert_state_dict = bert.state_dict() self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.model.load_state_dict(torch.load(opt.state_dict_path)) logger.info( f"Loaded model {opt.model_name} from {opt.state_dict_path}") else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.valset = ABSADataset(opt.dataset_file['val'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index)))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) # freeze pretrained bert params # for param in bert.parameters(): # param.requires_grad = False self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_length=opt.max_length, data_file='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( vocab=tokenizer.vocab, embed_dim=opt.embed_dim, data_file='{0}d_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) trainset = SentenceDataset(opt.dataset_file['train'], tokenizer, target_dim=self.opt.polarities_dim) testset = SentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim) self.train_dataloader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.device.type == 'cuda': print('cuda memory allocated:', torch.cuda.memory_allocated(self.opt.device.index)) self._print_args()
def __init__(self, opt): super(Inferrer, self).__init__() self.opt = opt self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, ngram=opt.ngram, min_count=opt.min_count, dat_fname='./bin/{0}_tokenizer.dat'.format(opt.dataset)) opt.vocab_size = len(self.tokenizer.token2idx) + 2 opt.ngram_vocab_sizes = [ len(self.tokenizer.ngram2idx[n]) + 2 for n in range(2, opt.ngram + 1) ] embedding_matrix = build_embedding_matrix( token2idx=self.tokenizer.token2idx, embed_dim=opt.embed_dim, embed_file=opt.embed_file, dat_fname='./bin/{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(opt, embedding_matrix) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) # Switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer_train.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt, dtl_param): """ A wrapper for running HAOFL based models. :param opt: An object stores all hyper-parameters :param dtl_param: A string indicates that parameter of used data transformation method used in DTL layer. """ self.opt = opt if 'bert' in self.opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, 'bert-base-uncased') self.model = opt.model_class(opt, tokenizer).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='normal_tokenizer.dat') embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_embedding_matrix.dat'.format(str( opt.embed_dim))) self.model = opt.model_class(opt, tokenizer, embedding_matrix).to(opt.device) self.train_set = TrainDataset(opt.dataset_file['train'], tokenizer, opt, opt.dtl_method, dtl_param, opt.name_tail) self.val_set = TrainDataset(opt.dataset_file['test'], tokenizer, opt, opt.dtl_method, dtl_param, opt.name_tail) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, config): self.config = config aste_data_reader = ASTEDataReader(data_dir=config.data_dir) tokenizer = build_tokenizer(data_dir=config.data_dir) target_vocab = Tokenizer() embedding_matrix = build_embedding_matrix( config.data_dir, tokenizer.word2idx, config.emb_dim, config.dataset) embedding_matrix = torch.tensor( embedding_matrix, dtype=torch.float).to(config.device) self.train_data_loader = BucketIterator(data=aste_data_reader.get_train( tokenizer), batch_size=config.batch_size, shuffle=True, tokenizer=tokenizer, max_decode=config.tgt_max_train, device=config.device) self.dev_data_loader = BucketIterator(data=aste_data_reader.get_dev( tokenizer), batch_size=config.batch_size, shuffle=False, tokenizer=tokenizer, max_decode=config.tgt_max_test, device=config.device) self.test_data_loader = BucketIterator(data=aste_data_reader.get_test( tokenizer), batch_size=config.batch_size, shuffle=False, tokenizer=tokenizer, max_decode=config.tgt_max_test, device=config.device) self.model = config.model_class( embedding_matrix=embedding_matrix, config=config, vocab=tokenizer) self.model = self.model.to(config.device) if torch.cuda.is_available(): print('>>> cuda memory allocated:', torch.cuda.memory_allocated(device=config.device.index))
def __init__(self, opt): self.opt = opt self._set_seed(opt.seed) opt.zeta = opt.zeta if 'M' in opt.method else 0.0 opt.tokenizer = build_tokenizer(domains=opt.domains, fnames=opt.dataset_file.values()) embedding_matrix = build_embedding_matrix( domains=opt.domains, vocab=opt.tokenizer.vocab['word']) self.trainset = MyDataset(side='main', tasks=opt.tasks, domains=opt.domains, fname=opt.dataset_file['train'], tokenizer=opt.tokenizer) self.testset = MyDataset(side='main', tasks=opt.tasks, domains=opt.domains, fname=opt.dataset_file['test'], tokenizer=opt.tokenizer) self.auxset = MyDataset(side='aux', tasks=opt.tasks, domains=opt.domains, fname=opt.dataset_file['aux'], tokenizer=opt.tokenizer) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if(opt.polarities_dim==2): self.trainset = [data_point for data_point in self.trainset if data_point['polarity']!=1] for data in self.trainset: data['polarity'] = int(data['polarity']/2) self.testset = [data_point for data_point in self.testset if data_point['polarity']!=1] for data in self.testset: data['polarity'] = int(data['polarity']/2) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained('/home/yinrongdi/bert/bert-base-uncased.tar.gz') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='temp_data/'+'{0}_tokenizer.dat'.format(opt.dataset), step = 4 if opt.tabsa else 3) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='temp_data/'+'{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.classifier: if opt.classifier_with_absa_target: self.classifierset = TABSADataset(opt.dataset_file['classifier_absa_target'],tokenizer,False) else: if opt.classifier_with_absa: self.classifierset = TABSADataset(opt.dataset_file['classifier'],tokenizer,True) else: self.classifierset = TABSADataset(opt.dataset_file['classifier'],tokenizer,False) if opt.tabsa: if opt.tabsa_with_absa: if opt.gating: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,True,True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True,True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True) else: if opt.gating: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,False,True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,False) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,False) else: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): # prepare for training the model self.opt = opt # hyperparameters and options opt.tokenizer = build_tokenizer(fnames=opt.dataset_file.values(), dataset=opt.dataset) # transfrom tokens to indices embedding_matrix = build_embedding_matrix(vocab=opt.tokenizer.vocab['word'], dataset=opt.dataset) # pre-trained glove embeddings self.trainset = MyDataset(fname=opt.dataset_file['train'], tokenizer=opt.tokenizer) # training set self.testset = MyDataset(fname=opt.dataset_file['test'], tokenizer=opt.tokenizer) # testing set self.model = RepWalk(embedding_matrix, opt).to(opt.device) # neural network model self._print_args() # print arguments
def __init__(self, opt, model_classes): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) else: self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.trainset = ABSADataset(opt.dataset_file['train'], self.tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], self.tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) if 'bert' in opt.model_name: # ,cache_dir="pretrained/bert/" print("--------load module BERT --------") #To from pytorch_transformers import BertModel self.bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True, cache_dir="pretrained/bert/") # Bert pretrained (Old version) #bert = BertModel.from_pretrained(opt.pretrained_bert_name, cache_dir="pretrained/bert/") print("--------DDDD-----") print("OUTPUT") print("------ Module LOADED -------") #self.model = model_classes[opt.model_name](bert, opt).to(opt.device) self.model = opt.model_class(self.bert, opt).to(opt.device) #self.model = AEN_BERT(self.bert, opt).to(opt.device) print("MODULE LOADED SPECIFIC") else: self.model = model_classes[opt.model_name](embedding_matrix, opt).to(opt.device) self._print_args()
def __init__(self, opt): """ 初始化模型和数据预处理,并token化 :param opt: argparse的参数 """ self.opt = opt #是否是bert类模型,使用bert类模型初始化, 非BERT类使用GloVe if 'bert' in opt.model_name: #初始化tokenizer tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name, cache_dir=opt.pretrained_bert_cache_dir) # 加载BERT模型 bert = BertModel.from_pretrained( opt.pretrained_bert_name, cache_dir=opt.pretrained_bert_cache_dir) # 然后把BERT模型和opt参数传入自定义模型,进行进一步处理 self.model = opt.model_class(bert, opt).to(opt.device) else: # 自定义tokenizer,生成id2word,word2idx tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) #返回所有单词的词嵌入 [word_nums, embedding_dimesion] embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) # 加载模型 self.model = opt.model_class(embedding_matrix, opt).to(opt.device) # 加载训练集 self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, recreate_caches=opt.recreate_caches) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, recreate_caches=opt.recreate_caches) #如果valset_ratio为0,测试集代替验证集 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset # 检查cuda的内存 if opt.device.type == 'cuda': logger.info('cuda 可用内存: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt absa_data_reader = ABSADataReader(data_dir=opt.data_dir) self.tokenizer = build_tokenizer(data_dir=opt.data_dir) embedding_matrix = build_embedding_matrix(opt.data_dir, self.tokenizer.word2idx, opt.embed_dim, opt.dataset) self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage)) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt fname = { 'cr': { 'train': './datasets/cr/train.csv', 'test': './datasets/cr/dev.csv' }, 'mr': { 'train': './datasets/mr/train.csv', 'test': './datasets/mr/dev.csv' }, 'mpqa': { 'train': './datasets/mpqa/train.csv', 'test': './datasets/mpqa/dev.csv' }, 'subj': { 'train': './datasets/subj/train.csv', 'test': './datasets/subj/dev.csv' }, 'sst2': { 'train': './datasets/sst2/train.csv', 'test': './datasets/sst2/test.csv' }, 'TREC': { 'train': './datasets/TREC/train.csv', 'test': './datasets/TREC/test.csv' }, } if os.path.exists(opt.dataset + '_word2idx.pkl'): print("loading {0} tokenizer...".format(opt.dataset)) with open(opt.dataset + '_word2idx.pkl', 'rb') as f: word2idx = pickle.load(f) self.tokenizer = Tokenizer(word2idx=word2idx) else: print("reading {0} dataset...".format(opt.dataset)) text = ABSADatesetReader.__read_text__( [fname[opt.dataset]['train'], fname[opt.dataset]['test']]) self.tokenizer = Tokenizer() self.tokenizer.fit_on_text(text) with open(opt.dataset + '_word2idx.pkl', 'wb') as f: pickle.dump(self.tokenizer.word2idx, f) embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx, opt.embed_dim, opt.dataset) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def get_embedding_matrix(in_vocabulary, in_config): embeddings_file = in_config['embedding_matrix'] if not path.exists(embeddings_file): logger.info('Creating embeddings matrix') w2v = Word2Vec.load_word2vec_format( '../word2vec_google_news/GoogleNews-vectors-negative300.bin', binary=True ) embedding_matrix = build_embedding_matrix(w2v, in_vocabulary) with open(embeddings_file, 'wb') as embeddings_out: np.save(embeddings_out, embedding_matrix) else: logger.info('Skipping embeddings creating step') embedding_matrix = np.load(embeddings_file) return embedding_matrix
def __init__(self, opt): self.opt = opt print("loading {0} tokenizer...".format(opt.dataset)) with open(opt.dataset + '_word2idx.pkl', 'rb') as f: word2idx = pickle.load(f) self.tokenizer = Tokenizer(word2idx=word2idx) embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx, opt.embed_dim, opt.dataset) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict( torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage)) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt fname = { 'twitter': { 'train': './datasets/acl-14-short-data/train.raw', 'test': './datasets/acl-14-short-data/test.raw' }, 'rest14': { 'train': './datasets/semeval14/restaurant_train.raw', 'test': './datasets/semeval14/restaurant_test.raw' }, 'lap14': { 'train': './datasets/semeval14/laptop_train.raw', 'test': './datasets/semeval14/laptop_test.raw' }, 'rest15': { 'train': './datasets/semeval15/restaurant_train.raw', 'test': './datasets/semeval15/restaurant_test.raw' }, 'rest16': { 'train': './datasets/semeval16/restaurant_train.raw', 'test': './datasets/semeval16/restaurant_test.raw' }, } if os.path.exists(opt.dataset + '_word2idx.pkl'): print("loading {0} tokenizer...".format(opt.dataset)) with open(opt.dataset + '_word2idx.pkl', 'rb') as f: word2idx = pickle.load(f) self.tokenizer = Tokenizer(word2idx=word2idx) else: print("reading {0} dataset...".format(opt.dataset)) text = ABSADatesetReader.__read_text__( [fname[opt.dataset]['train'], fname[opt.dataset]['test']]) self.tokenizer = Tokenizer() self.tokenizer.fit_on_text(text) with open(opt.dataset + '_word2idx.pkl', 'wb') as f: pickle.dump(self.tokenizer.word2idx, f) embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx, opt.embed_dim, opt.dataset) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # set bert_based_vocab tokenizer = Tokenizer4Bert( opt.max_seq_len, '/data/kkzhang/aaa/command/bert-base-uncased-vocab.txt') #tokenizer = Tokenizer4Bert(opt.max_seq_len, '/home/kkzhang/bert-large-uncased/bert-large-uncased-vocab.txt') # set bert pre_train model bert = BertModel.from_pretrained( '/data/kkzhang/WordeEmbedding/bert_base/') ##### multi gpu ########## if torch.cuda.device_count() > 1: logging.info('The device has {} gpus!!!!!!!!!!!!!'.format( torch.cuda.device_count())) bert = nn.DataParallel(bert) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): super(Instructor, self).__init__() self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, ngram=opt.ngram, min_count=opt.min_count, dat_fname='./bin/{0}_tokenizer.dat'.format(opt.dataset)) opt.vocab_size = len(tokenizer.token2idx) + 2 opt.ngram_vocab_sizes = [ len(tokenizer.ngram2idx[n]) + 2 for n in range(2, opt.ngram + 1) ] if opt.embed_file is not None and os.path.exists(opt.embed_file): embedding_matrix = build_embedding_matrix( token2idx=tokenizer.token2idx, embed_dim=opt.embed_dim, embed_file=opt.embed_file, dat_fname='./bin/{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) else: embedding_matrix = None self.model = opt.model_class(opt, embedding_matrix).to(opt.device) self.train_set = TCDataset(opt.dataset_file['train'], opt.label_mapping, opt.ngram, tokenizer) self.test_set = TCDataset(opt.dataset_file['test'], opt.label_mapping, opt.ngram, tokenizer) assert 0 <= opt.val_set_ratio < 1 if opt.val_set_ratio > 0: val_set_len = int(len(self.train_set) * opt.val_set_ratio) self.train_set, self.val_set = random_split( self.train_set, (len(self.train_set) - val_set_len, val_set_len)) else: self.val_set = self.test_set if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if opt.dataset_file['val'] == None: fnames = [opt.dataset_file['train'], opt.dataset_file['test']] else: fnames = [ opt.dataset_file['train'], opt.dataset_file['val'], opt.dataset_file['test'] ] tokenizer = build_tokenizer(fnames, max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format( opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = Dataset(opt.dataset_file['train'], tokenizer, dat_fname='{0}_train.dat'.format(opt.dataset)) # self.weight_classes =torch.tensor( compute_class_weight('balanced', np.unique([i['polarity'] for i in self.trainset.data]), self.trainset[4]), dtype = torch.float).to(self.opt.device) # self.valset = ABSADataset(opt.dataset_file['val'], tokenizer)self.trainset[4] self.testset = Dataset(opt.dataset_file['test'], tokenizer, dat_fname='{0}_test.dat'.format(opt.dataset)) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = Dataset(opt.dataset_file['val'], tokenizer, dat_fname='{0}_val.dat'.format(opt.dataset)) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_length=opt.max_length, data_file='./embedding/{0}_{1}_tokenizer.dat'.format( opt.model_name, opt.dataset), ) embedding_matrix = build_embedding_matrix( vocab=self.tokenizer.vocab, embed_dim=opt.embed_dim, data_file='./embedding/{0}_{1}d_{2}_embedding_matrix.dat'.format( opt.model_name, str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) # tokenizer = Tokenizer4Bert(opt.max_seq_len, '/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus/vocab.txt') # bert = BertModel.from_pretrained('/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset( opt.dataset_file['train'], './datasets/semeval14/law_train.raw.graph', tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], './datasets/semeval14/law_train.raw.graph', tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt absa_data_reader = ABSADataReader(data_dir=opt.data_dir) self.tokenizer = build_tokenizer(data_dir=opt.data_dir) embedding_matrix = build_embedding_matrix(opt.data_dir, self.tokenizer.word2idx, opt.embed_dim, opt.dataset) self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) # self.model.load_state_dict(torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage)) # switch model to evaluation mode self.model.eval() # get a handle on s3 session = boto3.Session( aws_access_key_id='XXXXXXXXXXXX', aws_secret_access_key='XXXXXXXX', region_name='XXXXXXXX') self.s3 = session.resource('s3') self.bucket = self.s3.Bucket('surveybuddy-responses') # example: energy_market_procesing torch.autograd.set_grad_enabled(False)