示例#1
0
def load_model():
    model_dir = '../../model/model/'
    config = BertConfig(num_labels=3, output_attentions=True)
    config.from_pretrained('../../model/bert-cased/')
    model = BertAttn(config,
                     option='feed',
                     dropout=0.1,
                     gpu=False,
                     seed=0,
                     do_lower_case=False)
    class_weights = [0.6058, 0.1161, 0.2781]
    model.set_focal_loss(alpha=class_weights, gamma=-1)
    model.load_model(True, model_dir)
    return model
示例#2
0
def createCsvData():
    config = BertConfig.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel(config)
    with Cd("lemmadata"):
        with open("id_to_sent.json") as sent_id_dict_file:
            sent_id_dict = json.load(sent_id_dict_file)
        for dir_item in os.listdir():
            if os.path.isfile(dir_item):
                if dir_item.endswith(".json") and dir_item != "id_to_sent.json":
                    print(dir_item)
                    with open(dir_item, "r") as f:
                        lemma_data = json.load(f)
                    with Cd("vectors"):
                        with open(dir_item[:-5]+".csv", "w") as vector_file:
                            writer = csv.writer(vector_file, delimiter=",")
                            for instance in lemma_data:
                                inst_sent_id = instance["sent_id"]
                                inst_sense = instance["sense"]
                                inst_sent = sent_id_dict[str(inst_sent_id)]
                                if(len(inst_sent) > 511):
                                    continue 
                                vector = vectorizeWordInContext(inst_sent, instance["pos"], tokenizer, model)
                                vec_list = vector.detach().tolist()
                                row_data = [inst_sent_id, instance["pos"], inst_sense] + vec_list
                                writer.writerow(row_data)
def main():

    bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
    bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config)
    count = 0
    for name, param in bert_base_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in bert_base_uncased: ', count)

    roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config)
    count = 0
    for name, param in roberta_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in roberta: ', count)

    albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2)
    albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config)
    count = 0
    for name, param in albert_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in albert: ', count)
示例#4
0
 def __init__(self, name='bert-base-uncased', dropout=0.1, num_class=2):
     super(BertC, self).__init__()
     config = BertConfig.from_pretrained(name)
     self.bert = BertModel_attack(config)
     self.proj = nn.Linear(config.hidden_size, num_class)
     self.loss_f = nn.CrossEntropyLoss()
     self.drop = nn.Dropout(p=dropout)
示例#5
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            # bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            config = BertConfig.from_pretrained(opt.pretrained_bert_name,
                                                output_attentions=True)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name,
                                             config=config)
            self.pretrained_bert_state_dict = bert.state_dict()
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
示例#6
0
    def __init__(self, args, dictionary, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        from pytorch_transformers import RobertaModel, BertModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.pretrained_bert_model.startswith('roberta'):
            self.embed = RobertaModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(
                args.pretrained_bert_model)
            self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        else:
            self.embed = BertModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = BertModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(
                args.pretrained_bert_model)

            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.padding_idx = self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.pad_token)
示例#7
0
    def __init__(self): 
        super(Bert, self).__init__()

        self.tokenizer = BertTokenizer.from_pretrained(os.path.join(config.get('model_config')['language_model_path'], 'bert-base-uncased-vocab.txt'))
        modelConfig = BertConfig.from_pretrained(os.path.join(config.get('model_config')['language_model_path'], 'bert_config.json'))
        self.textExtractor = BertModel.from_pretrained(
            os.path.join(config.get('model_config')['language_model_path'], 'pytorch_model.bin'), config=modelConfig)
    def __init__(self, vocab_size, tag_to_ix, hidden_dim, n_layers):
        super(BERT_BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        config = BertConfig.from_pretrained('bert-base-multilingual-cased')
        self.model = BertModel(config)

        self.lstm = nn.LSTM(768,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size, device=device))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
示例#9
0
 def load_model(model_name: str, do_lower_case=False):
     config = BertConfig.from_pretrained(model_name)
     tokenizer = BertTokenizer.from_pretrained(model_name,
                                               do_lower_case=do_lower_case)
     model = BertForQuestionAnswering.from_pretrained(model_name,
                                                      from_tf=False,
                                                      config=config)
     return model, tokenizer
示例#10
0
    def __init__(self, hidden_dim, n_layers, tagset_size):
        super(BertLSTM, self).__init__()
        config = BertConfig.from_pretrained('bert-base-multilingual-cased')
        self.model = BertModel(config)

        self.decoder = nn.LSTM(768, hidden_dim, n_layers)

        self.hiddentotag = nn.Linear(hidden_dim, tagset_size)
示例#11
0
 def load_model(self, model_path: str, do_lower_case=False):
     config = BertConfig.from_pretrained(model_path + "/config.json")
     tokenizer = BertTokenizer.from_pretrained(model_path,
                                               do_lower_case=do_lower_case)
     model = BertForQuestionAnswering.from_pretrained(model_path,
                                                      from_tf=False,
                                                      config=config)
     return model, tokenizer
示例#12
0
def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda):

    assert torch.cuda.is_available(
    ) == True, 'PyTorch not running on GPU! #sadpanda'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(100)

    dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'}

    config = BertConfig.from_pretrained(bert_model)
    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertForNextSentencePrediction(config)
    model.cuda()
    model.eval()

    df = pd.read_csv(data, usecols=['id'])
    df.dropna(inplace=True)
    row_count = df.shape[0]
    del df

    chunk_count = math.ceil(row_count / batchsize)

    with open(dest, 'w+'):
        pass

    cols = ['context', dialogue_type_dict[dialogue_type]]
    for i, chunk in enumerate(
            tqdm(pd.read_csv(open(data, 'r'),
                             usecols=cols,
                             chunksize=batchsize),
                 desc='Batches',
                 total=chunk_count)):
        samples = get_batch(chunk, dialogue_type_dict[dialogue_type])

        assert len(samples) == chunk.shape[0], 'Some samples went missing!'

        if batchsize == 1:
            results = convert_single_example_to_features(samples, tokenizer)
        else:
            results = convert_examples_to_features(samples, tokenizer)

        with torch.no_grad():
            input_ids = torch.tensor([x.input_ids for x in results]).cuda()
            token_type_ids = torch.tensor([x.input_type_ids
                                           for x in results]).cuda()
            attention_mask = torch.tensor([x.input_mask
                                           for x in results]).cuda()

            outputs = model(input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask)[0]
            outputs = torch.softmax(outputs, dim=1)
        db_probs = outputs[:, 1]

        with open(dest, 'a') as f:
            f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
示例#13
0
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()

    ##Load Models
    config = BertConfig.from_pretrained(args.config_name)
    tokenizer = BertTokenizer.from_pretrained(args.text_encoder_checkpoint,
                                              do_lower_case=args.do_lower_case)
    text_encoder = BertModel.from_pretrained(args.text_encoder_checkpoint,
                                             config=config)
    graph_encoder = GraphEncoder(args.n_hidden, args.min_score)

    medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1)
    medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5)
    medsts_c2_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                          2)
    medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                            4)
    model = MedstsNet(text_encoder, graph_encoder, medsts_classifier,
                      medsts_c_classifier, medsts_c2_classifier,
                      medsts_type_classifier)
    if args.text_only:
        medsts_classifier = PairClassifier(config.hidden_size, 1)
        medsts_c_classifier = PairClassifier(config.hidden_size, 5)
        medsts_c2_classifier = PairClassifier(config.hidden_size, 2)
        medsts_type_classifier = PairClassifier(config.hidden_size, 4)
        model = MedstsNet_Textonly(text_encoder, medsts_classifier,
                                   medsts_c_classifier, medsts_c2_classifier,
                                   medsts_type_classifier)

    model.to(args.device)

    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False,
                                                reverse=True)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
示例#14
0
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    ##Load Models
    config = BertConfig.from_pretrained(args.config_name)
    tokenizer = BertTokenizer.from_pretrained(args.text_encoder_checkpoint,
                                              do_lower_case=args.do_lower_case)
    text_encoder = BertModel.from_pretrained(args.text_encoder_checkpoint,
                                             config=config)
    graph_encoder = GraphEncoder(args.n_hidden, args.min_score)
    if args.graph_encoder_checkpoint:
        graph_encoder.gcnnet.load_state_dict(
            torch.load(args.graph_encoder_checkpoint))

    medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1)
    medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5)
    medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                            4)
    model = MedstsNet(text_encoder, graph_encoder, medsts_classifier,
                      medsts_c_classifier, medsts_type_classifier)
    model.to(args.device)

    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info("saving model checkpoint to {}".format(args.output_dir))
        model_to_save = model.module if hasattr(model, 'module') else model
        # model_to_save.save_pretrained(args.output_dir)
        torch.save(model_to_save.state_dict(),
                   os.path.join(args.output_dir, 'saved_model.pth'))
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
示例#15
0
    def __init__(self, code_length):  # code_length为fc映射到的维度大小
        super(TextNet, self).__init__()

        modelConfig = BertConfig.from_pretrained(
            './data/bert-base-uncased-config.json')
        self.textExtractor = BertModel.from_pretrained(
            './data/bert-base-uncased-pytorch_model.bin', config=modelConfig)
        # self.textExtractor.eval()
        embedding_dim = self.textExtractor.config.hidden_size
示例#16
0
def load_artifacts(model_path):
    """ Loads pretrained model , tokenizer , config."""
    model_class = BertForQuestionAnswering
    model = model_class.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    config = BertConfig.from_pretrained(model_path)
    model.to("cpu")
    model.eval()
    return model, tokenizer, config
示例#17
0
def load_artifacts(model_path):
    """ Loads pretrained model , tokenizer , config."""
    model_class = BertForSequenceClassification
    model = model_class.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    config = BertConfig.from_pretrained(model_path)
    model.to("cpu")
    model.eval()
    return model, tokenizer, config
示例#18
0
 def __init__(self, code_length=1024):
     super(TextNet, self).__init__()
     modelConfig = BertConfig.from_pretrained(
         '/home/hengyuli/cross-modal/model/bert_config.json')
     self.textExtractor = BertModel.from_pretrained(
         '/home/hengyuli/cross-modal/model/pytorch_model.bin',
         config=modelConfig)
     embedding_dim = self.textExtractor.config.hidden_size
     self.fc = nn.Linear(embedding_dim, code_length)
     self.tanh = torch.nn.Tanh()
示例#19
0
    def __init__(self):
        super(Bert, self).__init__()

        self.tokenizer = BertTokenizer.from_pretrained(
            '../pretrained/bert-base-uncased/bert-base-uncased-vocab.txt')
        modelConfig = BertConfig.from_pretrained(
            '../pretrained/bert-base-uncased/bert_config.json')
        self.textExtractor = BertModel.from_pretrained(
            '../pretrained/bert-base-uncased/pytorch_model.bin',
            config=modelConfig)
    def __init__(self, code_length):  # code_length为fc映射到的维度大小
        super(TextNet, self).__init__()

        modelConfig = BertConfig.from_pretrained('bert-base-chinese')
        self.textExtractor = BertModel.from_pretrained('bert-base-chinese',
                                                       config=modelConfig)
        embedding_dim = self.textExtractor.config.hidden_size  #embedding_dim应该是模型截断处输出的维度

        self.fc = nn.Linear(embedding_dim, code_length)
        self.tanh = torch.nn.Tanh()
示例#21
0
    def __init__(self, opt):
        self.opt = opt

        if opt.model_name.lower() in ['vh_bert', 'bert_att', 'my_lcf']:
            tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name)
            config = BertConfig.from_pretrained(opt.pretrained_bert_name,
                                                output_attentions=True)
            self.model = opt.model_class(config, ).to(opt.device)
        elif 'bert' in opt.model_name.lower():
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            config = BertConfig.from_pretrained(opt.pretrained_bert_name,
                                                output_attentions=True)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name,
                                             config=config)
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='./cache/{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='./cache/{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
示例#22
0
def build_model(do_lower_case, num_labels):
    config = BertConfig.from_pretrained(MODEL_NAME,
                                        num_labels=num_labels,
                                        output_hidden_states=True)
    tokenizer = TOKENIZER_CLASS.from_pretrained(MODEL_NAME,
                                                do_lower_case=do_lower_case)
    model = MODEL_CLASS.from_pretrained(MODEL_NAME, config=config)

    model.to(get_device())

    return model, tokenizer
    def __init__(self, code_length):  # code_length为fc映射到的维度大小
        super(TextNet, self).__init__()

        # model_name = 'bert-base-multilingual-cased'
        modelConfig = BertConfig.from_pretrained(model_name)
        self.textExtractor = BertModel.from_pretrained(
            model_name, config=modelConfig)
        embedding_dim = self.textExtractor.config.hidden_size   #embedding_dim是模型截断处输出的维度
        self.fc = nn.Linear(embedding_dim, code_length)  # code_length是特征维度
        
        self.tanh = torch.nn.Tanh()
示例#24
0
 def __init__(self, model_name: str) -> None:
     super().__init__()
     config = BertConfig.from_pretrained(model_name)
     self.input_dim = config.hidden_size
     self.output_dim = config.vocab_size
     # TODO(mattg): It's possible that we could use some kind of cache like we have in
     # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel.  That way, we
     # would only load the BERT weights once.  Though, it's not clear how to do that here, as we
     # need to load `BertForMaskedLM`, not just `BertModel`...
     bert_model = BertForMaskedLM.from_pretrained(model_name)
     self.bert_lm_head = bert_model.cls  # pylint: disable=no-member
示例#25
0
 def __init__(self, model_state_dict) -> None:
     no_cuda = True
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                                    do_lower_case=False)
     config = BertConfig.from_pretrained('bert-base-chinese')
     self.model = BertForQuestionAnswering(config)
     self.model.load_state_dict(
         torch.load(model_state_dict, map_location='cpu'))
     self.model.to(self.device)
     self.model.eval()  # TODO
示例#26
0
    def __init__(self, code_length):
        super(TextNet, self).__init__()

        modelConfig = BertConfig.from_pretrained(
            '/home/disk1/zhaoyuying/models/modeling_bert/bert-base-uncased-config.json'
        )
        self.textExtractor = BertModel.from_pretrained(
            '/home/disk1/zhaoyuying/models/modeling_bert/bert-base-uncased-pytorch_model.bin',
            config=modelConfig)
        embedding_dim = self.textExtractor.config.hidden_size

        self.fc = nn.Linear(embedding_dim, code_length)
        self.tanh = torch.nn.Tanh()
示例#27
0
    def load_pretrained_model(model_path: str, lower_case=True):
        """
        Imports pretrained BERT model from the official format as seen on:
        https://github.com/google-research/bert

        :param model_path: Path to the model checkpoint file
        :param lower_case: select False if loading cased model
        :return: pretrained model and its tokenizer
        """
        config = BertConfig.from_pretrained(model_path + "/bert_config.json")
        tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=lower_case)
        model = BertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config)
        return model, tokenizer
示例#28
0
文件: bertnlp.py 项目: kohilin/ealm
def init(maxlen=512):
    global config, tokenizer, model, sim_model, MAX_LENGTH
    MAX_LENGTH = maxlen

    bert_model_name = 'bert-base-uncased'
    config = BertConfig.from_pretrained(bert_model_name)
    config.output_hidden_states = True
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertForMaskedLM.from_pretrained(bert_model_name, config=config)
    model.to(DEVICE)
    model.eval()

    sim_model = smodel.WebBertSimilarity(device=DEVICE)
示例#29
0
def train(
    root=True,
    binary=False,
    bert="bert-large-uncased",
    epochs=30,
    batch_size=8,
    save=False,
):
    trainset = SSTDataset("train", root=root, binary=binary)
    devset = SSTDataset("dev", root=root, binary=binary)
    testset = SSTDataset("test", root=root, binary=binary)

    config = BertConfig.from_pretrained(bert)
    if not binary:
        config.num_labels = 5
    model = BertForSequenceClassification.from_pretrained(bert, config=config)

    model = model.to(device)
    lossfn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(1, epochs):
        train_loss, train_acc = train_one_epoch(model,
                                                lossfn,
                                                optimizer,
                                                trainset,
                                                batch_size=batch_size)
        val_loss, val_acc = evaluate_one_epoch(model,
                                               lossfn,
                                               optimizer,
                                               devset,
                                               batch_size=batch_size)
        test_loss, test_acc = evaluate_one_epoch(model,
                                                 lossfn,
                                                 optimizer,
                                                 testset,
                                                 batch_size=batch_size)
        logger.info(f"epoch={epoch}")
        logger.info(
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}"
        )
        logger.info(
            f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}"
        )
        if save:
            label = "binary" if binary else "fine"
            nodes = "root" if root else "all"
            torch.save(model, f"{bert}__{nodes}__{label}__e{epoch}.pickle")

    logger.success("Done!")
示例#30
0
def load_artifacts(model_path, is_quantized=False):
    """ Loads pretrained model , tokenizer , config."""
    model_class = BertForSequenceClassification
    print("quantized_ouput/" if is_quantized else model_path)
    if not is_quantized:
        model = model_class.from_pretrained(model_path)
    else:
        model = torch.load("4bit_quantized_model.bin")
    tokenizer = BertTokenizer.from_pretrained(model_path)
    config = BertConfig.from_pretrained(
        "quantized_ouput/" if is_quantized else model_path)
    model.to("cpu")
    model.eval()
    return model, tokenizer, config