def post(self): args = parser.parse_args() print('hihihihihihihihi') print(args) # Create instance of model to be used for predictions model = SentenceTransformer('bert-base-nli-mean-tokens') if args['model_choice'] == 'base' else SentenceTransformer(os.path.join(ROOT, config['model_directory'])) model.eval() # handle user input input_sentence = [args.input_sentence] input_sentence = data_clean.clean_text(text=input_sentence, starting_line=0,\ ending_line=len(input_sentence)+1) input_embedding = model.encode(input_sentence) output_dict = {} output_dict['input_sentence'] = input_sentence[0] input_embedding = [round(embedding_num, 6) for embedding_num in list(input_embedding[0])] with open(os.path.join(ROOT,config['analects_file']['base'] if args['model_choice'] == 'base' else config['analects_file']['religio']), 'rb') as pkl_file: analects_loaded = pickle.load(pkl_file) analects_sentences = list(analects_loaded.keys()) analects_embeddings = list(analects_loaded.values()) min_distance = 100000 for index, corpus_embedding in enumerate(analects_embeddings): distance = scipy.spatial.distance.cdist([input_embedding], [corpus_embedding], "cosine")[0] if distance<min_distance: min_distance = distance min_index = index output_dict['closest_passage'] = analects_sentences[max(0, min_index-2):min(len(analects_sentences), min_index+3)] return output_dict
def similarity(par1, par2): transformer = SentenceTransformer('roberta-base-nli-stsb-mean-tokens') transformer.eval() par1 = tokenize.sent_tokenize(par1) vec1 = torch.Tensor(transformer.encode(par1)) vec1 = vec1.mean(0) par2 = tokenize.sent_tokenize(par2) vec2 = torch.Tensor(transformer.encode(par2)) vec2 = vec2.mean(0) cos_sim = F.cosine_similarity(vec1, vec2, dim=0) return cos_sim.item()
def sentence_transformers( path_to_senteval: str, pretrained_model_name_or_path: str, output_filepath: str = None, cuda_device: int = -1, prototyping_config: bool = False, verbose: bool = False, ) -> None: """Evaluates a pre-trained model from the Sentence Transformers library against the SentEval benchmark. """ from sentence_transformers import SentenceTransformer # SentEval prepare and batcher def prepare(params, samples): return @torch.no_grad() def batcher(params, batch): batch = _cleanup_batch(batch) # Sentence Transformers API expects un-tokenized sentences. batch = [" ".join(tokens) for tokens in batch] embeddings = params.model.encode(batch, batch_size=len(batch), show_progress_bar=False) embeddings = np.vstack(embeddings) return embeddings # Determine the torch device device = _get_device(cuda_device) # Load the Sentence Transformers tokenizer model = SentenceTransformer(pretrained_model_name_or_path, device=device) model.eval() typer.secho( (f"{SUCCESS} Model '{pretrained_model_name_or_path}' from Sentence Transformers loaded." " successfully."), fg=typer.colors.GREEN, bold=True, ) # Performs a few setup steps and returns the SentEval params params_senteval = _setup_senteval(path_to_senteval, prototyping_config, verbose) params_senteval["model"] = model _run_senteval(params_senteval, path_to_senteval, batcher, prepare, output_filepath) return
class SentenceBertEmbeddings: def __init__(self, bert_path): word_embedding_model = sent_models.Transformer(bert_path) pooling_model = sent_models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.model.to(DEFAULT_DEVICE) self.model.eval() def text_vector(self, sentences): return np.stack(self.model.encode([sentences], show_progress_bar=True))
def Obtaining_Embeddings(): SentenceBertEnc = SentenceTransformer('bert-base-nli-mean-tokens') SentenceBertEnc.eval() print('ggg') if path.exists('./embeddings') == False: os.mkdir('./embeddings') Positive = pd.read_csv('./dataset/trainpos.csv') Negative = pd.read_csv('./dataset/trainneg.csv') TextPos = Positive.text[:200000] Embeddings_Pos = np.array(SentenceBertEnc.encode(TextPos)) TextNeg = Positive.text[:200000] Embeddings_Neg = np.array(SentenceBertEnc.encode(TextNeg)) np.save('./embeddings/embpos200k.npy', Embeddings_Pos) np.save('./embeddings/embneg200k.npy', Embeddings_Neg) del SentenceBertEnc del Embeddings_Pos del Embeddings_Neg
def extract_sbert(self, input_json: str, output: str): from sentence_transformers import SentenceTransformer import torch from h5py import File device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = SentenceTransformer("bert-base-nli-mean-tokens") model = model.to(device) model.eval() df = pd.read_json(input_json) with torch.no_grad(), tqdm(total=df.shape[0], ascii=True) as pbar, File(output, "w") as store: for idx, row in df.iterrows(): caption = row["caption"] store[row["caption_key"]] = model.encode([caption]).squeeze(0) pbar.update()
class SBERTembedder(Embedder): def __init__(self): self.sbert = SentenceTransformer('paraphrase-distilroberta-base-v1') def embedding(self, texts: str): return self.sbert.encode([preproc(text) for text in texts]) def transform(self, texts: list): return [self.embedding(x) for x in texts] def fit(self, texts): pass def save(self, output_path: str): torch.save(self.sbert.state_dict(), output_path) def load(self, input_path: str): self.sbert = self.sbert.load_state_dict(torch.load(input_path)) # todo check it # self.sbert.load_state_dict(torch.load(input_path)) self.sbert.eval()
class SentenceEmbedder(nn.Module): def __init__(self, version='bert-large-nli-stsb-mean-tokens'): super().__init__() np.set_printoptions(threshold=100) # Load Sentence model (based on BERT) from URL self.model = SentenceTransformer(version, device="cuda") self.model.eval() def forward(self, sentences): """sentences are expect to be a list of strings, e.g. sentences = ['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.' ] """ sentence_embeddings = self.model.encode(sentences, batch_size=len(sentences), show_progress_bar=False, convert_to_tensor=True) return sentence_embeddings.cuda() def encode(self, sentences): embeddings = self(sentences) return embeddings[:, :, None, None]
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set') parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument('--custom-features-json', type=str, default='', help="JSON with precomputed features.") parser.add_argument("--bottleneck_size", type=int, default=0, help="Size of the bottleneck layer in the classifier") parser.add_argument("--input_dropout", type=float, default=0.0, help="Dropout on the the classifier input") parser.add_argument( "--do_norm", action='store_true', help= "Set this flag for mean/variance normalization before the classifier.") parser.add_argument( "--do_softmax", action='store_true', help="Set this flag for softmax before the classifier.") parser.add_argument( "--do_noise", action='store_true', help="Set this flag for noise addition before the classifier.") parser.add_argument("--do_round", default=0, type=int, help="Apply rounding before the classifier.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name](args.data_dir) label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] if args.model_type == 'sbert': config, tokenizer = None, None transformer = SentenceTransformer(args.model_name_or_path) model = FFClassifier(768, num_labels, args.bottleneck_size, args.input_dropout) else: config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) transformer = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) model = FFClassifier(config.hidden_size, num_labels) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab #transformer.to(args.device) transformer.eval() model.to(args.device) transformer.to(args.device) logger.info("Training/evaluation parameters %s", args) best_steps = 0 # Training if args.do_train: train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, processor, transformer, evaluate=False) global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer, processor, transformer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training save_path = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model, save_path) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = torch.load(save_path) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging checkpoints = sorted(checkpoints, key=lambda s: int(s.split('-')[-1]) if s.split('-')[-1].isdigit() else 0) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" model = torch.load(os.path.join(checkpoint, WEIGHTS_NAME)) model.to(args.device) result = evaluate(args, model, tokenizer, processor, transformer, prefix=global_step) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if args.do_test and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" model = torch.load(os.path.join(checkpoint, WEIGHTS_NAME)) model.to(args.device) result = evaluate(args, model, tokenizer, processor, transformer, prefix=global_step, test=True) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if best_steps: logger.info("best steps of eval acc is the following checkpoints: %s", best_steps) return results
"Options: 'base' or 'religio'. Chooses which model to use for scoring and finding the relevant passage with.", type=str, choices=['base', 'religio']) parser.add_argument( "--input_sentence", help= "Sentence you wish to be matched with a relevant Analects passage.", type=str) args = parser.parse_args() # Load religioBERT or other model from Disk model = SentenceTransformer( 'bert-base-nli-mean-tokens' ) if args.model_choice == 'base' else SentenceTransformer( os.path.join(ROOT, config['model_directory'])) model.eval() input_sentence = [args.input_sentence] input_sentence = data_clean.clean_text(text=input_sentence, starting_line=0,\ ending_line=len(input_sentence)+1) input_embedding = model.encode(input_sentence) output_dict = {} output_dict['input_sentence'] = input_sentence[0] input_embedding = [ round(embedding_num, 6) for embedding_num in list(input_embedding[0]) ] with open( os.path.join( ROOT, config['analects_file']['base'] if args.model_choice == 'base' else config['analects_file']['religio']), 'rb') as pkl_file:
config = BertConfig.from_pretrained(model_version, output_hidden_states=False) model = BertForMaskedLM.from_pretrained(model_version, config=config) model.train() cuda = torch.cuda.is_available() if cuda: model = model.cuda() tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=model_version.endswith("uncased")) CLS = '[CLS]' SEP = '[SEP]' MASK = '[MASK]' mask_id = tokenizer.convert_tokens_to_ids([MASK])[0] sep_id = tokenizer.convert_tokens_to_ids([SEP])[0] cls_id = tokenizer.convert_tokens_to_ids([CLS])[0] model2 = SentenceTransformer('bert-base-nli-mean-tokens') model2.eval() def DecandEval(PathtoData, PathtoRef, mode): df = pd.read_csv(PathtoData) TextData = df.text device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") humanref = list(open(PathtoRef, "r")) EncoderNet_Neg = EncoderNet() if path.exists('./models/negencoder'): checkpoint = torch.load('./models/negencoder') EncoderNet_Neg.load_state_dict(checkpoint) EncoderNet_Neg.to(device) EncoderNet_Neg.eval() DecoderNet_Neg = DecoderNet() if path.exists('./models/negdecoder'):
class EmbKnn: def __init__(self, path: str, args): self.args = args self.device = torch.device("cuda:0" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") with DisableLogger(): if path is not None and os.path.exists(path): self.model = SentenceTransformer(path) elif 'roberta' in self.args.bert_model: self.model = SentenceTransformer( 'roberta-base-nli-stsb-mean-tokens') else: self.model = SentenceTransformer('bert-base-nli-mean-tokens') self.model.to(self.device) self.cached_embeddings = None def save(self, dir_path): self.model.save(dir_path) def cache(self, example_sentences): self.model.eval() self.cached_embeddings = self.model.encode(example_sentences, show_progress_bar=False) def encode(self, text): self.model.eval() query_embeddings = self.model.encode(text, show_progress_bar=False) return torch.FloatTensor(query_embeddings) def predict(self, text): assert self.cached_embeddings is not None self.model.eval() query_embeddings = self.model.encode(text, show_progress_bar=False) distances = scipy.spatial.distance.cdist(query_embeddings, self.cached_embeddings, "cosine") distances = 1.0 - distances return torch.FloatTensor(distances) def train(self, train_examples, dev_examples, dir_path=None): train_examples = SentencesDataset(train_examples, self.model) dev_examples = SentencesDataset(dev_examples, self.model) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) dev_dataloader = DataLoader(dev_examples, shuffle=False, batch_size=self.args.eval_batch_size) train_loss = losses.CosineSimilarityLoss(model=self.model) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil( len(train_examples) * self.args.num_train_epochs / self.args.train_batch_size * self.args.warmup_proportion) self.model.zero_grad() self.model.train() self.model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=self.args.num_train_epochs, evaluation_steps=10000, warmup_steps=warmup_steps, output_path=None, optimizer_params={ 'lr': self.args.learning_rate, 'eps': 1e-6, 'correct_bias': False })
means = np.asarray(means) / len(dataset) stdevs = np.asarray(stdevs) / len(dataset) bn = 128 train_loader = torch.utils.data.DataLoader(dataset, batch_size=bn, shuffle=False) device = torch.device("cuda") torch.cuda.set_device(3) model_img = model.VGGNet().to(device) model_img.eval() embedding_img = np.zeros((len(dataset), 2048)) label_txt = np.zeros(len(dataset)) for batch_idx, (i,j,k) in enumerate(train_loader): i = i.to(device) output = model_img(i) embedding_img[batch_idx*bn : batch_idx*bn + output.shape[0]] = output.cpu().detach().numpy() label_txt[batch_idx*bn : batch_idx*bn + output.shape[0]] = j print(batch_idx*bn, batch_idx*bn + output.shape[0]) np.save("pascal_embedding_img.npy", embedding_img) np.save( "pascal_label_txt.npy", label_txt) device = torch.device("cuda") torch.cuda.set_device(3) model_txt = SentenceTransformer('bert-large-nli-stsb-mean-tokens').to(device) model_txt.eval() embedding_txt = np.zeros((len(dataset), 1024)) for batch_idx, (i,j,k) in enumerate(train_loader): output = model_txt.encode(k) print(k) embedding_txt[batch_idx*bn : batch_idx*bn + output.shape[0]] = output#.cpu().detach().numpy() print(batch_idx*bn, batch_idx*bn + output.shape[0]) np.save("pascal_embedding_txt.npy", embedding_txt)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--compress', type=bool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('input_file', type=str, help='Input file') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() # Setup CUDA, GPU & distributed training device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) # Set seed set_seed(args) args.model_type = args.model_type.lower() if args.model_type == 'sbert': transformer = SentenceTransformer(args.model_name_or_path) examples = read_examples(args.input_file) embeddings = transformer.encode([e.text for e in examples]) with file_writer_helper( args.wspecifier, filetype=args.filetype, compress=args.compress, compression_method=args.compression_method) as writer: for i in range(len(examples)): writer[examples[i].unique_id] = embeddings[i] else: config_class, model_class, tokenizer_class = MODEL_CLASSES[ args.model_type] config = config_class.from_pretrained(args.model_name_or_path) tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) transformer = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) transformer.eval() transformer.to(args.device) with torch.no_grad(): load_and_convert_examples(args, tokenizer, transformer) logger.info('Done converting {} to {}'.format(args.input_file, args.wspecifier))
class ClassifierModel(nn.Module): """ A class that performs all neural network functionalities ... Attributes ---------- image_feature_extractor : nn.Module object model that takes image as input and outputs 1000 dimension feature vector image_feature_size : int the size of image feature vector text_feature_extractor : nn.Module object model that takes sentence as input and outputs 768 dimension feature vector text_feature_size : int the size of text feature vector hidden_size : int number of neurons in the hidden layer(s) hidden_size : int number of neurons in the hidden layer(s) Methods ------- forward(images, text) Performs forwards pass through the neural network """ def __init__(self): """ Performs initializations of all needed variables Parameters ---------- None Returns ---------- None """ super(ClassifierModel, self).__init__() # Incorporating pre-trained models image_feature_extractor = models.resnet18(pretrained=True) self.image_feature_extractor = torch.nn.Sequential( *list(image_feature_extractor.children())[:-1]) self.image_feature_extractor.eval() self.text_feature_extractor = SentenceTransformer('stsb-roberta-base') self.text_feature_extractor.eval() self.image_feature_size = 512 self.text_feature_size = 768 self.seperator_size = 0 self.out_size = 1 self.hidden_size = 1024 # Fully connected layers for final prediction self.fc = nn.Sequential( nn.Linear(in_features=self.image_feature_size + self.text_feature_size + self.seperator_size, out_features=self.hidden_size, bias=False), nn.ReLU(), nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False), nn.ReLU(), nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False), nn.ReLU(), nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False), nn.ReLU(), nn.Linear(in_features=self.hidden_size, out_features=self.out_size, bias=False), nn.Sigmoid()) def forward(self, images, text): """Performs forwards pass through the neural network Both images and text is of a particular `batch size` during training, and of size 1 during predictions Parameters ---------- images : tensor Represents a collection of images that must be fed to our model. It must be of size `(batch_size,image_width,image_height,no_of_channels)` text : list Represents a list of sentences(representing image captions) that must be fed to our model. It must be of size `(batch_size,)` Returns ------ images : tensor Returns the output of the model after `images` and `text` are used as input """ # Extracting features for image and text with torch.no_grad(): image_feature_vecs = self.image_feature_extractor.forward( images).flatten(start_dim=1) text_feature_vecs = from_numpy( self.text_feature_extractor.encode(text)).cuda() #seperator_vecs = torch.zeros(images.shape[0],self.seperator_size).cuda() # Concating features and obtaining final outputs fc_input = torch.cat((image_feature_vecs, text_feature_vecs), dim=-1) # fc_input = torch.cat((image_feature_vecs,seperator_vecs,text_feature_vecs),dim=-1) output = self.fc(fc_input) return output
class TransformersClassifierHandler(BaseHandler, ABC): """ Transformers text classifier handler class. This handler takes a text (string) and as input and returns the classification text based on the serialized transformers checkpoint. """ def __init__(self): super(TransformersClassifierHandler, self).__init__() self.initialized = False def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") files = os.listdir(model_dir) if not os.path.exists(os.path.join(model_dir, '0_BERT')): os.mkdir(os.path.join(model_dir, '0_BERT')) if not os.path.exists(os.path.join(model_dir, '1_Pooling')): os.mkdir(os.path.join(model_dir, '1_Pooling')) for file in files: if file.startswith("bert_"): shutil.move(os.path.join(model_dir, file), os.path.join(model_dir, '0_BERT', file[5:])) elif file.startswith("pooling_"): shutil.move(os.path.join(model_dir, file), os.path.join(model_dir, '1_Pooling', file[8:])) self.model = SentenceTransformer(model_dir) self.model.to(self.device) self.model.eval() logger.debug('Transformer model from path {0} loaded successfully'.format(model_dir)) # Read the mapping file, index to object name mapping_file_path = os.path.join(model_dir, "index_to_name.json") if os.path.isfile(mapping_file_path): with open(mapping_file_path) as f: self.mapping = json.load(f) else: logger.warning('Missing the index_to_name.json file. Inference output will not include class name.') self.initialized = True def preprocess(self, data): """ Very basic preprocessing code - only tokenizes. Extend with your own preprocessing steps as needed. """ text = data[0].get("data") if text is None: text = data[0].get("body") sentences = text.decode('utf-8') logger.info("Received text: '%s'", sentences) inputs = sentences return inputs def inference(self, inputs): """ Predict the class of a text using a trained transformer model. """ # NOTE: This makes the assumption that your model expects text to be tokenized # with "input_ids" and "token_type_ids" - which is true for some popular transformer models, e.g. bert. # If your transformer model expects different tokenization, adapt this code to suit # its expected input format. prediction = self.model.encode( inputs )[0].tolist(); logger.info("Model predicted: '%s'", prediction) if self.mapping: prediction = self.mapping[str(prediction)] return [prediction] def postprocess(self, inference_output): # TODO: Add any needed post-processing of the model predictions here return inference_output