Пример #1
0
    def post(self):
        
        args = parser.parse_args()
        print('hihihihihihihihi')
        print(args)
        # Create instance of model to be used for predictions
        model = SentenceTransformer('bert-base-nli-mean-tokens') if args['model_choice'] == 'base' else SentenceTransformer(os.path.join(ROOT, config['model_directory']))
        model.eval()

        # handle user input
        input_sentence = [args.input_sentence]
        input_sentence = data_clean.clean_text(text=input_sentence, starting_line=0,\
                              ending_line=len(input_sentence)+1)
        input_embedding = model.encode(input_sentence)
        output_dict = {}
        output_dict['input_sentence'] = input_sentence[0]
        input_embedding = [round(embedding_num, 6) for embedding_num in list(input_embedding[0])]
        
        with open(os.path.join(ROOT,config['analects_file']['base'] if args['model_choice'] == 'base' else config['analects_file']['religio']), 'rb') as pkl_file:
            analects_loaded = pickle.load(pkl_file)

        analects_sentences = list(analects_loaded.keys())
        analects_embeddings = list(analects_loaded.values())
        min_distance = 100000
        for index, corpus_embedding in enumerate(analects_embeddings):
            distance = scipy.spatial.distance.cdist([input_embedding], [corpus_embedding], "cosine")[0]
            if distance<min_distance:
                min_distance = distance
                min_index = index
        output_dict['closest_passage'] = analects_sentences[max(0, min_index-2):min(len(analects_sentences), min_index+3)]
        
        return output_dict
Пример #2
0
 def similarity(par1, par2):
     transformer = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
     transformer.eval()
     par1 = tokenize.sent_tokenize(par1)
     vec1 = torch.Tensor(transformer.encode(par1))
     vec1 = vec1.mean(0)
     par2 = tokenize.sent_tokenize(par2)
     vec2 = torch.Tensor(transformer.encode(par2))
     vec2 = vec2.mean(0)
     cos_sim = F.cosine_similarity(vec1, vec2, dim=0)
     return cos_sim.item()
Пример #3
0
def sentence_transformers(
    path_to_senteval: str,
    pretrained_model_name_or_path: str,
    output_filepath: str = None,
    cuda_device: int = -1,
    prototyping_config: bool = False,
    verbose: bool = False,
) -> None:
    """Evaluates a pre-trained model from the Sentence Transformers library against the SentEval
    benchmark.
    """

    from sentence_transformers import SentenceTransformer

    # SentEval prepare and batcher
    def prepare(params, samples):
        return

    @torch.no_grad()
    def batcher(params, batch):
        batch = _cleanup_batch(batch)
        # Sentence Transformers API expects un-tokenized sentences.
        batch = [" ".join(tokens) for tokens in batch]
        embeddings = params.model.encode(batch,
                                         batch_size=len(batch),
                                         show_progress_bar=False)
        embeddings = np.vstack(embeddings)
        return embeddings

    # Determine the torch device
    device = _get_device(cuda_device)

    # Load the Sentence Transformers tokenizer
    model = SentenceTransformer(pretrained_model_name_or_path, device=device)
    model.eval()
    typer.secho(
        (f"{SUCCESS} Model '{pretrained_model_name_or_path}' from Sentence Transformers loaded."
         " successfully."),
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Performs a few setup steps and returns the SentEval params
    params_senteval = _setup_senteval(path_to_senteval, prototyping_config,
                                      verbose)
    params_senteval["model"] = model
    _run_senteval(params_senteval, path_to_senteval, batcher, prepare,
                  output_filepath)

    return
Пример #4
0
class SentenceBertEmbeddings:
    def __init__(self, bert_path):
        word_embedding_model = sent_models.Transformer(bert_path)
        pooling_model = sent_models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)
        self.model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        self.model.to(DEFAULT_DEVICE)
        self.model.eval()

    def text_vector(self, sentences):
        return np.stack(self.model.encode([sentences], show_progress_bar=True))
Пример #5
0
def Obtaining_Embeddings():
    SentenceBertEnc = SentenceTransformer('bert-base-nli-mean-tokens')
    SentenceBertEnc.eval()
    print('ggg')
    if path.exists('./embeddings') == False:
        os.mkdir('./embeddings')
    Positive = pd.read_csv('./dataset/trainpos.csv')
    Negative = pd.read_csv('./dataset/trainneg.csv')
    TextPos = Positive.text[:200000]
    Embeddings_Pos = np.array(SentenceBertEnc.encode(TextPos))
    TextNeg = Positive.text[:200000]
    Embeddings_Neg = np.array(SentenceBertEnc.encode(TextNeg))
    np.save('./embeddings/embpos200k.npy', Embeddings_Pos)
    np.save('./embeddings/embneg200k.npy', Embeddings_Neg)
    del SentenceBertEnc
    del Embeddings_Pos
    del Embeddings_Neg
Пример #6
0
    def extract_sbert(self, input_json: str, output: str):
        from sentence_transformers import SentenceTransformer
        import torch
        from h5py import File
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = SentenceTransformer("bert-base-nli-mean-tokens")
        model = model.to(device)
        model.eval()

        df = pd.read_json(input_json)

        with torch.no_grad(), tqdm(total=df.shape[0],
                                   ascii=True) as pbar, File(output,
                                                             "w") as store:
            for idx, row in df.iterrows():
                caption = row["caption"]
                store[row["caption_key"]] = model.encode([caption]).squeeze(0)
                pbar.update()
Пример #7
0
class SBERTembedder(Embedder):
    def __init__(self):
        self.sbert = SentenceTransformer('paraphrase-distilroberta-base-v1')

    def embedding(self, texts: str):
        return self.sbert.encode([preproc(text) for text in texts])

    def transform(self, texts: list):
        return [self.embedding(x) for x in texts]

    def fit(self, texts):
        pass

    def save(self, output_path: str):
        torch.save(self.sbert.state_dict(), output_path)

    def load(self, input_path: str):
        self.sbert = self.sbert.load_state_dict(torch.load(input_path))
        # todo check it
        # self.sbert.load_state_dict(torch.load(input_path))
        self.sbert.eval()
Пример #8
0
class SentenceEmbedder(nn.Module):
    def __init__(self, version='bert-large-nli-stsb-mean-tokens'):
        super().__init__()
        np.set_printoptions(threshold=100)
        # Load Sentence model (based on BERT) from URL
        self.model = SentenceTransformer(version, device="cuda")
        self.model.eval()

    def forward(self, sentences):
        """sentences are expect to be a list of strings, e.g.
            sentences = ['This framework generates embeddings for each input sentence',
                         'Sentences are passed as a list of string.',
                         'The quick brown fox jumps over the lazy dog.'
                         ]
        """
        sentence_embeddings = self.model.encode(sentences,
                                                batch_size=len(sentences),
                                                show_progress_bar=False,
                                                convert_to_tensor=True)
        return sentence_embeddings.cuda()

    def encode(self, sentences):
        embeddings = self(sentences)
        return embeddings[:, :, None, None]
Пример #9
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help='Whether to run test on the test set')
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--custom-features-json',
                        type=str,
                        default='',
                        help="JSON with precomputed features.")
    parser.add_argument("--bottleneck_size",
                        type=int,
                        default=0,
                        help="Size of the bottleneck layer in the classifier")
    parser.add_argument("--input_dropout",
                        type=float,
                        default=0.0,
                        help="Dropout on the the classifier input")
    parser.add_argument(
        "--do_norm",
        action='store_true',
        help=
        "Set this flag for mean/variance normalization before the classifier.")
    parser.add_argument(
        "--do_softmax",
        action='store_true',
        help="Set this flag for softmax before the classifier.")
    parser.add_argument(
        "--do_noise",
        action='store_true',
        help="Set this flag for noise addition before the classifier.")
    parser.add_argument("--do_round",
                        default=0,
                        type=int,
                        help="Apply rounding before the classifier.")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name](args.data_dir)
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    if args.model_type == 'sbert':
        config, tokenizer = None, None
        transformer = SentenceTransformer(args.model_name_or_path)
        model = FFClassifier(768, num_labels, args.bottleneck_size,
                             args.input_dropout)
    else:
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=args.task_name)
        tokenizer = tokenizer_class.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        transformer = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config)
        model = FFClassifier(config.hidden_size, num_labels)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    #transformer.to(args.device)
    transformer.eval()
    model.to(args.device)
    transformer.to(args.device)

    logger.info("Training/evaluation parameters %s", args)
    best_steps = 0

    # Training
    if args.do_train:
        train_dataset, _ = load_and_cache_examples(args,
                                                   args.task_name,
                                                   tokenizer,
                                                   processor,
                                                   transformer,
                                                   evaluate=False)
        global_step, tr_loss, best_steps = train(args, train_dataset, model,
                                                 tokenizer, processor,
                                                 transformer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        save_path = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model, save_path)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = torch.load(save_path)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        checkpoints = sorted(checkpoints,
                             key=lambda s: int(s.split('-')[-1])
                             if s.split('-')[-1].isdigit() else 0)
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            model = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
            model.to(args.device)
            result = evaluate(args,
                              model,
                              tokenizer,
                              processor,
                              transformer,
                              prefix=global_step)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    if args.do_test and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            model = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
            model.to(args.device)
            result = evaluate(args,
                              model,
                              tokenizer,
                              processor,
                              transformer,
                              prefix=global_step,
                              test=True)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)
    if best_steps:
        logger.info("best steps of eval acc is the following checkpoints: %s",
                    best_steps)
    return results
        "Options: 'base' or 'religio'. Chooses which model to use for scoring and finding the relevant passage with.",
        type=str,
        choices=['base', 'religio'])
    parser.add_argument(
        "--input_sentence",
        help=
        "Sentence you wish to be matched with a relevant Analects passage.",
        type=str)
    args = parser.parse_args()

    # Load religioBERT or other model from Disk
    model = SentenceTransformer(
        'bert-base-nli-mean-tokens'
    ) if args.model_choice == 'base' else SentenceTransformer(
        os.path.join(ROOT, config['model_directory']))
    model.eval()
    input_sentence = [args.input_sentence]
    input_sentence = data_clean.clean_text(text=input_sentence, starting_line=0,\
                          ending_line=len(input_sentence)+1)
    input_embedding = model.encode(input_sentence)
    output_dict = {}
    output_dict['input_sentence'] = input_sentence[0]
    input_embedding = [
        round(embedding_num, 6) for embedding_num in list(input_embedding[0])
    ]

    with open(
            os.path.join(
                ROOT, config['analects_file']['base'] if args.model_choice
                == 'base' else config['analects_file']['religio']),
            'rb') as pkl_file:
Пример #11
0
config = BertConfig.from_pretrained(model_version, output_hidden_states=False)
model = BertForMaskedLM.from_pretrained(model_version, config=config)
model.train()
cuda = torch.cuda.is_available()
if cuda:
    model = model.cuda()

tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=model_version.endswith("uncased"))
CLS = '[CLS]'
SEP = '[SEP]'
MASK = '[MASK]'
mask_id = tokenizer.convert_tokens_to_ids([MASK])[0]
sep_id = tokenizer.convert_tokens_to_ids([SEP])[0]
cls_id = tokenizer.convert_tokens_to_ids([CLS])[0]
model2 = SentenceTransformer('bert-base-nli-mean-tokens')
model2.eval()

def DecandEval(PathtoData, PathtoRef, mode):
    df = pd.read_csv(PathtoData)
    TextData = df.text
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    humanref = list(open(PathtoRef, "r"))
    EncoderNet_Neg = EncoderNet()
    if path.exists('./models/negencoder'):
        checkpoint = torch.load('./models/negencoder')
        EncoderNet_Neg.load_state_dict(checkpoint)
    EncoderNet_Neg.to(device)
    EncoderNet_Neg.eval()
    DecoderNet_Neg = DecoderNet()
    if path.exists('./models/negdecoder'):
Пример #12
0
class EmbKnn:
    def __init__(self, path: str, args):

        self.args = args
        self.device = torch.device("cuda:0" if torch.cuda.is_available()
                                   and not self.args.no_cuda else "cpu")

        with DisableLogger():
            if path is not None and os.path.exists(path):
                self.model = SentenceTransformer(path)
            elif 'roberta' in self.args.bert_model:
                self.model = SentenceTransformer(
                    'roberta-base-nli-stsb-mean-tokens')
            else:
                self.model = SentenceTransformer('bert-base-nli-mean-tokens')

        self.model.to(self.device)
        self.cached_embeddings = None

    def save(self, dir_path):
        self.model.save(dir_path)

    def cache(self, example_sentences):
        self.model.eval()
        self.cached_embeddings = self.model.encode(example_sentences,
                                                   show_progress_bar=False)

    def encode(self, text):

        self.model.eval()
        query_embeddings = self.model.encode(text, show_progress_bar=False)
        return torch.FloatTensor(query_embeddings)

    def predict(self, text):

        assert self.cached_embeddings is not None

        self.model.eval()

        query_embeddings = self.model.encode(text, show_progress_bar=False)
        distances = scipy.spatial.distance.cdist(query_embeddings,
                                                 self.cached_embeddings,
                                                 "cosine")
        distances = 1.0 - distances

        return torch.FloatTensor(distances)

    def train(self, train_examples, dev_examples, dir_path=None):

        train_examples = SentencesDataset(train_examples, self.model)
        dev_examples = SentencesDataset(dev_examples, self.model)

        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        dev_dataloader = DataLoader(dev_examples,
                                    shuffle=False,
                                    batch_size=self.args.eval_batch_size)

        train_loss = losses.CosineSimilarityLoss(model=self.model)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

        warmup_steps = math.ceil(
            len(train_examples) * self.args.num_train_epochs /
            self.args.train_batch_size * self.args.warmup_proportion)

        self.model.zero_grad()
        self.model.train()
        self.model.fit(train_objectives=[(train_dataloader, train_loss)],
                       evaluator=evaluator,
                       epochs=self.args.num_train_epochs,
                       evaluation_steps=10000,
                       warmup_steps=warmup_steps,
                       output_path=None,
                       optimizer_params={
                           'lr': self.args.learning_rate,
                           'eps': 1e-6,
                           'correct_bias': False
                       })
means = np.asarray(means) / len(dataset)
stdevs = np.asarray(stdevs) / len(dataset)
bn = 128
train_loader = torch.utils.data.DataLoader(dataset, batch_size=bn, shuffle=False)
device = torch.device("cuda")
torch.cuda.set_device(3)
model_img = model.VGGNet().to(device)
model_img.eval()
embedding_img = np.zeros((len(dataset), 2048))
label_txt = np.zeros(len(dataset))
for batch_idx, (i,j,k) in enumerate(train_loader):
    i = i.to(device)
    output = model_img(i)
    embedding_img[batch_idx*bn : batch_idx*bn + output.shape[0]] = output.cpu().detach().numpy()
    label_txt[batch_idx*bn : batch_idx*bn + output.shape[0]] = j
    print(batch_idx*bn, batch_idx*bn + output.shape[0])
np.save("pascal_embedding_img.npy", embedding_img)
np.save( "pascal_label_txt.npy", label_txt)
device = torch.device("cuda")
torch.cuda.set_device(3)
model_txt = SentenceTransformer('bert-large-nli-stsb-mean-tokens').to(device)
model_txt.eval()
embedding_txt = np.zeros((len(dataset), 1024))
for batch_idx, (i,j,k) in enumerate(train_loader):
    output = model_txt.encode(k)
    print(k)
    embedding_txt[batch_idx*bn : batch_idx*bn + output.shape[0]] = output#.cpu().detach().numpy()
    print(batch_idx*bn, batch_idx*bn + output.shape[0])
 np.save("pascal_embedding_txt.npy", embedding_txt)
Пример #14
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument('--filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5'],
                        help='Specify the file format for output. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--compress',
                        type=bool,
                        default=False,
                        help='Save in compressed format')
    parser.add_argument(
        '--compression-method',
        type=int,
        default=2,
        help='Specify the method(if mat) or gzip-level(if hdf5)')

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument('input_file', type=str, help='Input file')
    parser.add_argument('wspecifier', type=str, help='Write specifier')
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    # Set seed
    set_seed(args)

    args.model_type = args.model_type.lower()

    if args.model_type == 'sbert':
        transformer = SentenceTransformer(args.model_name_or_path)
        examples = read_examples(args.input_file)
        embeddings = transformer.encode([e.text for e in examples])

        with file_writer_helper(
                args.wspecifier,
                filetype=args.filetype,
                compress=args.compress,
                compression_method=args.compression_method) as writer:
            for i in range(len(examples)):
                writer[examples[i].unique_id] = embeddings[i]
    else:
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            args.model_type]
        config = config_class.from_pretrained(args.model_name_or_path)
        tokenizer = tokenizer_class.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        transformer = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config)

        transformer.eval()
        transformer.to(args.device)

        with torch.no_grad():
            load_and_convert_examples(args, tokenizer, transformer)

    logger.info('Done converting {} to {}'.format(args.input_file,
                                                  args.wspecifier))
class ClassifierModel(nn.Module):
    """
    A class that performs all neural network functionalities

    ...

    Attributes
    ----------
    image_feature_extractor : nn.Module object
        model that takes image as input and outputs 1000 dimension feature vector
    image_feature_size : int
        the size of image feature vector
    text_feature_extractor : nn.Module object
        model that takes sentence as input and outputs 768 dimension feature vector
    text_feature_size : int
        the size of text feature vector
    hidden_size : int
        number of neurons in the hidden layer(s)
    hidden_size : int
        number of neurons in the hidden layer(s)
    

    Methods
    -------
    forward(images, text)
        Performs forwards pass through the neural network
    
    """
    def __init__(self):
        """
        Performs initializations of all needed variables
        
        Parameters
        ----------
        None

        Returns
        ----------
        None

        """

        super(ClassifierModel, self).__init__()

        # Incorporating pre-trained models
        image_feature_extractor = models.resnet18(pretrained=True)
        self.image_feature_extractor = torch.nn.Sequential(
            *list(image_feature_extractor.children())[:-1])
        self.image_feature_extractor.eval()
        self.text_feature_extractor = SentenceTransformer('stsb-roberta-base')
        self.text_feature_extractor.eval()
        self.image_feature_size = 512
        self.text_feature_size = 768
        self.seperator_size = 0
        self.out_size = 1
        self.hidden_size = 1024

        # Fully connected layers for final prediction
        self.fc = nn.Sequential(
            nn.Linear(in_features=self.image_feature_size +
                      self.text_feature_size + self.seperator_size,
                      out_features=self.hidden_size,
                      bias=False), nn.ReLU(),
            nn.Linear(in_features=self.hidden_size,
                      out_features=self.hidden_size,
                      bias=False), nn.ReLU(),
            nn.Linear(in_features=self.hidden_size,
                      out_features=self.hidden_size,
                      bias=False), nn.ReLU(),
            nn.Linear(in_features=self.hidden_size,
                      out_features=self.hidden_size,
                      bias=False), nn.ReLU(),
            nn.Linear(in_features=self.hidden_size,
                      out_features=self.out_size,
                      bias=False), nn.Sigmoid())

    def forward(self, images, text):
        """Performs forwards pass through the neural network

        Both images and text is of a particular `batch size` during training, and of
        size 1 during predictions

        Parameters
        ----------
        images : tensor
            Represents a collection of images that must be fed to our model. It must be of 
            size `(batch_size,image_width,image_height,no_of_channels)`
        
        text : list
            Represents a list of sentences(representing image captions) that must be fed to our model. It must be of 
            size `(batch_size,)`

        Returns
        ------
        images : tensor
            Returns the output of the model after `images` and `text` are used as input
        
        """

        # Extracting features for image and text
        with torch.no_grad():
            image_feature_vecs = self.image_feature_extractor.forward(
                images).flatten(start_dim=1)
            text_feature_vecs = from_numpy(
                self.text_feature_extractor.encode(text)).cuda()
        #seperator_vecs = torch.zeros(images.shape[0],self.seperator_size).cuda()

        # Concating features and obtaining final outputs
        fc_input = torch.cat((image_feature_vecs, text_feature_vecs), dim=-1)
        # fc_input = torch.cat((image_feature_vecs,seperator_vecs,text_feature_vecs),dim=-1)
        output = self.fc(fc_input)

        return output
Пример #16
0
class TransformersClassifierHandler(BaseHandler, ABC):
    """
    Transformers text classifier handler class. This handler takes a text (string) and
    as input and returns the classification text based on the serialized transformers checkpoint.
    """
    def __init__(self):
        super(TransformersClassifierHandler, self).__init__()
        self.initialized = False

    def initialize(self, ctx):
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        files = os.listdir(model_dir)
        if not os.path.exists(os.path.join(model_dir, '0_BERT')):
            os.mkdir(os.path.join(model_dir, '0_BERT'))
        if not os.path.exists(os.path.join(model_dir, '1_Pooling')):
            os.mkdir(os.path.join(model_dir, '1_Pooling'))
        for file in files:
            if file.startswith("bert_"):
                shutil.move(os.path.join(model_dir, file), os.path.join(model_dir, '0_BERT', file[5:]))
            elif file.startswith("pooling_"):
                shutil.move(os.path.join(model_dir, file), os.path.join(model_dir, '1_Pooling', file[8:]))
      
       
        self.model = SentenceTransformer(model_dir)
        

        self.model.to(self.device)
        self.model.eval()

        logger.debug('Transformer model from path {0} loaded successfully'.format(model_dir))

        # Read the mapping file, index to object name
        mapping_file_path = os.path.join(model_dir, "index_to_name.json")

        if os.path.isfile(mapping_file_path):
            with open(mapping_file_path) as f:
                self.mapping = json.load(f)
        else:
            logger.warning('Missing the index_to_name.json file. Inference output will not include class name.')

        self.initialized = True

    def preprocess(self, data):
        """ Very basic preprocessing code - only tokenizes. 
            Extend with your own preprocessing steps as needed.
        """
        text = data[0].get("data")
        if text is None:
            text = data[0].get("body")
        sentences = text.decode('utf-8')
        logger.info("Received text: '%s'", sentences)

        inputs = sentences
        return inputs

    def inference(self, inputs):
        """
        Predict the class of a text using a trained transformer model.
        """
        # NOTE: This makes the assumption that your model expects text to be tokenized  
        # with "input_ids" and "token_type_ids" - which is true for some popular transformer models, e.g. bert.
        # If your transformer model expects different tokenization, adapt this code to suit 
        # its expected input format.
        prediction = self.model.encode(
            inputs
        )[0].tolist();
        logger.info("Model predicted: '%s'", prediction)

        if self.mapping:
            prediction = self.mapping[str(prediction)]

        return [prediction]

    def postprocess(self, inference_output):
        # TODO: Add any needed post-processing of the model predictions here
        return inference_output