def embedding_extraction(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ########################## ########## Settings ########################## set_all_seeds(seed=42) # load from a local path: #lang_model = Path("../saved_models/glove-german-uncased") # or through s3 lang_model = "glove-german-uncased" #only glove or word2vec or converted fasttext (fixed vocab) embeddings supported do_lower_case = True use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(lang_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # Create Inferencer for embedding extraction inferencer = Inferencer(model=model, processor=processor, task_type="embeddings") # Extract vectors basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] result = inferencer.extract_vectors(dicts=basic_texts, extraction_strategy="cls_token", extraction_layer=-1) print(result) inferencer.close_multiprocessing_pool()
def embeddings_extraction(): ########################## ########## Settings ########################## set_all_seeds(seed=42) batch_size = 32 use_gpu = True device, n_gpu = initialize_device_settings(use_cuda=use_gpu) lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # 4. Create an AdaptiveModel with a pretrained language model as a basis language_model = LanguageModel.load(lang_model) adaptive_model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Extract embeddings with model in inference mode basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist" }, { "text": "Martin Müller spielt Fussball" }, ] model = Inferencer(adaptive_model, processor, gpu=use_gpu) result = model.extract_vectors(dicts=basic_texts) print(result)
tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # 4. Create an AdaptiveModel with a pretrained language model as a basis language_model = LanguageModel.load(lang_model) adaptive_model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Extract embeddings with model in inference mode basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist" }, { "text": "Martin Müller spielt Fussball" }, ] model = Inferencer(adaptive_model, processor, gpu=use_gpu) result = model.extract_vectors(dicts=basic_texts) print(result)
lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) # 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # 4. Create an AdaptiveModel with a pretrained language model as a basis language_model = LanguageModel.load(lang_model) adaptive_model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Extract embeddings with model in inference mode basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"}, {"text": "Martin Müller spielt Fussball"}, ] model = Inferencer(adaptive_model, processor, gpu=use_gpu) result = model.extract_vectors(dicts=basic_texts, extraction_strategy="reduce_mean", extraction_layer=-1) print(result)