def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() indices = [ indexer.token_to_indices(Token(token), Vocabulary()) for token in ['Second', '.'] ] padded_tokens = indexer.pad_token_sequence(indices, desired_num_tokens=3, padding_lengths={}) expected_padded_tokens = [ [ 259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] assert padded_tokens == expected_padded_tokens
def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() tokens = [Token('Second'), Token('.')] indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"] padded_tokens = indexer.pad_token_sequence({'test-elmo': indices}, desired_num_tokens={'test-elmo': 3}, padding_lengths={}) expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert padded_tokens['test-elmo'] == expected_padded_tokens
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer_space = WhitespaceTokenizer() self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md", pos_tags=True, split_on_spaces=True) self.token_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', feature_name='tag_'), 'ner_tags': SingleIdTokenIndexer(namespace='ner_tag_vocab', feature_name='ent_type_') } self.slot_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6) }
def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() indices = [ indexer.token_to_indices(Token(token), Vocabulary()) for token in ['Second', '.'] ] padded_tokens = indexer.pad_token_sequence(indices, desired_num_tokens=3, padding_lengths={}) expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert padded_tokens == expected_padded_tokens
def test_elmo_empty_token_list(self): # Basic test indexer = ELMoTokenCharactersIndexer() assert {"elmo_tokens": []} == indexer.get_empty_token_list() # Real world test indexer = {"elmo": indexer} tokens_1 = TextField([Token("Apple")], indexer) targets_1 = ListField([TextField([Token("Apple")], indexer)]) tokens_2 = TextField([Token("Screen"), Token("device")], indexer) targets_2 = ListField([ TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer) ]) instance_1 = Instance({"tokens": tokens_1, "targets": targets_1}) instance_2 = Instance({"tokens": tokens_2, "targets": targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor["targets"]["elmo"][ "elmo_tokens"] # The TextField that is empty should have been created using the # `get_empty_token_list` and then padded with zeros. empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1], ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
def test_elmo_indexer_with_additional_tokens(self): indexer = ELMoTokenCharactersIndexer(tokens_to_add={"<first>": 1}) tokens = [Token("<first>")] indices = indexer.tokens_to_indices(tokens, Vocabulary()) expected_indices = [ [ 259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] ] assert indices["tokens"] == expected_indices
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(chr(256) + "t")], Vocabulary()) expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] assert indices == {"elmo_tokens": [expected_indices]}
def test_eos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token("</S>")], Vocabulary(), "test-eos") expected_indices = [ 259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] assert indices == {"test-eos": [expected_indices]}
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token("<S>")], Vocabulary()) expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] assert indices == {"elmo_tokens": [expected_indices]}
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.token_to_indices(Token('<S>'), Vocabulary()) expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == expected_indices
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.token_to_indices(Token(chr(256) + 't'), Vocabulary()) expected_indices = [259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == expected_indices
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.token_to_indices(Token('<S>'), Vocabulary()) expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == expected_indices
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(), "test-elmo") expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == {"test-elmo": [expected_indices]}
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.token_to_indices(Token(chr(256) + 't'), Vocabulary()) expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == expected_indices
def test_elmo_indexer_with_additional_tokens(self): indexer = ELMoTokenCharactersIndexer(tokens_to_add={'<first>': 1}) tokens = [Token('<first>')] indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"] expected_indices = [[259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261]] assert indices == expected_indices
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(chr(256) + 't')], Vocabulary(), "test-unicode") expected_indices = [259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == {"test-unicode": [expected_indices]}
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(unichr(256) + u't')], Vocabulary(), u"test-unicode") expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {u"test-unicode": [expected_indices]}
def initialize(self): print('Data reader initialization ...') self.cursor = fever_db.get_cursor() # Prepare Data token_indexers = { 'tokens': \ SingleIdTokenIndexer(namespace='tokens'), 'elmo_chars': \ ELMoTokenCharactersIndexer(namespace='elmo_characters') } self.fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=cfg.lazy) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT \ / 'vocab_cache' \ / 'nli_basic') # THis is important ns = 'selection_labels' vocab.add_token_to_namespace('true', namespace=ns) vocab.add_token_to_namespace('false', namespace=ns) vocab.add_token_to_namespace('hidden', namespace=ns) vocab.change_token_with_index_to_namespace('hidden', -2, namespace=ns) # Label value vocab.get_index_to_token_vocabulary(ns) self.vocab = vocab self.weight_dict = weight_dict self.initialized = True
def test_on_test_set(self): # model_weight_file = os.path.join(os.path.dirname(__file__), '..', "output", "201905290138", "weights_best.th") # vocab_dir_path = os.path.join(os.path.dirname(__file__), '..', "output", "201905290138", "vocabulary") model_weight_file = "C:\\Data\\rumourDNN_models\\output\\bostonbombings-201906241245\\weights_best.th" vocab_dir_path = "C:\\Data\\rumourDNN_models\\output\\bostonbombings-201906241245\\vocabulary" model, rumor_dnn_predictor = load_classifier_from_archive( vocab_dir_path=vocab_dir_path, model_weight_file=model_weight_file) evaluation_data_path = os.path.join(os.path.dirname(__file__), '..', "data", "test", "charliehebdo.csv") elmo_token_indexer = ELMoTokenCharactersIndexer() rumor_train_set_reader = RumorTweetsDataReader( token_indexers={'elmo': elmo_token_indexer}) test_instances = rumor_train_set_reader.read(evaluation_data_path) from training_util import evaluate data_iterator = BucketIterator(batch_size=200, sorting_keys=[("sentence", "num_tokens") ]) data_iterator.index_with(model.vocab) metrics = evaluate(model, test_instances, data_iterator, -1, "") timestamped_print("Finished evaluating.") timestamped_print("Metrics:") for key, metric in metrics.items(): timestamped_print("%s: %s" % (key, metric))
def build_elmo_dataset_reader(lower=False) -> DatasetReader: tokenizer = WhitespaceTokenizer() token_indexers = {'bert_tokens': ELMoTokenCharactersIndexer()} return ClassificationDatasetReader(tokenizer=tokenizer, token_indexers=token_indexers, max_tokens=300, lower=lower)
def build_indexers(args): indexers = {} if args.input_module in ["scratch", "glove", "fastText"]: indexers["words"] = SingleIdTokenIndexer() elif args.input_module in ["elmo", "elmo-chars-only"]: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if input_module_uses_transformers(args.input_module): assert ( not indexers ), "transformers modules like BERT/XLNet are not supported alongside other " "indexers due to tokenization." assert args.tokenizer == args.input_module, ( "transformers models use custom tokenization for each model, so tokenizer " "must match the specified model.") tokenizer_name = input_module_tokenizer_name(args.input_module) indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name) return indexers
def build_indexers(args): indexers = {} if not args.input_module.startswith("bert") and args.input_module not in [ "elmo", "gpt" ]: indexers["words"] = SingleIdTokenIndexer() if args.input_module == "elmo": indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if args.input_module == "gpt": assert ( not indexers ), "OpenAI transformer is not supported alongside other indexers due to tokenization." assert ( args.tokenizer == "OpenAI.BPE" ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE." indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer( "openai_bpe") if args.input_module.startswith("bert"): assert not indexers, "BERT is not supported alongside other indexers due to tokenization." assert args.tokenizer == args.input_module, ( "BERT models use custom WPM tokenization for " "each model, so tokenizer must match the " "specified BERT model.") indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer( args.input_module) return indexers
def __init__(self, split, mode, only_use_relevant_dets=True, add_image_as_a_box=True): self.only_use_relevant_dets = only_use_relevant_dets self.mode = mode self.split = split self.add_image_as_a_box = add_image_as_a_box with open(os.path.join(VCR_ANNOTS_DIR, '{}.jsonl'.format(split)), 'r') as f: self.items = [json.loads(s) for s in f] self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()} self.vocab = Vocabulary() with open( os.path.join(os.path.dirname(VCR_ANNOTS_DIR), 'cocoontology.json'), 'r') as f: coco = json.load(f) self.coco_objects = ['__background__'] + [ x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0])) ] self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)}
def __init__(self, split, mode, only_use_relevant_dets=True, add_image_as_a_box=True, embs_to_load='bert_da', conditioned_answer_choice=0): """ :param split: train, val, or test :param mode: answer or rationale :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer. False, if we should use all detections. :param add_image_as_a_box: True to add the image in as an additional 'detection'. It'll go first in the list of objects. :param embs_to_load: Which precomputed embeddings to load. :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be a problem for the QA->R task. Pass in 'conditioned_answer_choice=i' to always condition on the i-th answer. """ self.split = split self.mode = mode self.only_use_relevant_dets = only_use_relevant_dets print("Only relevant dets" if only_use_relevant_dets else "Using all detections", flush=True) self.add_image_as_a_box = add_image_as_a_box self.conditioned_answer_choice = conditioned_answer_choice with open( os.path.join( VCR_ANNOTS_DIR, '/media/ailab/songyoungtak/vcr_new/new/add_keyword/{}.jsonl' .format(split)), 'r') as f: self.items = [json.loads(s) for s in f] if split not in ('test', 'train_scene_version', 'val_scene_version'): raise ValueError( "Mode must be in test, train, or val. Supplied {}".format( mode)) if mode not in ('answer', 'rationale'): raise ValueError("split must be answer or rationale") self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()} self.vocab = Vocabulary() with open( os.path.join(os.path.dirname(VCR_ANNOTS_DIR), 'cocoontology.json'), 'r') as f: coco = json.load(f) self.coco_objects = ['__background__'] + [ x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0])) ] self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)} self.embs_to_load = embs_to_load self.h5fn = os.path.join( VCR_ANNOTS_DIR, f'{self.embs_to_load}_{self.mode}_{self.split}.h5') print("Loading embeddings from {}".format(self.h5fn), flush=True)
def build_indexers(args): indexers = {} if not args.word_embs == "none": indexers["words"] = SingleIdTokenIndexer() if args.elmo: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if args.openai_transformer: assert not indexers, ("OpenAI transformer is not supported alongside" " other indexers due to tokenization!") assert args.tokenizer == "OpenAI.BPE", ( "OpenAI transformer is not supported alongside" " other indexers due to tokenization!") indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer( "openai_bpe") if args.bert_model_name: assert not indexers, ("BERT is not supported alongside" " other indexers due to tokenization!") assert args.tokenizer == args.bert_model_name, ( "BERT models use custom WPM tokenization for " "each model, so tokenizer must match the " "specified BERT model.") indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer( args.bert_model_name) return indexers
def from_params(cls, params: Params) -> "PnetOntoDatasetReader": # token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True), "token_characters": TokenCharactersIndexer(), "elmo": ELMoTokenCharactersIndexer(), } valid_class = params.pop("valid_class") random_seed = params.pop("random_seed") drop_empty = params.pop("drop_empty") valid_part = params.pop("valid_part") tag_label = params.pop("tag_label", None) feature_labels = params.pop("feature_labels", ()) lazy = params.pop("lazy", False) params.assert_empty(cls.__name__) return PnetOntoDatasetReader( token_indexers=token_indexers, valid_class=valid_class, random_seed=random_seed, drop_empty=drop_empty, valid_part=valid_part, tag_label=tag_label, feature_labels=feature_labels, lazy=lazy, )
def spectrum_eval_manual_check(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-17-12:10:35_mesim_elmo/i(34800)_epoch(5)_dev(0.5563056305630563)_loss(1.6648460462434564)_seed(12)" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15:52:19_r/dev_sent.jsonl" IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16:34:19_r/dev_sent.jsonl" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16-34-19_r/dev_sent.jsonl" dev_sent_result_lsit = common.load_jsonl(IN_FILE) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) for sc_prob in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]: upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=sc_prob, delete_prob=False) dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) print("------------------------------------") print("Scaling_prob:", sc_prob) eval_mode = {'check_sent_id_correct': True, 'standard': True} print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode)) # del upstream_dev_list # del complete_upstream_dev_data del dev_fever_data_reader del dev_instances print("------------------------------------")
def eval_fever(): # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)" save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)" # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)" batch_size = 32 # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } csnli_dataset_reader = CNLIReader(token_indexers=token_indexers, example_filter=lambda x: float(x['cs_score']) >= 0.7) # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl" mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs" mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs" # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path) mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path) mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300) model.load_state_dict(torch.load(save_path)) model.display() model.to(device) # Create Log File criterion = nn.CrossEntropyLoss() eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion) eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion) print(f"Dev(M):{m_dev_score}/{m_dev_loss}") print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")
def __init__(self, split, only_use_relevant_dets=True, add_image_as_a_box=True, embs_to_load='bert_da', conditioned_answer_choice=0): """ :param split: train, val, or test :param mode: answer or rationale :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer. False, if we should use all detections. :param add_image_as_a_box: True to add the image in as an additional 'detection'. It'll go first in the list of objects. :param embs_to_load: Which precomputed embeddings to load. :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be a problem for the QA->R task. Pass in 'conditioned_answer_choice=i' to always condition on the i-th answer. 这是啥意思??????????怎么test的时候还有这种操作?????????解释→ https://groups.google.com/forum/?hl=en#!topic/visualcommonsense/lxEgFYRz5ho """ if split not in ('test', 'train', 'val'): raise ValueError( "Mode must be in test, train, or val. Supplied {}".format( 'answer-rationale')) print("Loading {} embeddings".format(split), flush=True) self.split = split self.only_use_relevant_dets = only_use_relevant_dets print("Only relevant dets" if only_use_relevant_dets else "Using all detections", flush=True) self.add_image_as_a_box = add_image_as_a_box self.conditioned_answer_choice = conditioned_answer_choice with open( os.path.join(VCR_ANNOTS_DIR, split, '{}.jsonl'.format(split)), 'r') as f: self.items = np.array(list(f)) self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()} self.vocab = Vocabulary() with open( os.path.join(VCR_ANNOTS_DIR, 'dataloaders', 'cocoontology.json'), 'r') as f: coco = json.load(f) self.coco_objects = ['__background__'] + [ x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0])) ] # 这里提到了background,思考一下以后如何利用background self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)} self.embs_to_load = embs_to_load self.h5fn_answer = os.path.join( VCR_ANNOTS_DIR, self.split, f'{self.embs_to_load}_answer_{self.split}.h5') self.h5fn_rationale = os.path.join( VCR_ANNOTS_DIR, self.split, f'{self.embs_to_load}_rationale_{self.split}.h5') self.h5fn_image = os.path.join(VCR_ANNOTS_DIR, self.split, f'image_feature_{self.split}.h5')
def run_ELMo_RSA(stim_file, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = WhitespaceTokenizer() #Load model ##ELMo OG elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json' #ELMo Small #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #ELMo Medium #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json' #ELMo OG (5.5B) #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file, dropout=0.0) embedder = BasicTextFieldEmbedder( token_embedders={'elmo_tokens': elmo_embedding}) for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE token_indexer = ELMoTokenCharactersIndexer() vocab = Vocabulary() target_tokens = tokenizer.tokenize(target) target_text_field = TextField(target_tokens, {'elmo_tokens': token_indexer}) target_text_field.index(vocab) target_token_tensor = target_text_field.as_tensor( target_text_field.get_padding_lengths()) target_tensor_dict = target_text_field.batch_tensors( [target_token_tensor]) target_embedding = embedder(target_tensor_dict)[0] baseline = target_embedding[-1].data.cpu().squeeze() #GET SIMS sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder) values = get_dummy_values(sentence) EXP.load_IT('elmo', x, values, False, sims) return EXP
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader( token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Use the 'Small' pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train()
def hidden_eval_fever(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)" dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) # dev_biterator = BasicIterator(batch_size=batch_size * 2) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 300), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} for item in builded_dev_data: del item['label'] print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
def __init__(self) -> None: # we use simple word tokenizer to split sentence to words self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) #initialize indexers singleIdIndexer = SingleIdTokenIndexer() elmoIndexer = ELMoTokenCharactersIndexer() self.indexers = {} self.indexers["tokens"] = singleIdIndexer self.indexers["elmo_characters"] = elmoIndexer
def get_score_multihop(t_data_file, additional_file, model_path, item_key='prioritized_docids_aside', top_k=6): batch_size = 64 lazy = True SAVE_PATH = model_path print("Model From:", SAVE_PATH) additional_sentence_list = get_additional_list(t_data_file, additional_file, item_key=item_key, top_k=top_k) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) print("Additional Dev size:", len(additional_sentence_list)) dev_instances = dev_fever_data_reader.read(additional_sentence_list) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) additional_sentence_list = hidden_eval(model, eval_iter, additional_sentence_list) return additional_sentence_list
def main(vocab_path: str, elmo_config_path: str, elmo_weights_path: str, output_dir: str, batch_size: int, device: int, use_custom_oov_token: bool = False): """ Creates ELMo word representations from a vocabulary file. These word representations are _independent_ - they are the result of running the CNN and Highway layers of the ELMo model, but not the Bidirectional LSTM. ELMo requires 2 additional tokens: <S> and </S>. The first token in this file is assumed to be an unknown token. This script produces two artifacts: A new vocabulary file with the <S> and </S> tokens inserted and a glove formatted embedding file containing word : vector pairs, one per line, with all values separated by a space. """ # Load the vocabulary words and convert to char ids with open(vocab_path, 'r') as vocab_file: tokens = vocab_file.read().strip().split('\n') # Insert the sentence boundary tokens which elmo uses at positions 1 and 2. if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token: raise ConfigurationError("ELMo embeddings require the use of a OOV token.") tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:] indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary(), "indices")["indices"] sentences = [] for k in range((len(indices) // 50) + 1): sentences.append(indexer.pad_token_sequence(indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={})) last_batch_remainder = 50 - (len(indices) % 50) if device != -1: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path).cuda(device) else: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path) all_embeddings = [] for i in range((len(sentences) // batch_size) + 1): array = numpy.array(sentences[i * batch_size: (i + 1) * batch_size]) if device != -1: batch = torch.from_numpy(array).cuda(device) else: batch = torch.from_numpy(array) token_embedding = elmo_token_embedder(batch)['token_embedding'].data # Reshape back to a list of words of shape (batch_size * 50, encoding_dim) # We also need to remove the <S>, </S> tokens appended by the encoder. per_word_embeddings = token_embedding[:, 1:-1, :].contiguous().view(-1, token_embedding.size(-1)) all_embeddings.append(per_word_embeddings) # Remove the embeddings associated with padding in the last batch. all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :] embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy() # Write out the embedding in a glove format. os.makedirs(output_dir, exist_ok=True) with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), 'wb') as embeddings_file: for i, word in enumerate(tokens): string_array = " ".join([str(x) for x in list(embedding_weight[i, :])]) embeddings_file.write(f"{word} {string_array}\n".encode('utf-8')) # Write out the new vocab with the <S> and </S> tokens. _, vocab_file_name = os.path.split(vocab_path) with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file: for word in tokens: new_vocab_file.write(f"{word}\n")