def __init__(self, lazy: bool = True, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = 768*4, listfile: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv", notes_dir: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes", skip_patients_file: str ="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/null_patients.txt", stats_write_dir: str="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/", all_stays: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/root/all_stays.csv" ): super().__init__(lazy) self.tokenizer = tokenizer or WhitespaceTokenizer() self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens self.listfile = listfile self.notes_dir = notes_dir logger.critical(f"we are getting the max tokens {self.max_tokens}") self.null_patients = [] with open(skip_patients_file, "r") as file: for line in file: self.null_patients.append(line.strip()) self.stats_write_dir = stats_write_dir self.all_stays_path = all_stays self.all_stays_df = self.get_all_stays()
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, sent_max_len: int = 100, max_sent_per_example: int = 20, use_sep: bool = True, sci_sum: bool = False, use_abstract_scores: bool = True, sci_sum_fake_scores: bool = True, predict: bool = False, ) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True) self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.sent_max_len = sent_max_len self.use_sep = use_sep self.predict = predict self.sci_sum = sci_sum self.max_sent_per_example = max_sent_per_example self.use_abstract_scores = use_abstract_scores self.sci_sum_fake_scores = sci_sum_fake_scores
def __init__(self, lazy: bool = True, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = 768*4, train_listfile: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv", test_listfile: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv", notes_dir: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes", skip_patients_file: str ="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/null_patients.txt", stats_write_dir: str="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/", all_stays: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/root/all_stays.csv", limit_examples: int = None, use_preprocessing: bool = False, num_classes: int=2, mode: str='train', data_type: str="MORTALITY", args=None, hadm2eps_path: str="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/hadm2episode.dict" ): super().__init__(lazy) self.tokenizer = tokenizer or WhitespaceTokenizer() self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens self.train_listfile = train_listfile self.test_listfile = test_listfile self.notes_dir = notes_dir self.use_preprocessing = use_preprocessing logger.critical(f"we are getting the max tokens {self.max_tokens} " f"and use_preproc is {self.use_preprocessing}") self.null_patients = [] with open(skip_patients_file, "r") as file: for line in file: self.null_patients.append(line.strip()) self.stats_write_dir = stats_write_dir self.all_stays_path = all_stays self.all_stays_df = self.get_all_stays() self.limit_examples = limit_examples self.cur_examples = 0 self.lengths = [] self.num_classes = num_classes self.mode = mode self.sampled_idx = {} self.data_type = data_type self.args = args self.get_idx() #realistically, only the train_idx will be set, and we simply need to compare against # self.null_patients self.vocab = None self.hadm2eps_path = hadm2eps_path self.listfile_df = pd.read_csv(train_listfile) if self.data_type == "PHENOTYPING" or self.data_type == "DECOMPENSATION": self.labels_start_idx = 2 elif self.data_type == "MORTALITY": self.labels_start_idx = 1 self.labels = list(self.listfile_df.columns[self.labels_start_idx:])
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer_space = WhitespaceTokenizer() self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md", pos_tags=True, split_on_spaces=True) self.token_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', feature_name='tag_'), 'ner_tags': SingleIdTokenIndexer(namespace='ner_tag_vocab', feature_name='ent_type_') } self.slot_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6) }
def build_domain_dataset_reader(lower=False) -> DatasetReader: tokenizer = WhitespaceTokenizer() token_indexers = {'tokens': SingleIdTokenIndexer()} return DomainDatasetReader(tokenizer=tokenizer, token_indexers=token_indexers, max_tokens=None, lower=lower)
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexer: Dict[str, TokenIndexer] = None, target_token_indexer: Dict[str, TokenIndexer] = None, max_tokens: int = None, target_add_start_token: bool = True, target_add_end_token: bool = True, end_symbol: str = END_SYMBOL, start_symbol: str = START_SYMBOL, **kwargs) -> None: super().__init__(**kwargs) self._source_tokenizer = source_tokenizer or CharacterTokenizer( start_tokens=[ START_SYMBOL, ], end_tokens=[ END_SYMBOL, ], lowercase_characters=True) self._target_tokenizer = target_tokenizer or WhitespaceTokenizer() self._source_token_indexer = source_token_indexer or { 'tokens': SingleIdTokenIndexer() } self._target_token_indexer = target_token_indexer or { 'tokens': SingleIdTokenIndexer() } self._max_tokens = max_tokens self._start_symbol = start_symbol self._end_symbol = end_symbol self._target_add_start_token = target_add_start_token self._target_add_end_token = target_add_end_token
def build_elmo_dataset_reader(lower=False) -> DatasetReader: tokenizer = WhitespaceTokenizer() token_indexers = {'bert_tokens': ELMoTokenCharactersIndexer()} return ClassificationDatasetReader(tokenizer=tokenizer, token_indexers=token_indexers, max_tokens=300, lower=lower)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None): super().__init__() self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() }
def run_ELMo_RSA(stim_file, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = WhitespaceTokenizer() #Load model ##ELMo OG elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json' #ELMo Small #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #ELMo Medium #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json' #ELMo OG (5.5B) #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file, dropout=0.0) embedder = BasicTextFieldEmbedder( token_embedders={'elmo_tokens': elmo_embedding}) for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE token_indexer = ELMoTokenCharactersIndexer() vocab = Vocabulary() target_tokens = tokenizer.tokenize(target) target_text_field = TextField(target_tokens, {'elmo_tokens': token_indexer}) target_text_field.index(vocab) target_token_tensor = target_text_field.as_tensor( target_text_field.get_padding_lengths()) target_tensor_dict = target_text_field.batch_tensors( [target_token_tensor]) target_embedding = embedder(target_tensor_dict)[0] baseline = target_embedding[-1].data.cpu().squeeze() #GET SIMS sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder) values = get_dummy_values(sentence) EXP.load_IT('elmo', x, values, False, sims) return EXP
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = None, **kwargs): super().__init__(**kwargs) self.tokenizer = tokenizer or WhitespaceTokenizer() self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens
def __init__(self, max_length: int = None, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, fill_in_empty_labels: bool = False, clean_text:bool = False) -> None: super().__init__() self._max_sequence_length = max_length self.fill_in_empty_labels = fill_in_empty_labels self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexer = token_indexers or {'tokens': SingleIdTokenIndexer()} self._clean_text = clean_text
def build_dataset_reader(transformer_model=None, lower=False) -> DatasetReader: if transformer_model is not None: tokenizer = PretrainedTransformerTokenizer(transformer_model) else: tokenizer = WhitespaceTokenizer() token_indexers = {'bert_tokens': SingleIdTokenIndexer()} return ClassificationDatasetReader(tokenizer=tokenizer, token_indexers=token_indexers, max_tokens=None, lower=lower)
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_sequence_length: int = None) -> None: super().__init__() self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._max_sequence_length = max_sequence_length
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = None): super().__init__(lazy) self.tokenizer = tokenizer or WhitespaceTokenizer() self.token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, **kwargs): """ :param tokenizer: used to split text into list of tokens :param token_indexers: it defines how to map tokens to integer """ super().__init__(**kwargs) self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, **kwargs): super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self.tokenizer = tokenizer or WhitespaceTokenizer() self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() }
def prepare_data(): reader = TextClassificationJsonReader( token_indexers={"tokens": SingleIdTokenIndexer()}, tokenizer=WhitespaceTokenizer(), ) train_dataset = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/train.jsonl") # NOQA valid_dataset = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/dev.jsonl") # NOQA vocab = Vocabulary.from_instances(train_dataset) train_dataset.index_with(vocab) valid_dataset.index_with(vocab) return train_dataset, valid_dataset, vocab
def test_train_read(self): self.reader = Flickr30kReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "flickr30k", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), data_dir=FIXTURES_ROOT / "vision" / "flickr30k" / "sentences", region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, featurize_captions=False, num_potential_hard_negatives=4, ) instances = list(self.reader.read("test_fixtures/vision/flickr30k/test.txt")) assert len(instances) == 25 instance = instances[5] assert len(instance.fields) == 5 assert len(instance["caption"]) == 4 assert len(instance["caption"][0]) == 12 # 16 assert instance["caption"][0] != instance["caption"][1] assert instance["caption"][0] == instance["caption"][2] assert instance["caption"][0] == instance["caption"][3] question_tokens = [t.text for t in instance["caption"][0]] assert question_tokens == [ "girl", "with", "brown", "hair", "sits", "on", "edge", "of", "concrete", "area", "overlooking", "water", ] batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num images (3 hard negatives + gold image), num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (25, 4, 2, 10) # (batch size, num images (3 hard negatives + gold image), num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (25, 4, 2, 4) # (batch size, num images (3 hard negatives + gold image), num boxes (fake),) assert tensors["box_mask"].size() == (25, 4, 2) # (batch size) assert tensors["label"].size() == (25,)
def test_read_without_images(self): from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader reader = VQAv2Reader( tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list(reader.read("unittest")) assert len(instances) == 3 assert "box_coordinates" not in instances[0] assert "box_features" not in instances[0] assert "box_mask" not in instances[0]
def setup_method(self): from allennlp_models.vision.dataset_readers.gqa import GQAReader super().setup_method() self.reader = GQAReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "gqa", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, )
def __init__( self, discretizer_path: str, max_sequence_length: int = None, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self.discretizer = load_discretizer(discretizer_path) self._max_sequence_length = max_sequence_length or math.inf self._tokenizer = WhitespaceTokenizer() self._start_token = Token(START_TOKEN) self._end_token = Token(END_TOKEN)
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, seg_threshold: bool = True, tokenizer: Tokenizer = None, read_entities: bool = False, **kwargs, ) -> None: super().__init__(**kwargs) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.seg_threshold = seg_threshold self.tokenizer = tokenizer self.whitespace_tokenizer = WhitespaceTokenizer() self.read_entities = read_entities
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._source_tokenizer = source_tokenizer or WhitespaceTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexer = (source_token_indexers or {"tokens": SingleIdTokenIndexer()}) self._target_token_indexers = (target_token_indexers or self._source_token_indexer)
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, **kwargs) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self._source_tokenzier = source_tokenizer or WhitespaceTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenzier self.source_token_indexers = source_token_indexers or { "tokens": SingleIdTokenIndexer() } self._target_token_indexers = target_token_indexers or self.source_token_indexers
def __init__(self, dataset_name: str = 'cnn_dailymail', tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_max_tokens: int = 400, target_max_tokens: int = 100, seperate_namespaces: bool = False, target_namespace: str = "target_tokens", save_copy_fields: bool = False, save_pgn_fields: bool = False, lowercase: bool = True, max_instances=None): super().__init__() self._lowercase = lowercase self._max_instances = max_instances self._dataset_name = dataset_name self._source_max_tokens = source_max_tokens self._target_max_tokens = target_max_tokens self._tokenizer = tokenizer or WhitespaceTokenizer() tokens_indexer = { 'tokens': SingleIdTokenIndexer(lowercase_tokens=lowercase) } self._source_token_indexers = source_token_indexers or tokens_indexer self._target_token_indexers = target_token_indexers or tokens_indexer self._save_copy_fields = save_copy_fields self._save_pgn_fields = save_pgn_fields self._target_namespace = 'tokens' self.train = datasets.load_dataset( dataset_name, '3.0.0', split='train', ) self.val = datasets.load_dataset(dataset_name, '3.0.0', split='validation[:5%]') if seperate_namespaces: self._target_namespace = target_namespace second_tokens_indexer = { 'tokens': SingleIdTokenIndexer(namespace=target_namespace) } self._target_token_indexers = target_token_indexers or second_tokens_indexer
def test_read(self): from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader reader = VQAv2Reader( image_dir=FIXTURES_ROOT / "vision" / "images" / "vqav2", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list(reader.read("unittest")) assert len(instances) == 3 instance = instances[0] assert len(instance.fields) == 6 assert len(instance["question"]) == 7 question_tokens = [t.text for t in instance["question"]] assert question_tokens == [ "What", "is", "this", "photo", "taken", "looking", "through?" ] assert len(instance["labels"]) == 5 labels = [field.label for field in instance["labels"].field_list] assert labels == ["net", "netting", "mesh", "pitcher", "orange"] assert torch.allclose( instance["label_weights"].tensor, torch.tensor([1.0, 1.0 / 3, 1.0 / 3, 1.0 / 3, 1.0 / 3]), ) batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (3, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (3, 2, 4) # (batch size, num boxes (fake),) assert tensors["box_mask"].size() == (3, 2) # Nothing should be masked out since the number of fake boxes is the same # for each item in the batch. assert tensors["box_mask"].all()
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = None, balance: bool = False, text_col: str = "text", label_col: str = "label", data_dir: str = "data/CrisisNLP_volunteers_labeled_data", exclude: List[str] = ["MH370", "Respiratory", "ebola"], **kwargs): super().__init__(**kwargs) self.tokenizer = tokenizer or WhitespaceTokenizer() self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens self.balance = balance self.text_col = text_col self.label_col = label_col self.data_dir = Path(data_dir) self.exclude = exclude
def test_read(self): from allennlp_models.vision.dataset_readers.vgqa import VGQAReader reader = VGQAReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "vgqa", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list( reader.read("test_fixtures/vision/vgqa/question_answers.json")) assert len(instances) == 8 instance = instances[0] assert len(instance.fields) == 6 assert len(instance["question"]) == 5 question_tokens = [t.text for t in instance["question"]] assert question_tokens == ["What", "is", "on", "the", "curtains?"] assert len(instance["labels"]) == 1 labels = [field.label for field in instance["labels"].field_list] assert labels == ["sailboats"] batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (8, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (8, 2, 4) # (batch size, num boxes (fake)) assert tensors["box_mask"].size() == (8, 2) # Nothing should be masked out since the number of fake boxes is the same # for each item in the batch. assert tensors["box_mask"].all()
def test_read(self): from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader reader = VisualEntailmentReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "visual_entailment", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list( reader.read( "test_fixtures/vision/visual_entailment/sample_pairs.jsonl")) assert len(instances) == 16 instance = instances[0] assert len(instance.fields) == 5 assert len(instance["hypothesis"]) == 4 sentence_tokens = [t.text for t in instance["hypothesis"]] assert sentence_tokens == ["A", "toddler", "sleeps", "outside."] assert instance["labels"].label == "contradiction" batch = Batch(instances) vocab = Vocabulary() vocab.add_tokens_to_namespace( ["entailment", "contradiction", "neutral"], "labels") batch.index_instances(vocab) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (16, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (16, 2, 4) # (batch_size, num boxes (fake),) assert tensors["box_mask"].size() == (16, 2)
def test_read(self): from allennlp_models.vision.dataset_readers.nlvr2 import Nlvr2Reader reader = Nlvr2Reader( image_dir=FIXTURES_ROOT / "vision" / "images" / "nlvr2", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list( reader.read("test_fixtures/vision/nlvr2/tiny-dev.json")) assert len(instances) == 8 instance = instances[0] assert len(instance.fields) == 6 assert instance["hypothesis"][0] == instance["hypothesis"][1] assert len(instance["hypothesis"][0]) == 18 hypothesis_tokens = [t.text for t in instance["hypothesis"][0]] assert hypothesis_tokens[:6] == [ "The", "right", "image", "shows", "a", "curving" ] assert instance["label"].label == 0 assert instances[1]["label"].label == 1 assert instance["identifier"].metadata == "dev-850-0-0" batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, 2 images per instance, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (8, 2, 2, 10) # (batch size, 2 images per instance, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (8, 2, 2, 4) # (batch size, 2 images per instance, num boxes (fake)) assert tensors["box_mask"].size() == (8, 2, 2)