Exemplo n.º 1
0
def run_ELMo_RSA(stim_file, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Get tokenizer
    tokenizer = WhitespaceTokenizer()

    #Load model
    ##ELMo OG
    elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
    elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'

    #ELMo Small
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'

    #ELMo Medium
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json'

    #ELMo OG (5.5B)
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

    elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file,
                                       weight_file=elmo_weight_file,
                                       dropout=0.0)
    embedder = BasicTextFieldEmbedder(
        token_embedders={'elmo_tokens': elmo_embedding})

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])
        target = sentences[0]
        sentence = sentences[1]

        #GET BASELINE
        token_indexer = ELMoTokenCharactersIndexer()
        vocab = Vocabulary()

        target_tokens = tokenizer.tokenize(target)
        target_text_field = TextField(target_tokens,
                                      {'elmo_tokens': token_indexer})
        target_text_field.index(vocab)
        target_token_tensor = target_text_field.as_tensor(
            target_text_field.get_padding_lengths())
        target_tensor_dict = target_text_field.batch_tensors(
            [target_token_tensor])

        target_embedding = embedder(target_tensor_dict)[0]
        baseline = target_embedding[-1].data.cpu().squeeze()

        #GET SIMS
        sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder)
        values = get_dummy_values(sentence)

        EXP.load_IT('elmo', x, values, False, sims)

    return EXP
Exemplo n.º 2
0
class IOBDatasetReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language = "en_core_web_md", 
                                              pos_tags = True, split_on_spaces = True)
        self.token_indexers = {
            'elmo_tokens': ELMoTokenCharactersIndexer(),
            'token_characters': TokenCharactersIndexer(namespace='character_vocab',
                                                      min_padding_length=2),
            'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP',
                                     feature_name='tag_')
        } 
        
        self.intent_indexers = {
            'elmo_tokens': ELMoTokenCharactersIndexer(),
            'token_characters': TokenCharactersIndexer(namespace='character_vocab',
                                                      min_padding_length=2),
            'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP',
                                     feature_name='tag_')
        }
        
        
    def text_to_instance(self, tokens: List[Token], intent: List[Token], 
                         rmf: str = None,
                         label: int = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        intent_field = TextField(intent, self.intent_indexers)
        
        
        fields = {"utterance": sentence_field, 
                  "intent": intent_field
                 }

        if label:
            fields["label"] = LabelField(label)
            
        if rmf:
            rmf = np.fromstring(rmf, dtype=float, sep=' ')
            fields["rmf"] = ArrayField(rmf)
            
        
        return Instance(fields)
    
    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path) as f:
            for line in f:
                sentence, intent, rmf, label = line.strip().split('\t')
                yield self.text_to_instance(self.tokenizer_space.tokenize(sentence),
                                            self.tokenizer_space.tokenize(intent),
                                            rmf,
                                            label
                                            )
    def __init__(
        self,
        discretizer_path: str,
        max_sequence_length: int = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)

        self.discretizer = load_discretizer(discretizer_path)
        self._max_sequence_length = max_sequence_length or math.inf
        self._tokenizer = WhitespaceTokenizer()
        self._start_token = Token(START_TOKEN)
        self._end_token = Token(END_TOKEN)
Exemplo n.º 4
0
class CCGReader(SequenceTaggingDatasetReader):
    def __init__(self, *args, **kwargs):
        super(CCGReader, self).__init__(*args, **kwargs)
        self.tokenizer = WhitespaceTokenizer()

    def text_to_instances(self, text: str) -> Iterable[Instance]:
        instances = []
        for line in text.split('\n'):
            instance = self.text_to_instance(line)
            if instance:
                instances.append(instance)
        return instances

    def text_to_instance(self, line: str) -> Optional[Instance]:
        tokens = []
        tags = []
        toks_tags = self.tokenizer.tokenize(line)
        if not toks_tags:
            return None
        for tok_tag in toks_tags:
            tok, *tag = tok_tag.text.split(self._word_tag_delimiter)
            tokens.append(Token(tok))
            tags.append(tag or UNK)

        inst = Instance({'tokens': TextField(tokens, {})})
        inst.add_field('tags', SequenceLabelField(tags, inst['tokens']))
        return inst
Exemplo n.º 5
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
     sent_max_len: int = 100,
     max_sent_per_example: int = 20,
     use_sep: bool = True,
     sci_sum: bool = False,
     use_abstract_scores: bool = True,
     sci_sum_fake_scores: bool = True,
     predict: bool = False,
 ) -> None:
     super().__init__(manual_distributed_sharding=True,
                      manual_multiprocess_sharding=True)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self.sent_max_len = sent_max_len
     self.use_sep = use_sep
     self.predict = predict
     self.sci_sum = sci_sum
     self.max_sent_per_example = max_sent_per_example
     self.use_abstract_scores = use_abstract_scores
     self.sci_sum_fake_scores = sci_sum_fake_scores
Exemplo n.º 6
0
    def __init__(self,
                 lazy: bool = True,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = 768*4,
                 listfile: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv",
                 notes_dir: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes",
                 skip_patients_file: str ="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/null_patients.txt",
                 stats_write_dir: str="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/",
                 all_stays: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/root/all_stays.csv"

    ):
        super().__init__(lazy)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens
        self.listfile = listfile
        self.notes_dir = notes_dir
        logger.critical(f"we are getting the max tokens {self.max_tokens}")
        self.null_patients = []
        with open(skip_patients_file, "r") as file:
            for line in file:
                self.null_patients.append(line.strip())
        self.stats_write_dir = stats_write_dir
        self.all_stays_path = all_stays
        self.all_stays_df = self.get_all_stays()
Exemplo n.º 7
0
    def __init__(self,
                 lazy: bool = True,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = 768*4,
                 train_listfile: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv",
                 test_listfile: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv",
                 notes_dir: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes",
                 skip_patients_file: str ="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/null_patients.txt",
                 stats_write_dir: str="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/",
                 all_stays: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/root/all_stays.csv",
                 limit_examples: int = None,
                 use_preprocessing: bool = False,
                 num_classes: int=2,
                 mode: str='train',
                 data_type: str="MORTALITY",
                 args=None,
                 hadm2eps_path: str="/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/extracted_notes/hadm2episode.dict"


    ):
        super().__init__(lazy)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens
        self.train_listfile = train_listfile
        self.test_listfile = test_listfile
        self.notes_dir = notes_dir
        self.use_preprocessing = use_preprocessing

        logger.critical(f"we are getting the max tokens {self.max_tokens} "
                        f"and use_preproc is {self.use_preprocessing}")
        self.null_patients = []
        with open(skip_patients_file, "r") as file:
            for line in file:
                self.null_patients.append(line.strip())
        self.stats_write_dir = stats_write_dir
        self.all_stays_path = all_stays
        self.all_stays_df = self.get_all_stays()
        self.limit_examples = limit_examples
        self.cur_examples = 0
        self.lengths = []
        self.num_classes = num_classes
        self.mode =  mode
        self.sampled_idx = {}
        self.data_type = data_type
        self.args = args

        self.get_idx() #realistically, only the train_idx will be set, and we simply need to compare against
        # self.null_patients
        self.vocab = None
        self.hadm2eps_path = hadm2eps_path
        self.listfile_df = pd.read_csv(train_listfile)

        if self.data_type == "PHENOTYPING" or self.data_type == "DECOMPENSATION":
            self.labels_start_idx = 2
        elif self.data_type == "MORTALITY":
            self.labels_start_idx = 1

        self.labels = list(self.listfile_df.columns[self.labels_start_idx:])
Exemplo n.º 8
0
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md",
                                              pos_tags=True,
                                              split_on_spaces=True)
        self.token_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6),
            'pos_tags':
            SingleIdTokenIndexer(namespace='pos_tag_vocab',
                                 feature_name='tag_'),
            'ner_tags':
            SingleIdTokenIndexer(namespace='ner_tag_vocab',
                                 feature_name='ent_type_')
        }

        self.slot_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6)
        }
Exemplo n.º 9
0
def build_elmo_dataset_reader(lower=False) -> DatasetReader:
    tokenizer = WhitespaceTokenizer()
    token_indexers = {'bert_tokens': ELMoTokenCharactersIndexer()}
    return ClassificationDatasetReader(tokenizer=tokenizer,
                                       token_indexers=token_indexers,
                                       max_tokens=300,
                                       lower=lower)
Exemplo n.º 10
0
def build_domain_dataset_reader(lower=False) -> DatasetReader:
    tokenizer = WhitespaceTokenizer()
    token_indexers = {'tokens': SingleIdTokenIndexer()}
    return DomainDatasetReader(tokenizer=tokenizer,
                               token_indexers=token_indexers,
                               max_tokens=None,
                               lower=lower)
Exemplo n.º 11
0
 def __init__(self,
              source_tokenizer: Tokenizer = None,
              target_tokenizer: Tokenizer = None,
              source_token_indexer: Dict[str, TokenIndexer] = None,
              target_token_indexer: Dict[str, TokenIndexer] = None,
              max_tokens: int = None,
              target_add_start_token: bool = True,
              target_add_end_token: bool = True,
              end_symbol: str = END_SYMBOL,
              start_symbol: str = START_SYMBOL,
              **kwargs) -> None:
     super().__init__(**kwargs)
     self._source_tokenizer = source_tokenizer or CharacterTokenizer(
         start_tokens=[
             START_SYMBOL,
         ],
         end_tokens=[
             END_SYMBOL,
         ],
         lowercase_characters=True)
     self._target_tokenizer = target_tokenizer or WhitespaceTokenizer()
     self._source_token_indexer = source_token_indexer or {
         'tokens': SingleIdTokenIndexer()
     }
     self._target_token_indexer = target_token_indexer or {
         'tokens': SingleIdTokenIndexer()
     }
     self._max_tokens = max_tokens
     self._start_symbol = start_symbol
     self._end_symbol = end_symbol
     self._target_add_start_token = target_add_start_token
     self._target_add_end_token = target_add_end_token
Exemplo n.º 12
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              tokenizer: Tokenizer = None):
     super().__init__()
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
Exemplo n.º 13
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_tokens: int = None,
              **kwargs):
     super().__init__(**kwargs)
     self.tokenizer = tokenizer or WhitespaceTokenizer()
     self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self.max_tokens = max_tokens
Exemplo n.º 14
0
 def __init__(self, max_length: int = None, tokenizer: Tokenizer = None,
             token_indexers: Dict[str, TokenIndexer] = None,
             fill_in_empty_labels: bool = False, clean_text:bool = False) -> None:
     super().__init__()
     self._max_sequence_length = max_length
     self.fill_in_empty_labels = fill_in_empty_labels
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexer = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self._clean_text = clean_text
Exemplo n.º 15
0
 def __init__(self, tokenizer: Tokenizer = None,
                    token_indexers: Dict[str, TokenIndexer] = None,
                    **kwargs):
     """
     :param tokenizer: used to split text into list of tokens
     :param token_indexers: it defines how to map tokens to integer
     """
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Exemplo n.º 16
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_tokens: int = None):
     super().__init__(lazy)
     self.tokenizer = tokenizer or WhitespaceTokenizer()
     self.token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()}
     self.max_tokens = max_tokens
Exemplo n.º 17
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_sequence_length: int = None) -> None:
     super().__init__()
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._max_sequence_length = max_sequence_length
Exemplo n.º 18
0
def build_dataset_reader(transformer_model=None, lower=False) -> DatasetReader:
    if transformer_model is not None:
        tokenizer = PretrainedTransformerTokenizer(transformer_model)
    else:
        tokenizer = WhitespaceTokenizer()
    token_indexers = {'bert_tokens': SingleIdTokenIndexer()}
    return ClassificationDatasetReader(tokenizer=tokenizer,
                                       token_indexers=token_indexers,
                                       max_tokens=None,
                                       lower=lower)
def prepare_data():
    reader = TextClassificationJsonReader(
        token_indexers={"tokens": SingleIdTokenIndexer()},
        tokenizer=WhitespaceTokenizer(),
    )
    train_dataset = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/train.jsonl")  # NOQA
    valid_dataset = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/dev.jsonl")  # NOQA
    vocab = Vocabulary.from_instances(train_dataset)
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    return train_dataset, valid_dataset, vocab
Exemplo n.º 20
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              **kwargs):
     super().__init__(manual_distributed_sharding=True,
                      manual_multiprocess_sharding=True,
                      **kwargs)
     self.tokenizer = tokenizer or WhitespaceTokenizer()
     self.token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
Exemplo n.º 21
0
    def test_train_read(self):
        self.reader = Flickr30kReader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "flickr30k",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            data_dir=FIXTURES_ROOT / "vision" / "flickr30k" / "sentences",
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
            featurize_captions=False,
            num_potential_hard_negatives=4,
        )

        instances = list(self.reader.read("test_fixtures/vision/flickr30k/test.txt"))
        assert len(instances) == 25

        instance = instances[5]
        assert len(instance.fields) == 5
        assert len(instance["caption"]) == 4
        assert len(instance["caption"][0]) == 12  # 16
        assert instance["caption"][0] != instance["caption"][1]
        assert instance["caption"][0] == instance["caption"][2]
        assert instance["caption"][0] == instance["caption"][3]
        question_tokens = [t.text for t in instance["caption"][0]]
        assert question_tokens == [
            "girl",
            "with",
            "brown",
            "hair",
            "sits",
            "on",
            "edge",
            "of",
            "concrete",
            "area",
            "overlooking",
            "water",
        ]

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num images (3 hard negatives + gold image), num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (25, 4, 2, 10)

        # (batch size, num images (3 hard negatives + gold image), num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (25, 4, 2, 4)

        # (batch size, num images (3 hard negatives + gold image), num boxes (fake),)
        assert tensors["box_mask"].size() == (25, 4, 2)

        # (batch size)
        assert tensors["label"].size() == (25,)
Exemplo n.º 22
0
    def setup_method(self):
        from allennlp_models.vision.dataset_readers.gqa import GQAReader

        super().setup_method()
        self.reader = GQAReader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "gqa",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
Exemplo n.º 23
0
    def test_read_without_images(self):
        from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader

        reader = VQAv2Reader(
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
        instances = list(reader.read("unittest"))
        assert len(instances) == 3
        assert "box_coordinates" not in instances[0]
        assert "box_features" not in instances[0]
        assert "box_mask" not in instances[0]
Exemplo n.º 24
0
 def __init__(self,
             source_tokenizer: Tokenizer = None,
             target_tokenizer: Tokenizer = None,
             source_token_indexers: Dict[str, TokenIndexer] = None,
             target_token_indexers: Dict[str, TokenIndexer] = None,
             **kwargs,
             ) -> None:
    super().__init__(**kwargs)
    self._source_tokenizer = source_tokenizer or WhitespaceTokenizer()
    self._target_tokenizer = target_tokenizer or self._source_tokenizer
    self._source_token_indexer = (source_token_indexers
                                  or {"tokens": SingleIdTokenIndexer()})
    self._target_token_indexers = (target_token_indexers
                                   or self._source_token_indexer)
Exemplo n.º 25
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     seg_threshold: bool = True,
     tokenizer: Tokenizer = None,
     read_entities: bool = False,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
     self.seg_threshold = seg_threshold
     self.tokenizer = tokenizer
     self.whitespace_tokenizer = WhitespaceTokenizer()
     self.read_entities = read_entities
Exemplo n.º 26
0
    def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 **kwargs) -> None:
        super().__init__(manual_distributed_sharding=True,
                         manual_multiprocess_sharding=True,
                         **kwargs)
        self._source_tokenzier = source_tokenizer or WhitespaceTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenzier
        self.source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self._target_token_indexers = target_token_indexers or self.source_token_indexers
Exemplo n.º 27
0
    def __init__(self,
                 dataset_name: str = 'cnn_dailymail',
                 tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_max_tokens: int = 400,
                 target_max_tokens: int = 100,
                 seperate_namespaces: bool = False,
                 target_namespace: str = "target_tokens",
                 save_copy_fields: bool = False,
                 save_pgn_fields: bool = False,
                 lowercase: bool = True,
                 max_instances=None):
        super().__init__()
        self._lowercase = lowercase
        self._max_instances = max_instances
        self._dataset_name = dataset_name
        self._source_max_tokens = source_max_tokens
        self._target_max_tokens = target_max_tokens

        self._tokenizer = tokenizer or WhitespaceTokenizer()

        tokens_indexer = {
            'tokens': SingleIdTokenIndexer(lowercase_tokens=lowercase)
        }
        self._source_token_indexers = source_token_indexers or tokens_indexer
        self._target_token_indexers = target_token_indexers or tokens_indexer
        self._save_copy_fields = save_copy_fields
        self._save_pgn_fields = save_pgn_fields

        self._target_namespace = 'tokens'
        self.train = datasets.load_dataset(
            dataset_name,
            '3.0.0',
            split='train',
        )
        self.val = datasets.load_dataset(dataset_name,
                                         '3.0.0',
                                         split='validation[:5%]')

        if seperate_namespaces:
            self._target_namespace = target_namespace
            second_tokens_indexer = {
                'tokens': SingleIdTokenIndexer(namespace=target_namespace)
            }
            self._target_token_indexers = target_token_indexers or second_tokens_indexer
Exemplo n.º 28
0
    def test_read(self):
        from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader

        reader = VQAv2Reader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "vqav2",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
        instances = list(reader.read("unittest"))
        assert len(instances) == 3

        instance = instances[0]
        assert len(instance.fields) == 6
        assert len(instance["question"]) == 7
        question_tokens = [t.text for t in instance["question"]]
        assert question_tokens == [
            "What", "is", "this", "photo", "taken", "looking", "through?"
        ]
        assert len(instance["labels"]) == 5
        labels = [field.label for field in instance["labels"].field_list]
        assert labels == ["net", "netting", "mesh", "pitcher", "orange"]
        assert torch.allclose(
            instance["label_weights"].tensor,
            torch.tensor([1.0, 1.0 / 3, 1.0 / 3, 1.0 / 3, 1.0 / 3]),
        )

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (3, 2, 10)

        # (batch size, num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (3, 2, 4)

        # (batch size, num boxes (fake),)
        assert tensors["box_mask"].size() == (3, 2)

        # Nothing should be masked out since the number of fake boxes is the same
        # for each item in the batch.
        assert tensors["box_mask"].all()
Exemplo n.º 29
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_tokens: int = None,
              balance: bool = False,
              text_col: str = "text",
              label_col: str = "label",
              data_dir: str = "data/CrisisNLP_volunteers_labeled_data",
              exclude: List[str] = ["MH370", "Respiratory", "ebola"],
              **kwargs):
     super().__init__(**kwargs)
     self.tokenizer = tokenizer or WhitespaceTokenizer()
     self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self.max_tokens = max_tokens
     self.balance = balance
     self.text_col = text_col
     self.label_col = label_col
     self.data_dir = Path(data_dir)
     self.exclude = exclude
Exemplo n.º 30
0
    def test_read(self):
        from allennlp_models.vision.dataset_readers.vgqa import VGQAReader

        reader = VGQAReader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "vgqa",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
        instances = list(
            reader.read("test_fixtures/vision/vgqa/question_answers.json"))
        assert len(instances) == 8

        instance = instances[0]
        assert len(instance.fields) == 6
        assert len(instance["question"]) == 5
        question_tokens = [t.text for t in instance["question"]]
        assert question_tokens == ["What", "is", "on", "the", "curtains?"]
        assert len(instance["labels"]) == 1
        labels = [field.label for field in instance["labels"].field_list]
        assert labels == ["sailboats"]

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (8, 2, 10)

        # (batch size, num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (8, 2, 4)

        # (batch size, num boxes (fake))
        assert tensors["box_mask"].size() == (8, 2)

        # Nothing should be masked out since the number of fake boxes is the same
        # for each item in the batch.
        assert tensors["box_mask"].all()