Exemplo n.º 1
0
def run_RSA(stim_file, vocab_file, model_files, header=False, 
        multisent_flag = False, filter_file = None, verbose=False, 
        embedding=False):

    ''' Given a stimuli file, model vocabulary file and model files
    return information about information
    theoretic measures and similarity'''

    #hard code data_dir
    data_path = './'


    #set loss function to be cross entropy
    criterion = nn.CrossEntropyLoss()

    #Load experiments
    EXP = data.Stim(stim_file, header, filter_file, vocab_file)

    #Loop through the models
    for model_file in model_files:
        if verbose:
            print('testing model:', model_file)

        #load the model
        with open(model_file, 'rb') as f:
            #run on local cpu for now
            model = torch.load(f, map_location='cpu')

            # make in continous chunk of memory for speed
            if isinstance(model, torch.nn.DataParallel):
                model = model.module
            model.rnn.flatten_parameters()

        model.eval()
        #loop through experimental items for EXP
        for x in range(len(EXP.UNK_SENTS)):
            sentences = list(EXP.UNK_SENTS[x])

            target = sentences[:1]
            sentences = sentences[1:]

            #Create corpus wrapper (this is for one hoting data)
            corpus = data_test.TestSent(data_path, vocab_file, 
                    target, False)
            #Get one hots
            target_ids = corpus.get_data()

            #Create corpus wrapper (this is for one hoting data)
            corpus = data_test.TestSent(data_path, vocab_file, 
                    sentences, multisent_flag)
            #Get one hots
            sent_ids = corpus.get_data()

            sims = get_sims(target_ids, sent_ids, corpus, model, embedding)

            values = test_IT(sent_ids, corpus, model)

            EXP.load_IT(model_file, x, values, multisent_flag, sims)

    return EXP
Exemplo n.º 2
0
def run_ELMo_RSA(stim_file, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Get tokenizer
    tokenizer = WhitespaceTokenizer()

    #Load model
    ##ELMo OG
    elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
    elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'

    #ELMo Small
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'

    #ELMo Medium
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json'

    #ELMo OG (5.5B)
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

    elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file,
                                       weight_file=elmo_weight_file,
                                       dropout=0.0)
    embedder = BasicTextFieldEmbedder(
        token_embedders={'elmo_tokens': elmo_embedding})

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])
        target = sentences[0]
        sentence = sentences[1]

        #GET BASELINE
        token_indexer = ELMoTokenCharactersIndexer()
        vocab = Vocabulary()

        target_tokens = tokenizer.tokenize(target)
        target_text_field = TextField(target_tokens,
                                      {'elmo_tokens': token_indexer})
        target_text_field.index(vocab)
        target_token_tensor = target_text_field.as_tensor(
            target_text_field.get_padding_lengths())
        target_tensor_dict = target_text_field.batch_tensors(
            [target_token_tensor])

        target_embedding = embedder(target_tensor_dict)[0]
        baseline = target_embedding[-1].data.cpu().squeeze()

        #GET SIMS
        sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder)
        values = get_dummy_values(sentence)

        EXP.load_IT('elmo', x, values, False, sims)

    return EXP
Exemplo n.º 3
0
def check_unk(stim_file, vocab_file, header=False, 
        filter_file=None, verbose=False):

    ''' Given a stimuli file and model vocabulary file 
    return UNK'd stimuli.'''

    #hard code data_dir
    data_path = './'


    #Load experiments
    EXP = data.Stim(stim_file, header, filter_file, vocab_file)
    EXP.check_unks()

    return EXP
Exemplo n.º 4
0
def run_BERT_RSA(stim_file, layer, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Load BERT uncased
    pretrained_weights = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    model = BertModel.from_pretrained(pretrained_weights,
                                      output_hidden_states=True)

    #tokenizer = AutoTokenizer.from_pretrained("nyu-mll/roberta-base-100M-3")
    #tokenizer = AutoTokenizer.from_pretrained("nyu-mll/roberta-base-1B-3")
    #model = RobertaForMaskedLM.from_pretrained("nyu-mll/roberta-base-1B-3", output_hidden_states=True)

    model.eval()
    model.zero_grad()

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])

        target = sentences[0]
        sentences = sentences[1:]

        #GET BASELINE
        target_encoded = tokenizer.encode(target)
        target_ids = torch.tensor(target_encoded).unsqueeze(0)

        hidden_states = model(target_ids)[-1]
        embed, hidden_states = hidden_states[:1], hidden_states[1:]

        hidden_states = hidden_states[layer][0]

        baseline_word = tokenizer.decode(torch.tensor([target_encoded[-2]
                                                       ])).strip()

        baseline = hidden_states[-2].data.cpu().squeeze()

        sims = get_BERT_sims(sentences[0], layer, baseline, tokenizer, model)
        values = get_dummy_values(sentences[0])

        EXP.load_IT('bert-uncased', x, values, False, sims)

    return EXP
Exemplo n.º 5
0
def run_norming(stim_file, vocab_file, model_files, header=False, 
        multisent_flag = False, filter_file = None, verbose=False):

    ''' Given a stimuli file, model vocabulary file and model files
    return information about frequency and information
    theoretic measures'''

    #hard code data_dir
    data_path = './'


    #set loss function to be cross entropy
    criterion = nn.CrossEntropyLoss()

    #Load experiments
    EXP = data.Stim(stim_file, header, filter_file, vocab_file)

    #Loop through the models
    for model_file in model_files:
        if verbose:
            print('testing model:', model_file)

        #load the model
        with open(model_file, 'rb') as f:
            #run on local cpu for now
            model = torch.load(f, map_location='cpu')

        #loop through experimental items for EXP
        for x in range(len(EXP.UNK_SENTS)):
            sentences = list(EXP.UNK_SENTS[x])

            #Create corpus wrapper (this is for one hoting data)
            corpus = data_test.TestSent(data_path, vocab_file, 
                    sentences, multisent_flag)
            #Get one hots
            sent_ids = corpus.get_data()

            values = test_IT(sent_ids, corpus, model)

            EXP.load_IT(model_file, x, values, multisent_flag)

    return EXP
Exemplo n.º 6
0
def run_GPT_RSA(stim_file, layer, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Get tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')

    #Load model
    model = GPT2LMHeadModel.from_pretrained(
        'gpt2-xl', output_hidden_states=True)  #, force_download=True)
    #turn off learning
    model.zero_grad()

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])
        target = sentences[0]
        sentence = sentences[1]

        #GET BASELINE
        target_encoded = tokenizer.encode(target,
                                          add_special_tokens=True,
                                          add_prefix_space=True)
        target_input_ids = torch.tensor(target_encoded).unsqueeze(0)

        #Get model outputs
        output = model(target_input_ids)
        predictions, mems, hidden_states = output

        hidden_states = hidden_states[1:]

        baseline = hidden_states[layer][0][-1].data.cpu().squeeze()

        #GET SIMs
        sims = get_GPT_sims(sentence, layer, baseline, tokenizer, model)
        values = get_dummy_values(sentence)

        EXP.load_IT('gpt2', x, values, False, sims)

    return EXP