def targs_to_idx(col_name): # This function builds the index to vocab (and its inverse) mapping values = set(rows[col_name].values) vocab = vocabulary.Vocabulary(counter=None, non_padded_namespaces=[col_name]) for value in values: vocab.add_token_to_namespace(value, col_name) idx_to_word = vocab.get_index_to_token_vocabulary(col_name) word_to_idx = vocab.get_token_to_index_vocabulary(col_name) rows[col_name] = rows[col_name].apply(lambda x: [word_to_idx[x]] if x != "" else []) return word_to_idx, idx_to_word, rows[col_name]
if num_occurrences[i] == 0: continue token = vocab.get_token_from_index(i) to_dump = token + ' ' + ' '.join([str(v) for v in embeds[i, :]]) + '\n' f.write(to_dump) if __name__ == '__main__': args = parse_config(args) if os.path.exists(args.out_dir): print("Output dir already exists: {}".format(args.out_dir)) sys.exit(1) vocab = vocabulary.Vocabulary() vocab.set_from_file(args.vocab_file, oov_token='<UNK>') print("Loaded vocabulary of size {}".format(vocab.get_vocab_size())) anchors, norms, num_occurrences = run_elmo(args.txt_files, args.elmo_options_path, args.elmo_weights_path, vocab, args.layers, args.batch_size, args.cuda_device) os.makedirs(args.out_dir, exist_ok=True) norm_dict = {} print('Saving outputs to {}'.format(args.out_dir)) for l in tqdm(args.layers): norm_key = 'avg_norm_layer_{}'.format(l) norm_dict[norm_key] = norms[l]
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--lang", default=None, type=str, required=True) parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--out_dir", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="0", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('-d', '--emb_dim', type=int, default=1024, help="Embeddings size") parser.add_argument( '--vocab_file', type=str, default='vocabs/en_50k.vocab', help= "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language" ) args = parser.parse_args() lang = args.lang tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab = vocabulary.Vocabulary() vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token) print("Loaded vocabulary of size {}".format(vocab.get_vocab_size())) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] examples = read_examples(args.input_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer, lang=lang) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = AutoModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) num_occurrences = [0] * vocab.get_vocab_size() anchors = {} norms = {} total_words = 0 for l in layer_indexes: norms[l] = 0.0 anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim)) oov_ind = vocab.get_token_index(vocab._oov_token) model.eval() for input_ids, input_mask, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers = model(input_ids) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] for (i, token) in enumerate(feature.tokens): all_layers = [] w_id = vocab.get_token_index(token) if w_id == oov_ind: continue n = num_occurrences[w_id] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() l = layer_index values = layer_output[i] anchors[l][w_id, :] = anchors[l][w_id, :] * ( n / (n + 1)) + values[:] / (n + 1) norm = np.linalg.norm(values[:]) norms[l] = norms[l] * (total_words / (total_words + 1)) + norm / ( total_words + 1) total_words += 1 num_occurrences[w_id] += 1 os.makedirs(args.out_dir, exist_ok=True) norm_dict = {} print('Saving outputs to {}'.format(args.out_dir)) for l in tqdm(layer_indexes): norm_key = 'avg_norm_layer_{}'.format(l) norm_dict[norm_key] = norms[l] file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l)) save_embeds(file_path, anchors[l], vocab, num_occurrences, args.emb_dim) norm_dict['occurrences'] = num_occurrences file_path = os.path.join(args.out_dir, 'norms.json') json.dump(norm_dict, open(file_path, 'w'))