def create_cache(args): datasetName = args.datasetName tokenColId = args.tokenColumnId cudaDevice = args.cuda_device elmo_options_file = args.elmo_options elmo_weight_file = args.elmo_weights #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file= 'pretrained/velmo_options.json' #elmo_weight_file = 'pretrained/velmo_weights.hdf5' # :: Logging level :: loggingLevel = logging.INFO logger = logging.getLogger() logger.setLevel(loggingLevel) ch = logging.StreamHandler(sys.stdout) ch.setLevel(loggingLevel) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) commentSymbol = None columns = {tokenColId: 'tokens'} #picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl" #picklePath = "embeddings/velmo_cache_conll2000_data_perturbed_03.pkl" #picklePath = "embeddings/velmo_cache_conll2000_data_clean.pkl" picklePath = args.pkl_path embLookup = ELMoWordEmbeddings(None, elmo_options_file, elmo_weight_file, elmo_cuda_device=cudaDevice) print("ELMo Cache Generation") print("Output file:", picklePath) print("CUDA Device:", cudaDevice) splitFiles = ['train.txt', 'dev.txt', 'test.txt'] for splitFile in splitFiles: inputPath = os.path.join('data', datasetName, splitFile) print("Adding file to cache: " + inputPath) sentences = readCoNLL(inputPath, columns, commentSymbol) tokens = [sentence['tokens'] for sentence in sentences] start_time = time.time() embLookup.addToCache(tokens) end_time = time.time() print("%s processed in %.1f seconds" % (splitFile, end_time - start_time)) print("\n---\n") print("Store file at:", picklePath) embLookup.storeCache(picklePath)
columns = {tokenColId: 'tokens'} picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl" embLookup = ELMoWordEmbeddings(None, elmo_options_file, elmo_weight_file, elmo_cuda_device=cudaDevice) print("ELMo Cache Generation") print("Output file:", picklePath) print("CUDA Device:", cudaDevice) splitFiles = ['train.txt', 'dev.txt', 'test.txt'] for splitFile in splitFiles: inputPath = os.path.join('data', datasetName, splitFile) print("Adding file to cache: " + inputPath) sentences = readCoNLL(inputPath, columns, commentSymbol) tokens = [sentence['tokens'] for sentence in sentences] start_time = time.time() embLookup.addToCache(tokens) end_time = time.time() print("%s processed in %.1f seconds" % (splitFile, end_time - start_time)) print("\n---\n") print("Store file at:", picklePath) embLookup.storeCache(picklePath)