def __init__(self, path, tables={}, enable_traces=True): """ Arguments: path (str): The path to the database file. tables (dictionary of {str: tuple of str}, optional): A diction of {name: schema} pairs, where a schema is list of tuple pairs, of the form: (name, type). enable_traces(bool, optional): Enable traces for user defined functions and aggregates. """ self.path = fs.path(path) # Create directory if needed. parent_dir = fs.dirname(path) if parent_dir: fs.mkdir(parent_dir) self.connection = sql.connect(self.path) for name,schema in six.iteritems(tables): self.create_table(name, schema) io.debug("Opened connection to '{0}'".format(self.path)) # Register exit handler atexit.register(self.close) # Enable traces for user defined functions and aggregates. See: # # https://docs.python.org/2/library/sqlite3.html#sqlite3.enable_callback_tracebacks if enable_traces: sql.enable_callback_tracebacks(True)
def unpack_archive(*components, **kwargs): """ Unpack a compressed archive. Arguments: *components (str[]): Absolute path. compression (str, optional): Archive compression type. """ path = fs.path(*components) compression = kwargs.get("compression", "bz2") # extract tar relative to it's directory fs.cd(fs.dirname(path)) tar = tarfile.open(path, "r:" + compression) tar.extractall() tar.close() fs.cdpop()
def unpack_archive(*components, **kwargs) -> str: """ Unpack a compressed archive. Arguments: *components (str[]): Absolute path. **kwargs (dict, optional): Set "compression" to compression type. Default: bz2. Set "dir" to destination directory. Defaults to the directory of the archive. Returns: str: Path to directory. """ path = fs.abspath(*components) compression = kwargs.get("compression", "bz2") dir = kwargs.get("dir", fs.dirname(path)) fs.cd(dir) tar = tarfile.open(path, "r:" + compression) tar.extractall() tar.close() fs.cdpop() return dir
def evaluate(model, embeddings, folder_data, samples_per_class, folder_results, dense_layer_size, print_summary, num_epochs, batch_size): # Set seed for reproducibility seed = 204 #################################################################################################################### # Get data vsamples_per_class = FLAGS.vsamples # Data acquisition num_classes = 104 y_train = np.empty(0) # training X_train = list() folder_data_train = folder_data + '_train' y_val = np.empty(0) # validation X_val = list() folder_data_val = folder_data + '_val' y_test = np.empty(0) # testing X_test = list() folder_data_test = folder_data + '_test' print('Getting file names for', num_classes, 'classes from folders:') print(folder_data_train) print(folder_data_val) print(folder_data_test) for i in range(1, num_classes + 1): # loop over classes # training: Read data file names folder = os.path.join(folder_data_train, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\ttraining : Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec'] # training: Randomly pick programs assert len(seq_files) >= samples_per_class, "Cannot sample " + str(samples_per_class) + " from " + str( len(seq_files)) + " files found in " + folder X_train += resample(seq_files, replace=False, n_samples=samples_per_class, random_state=seed) y_train = np.concatenate([y_train, np.array([int(i)] * samples_per_class, dtype=np.int32)]) # validation: Read data file names folder = os.path.join(folder_data_val, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\tvalidation: Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec'] # validation: Randomly pick programs if vsamples_per_class > 0: assert len(seq_files) >= vsamples_per_class, "Cannot sample " + str(vsamples_per_class) + " from " + str( len(seq_files)) + " files found in " + folder X_val += resample(seq_files, replace=False, n_samples=vsamples_per_class, random_state=seed) y_val = np.concatenate([y_val, np.array([int(i)] * vsamples_per_class, dtype=np.int32)]) else: assert len(seq_files) > 0, "No .rec files found in" + folder X_val += seq_files y_val = np.concatenate([y_val, np.array([int(i)] * len(seq_files), dtype=np.int32)]) # test: Read data file names folder = os.path.join(folder_data_test, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\ttest : Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec'] assert len(seq_files) > 0, "No .rec files found in" + folder X_test += seq_files y_test = np.concatenate([y_test, np.array([int(i)] * len(seq_files), dtype=np.int32)]) # Load dictionary and cutoff statements folder_vocabulary = FLAGS.vocabulary_dir dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle') print('\tLoading dictionary from file', dictionary_pickle) with open(dictionary_pickle, 'rb') as f: dictionary = pickle.load(f) unk_index = dictionary[rgx.unknown_token] del dictionary # Encode source codes and get max. sequence length X_seq_train, maxlen_train = encode_srcs(X_train, 'training', unk_index) X_seq_val, maxlen_val = encode_srcs(X_val, 'validation', unk_index) X_seq_test, maxlen_test = encode_srcs(X_test, 'testing', unk_index) maxlen = max(maxlen_train, maxlen_test, maxlen_val) print('Max. sequence length overall:', maxlen) print('Padding sequences') X_seq_train = pad_src(X_seq_train, maxlen, unk_index) X_seq_val = pad_src(X_seq_val, maxlen, unk_index) X_seq_test = pad_src(X_seq_test, maxlen, unk_index) # Get one-hot vectors for classification print('YTRAIN\n', y_train) y_1hot_train = get_onehot(y_train, num_classes) y_1hot_val = get_onehot(y_val, num_classes) #################################################################################################################### # Setup paths # Set up names paths model_name = model.__name__ model_path = os.path.join(folder_results, "classifyapp/models/{}.model".format(model_name)) predictions_path = os.path.join(folder_results, "classifyapp/predictions/{}.result".format(model_name)) # If predictions have already been made with these embeddings, load them if fs.exists(predictions_path): print("\tFound predictions in", predictions_path, ", skipping...") with open(predictions_path, 'rb') as infile: p = pickle.load(infile) else: # could not find predictions already computed with these embeddings # Embeddings import tensorflow as tf # for embeddings lookup embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1) vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape print('XSEQ:\n', X_seq_train) print('EMB:\n', embedding_matrix_normalized) gen_test = EmbeddingPredictionSequence(batch_size, X_seq_test, embedding_matrix_normalized) # If models have already been made with these embeddings, load them if fs.exists(model_path): print("\n\tFound trained model in", model_path, ", skipping...") model.restore(model_path) else: # could not find models already computed with these embeddings gen_train = EmbeddingSequence(batch_size, X_seq_train, y_1hot_train, embedding_matrix_normalized) gen_val = EmbeddingSequence(batch_size, X_seq_val, y_1hot_val, embedding_matrix_normalized) ############################################################################################################ # Train # Create a new model and train it print('\n--- Initializing model...') model.init(seed=seed, maxlen=maxlen, embedding_dim=int(embedding_dimension), num_classes=num_classes, dense_layer_size=dense_layer_size) if print_summary: model.model.summary() print('\n--- Training model...') model.train_gen(train_generator=gen_train, validation_generator=gen_val, verbose=True, epochs=num_epochs) # Save the model fs.mkdir(fs.dirname(model_path)) model.save(model_path) print('\tsaved model to', model_path) ################################################################################################################ # Test # Test model print('\n--- Testing model...') p = model.predict_gen(generator=gen_test)[0] # cache the prediction fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(p, outfile) print('\tsaved predictions to', predictions_path) #################################################################################################################### # Return accuracy accuracy = p == y_test # prediction accuracy return accuracy
def test_dirname(self): self._test("", fs.dirname("foo")) self._test("/tmp", fs.dirname("/tmp/labm8.tmp"))
def test_dirname(): assert "" == fs.dirname("foo") assert "/tmp" == fs.dirname("/tmp/labm8.tmp")
def test_must_exist(): with tempfile.NamedTemporaryFile(prefix='labm8_') as f: assert fs.must_exist(f.name) == f.name assert fs.must_exist(fs.dirname(f.name), fs.basename(f.name)) == f.name with pytest.raises(fs.File404): fs.must_exist("/not/a/real/path")
def inline_fs_headers(path: Path, stack: List[str], lang: clgen.Language = clgen.Language.OPENCL, topdir: Path = None) -> str: """ Recursively inline headers in file. Parameters ---------- path : str File. stack : List[str] File stack. topdir : Path The top level directory to stop searching for includes in. Returns ------- str Inlined file. """ stack.append(path) if topdir is None: topdir = fs.dirname(path) # shell escaped top directory escp_topdir = topdir.replace('"', '\\"') include_re = clgen.include_regexp(lang) with open(path, encoding="utf-8") as infile: src = infile.read() outlines = [] for line in src.split('\n'): match = re.match(include_re, line) if match: # We have an import to inline! include = match.group("path") # Search for files with that name in the repository include_basename = fs.basename(include) esc_basename = include_basename.replace('"', '\\"') candidates = [x for x in subprocess.check_output( f'find "{escp_topdir}" -type f -name {esc_basename}', shell=True, universal_newlines=True)\ .split('\n') if x] # Select which file to inline: if len(candidates) == 1: # If there's exactly one match, then we're done: file_to_inline = candidates[0] elif len(candidates) > 1: # We have multiple candidates to inline, so we'll compare the # full paths (relative to the top directory) to select the one # whose name is the closest match: rel_matches = [match[len(topdir) + 1:] for match in candidates] distances = [ editdistance.eval(include, path) for path in rel_matches ] min_distance = min(distances) file_to_inline = candidates[distances.index(min_distance)] log.debug( f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}" ) else: # We didn't find anything suitable: file_to_inline = None # Process the inline file: if file_to_inline in stack: # We've already inlined this file, so ignore it: outlines.append( clgen.format_as_comment( lang, f'[FETCH] ignored_include({line})')) elif file_to_inline: # Inline the file by recursively expanding its contents: outlines.append( clgen.format_as_comment(lang, f'[FETCH] begin_include({line})')) inline_src = inline_fs_headers(file_to_inline, stack) outlines.append(inline_src) outlines.append( clgen.format_as_comment(lang, f'[FETCH] end_include({line})')) else: # We didn't find anything suitable, so keep the original # include: outlines.append( clgen.format_as_comment(lang, f'[FETCH] not_found({line})')) outlines.append(line) else: outlines.append(line) return '\n'.join(outlines)
def write_file(path, contents): fs.mkdir(fs.dirname(path)) with open(path, 'w') as outfile: outfile.write(contents)
def evaluate(model, device, data_folder, out_folder, embeddings, dense_layer_size, print_summary, num_epochs, batch_size): data = [] # Create device list if device == 'all': device_list = ["Cypress", "Tahiti", "Fermi", "Kepler"] else: device_list = [device] for i, platform in enumerate(device_list): print( '\n------------------------------------------------------------------' ) print('--- Platform', platform, '[', i + 1, '/ 4 ]') print( '------------------------------------------------------------------' ) platform_name = platform2str(platform) # Read data oracle_file = os.path.join(data_folder, "pact-2014-oracles.csv") oracles = pd.read_csv(oracle_file) runtimes_file = os.path.join(data_folder, "pact-2014-runtimes.csv") df = pd.read_csv(runtimes_file) print('\tRead data from', oracle_file, '\n\tand', runtimes_file) # Extract data oracle_runtimes = np.array( [float(x) for x in oracles["runtime_" + platform]]) y = np.array([int(x) for x in oracles["cf_" + platform]], dtype=np.int32) y_1hot = get_onehot(oracles, platform) # Encode source codes X_seq, maxlen = encode_srcs(data_folder, df) # Embeddings import tensorflow as tf # for embeddings lookup embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1) vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape seq_ = tf.placeholder(dtype=tf.int32) # Tensor of shape (num_input_files, sequence length, embbedding dimension) embedding_input_ = tf.nn.embedding_lookup(embedding_matrix_normalized, seq_) # Make tf block less gpu memory config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: embedding_input = sess.run(embedding_input_, feed_dict={seq_: X_seq}) # Leave-one-out cross-validation kf = KFold(n_splits=len(y), shuffle=False) for j, (train_index, test_index) in enumerate(kf.split(y)): print('--- Cross validation step [', j + 1, '/ ', len(y), ']') kernel = sorted(set(df["kernel"]))[test_index[0]] X_cc, y_cc = get_magni_features(df, oracles, platform) model_name = model.__name__ model_basename = model.__basename__ model_path = os.path.join( out_folder, "models/{model_basename}-{platform}-{j}.model".format( model_basename=model_basename, platform=platform, j=j)) predictions_path = os.path.join( out_folder, "predictions/{model_basename}-{platform}-{j}.result".format( model_basename=model_basename, platform=platform, j=j)) if fs.exists(predictions_path): # load result from cache print("\tFound predictions in", predictions_path, ", skipping...") with open(predictions_path, 'rb') as infile: p = pickle.load(infile) else: if fs.exists(model_path): # load a trained model from cache print("\n\tFound trained model in", model_path, ", skipping...") model.restore(model_path) else: # Initialize model and print summary print('\n--- Training model...') model.init(seed, maxlen, int(embedding_dimension), dense_layer_size) if print_summary: model.model.summary() # Train and cache a model model.train(sequences=embedding_input[train_index, :, :], verbose=True, y_1hot=y_1hot[train_index], epochs=num_epochs, batch_size=batch_size) # cache the model fs.mkdir(fs.dirname(model_path)) model.save(model_path) print('\tsaved model to', model_path) # test model print('\n--- Testing model...') p = model.predict(sequences=embedding_input[test_index, :, :], batch_size=batch_size)[0] # The runtimes of some coarsening factors are not recorded in the data table. If that is the case for # the predicted cf, clamp it down to the highest cf for which the runtime is recorded p = min(p, 2**(len(X_cc[test_index[0]]) - 1)) # cache the prediction fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(p, outfile) print('\tsaved predictions to', predictions_path) o = y[test_index[0]] # oracle prediction (true value) correct = p == o # predictions' correctness # get runtime without thread coarsening row = df[(df["kernel"] == kernel) & (df["cf"] == 1)] assert (len(row) == 1) # sanity check nocf_runtime = float(row["runtime_" + platform]) # get runtime of prediction row = df[(df["kernel"] == kernel) & (df["cf"] == p)] assert (len(row) == 1) # sanity check p_runtime = float(row["runtime_" + platform]) # get runtime of oracle coarsening factor o_runtime = oracle_runtimes[test_index[0]] # speedup and % oracle s_oracle = nocf_runtime / o_runtime p_speedup = nocf_runtime / p_runtime p_oracle = o_runtime / p_runtime # record result data.append({ "Model": model_name, "Platform": platform_name, "Kernel": kernel, "Oracle-CF": o, "Predicted-CF": p, "Speedup": p_speedup, "Oracle": p_oracle }) return pd.DataFrame(data, columns=[ "Model", "Platform", "Kernel", "Oracle-CF", "Predicted-CF", "Speedup", "Oracle" ])
def evaluate(model, device, data_folder, out_folder, embeddings, dense_layer_size, print_summary, num_epochs, batch_size) -> pd.DataFrame: from sklearn.model_selection import StratifiedKFold # Create device list if device == 'all': device_list = ["amd", "nvidia"] else: device_list = [device] data = [] for i, platform in enumerate(device_list): platform_name = platform2str(platform) # Load runtime data data_file = os.path.join(data_folder, "cgo17-{}.csv".format(platform)) print('\n--- Read data from', data_file) df = pd.read_csv(data_file) # Encode input source codes sequences, maxlen = encode_srcs(data_folder, df) # Load embeddings import tensorflow as tf # for embeddings lookup embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1) vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape seq_ = tf.compat.v1.placeholder(dtype=tf.int32) # Tensor of shape (num_input_files, sequence length, embbedding dimension) embedding_input_ = tf.compat.v1.nn.embedding_lookup( params=embedding_matrix_normalized, ids=seq_) # Make tf block less gpu memory config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: embedding_input = sess.run(embedding_input_, feed_dict={seq_: sequences}) # Values used for training & predictions aux_in = auxiliary_inputs(df) # Optimal mappings y = np.array([1 if x == "GPU" else 0 for x in df["oracle"].values]) y_1hot = encode_1hot(y) # 10-fold cross-validation n_splits = 10 kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) for j, (train_index, test_index) in enumerate(kf.split(sequences, y)): print('--- Cross validation step [', j, '/ ', n_splits, ']') model_name = model.__name__ model_basename = model.__basename__ model_path = os.path.join( out_folder, "models/{model_basename}-{platform}-{j}.model".format( model_basename=model_basename, platform=platform, j=j)) predictions_path = os.path.join( out_folder, "predictions/{model_basename}-{platform}-{j}.result".format( model_basename=model_basename, platform=platform, j=j)) log_dir = os.path.join(out_folder, "logs") if fs.exists(predictions_path): # load result from cache print("\tFound predictions in", predictions_path, ", skipping...") with open(predictions_path, 'rb') as infile: p = pickle.load(infile) else: if fs.exists(model_path): # restore trained model from cache print("\n\tFound trained model in", model_path, ", skipping...") model.restore(model_path) else: # Initialize model and print summary model.init(seed=seed, maxlen=maxlen, embedding_dim=int(embedding_dimension), dense_layer_size=dense_layer_size) if print_summary: model.model.summary() # Train and cache a model print('\n--- Training model... ') model.train(df=df, aux_in=aux_in[train_index], sequences=embedding_input[train_index, :, :], y=y[train_index], y_1hot=y_1hot[train_index], verbose=False, epochs=num_epochs, batch_size=batch_size, log_dir=log_dir) fs.mkdir(fs.dirname(model_path)) model.save(model_path) print('\tsaved model to', model_path) # test model print('\n--- Testing model... ') p = model.predict(batch_size=batch_size, aux_in=aux_in[test_index], sequences=embedding_input[test_index, :, :], y=y[test_index], y_1hot=y_1hot[test_index], verbose=False) # cache results fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(p, outfile) print('\tsaved predictions to', predictions_path) benchmarks = df['benchmark'].values[test_index] # benchmarks names o = y[test_index] # oracle device mappings (true values) correct = p == o # predictions' correctness # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) zero_r_dev = "runtime_cpu" if platform == "amd" else "runtime_gpu" zer_r_runtimes = df[zero_r_dev][test_index] # speedups of predictions runtimes = df[['runtime_cpu', 'runtime_gpu']].values[test_index] p_runtimes = [r[p_] for p_, r in zip(p, runtimes)] p_speedup = zer_r_runtimes / p_runtimes # sanity check assert (len(benchmarks) == len(o) == len(correct) == len(p) == len(p_speedup)) # record results for benchmark_, o_, p_, correct_, p_speedup_ in zip( benchmarks, o, p, correct, p_speedup): data.append({ "Model": model_basename, "Platform": platform_name, 'Benchmark': escape_benchmark_name(benchmark_), 'Benchmark Suite': escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Correct?": correct_, "Speedup": p_speedup_, }) return pd.DataFrame(data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Correct?", "Speedup" ])
def write_file(path: str, contents: str) -> None: if fs.dirname(path): fs.mkdir(fs.dirname(path)) with open(path, 'w') as outfile: outfile.write(contents)
def evaluate(model): from progressbar import ProgressBar progressbar = [0, ProgressBar(maxval=68)] progressbar[1].start() data = [] X_seq = None # defer sequence encoding (it's expensive) for i, platform in enumerate(["Cypress", "Tahiti", "Fermi", "Kepler"]): platform_name = platform2str(platform) # 读取四个平台下标签的运行时 oracle_runtimes = np.array( [float(x) for x in oracles["runtime_" + platform]]) # 读取四个平台下的标签(粗化因子) y = np.array([int(x) for x in oracles["cf_" + platform]], dtype=np.int32) # 对标签6种情况一热编码 y_1hot = get_onehot(oracles, platform) X_cc, y_cc = get_features(df, oracles, platform) embed = np.load(f"{data_path}caseb_128.npy") kf = KFold(n_splits=len(y), shuffle=False) for j, (train_index, test_index) in enumerate(kf.split(y)): kernel = sorted(set(df["kernel"]))[test_index[0]] model_name = model.__name__ model_basename = model.__basename__ model_path = f"result_caseB/modelb_caseB/{model_basename}-{platform}-{j}.model" predictions_path = f"result_caseB/predictionb_caseB/{model_basename}-{platform}-{j}.result" if fs.exists(predictions_path): # load result from cache with open(predictions_path, 'rb') as infile: p = pickle.load(infile) else: if fs.exists(model_path): # load a trained model from cache model.restore(model_path) else: # create a new model and train it model.init(seed=seed) model.train( sequences=embed[train_index], verbose=True, # TODO y_1hot=y_1hot[train_index]) # cache the model fs.mkdir(fs.dirname(model_path)) model.save(model_path) # make prediction p = model.predict(sequences=np.array(embed[test_index[0]]))[0] p = min(p, 2**(len(X_cc[test_index[0]]) - 1)) # cache the prediction fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(p, outfile) # oracle prediction o = y[test_index[0]] correct = p == o # get runtime without thread coarsening row = df[(df["kernel"] == kernel) & (df["cf"] == 1)] assert (len(row) == 1) # sanity check nocf_runtime = float(row["runtime_" + platform]) # get runtime of prediction row = df[(df["kernel"] == kernel) & (df["cf"] == p)] assert (len(row) == 1) # sanity check p_runtime = float(row["runtime_" + platform]) # get runtime of oracle coarsening factor o_runtime = oracle_runtimes[test_index[0]] # speedup and % oracle s_oracle = nocf_runtime / o_runtime p_speedup = nocf_runtime / p_runtime p_oracle = o_runtime / p_runtime # record result data.append({ "Model": model_name, "Platform": platform_name, "Kernel": kernel, "Oracle-CF": o, "Predicted-CF": p, "Speedup": p_speedup, "Oracle": p_oracle }) progressbar[0] += 1 # update progress bar progressbar[1].update(progressbar[0]) return pd.DataFrame(data, columns=[ "Model", "Platform", "Kernel", "Oracle-CF", "Predicted-CF", "Speedup", "Oracle" ])
def features_dir(csv_path): return fs.basename(fs.dirname(csv_path))
def test_accuracy(model, embeddings, folder_data, samples_per_class, folder_results, dense_layer_size, print_summary, num_epochs, batch_size): seed = 204 num_classes = 104 y_test = np.array([], dtype=np.int32) X_test = list() folder_data_test = os.path.join(folder_data, 'seq_test') print('Getting file names for', num_classes, 'classes from folders:') print(folder_data_test) for i in range(1, num_classes + 1): folder = os.path.join(folder_data_test, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\ttest : Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [ os.path.join(folder, f) for f in listing if f[-4:] == '.rec' ] assert len(seq_files) > 0, "No .rec files found in" + folder X_test += seq_files y_test = np.concatenate( [y_test, np.array([int(i)] * len(seq_files), dtype=np.int32)]) folder_vocabulary = FLAGS.vocabulary_dir dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle') print('\tLoading dictionary from file', dictionary_pickle) with open(dictionary_pickle, 'rb') as f: dictionary = pickle.load(f) unk_index = dictionary[rgx.unknown_token] del dictionary X_seq_test, maxlen_test = encode_srcs(X_test, 'testing', unk_index) maxlen = maxlen_test print('Max. sequence length overall:', maxlen) if FLAGS.maxlen > 0: maxlen = FLAGS.maxlen print('Padding sequences to length', maxlen) X_seq_test = pad_src(X_seq_test, maxlen, unk_index) model.__name__ = FLAGS.model_name model_name = model.__name__ model_path = os.path.join(folder_results, "models/{}.model".format(model_name)) predictions_path = os.path.join( folder_results, "predictions/{}_top{}.result".format(model_name, FLAGS.topk)) if fs.exists(predictions_path): print("\tFound predictions in", predictions_path, ", skipping...") with open(predictions_path, 'rb') as infile: ind = pickle.load(infile) else: import tensorflow as tf embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1) vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape print('EMB:\n', embedding_matrix_normalized) print('\n--- Initializing model...') model.init(seed=seed, maxlen=maxlen, embedding_dim=int(embedding_dimension), num_classes=num_classes, dense_layer_size=dense_layer_size, embedding_matrix=embedding_matrix_normalized) model.load_weights( os.path.join(FLAGS.out, model.__name__ + '_weights.h5')) if print_summary: model.model.summary() print('\n--- Testing model...') ind, prob = model.predict_topk(X_seq_test, batch_size, FLAGS.topk) del prob fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(ind, outfile) print('\tsaved predictions to', predictions_path) accuracy = np.zeros_like(y_test) ind = np.transpose(np.array(ind)) for i in range(FLAGS.topk): accuracy += np.array(ind[i]) == y_test print('\nTest top{} accuracy:'.format(FLAGS.topk), sum(accuracy) * 100.0 / len(accuracy), '%') from sklearn.metrics import confusion_matrix conf_matr = confusion_matrix(y_test, ind[0]) import matplotlib.pyplot as plt fig, ax = plt.subplots() values = plt.imshow(conf_matr) ax.xaxis.tick_top() ax.xaxis.set_label_position('top') fig.colorbar(values) ax.set_xlabel('Настоящие классы') ax.set_ylabel('Предсказанные классы') conf_png = os.path.join(folder_results, "models/conf_matr_{}.png".format(model_name)) plt.savefig(conf_png)