def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', required=True) parser.add_argument('--model_state', help='Path to saved model state_dict.') parser.add_argument('--limit', type=int, default=None) args = parser.parse_args() print('Loading trained model') _, node_feature_mapping, edge_feature_mapping = sample.load_saved_model( args.model_state) print('Loading testing data') seqs = flat_array.load_dictionary_flat(np.load(args.dataset, mmap_mode='r'))['sequences'] if args.limit is not None: seqs = seqs[:args.limit] total_bytes = compress_sequences(tqdm.tqdm(seqs, total=len(seqs)), node_feature_mapping, edge_feature_mapping) print('Total bytes: {0}'.format(total_bytes)) print('Average Entropy: {0:.3f} bits / graph'.format(total_bytes * 8 / len(seqs)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) parser.add_argument('--mask', choices=list(MASK_FUNCTIONS.keys()), default='node_type') parser.add_argument('--output') parser.add_argument('--limit', type=int, default=None) parser.add_argument('--num_workers', type=int, default=16) args = parser.parse_args() print('Reading data') seqs = flat_array.load_dictionary_flat(np.load(args.input, mmap_mode='r'))['sequences'] seqs.share_memory_() if args.limit is not None: seqs = seqs[:args.limit] print('Computing statistics') result = uniform_valid_perplexity(seqs, MASK_FUNCTIONS[args.mask], args.num_workers) if args.output is not None: print('Saving results') np.savez_compressed(args.output, **result) choices = np.average(result['choices'], weights=result['sequence_length']) entropy = np.average(result['entropy'], weights=result['sequence_length']) print('Average choices: {:.3f}'.format(choices)) print('Average entropy: {:.3f}'.format(entropy))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True) parser.add_argument('--dataset', type=str, required=True) parser.add_argument('--device', type=str, default='cpu') parser.add_argument('--output', type=str) parser.add_argument('--max_predictions', type=int, default=None) args = parser.parse_args() device = torch.device(args.device) print('Loading trained model') model, node_feature_mapping = eval.load_sampling_model(args.model) model = model.eval().to(device) print('Loading testing data') seqs = flat_array.load_dictionary_flat(np.load(args.dataset, mmap_mode='r'))['sequences'] likelihood_evaluation = EdgeLikelihoodEvaluator(model, node_feature_mapping, device) length = len(seqs) if args.max_predictions is not None: length = min(length, args.max_predictions) results = [] average_likelihoods = np.empty(length) sequence_length = np.empty(length) for i in tqdm.trange(length): seq = seqs[i] result = likelihood_evaluation.edge_likelihood(seq) results.append({'seq': seq, 'likelihood': result}) sequence_length[i] = result.shape[0] if result.shape[0] == 0: average_likelihoods[i] = 0.0 else: average_likelihoods[i] = np.mean(np.sum(result, axis=-1), axis=0) print('Average bit per edge {0:.3f}'.format( np.average(average_likelihoods, weights=sequence_length) / np.log(2))) if args.output is None: return print('Saving to output {0}'.format(args.output)) with gzip.open(args.output, 'wb') as f: pickle.dump(results, f, protocol=4)
def load_dataset_and_weights_with_mapping(dataset_file, node_feature_mapping, edge_feature_mapping, seed=None): data = flat_array.load_dictionary_flat(np.load(dataset_file, mmap_mode='r')) seqs = data['sequences'] seqs.share_memory_() ds = dataset.GraphDataset(seqs, node_feature_mapping, edge_feature_mapping, seed) return ds, data['sequence_lengths']
def load_sequences_and_mappings(dataset_file, auxiliary_file, quantization, entity_features=True, edge_features=True): data = flat_array.load_dictionary_flat(np.load(dataset_file, mmap_mode='r')) if auxiliary_file is None: root, _ = os.path.splitext(dataset_file) auxiliary_file = root + '.stats.pkl.gz' if entity_features or edge_features: with gzip.open(auxiliary_file, 'rb') as f: auxiliary_dict = pickle.load(f) if entity_features: entity_feature_mapping = dataset.EntityFeatureMapping( auxiliary_dict['node']) else: entity_feature_mapping = None seqs = data['sequences'] weights = data['sequence_lengths'] if edge_features: if isinstance(quantization['angle'], dataset.QuantizationMap): angle_map = quantization['angle'] else: angle_map = dataset.QuantizationMap.from_counter( auxiliary_dict['edge']['angle'], quantization['angle']) if isinstance(quantization['length'], dataset.QuantizationMap): length_map = quantization['length'] else: length_map = dataset.QuantizationMap.from_counter( auxiliary_dict['edge']['length'], quantization['length']) edge_feature_mapping = dataset.EdgeFeatureMapping( angle_map, length_map) else: edge_feature_mapping = None return { 'sequences': seqs.share_memory_(), 'entity_feature_mapping': entity_feature_mapping, 'edge_feature_mapping': edge_feature_mapping, 'weights': weights }
def _worker_node(param_combination, filepath, num_centers, max_values=None): label, param_name = param_combination sequences = flat_array.load_dictionary_flat( np.load(filepath, mmap_mode='r'))['sequences'] values = (op.parameters[param_name] for op in itertools.chain.from_iterable(sequences) if op.label == label and param_name in op.parameters) if max_values is not None: values = itertools.islice(values, max_values) values = np.array(list(values)) centers = numerical_parameters.make_quantization(values, num_centers, 'cdf') return centers
def _worker_edges(dataset_path, worker_idx, num_workers, result_queue): # Load data data = flat_array.load_dictionary_flat(np.load(dataset_path, mmap_mode='r')) sequences = data['sequences'] # Extract sub-sequence for worker length_for_worker, num_additional = divmod(len(sequences), num_workers) offset = worker_idx * length_for_worker + max(worker_idx, num_additional) if worker_idx < num_additional: length_for_worker += 1 seq_indices = range(offset, min((offset + length_for_worker, len(sequences)))) # Process data expression_counters = { k: collections.Counter() for k in _EDGE_PARAMETER_IDS } num_processed = 0 for seq_idx in seq_indices: seq = sequences[seq_idx] try: for op in seq: if not isinstance(op, EdgeOp): continue for k in _EDGE_PARAMETER_IDS: if k in op.parameters: value = op.parameters[k] value = numerical_parameters.normalize_expression( value, k) expression_counters[k][value] += 1 except Exception: print('Error processing sequence at index {0}'.format(seq_idx)) num_processed += 1 if num_processed > 1000: result_queue.put(num_processed) num_processed = 0 result_queue.put(num_processed) result_queue.put(expression_counters)
def test_flat_dictionary(): x = [2, 3, 4, 5] y = np.array([3, 5]) z = ["A", "python", "list"] x_flat = flat_array.save_list_flat(x) dict_flat = flat_array.pack_dictionary_flat({ 'x': x_flat, 'y': y, 'z': z }) result = flat_array.load_dictionary_flat(dict_flat) assert isinstance(result['x'], flat_array.FlatSerializedArray) assert len(result['x']) == len(x) assert result['z'] == z assert all(result['y'] == y)
def process_edges(dataset_path, num_threads): print('Checking total sketch dataset size.') total_sequences = len( flat_array.load_dictionary_flat(np.load(dataset_path, mmap_mode='r'))['sequences']) result_queue = multiprocessing.Queue() workers = [] for worker_idx in range(num_threads): workers.append( multiprocessing.Process(target=_worker_edges, args=(dataset_path, worker_idx, num_threads, result_queue))) for worker in workers: worker.start() active_workers = len(workers) total_result = {} print('Processing sequences for edge statistics') with tqdm.tqdm(total=total_sequences) as pbar: while active_workers > 0: result = result_queue.get() if isinstance(result, int): pbar.update(result) continue for k, v in result.items(): total_result.setdefault(k, collections.Counter()).update(v) active_workers -= 1 for worker in workers: worker.join() return total_result
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', required=True) parser.add_argument('--model_state', help='Path to saved model state_dict.') parser.add_argument('--device', type=str, default='cuda') parser.add_argument('--limit', type=int, default=None) args = parser.parse_args() device = torch.device(args.device) print('Loading trained model') model, node_feature_mapping, edge_feature_mapping = sample.load_saved_model( args.model_state) model = model.eval().to(device) print('Loading testing data') seqs = flat_array.load_dictionary_flat(np.load(args.dataset, mmap_mode='r'))['sequences'] if args.limit is not None: seqs = seqs[:args.limit] evaluator = GraphLikelihoodEvaluator(model, node_feature_mapping, edge_feature_mapping, device) losses = np.empty(len(seqs)) length = np.empty(len(seqs), dtype=np.int64) for i, result in enumerate( tqdm.tqdm(evaluator.compute_likelihood(seqs), total=len(seqs))): losses[i], length[i] = result print('Average bits per sketch: {:.2f}'.format(losses.mean() / np.log(2))) print('Average bits per step: {:.2f}'.format(losses.sum() / np.log(2) / length.sum()))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True) parser.add_argument('--dataset', type=str, required=True) parser.add_argument('--device', type=str, default='cpu') parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--output', type=str) parser.add_argument('--output_statistics', type=str) parser.add_argument('--max_edge', type=int, default=100) parser.add_argument('--max_predictions', type=int, default=None) parser.add_argument('--use_joint', action='store_true') parser.add_argument('--mask', default=None, choices=list(MASK_FUNCTIONS.keys())) args = parser.parse_args() device = torch.device(args.device) print('Loading trained model') model, node_feature_mapping = load_sampling_model(args.model) model = model.eval().to(device) print('Loading testing data') seqs = flat_array.load_dictionary_flat(np.load(args.dataset, mmap_mode='r'))['sequences'] prediction = AutoConstraintPrediction( model, node_feature_mapping, batch_size=args.batch_size, device=device, mask_function=MASK_FUNCTIONS[args.mask] if args.mask is not None else None) length = len(seqs) if args.max_predictions is not None: length = min(length, args.max_predictions) input_seq_prediction, input_seq_verification = itertools.tee( (seqs[i] for i in range(length)), 2) input_node_ops = ([op for op in seq if isinstance(op, datalib.NodeOp)] for seq in input_seq_prediction) prediction_output = prediction.predict(input_node_ops, use_joint=args.use_joint, num_workers=4) precision = np.empty(length, dtype=np.float) recall = np.empty(length, dtype=np.float) ops = [] for i, (predicted_edge_ops, original_ops) in enumerate( tqdm.tqdm(zip(prediction_output, input_seq_verification), total=length)): node_ops, edge_ops = split_ops(original_ops) ops.append({ 'node_ops': node_ops, 'edge_ops': edge_ops, 'predicted_edge_ops': predicted_edge_ops, }) predicted_edge_ops = set((e.label, e.partner, e.current) for e in predicted_edge_ops if e.label != sketch.ConstraintType.Subnode) edge_ops = set((e.label, e.partner, e.current) for e in edge_ops if e.label != sketch.ConstraintType.Subnode) num_correct_edge_ops = len(edge_ops & predicted_edge_ops) precision[i] = num_correct_edge_ops / len(predicted_edge_ops) if len( predicted_edge_ops) > 0 else 0 recall[i] = num_correct_edge_ops / len(edge_ops) if len( edge_ops) > 0 else 1 if args.output is not None: with gzip.open(args.output, 'wb') as f: pickle.dump(ops, f, protocol=4) output_statistics_file = args.output_statistics if output_statistics_file is None: if args.output is not None: output_basename, output_ext = os.path.splitext(args.output) if output_ext == '.gz': output_basename, _ = os.path.splitext(output_basename) output_statistics_file = output_basename + '_stat.npz' if output_statistics_file is not None: np.savez_compressed(output_statistics_file, precision=precision, recall=recall)