Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', required=True)
    parser.add_argument('--model_state',
                        help='Path to saved model state_dict.')
    parser.add_argument('--limit', type=int, default=None)

    args = parser.parse_args()

    print('Loading trained model')
    _, node_feature_mapping, edge_feature_mapping = sample.load_saved_model(
        args.model_state)

    print('Loading testing data')
    seqs = flat_array.load_dictionary_flat(np.load(args.dataset,
                                                   mmap_mode='r'))['sequences']

    if args.limit is not None:
        seqs = seqs[:args.limit]

    total_bytes = compress_sequences(tqdm.tqdm(seqs, total=len(seqs)),
                                     node_feature_mapping,
                                     edge_feature_mapping)
    print('Total bytes: {0}'.format(total_bytes))
    print('Average Entropy: {0:.3f} bits / graph'.format(total_bytes * 8 /
                                                         len(seqs)))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', required=True)
    parser.add_argument('--mask',
                        choices=list(MASK_FUNCTIONS.keys()),
                        default='node_type')
    parser.add_argument('--output')
    parser.add_argument('--limit', type=int, default=None)
    parser.add_argument('--num_workers', type=int, default=16)

    args = parser.parse_args()

    print('Reading data')
    seqs = flat_array.load_dictionary_flat(np.load(args.input,
                                                   mmap_mode='r'))['sequences']
    seqs.share_memory_()

    if args.limit is not None:
        seqs = seqs[:args.limit]

    print('Computing statistics')
    result = uniform_valid_perplexity(seqs, MASK_FUNCTIONS[args.mask],
                                      args.num_workers)

    if args.output is not None:
        print('Saving results')
        np.savez_compressed(args.output, **result)

    choices = np.average(result['choices'], weights=result['sequence_length'])
    entropy = np.average(result['entropy'], weights=result['sequence_length'])
    print('Average choices: {:.3f}'.format(choices))
    print('Average entropy: {:.3f}'.format(entropy))
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--dataset', type=str, required=True)
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--output', type=str)
    parser.add_argument('--max_predictions', type=int, default=None)

    args = parser.parse_args()

    device = torch.device(args.device)

    print('Loading trained model')
    model, node_feature_mapping = eval.load_sampling_model(args.model)
    model = model.eval().to(device)

    print('Loading testing data')
    seqs = flat_array.load_dictionary_flat(np.load(args.dataset,
                                                   mmap_mode='r'))['sequences']

    likelihood_evaluation = EdgeLikelihoodEvaluator(model,
                                                    node_feature_mapping,
                                                    device)

    length = len(seqs)
    if args.max_predictions is not None:
        length = min(length, args.max_predictions)

    results = []
    average_likelihoods = np.empty(length)
    sequence_length = np.empty(length)

    for i in tqdm.trange(length):
        seq = seqs[i]
        result = likelihood_evaluation.edge_likelihood(seq)
        results.append({'seq': seq, 'likelihood': result})

        sequence_length[i] = result.shape[0]

        if result.shape[0] == 0:
            average_likelihoods[i] = 0.0
        else:
            average_likelihoods[i] = np.mean(np.sum(result, axis=-1), axis=0)

    print('Average bit per edge {0:.3f}'.format(
        np.average(average_likelihoods, weights=sequence_length) / np.log(2)))

    if args.output is None:
        return

    print('Saving to output {0}'.format(args.output))
    with gzip.open(args.output, 'wb') as f:
        pickle.dump(results, f, protocol=4)
Пример #4
0
def load_dataset_and_weights_with_mapping(dataset_file,
                                          node_feature_mapping,
                                          edge_feature_mapping,
                                          seed=None):
    data = flat_array.load_dictionary_flat(np.load(dataset_file,
                                                   mmap_mode='r'))
    seqs = data['sequences']
    seqs.share_memory_()

    ds = dataset.GraphDataset(seqs, node_feature_mapping, edge_feature_mapping,
                              seed)

    return ds, data['sequence_lengths']
Пример #5
0
def load_sequences_and_mappings(dataset_file,
                                auxiliary_file,
                                quantization,
                                entity_features=True,
                                edge_features=True):
    data = flat_array.load_dictionary_flat(np.load(dataset_file,
                                                   mmap_mode='r'))

    if auxiliary_file is None:
        root, _ = os.path.splitext(dataset_file)
        auxiliary_file = root + '.stats.pkl.gz'

    if entity_features or edge_features:
        with gzip.open(auxiliary_file, 'rb') as f:
            auxiliary_dict = pickle.load(f)

    if entity_features:
        entity_feature_mapping = dataset.EntityFeatureMapping(
            auxiliary_dict['node'])
    else:
        entity_feature_mapping = None

    seqs = data['sequences']
    weights = data['sequence_lengths']

    if edge_features:
        if isinstance(quantization['angle'], dataset.QuantizationMap):
            angle_map = quantization['angle']
        else:
            angle_map = dataset.QuantizationMap.from_counter(
                auxiliary_dict['edge']['angle'], quantization['angle'])

        if isinstance(quantization['length'], dataset.QuantizationMap):
            length_map = quantization['length']
        else:
            length_map = dataset.QuantizationMap.from_counter(
                auxiliary_dict['edge']['length'], quantization['length'])
        edge_feature_mapping = dataset.EdgeFeatureMapping(
            angle_map, length_map)
    else:
        edge_feature_mapping = None

    return {
        'sequences': seqs.share_memory_(),
        'entity_feature_mapping': entity_feature_mapping,
        'edge_feature_mapping': edge_feature_mapping,
        'weights': weights
    }
def _worker_node(param_combination, filepath, num_centers, max_values=None):
    label, param_name = param_combination
    sequences = flat_array.load_dictionary_flat(
        np.load(filepath, mmap_mode='r'))['sequences']

    values = (op.parameters[param_name]
              for op in itertools.chain.from_iterable(sequences)
              if op.label == label and param_name in op.parameters)

    if max_values is not None:
        values = itertools.islice(values, max_values)

    values = np.array(list(values))
    centers = numerical_parameters.make_quantization(values, num_centers,
                                                     'cdf')
    return centers
def _worker_edges(dataset_path, worker_idx, num_workers, result_queue):
    # Load data
    data = flat_array.load_dictionary_flat(np.load(dataset_path,
                                                   mmap_mode='r'))
    sequences = data['sequences']

    # Extract sub-sequence for worker
    length_for_worker, num_additional = divmod(len(sequences), num_workers)
    offset = worker_idx * length_for_worker + max(worker_idx, num_additional)
    if worker_idx < num_additional:
        length_for_worker += 1

    seq_indices = range(offset,
                        min((offset + length_for_worker, len(sequences))))

    # Process data
    expression_counters = {
        k: collections.Counter()
        for k in _EDGE_PARAMETER_IDS
    }

    num_processed = 0

    for seq_idx in seq_indices:
        seq = sequences[seq_idx]

        try:
            for op in seq:
                if not isinstance(op, EdgeOp):
                    continue

                for k in _EDGE_PARAMETER_IDS:
                    if k in op.parameters:
                        value = op.parameters[k]
                        value = numerical_parameters.normalize_expression(
                            value, k)
                        expression_counters[k][value] += 1
        except Exception:
            print('Error processing sequence at index {0}'.format(seq_idx))

        num_processed += 1
        if num_processed > 1000:
            result_queue.put(num_processed)
            num_processed = 0
    result_queue.put(num_processed)

    result_queue.put(expression_counters)
Пример #8
0
def test_flat_dictionary():
    x = [2, 3, 4, 5]
    y = np.array([3, 5])
    z = ["A", "python", "list"]

    x_flat = flat_array.save_list_flat(x)

    dict_flat = flat_array.pack_dictionary_flat({
        'x': x_flat,
        'y': y,
        'z': z
    })

    result = flat_array.load_dictionary_flat(dict_flat)

    assert isinstance(result['x'], flat_array.FlatSerializedArray)
    assert len(result['x']) == len(x)

    assert result['z'] == z
    assert all(result['y'] == y)
def process_edges(dataset_path, num_threads):
    print('Checking total sketch dataset size.')
    total_sequences = len(
        flat_array.load_dictionary_flat(np.load(dataset_path,
                                                mmap_mode='r'))['sequences'])

    result_queue = multiprocessing.Queue()

    workers = []

    for worker_idx in range(num_threads):
        workers.append(
            multiprocessing.Process(target=_worker_edges,
                                    args=(dataset_path, worker_idx,
                                          num_threads, result_queue)))

    for worker in workers:
        worker.start()

    active_workers = len(workers)

    total_result = {}

    print('Processing sequences for edge statistics')
    with tqdm.tqdm(total=total_sequences) as pbar:
        while active_workers > 0:
            result = result_queue.get()

            if isinstance(result, int):
                pbar.update(result)
                continue

            for k, v in result.items():
                total_result.setdefault(k, collections.Counter()).update(v)
            active_workers -= 1

    for worker in workers:
        worker.join()

    return total_result
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', required=True)
    parser.add_argument('--model_state',
                        help='Path to saved model state_dict.')
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument('--limit', type=int, default=None)

    args = parser.parse_args()

    device = torch.device(args.device)

    print('Loading trained model')
    model, node_feature_mapping, edge_feature_mapping = sample.load_saved_model(
        args.model_state)
    model = model.eval().to(device)

    print('Loading testing data')
    seqs = flat_array.load_dictionary_flat(np.load(args.dataset,
                                                   mmap_mode='r'))['sequences']

    if args.limit is not None:
        seqs = seqs[:args.limit]

    evaluator = GraphLikelihoodEvaluator(model, node_feature_mapping,
                                         edge_feature_mapping, device)

    losses = np.empty(len(seqs))
    length = np.empty(len(seqs), dtype=np.int64)

    for i, result in enumerate(
            tqdm.tqdm(evaluator.compute_likelihood(seqs), total=len(seqs))):
        losses[i], length[i] = result

    print('Average bits per sketch: {:.2f}'.format(losses.mean() / np.log(2)))
    print('Average bits per step: {:.2f}'.format(losses.sum() / np.log(2) /
                                                 length.sum()))
Пример #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--dataset', type=str, required=True)
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--output', type=str)
    parser.add_argument('--output_statistics', type=str)
    parser.add_argument('--max_edge', type=int, default=100)
    parser.add_argument('--max_predictions', type=int, default=None)
    parser.add_argument('--use_joint', action='store_true')
    parser.add_argument('--mask',
                        default=None,
                        choices=list(MASK_FUNCTIONS.keys()))

    args = parser.parse_args()

    device = torch.device(args.device)

    print('Loading trained model')
    model, node_feature_mapping = load_sampling_model(args.model)
    model = model.eval().to(device)

    print('Loading testing data')
    seqs = flat_array.load_dictionary_flat(np.load(args.dataset,
                                                   mmap_mode='r'))['sequences']

    prediction = AutoConstraintPrediction(
        model,
        node_feature_mapping,
        batch_size=args.batch_size,
        device=device,
        mask_function=MASK_FUNCTIONS[args.mask]
        if args.mask is not None else None)

    length = len(seqs)
    if args.max_predictions is not None:
        length = min(length, args.max_predictions)

    input_seq_prediction, input_seq_verification = itertools.tee(
        (seqs[i] for i in range(length)), 2)

    input_node_ops = ([op for op in seq if isinstance(op, datalib.NodeOp)]
                      for seq in input_seq_prediction)
    prediction_output = prediction.predict(input_node_ops,
                                           use_joint=args.use_joint,
                                           num_workers=4)

    precision = np.empty(length, dtype=np.float)
    recall = np.empty(length, dtype=np.float)
    ops = []

    for i, (predicted_edge_ops, original_ops) in enumerate(
            tqdm.tqdm(zip(prediction_output, input_seq_verification),
                      total=length)):
        node_ops, edge_ops = split_ops(original_ops)
        ops.append({
            'node_ops': node_ops,
            'edge_ops': edge_ops,
            'predicted_edge_ops': predicted_edge_ops,
        })

        predicted_edge_ops = set((e.label, e.partner, e.current)
                                 for e in predicted_edge_ops
                                 if e.label != sketch.ConstraintType.Subnode)
        edge_ops = set((e.label, e.partner, e.current) for e in edge_ops
                       if e.label != sketch.ConstraintType.Subnode)

        num_correct_edge_ops = len(edge_ops & predicted_edge_ops)
        precision[i] = num_correct_edge_ops / len(predicted_edge_ops) if len(
            predicted_edge_ops) > 0 else 0
        recall[i] = num_correct_edge_ops / len(edge_ops) if len(
            edge_ops) > 0 else 1

    if args.output is not None:
        with gzip.open(args.output, 'wb') as f:
            pickle.dump(ops, f, protocol=4)

    output_statistics_file = args.output_statistics

    if output_statistics_file is None:
        if args.output is not None:
            output_basename, output_ext = os.path.splitext(args.output)
            if output_ext == '.gz':
                output_basename, _ = os.path.splitext(output_basename)

            output_statistics_file = output_basename + '_stat.npz'

    if output_statistics_file is not None:
        np.savez_compressed(output_statistics_file,
                            precision=precision,
                            recall=recall)