Пример #1
0
 def filter_samples(blacklist, data, name):
     original_size = len(data)
     data.examples = [
         example for example in data if example.id not in blacklist
     ]
     Logger.debug("{} size {} -> {}".format(name, original_size,
                                            len(data)))
Пример #2
0
    def __accuracy(self, base_model, reject_threshold, valid_iter, adversarial=False):
        valid_stats = base_model.accuracy_with_reject(
            valid_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            reject_threshold,
        )

        if adversarial:
            Logger.start_scope("adversarial accuracy")

            stats = self.rename_adversary.adversarial_accuracy(
                base_model,
                valid_iter,
                [
                    AdversaryBatchIter(
                        self.subtree_adversary,
                        base_model,
                        AdversaryBatchIter(
                            self.rename_adversary, base_model, valid_iter, num_samples=2
                        ),
                    ),
                    AdversaryBatchIter(
                        self.rename_adversary, base_model, valid_iter, num_samples=40
                    ),
                ],
                threshold=reject_threshold,
                approximate=True,
            )
            Logger.end_scope()
            return stats.is_sound()

        valid_prec = valid_stats["mask_valid_noreject_acc"]
        return valid_prec == 100.0
Пример #3
0
    def set_base_predictions(
        self,
        base_correct: torch.Tensor,
        base_preds: torch.Tensor,
        base_y=None,
        batch_ids=None,
    ):
        self.base_correct = base_correct
        self.base_preds = base_preds
        self.base_reject_mask = base_preds == self.reject_token_id
        self.base_y = base_y

        self.correct = {
            AdvType.SOUND_PRECISE: base_correct.clone(),
            AdvType.SOUND: base_correct.clone(),
            # AdvType.UNSOUND: torch.zeros_like(base_correct, dtype=torch.bool),
        }

        self.reject = {
            AdvType.SOUND_PRECISE: self.base_reject_mask.clone(),
            AdvType.SOUND: self.base_reject_mask.clone(),
            # AdvType.UNSOUND: torch.zeros_like(self.base_reject_mask, dtype=torch.bool),
        }

        self.incorrect = {
            AdvType.UNSOUND: ~(self.base_correct | self.base_reject_mask)
        }
        Logger.debug(
            "correct: {}, reject: {}, incorrect: {}, total: {}".format(
                self.base_correct.sum().item(),
                self.base_reject_mask.sum().item(),
                (~(self.base_correct | self.base_reject_mask)).sum().item(),
                self.base_correct.numel(),
            ))
        self.batch_ids = batch_ids
Пример #4
0
 def filter_size(self, dataset, min_size=100):
     original_size = len(dataset)
     dataset.examples = [
         example for example in dataset if len(example.target) > min_size
     ]
     Logger.debug("Filter size {} -> {}".format(original_size,
                                                len(dataset)))
Пример #5
0
def __save_paths(args, paths, name):
    num_entries = 0
    with gzip.open(os.path.join(args.out_dir, name + ".json.gz"),
                   "wb") as f_out:
        for path in paths:
            optimized = False
            if os.path.exists(path + ".opt"):
                optimized = True
                path = path + ".opt"

            if is_file_empty(path):
                continue
            print(num_entries, path)

            with gzip.open(path, "rb") as f:
                for entry in json.loads(f.read()):
                    if not (args.include_js
                            or entry["filename"].endswith(".ts")):
                        continue

                    num_entries += 1
                    if not optimized:
                        entry["dependencies"] = entry["source_files"]
                    del entry["source_files"]
                    assert None not in entry["dependencies"]

                    f_out.write(json.dumps(entry).encode("utf-8"))
                    f_out.write("\n".encode("utf-8"))

    Logger.debug("{}, num files: {}".format(name, num_entries))
Пример #6
0
def save_model(model: NeuralModelBase, args, model_id):
    import torch

    checkpoint_file = os.path.join(checkpoint_dir(args),
                                   checkpoint_name(args, model_id))
    Logger.debug("Saving model to {}".format(checkpoint_file))
    torch.save(model.state_dict(), checkpoint_file)
Пример #7
0
 def load_models(
     self,
     make_model,
     dataset: Dataset,
     adversary,
     subtree_adversary,
     args,
     model_id,
     max_models=None,
     last_base=True,
 ):
     self.models = []
     while max_models is None or len(self.models) < max_models:
         model = RobustModel(
             make_model,
             dataset,
             idx=len(self.models),
             rename_adversary=adversary,
             subtree_adversary=subtree_adversary,
             base_model=last_base and len(self.models) + 1 == max_models,
         )
         if not model.load(args, model_id):
             break
         self.models.append(model)
     Logger.debug("Loaded {} models".format(len(self.models)))
Пример #8
0
def train_nonempty_model(
    model_fn,
    dataset: Dataset,
    train_iter,
    valid_iter,
    num_epochs=10,
    max_steps=10,
    step=0.1,
):
    model = model_fn()
    train_model(model, dataset, num_epochs, train_iter, valid_iter, target_o=1.1)

    thresholds = get_rejection_thresholds(
        valid_iter, model, dataset, [0.98, 0.95, 0.9, 0.8]
    )
    thresholds = [t for t in thresholds if t.h is not None and t.size > 100]
    if not thresholds:
        thresholds = get_rejection_thresholds(
            train_iter, model, dataset, [0.98, 0.95, 0.9, 0.8]
        )
        thresholds = [t for t in thresholds if t.h is not None and t.size > 100]

    if thresholds:
        Logger.debug("Rejection Threshold: {}".format(thresholds[0]))
        model.accuracy_with_reject(
            valid_iter, dataset.TARGET, dataset.reject_token_id, thresholds[0].h
        )
        return model, thresholds[0]

    return None, None
Пример #9
0
    def print(self, dataset=None, edge_gen=None):
        Logger.debug("EdgeFilter")
        cumsum = 0
        cumsum_seen = 0
        total_seen = sum(self.seen_counts.values())
        for feature, cost, count in zip(self.valid_features, self.costs,
                                        self.counts):
            cumsum += count
            cumsum_seen += self.seen_counts[feature]

            if dataset is not None or edge_gen is not None:
                node_type_u, node_type_v, edge_type = feature.split("_")
                node_type_u = (dataset.TYPES.vocab.itos[int(node_type_u)]
                               if dataset is not None else node_type_u)
                node_type_v = (dataset.TYPES.vocab.itos[int(node_type_v)]
                               if dataset is not None else node_type_v)
                edge_type = (edge_gen.id_to_edge_type[int(edge_type)]
                             if edge_gen is not None else edge_type)

                feature = "{}_{}_{}".format(node_type_u, node_type_v,
                                            edge_type)

            Logger.debug(
                "\t{:>40s} cost: {:10.0f} ({:5.2f}%), count: {:10d} ({:5.2f}%), cumsum: {:6.2f}%, seen: {:6.2f}%"
                .format(
                    feature,
                    cost,
                    cost * 100.0 / sum(self.costs),
                    count,
                    count * 100.0 / sum(self.counts),
                    cumsum * 100.0 / sum(self.counts),
                    (cumsum_seen * 100.0 /
                     total_seen) if total_seen != 0 else 0,
                ))
Пример #10
0
def main():
    args = parse_args()
    if not args.include_values:
        # When the values are not included renaming is a no-op
        args.n_renames = 0
    if args.adv_mode != "RANDOM" or args.train_adv_mode != "RANDOM":
        args.dot_product_embedding = True

    args.tag = "{}/robust".format(args.tag)

    """
    Debug Initialization
    """
    Logger.init(args.log_file)
    Logger.debug(" ".join(sys.argv))
    Random.seed(args.seed)

    USE_CUDA = torch.cuda.is_available() and args.use_cuda
    device = torch.device("cuda" if USE_CUDA else "cpu")

    """
    Dataset Loading and Preprocessing
    """
    dataset = Dataset(
        args,
        include_edges=args.model
        in [Models.UGraphTransformer.name, Models.GCN.name, Models.GGNN.name],
    )
    dataset.remove_duplicates()

    masks = {"mask_valid": dataset.MASK_VALID}

    """
    Training
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)

    def save_results(data):
        data = pd.concat(data)
        print(data)
        csv_path = os.path.join(checkpoint_dir(args), "results.csv")
        data.to_csv(csv_path, index=False, header=True)

    dfs = []
    for i in range(args.repeat):
        Random.seed(args.seed + i)
        if args.eval:
            df = eval(
                args, dataset, device, masks, max_models=args.max_models, model_id=i
            )
        else:
            df = robust_multi(
                args, dataset, device, masks, max_models=args.max_models, model_id=i
            )
        dfs.append(df)
        save_results(dfs)
Пример #11
0
def get_rejection_thresholds(
    it, model: NeuralModelBase, dataset: Dataset, precision_thresholds: Iterable[float]
):
    num_bins = 1000
    # stats = [SimpleNamespace(correct=0, total=0) for _ in range(num_bins + 1)]

    num_correct = torch.zeros(num_bins)
    num_total = torch.zeros(num_bins)
    for batch in tqdm.tqdm(it, ncols=100, leave=False):
        _, best_predictions, reject_probs = model.predict_probs_with_reject(
            batch, reject_id=dataset.reject_token_id
        )
        mask = model.padding_mask(batch, mask_field="mask_valid")
        targets = batch.Y

        best_predictions = best_predictions.masked_select(mask)
        reject_probs = reject_probs.masked_select(mask).cpu()
        targets = targets.masked_select(mask)

        is_corrects = (targets == best_predictions).cpu()

        num_total.add_(torch.histc(reject_probs, bins=num_bins, min=0, max=1))
        num_correct.add_(
            torch.histc(
                reject_probs.masked_select(is_corrects), bins=num_bins, min=0, max=1
            )
        )

    def precision(stat):
        if stat.total == 0:
            return 0
        return stat.correct * 1.0 / stat.total

    thresholds = [SimpleNamespace(h=None, size=0) for _ in precision_thresholds]
    rolling_stat = SimpleNamespace(correct=0, total=0)
    for i, correct, total in zip(
        itertools.count(), num_correct.numpy(), num_total.numpy()
    ):
        for t, precision_threshold in zip(thresholds, precision_thresholds):
            if precision_threshold <= precision(rolling_stat):
                # update threshold if it's not set or the number of samples increased
                if t.h is None or t.size * 1.01 < rolling_stat.total:
                    t.h = i / float(num_bins)
                    t.size = int(rolling_stat.total)

        rolling_stat.correct += correct
        rolling_stat.total += total

    Logger.debug(
        "Thresholds: {}, sizes: {}".format(
            [t.h for t in thresholds], [t.size for t in thresholds]
        )
    )
    return thresholds
Пример #12
0
def load_model(model: NeuralModelBase, args, model_id):
    import torch

    checkpoint_file = os.path.join(checkpoint_dir(args),
                                   checkpoint_name(args, model_id))
    print("checkpoint_file", checkpoint_file)
    if not os.path.exists(checkpoint_file):
        return False

    Logger.debug("Loading model from {}".format(checkpoint_file))
    data = torch.load(checkpoint_file)
    model.load_state_dict(data)
    return True
Пример #13
0
def make_adversary(dataset: Dataset, make_iter):
    Logger.start_scope("Parsing Trees")
    trees_train_str = dataset_to_trees(dataset.dtrain, dataset.ID)
    trees_valid_str = dataset_to_trees(dataset.dvalid, dataset.ID)
    trees_test_str = dataset_to_trees(dataset.dtest, dataset.ID)
    trees_str = {**trees_train_str, **trees_valid_str, **trees_test_str}

    trees_train_num = dataset_to_trees_num(dataset.dtrain)
    trees_valid_num = dataset_to_trees_num(dataset.dvalid)
    trees_test_num = dataset_to_trees_num(dataset.dtest)
    trees_num = {**trees_train_num, **trees_valid_num, **trees_test_num}
    Logger.end_scope()

    Logger.start_scope("Indexing Trees")
    value_index = NodeValueIndex(dataset, trees_train_num)
    value_index_str = NodeValueIndexStr(dataset, trees_train_str)
    expr_gen = ExpressionGenerator(value_index_str)

    node_replacement = AdversarialNodeReplacement(value_index,
                                                  dataset.fixed_value_offset)
    rules_index = node_replacement.make_rules(dataset, trees_str, trees_num)
    adversary = RenameAdversary(rules_index, dataset)
    Logger.end_scope()

    subtree_replacement = AdversarialSubtreeReplacement(expr_gen)
    subtree_rules = subtree_replacement.make_rules(dataset, trees_str,
                                                   trees_num)
    subtree_adversary = SubtreeAdversary(subtree_rules, dataset, trees_str,
                                         make_iter)

    return adversary, subtree_adversary
Пример #14
0
def optimize_project(path, pool, include_js=False):
    if os.path.exists(path + ".opt"):
        return
    if is_file_empty(path):
        return

    with gzip.open(path, "rb") as f:
        entries = json.loads(f.read())

    if not include_js:
        entries = [
            entry for entry in entries if entry["filename"].endswith(".ts")
        ]

    Logger.start_scope("Optimizing {}".format(path))
    Logger.debug("#Entries: {}".format(len(entries)))

    num_diffs = 0
    opt_entries = []
    for idx, entry in enumerate(pool.imap_unordered(optimize_file, entries)):
        # for idx, entry in enumerate(entries):
        #     entry = optimize_file(entry)
        sys.stderr.write("\r{}/{}".format(idx, len(entries)))
        num_diffs += entry["num_diffs"]
        opt_entries.append(entry)
    sys.stderr.write("\r{}/{}\n".format(len(entries), len(entries)))
    Logger.debug("#Diffs: {}".format(num_diffs))
    Logger.end_scope()

    print("write: ", path + ".opt")
    with gzip.open(path + ".opt", "wb") as f:
        f.write(json.dumps(opt_entries).encode("utf-8"))
Пример #15
0
    def apply(
        self, it, filtered_it, mask_field="mask_valid", num_verbose=0, is_train=False
    ):
        num_predicted = 0
        num_shown = 0
        for batch, fbatch in zip(it, filtered_it):
            num_predicted_batch, num_shown_batch = self.apply_batch(
                batch,
                fbatch,
                mask_field=mask_field,
                num_verbose=max(0, num_verbose - num_shown),
                is_train=is_train,
            )
            num_predicted += num_predicted_batch
            num_shown += num_shown_batch

        Logger.debug("Number of predicted nodes: {}".format(num_predicted))
Пример #16
0
    def init(self, in_path, out_path):
        out_path = os.path.join(out_path, self.name)
        if not os.path.exists(out_path):
            os.makedirs(out_path)

        config_path = os.path.join(out_path, "config.json")
        if os.path.exists(config_path):
            existing_config = Config.load_from_file(config_path)
            if existing_config is not None and existing_config == self:
                Logger.debug("Dataset already preprocessed.")
                return
            else:
                Logger.debug(
                    "Configs do not match. Overwriting existing dataset.")

        DATA_LOADERS[self.loader].preprocess_dataset(in_path, out_path, self)

        self.save_to_file(config_path)
Пример #17
0
def optimize_deps(filename, deps, base_deps, ref_json, base_time):
    t = time.time()
    opt_deps = set(deps)
    removal_candidates = list(set(deps) - set(base_deps))
    random.shuffle(removal_candidates)

    opt_time = None
    queue = PriorityHeap()
    queue.add(removal_candidates)
    while len(queue) > 0:
        data = queue.pop()
        for to_remove in chunks(data, max(1, math.ceil(len(data) / 2))):
            start = time.time()
            ast_json = parse_file_server(
                filename,
                parser_name="typescript",
                data={
                    "remove_types": True,
                    "deps":
                    sorted([d for d in opt_deps if d not in to_remove]),
                },
            )
            opt_time = time.time() - start
            assert ast_json is not None

            if ast_json == ref_json:
                print("\ttook: {}, remove: {}".format(time.time() - start,
                                                      len(to_remove)))
                opt_deps.difference_update(to_remove)
            elif len(to_remove) != 1:
                print("\ttook: {}, recurse".format(time.time() - start))
                queue.add(to_remove)

    Logger.debug(
        "Original Size: #{} ({:.2f}s), Base Size: #{}, Optimized Size: #{} ({:.2f}s), Total Time: {:.2f}"
        .format(
            len(deps),
            base_time,
            len(base_deps),
            len(opt_deps),
            opt_time,
            time.time() - t,
        ))
    return list(opt_deps)
Пример #18
0
def print_rejection_thresholds(it, model: NeuralModelBase, dataset: Dataset):
    num_correct = 0
    num_total = 0
    thresholds = np.arange(0.1, 1.1, 0.1)
    stats = collections.defaultdict(lambda: SimpleNamespace(correct=0, total=0))
    for batch in tqdm.tqdm(it, ncols=100, leave=False):
        _, best_predictions, reject_probs = model.predict_probs_with_reject(
            batch, reject_id=dataset.reject_token_id
        )
        mask = model.padding_mask(batch, mask_field="mask_valid")
        targets = batch.Y

        best_predictions = best_predictions.masked_select(mask)
        reject_probs = reject_probs.masked_select(mask)
        targets = targets.masked_select(mask)

        is_correct = targets == best_predictions
        num_correct += torch.sum(is_correct).item()
        num_total += targets.numel()

        for h in thresholds:
            h_mask = reject_probs <= h
            stats[h].total += torch.sum(h_mask).item()
            stats[h].correct += torch.sum(is_correct.masked_select(h_mask)).item()

    for h in thresholds:
        Logger.debug(
            "Threshold {:5.2f}: {:6d}/{:6d} ({:.2f}%)".format(
                h,
                stats[h].correct,
                stats[h].total,
                acc(stats[h].correct, stats[h].total),
            )
        )

    Logger.debug(
        "{:6d}/{:6d} ({:.2f}%)".format(
            num_correct, num_total, acc(num_correct, num_total)
        )
    )
Пример #19
0
def main():
    parser = argparse.ArgumentParser(
        "Run TypeScript Type Checker on a dataset of project")
    parser.add_argument("--repos", default="data/Repos")
    parser.add_argument("--repos_cleaned", default="data/Repos-processed")
    parser.add_argument("--out_dir", default="data/out")
    parser.add_argument("--num_threads", default=12)
    parser.add_argument("--include_js", default=False, action="store_true")
    args = parser.parse_args()

    Logger.init()
    random.seed(42)

    args.repos = os.path.abspath(args.repos)
    args.repos_cleaned = os.path.abspath(args.repos_cleaned)
    args.out_dir = os.path.abspath(args.out_dir)

    paths = []
    for path in os.listdir(args.repos):
        if path == "SAP":
            continue

        for p in find_top_level_projects(os.path.join(args.repos, path)):
            paths.append((args.repos, args.repos_cleaned, p))

    if os.path.exists(args.repos_cleaned):
        shutil.rmtree(args.repos_cleaned)
    if not os.path.exists(args.repos_cleaned):
        os.makedirs(args.repos_cleaned)
        with multiprocessing.Pool(args.num_threads) as pool:
            pool.starmap(process_project, paths)

    # (optional) optimize dependencies
    # paths = glob.glob('{}/**/*.json.gz'.format(args.repos_cleaned), recursive=True)
    # with multiprocessing.Pool(args.num_threads) as pool:
    #     for path in paths:
    #         optimize_project(path, pool)

    save_dataset(args)
Пример #20
0
    def __refine_model_adversarial(
        self,
        model,
        train_iter,
        valid_iter,
        adv_train_iter,
        threshold,
        min_nonabstained=500,
    ):
        Logger.debug("fit_adversarial")
        step = 1.0 / 4
        schedule = [
            f * 1.1 + (1 - f) * 1.02 for f in np.arange(start=1.0, stop=0.0, step=-step)
        ] + 12 * [1.02]

        num_refined_all = []
        for epoch, o in enumerate(schedule):
            model.loss_function.o = o
            Logger.debug("Epoch {}, o={}".format(epoch, o))
            num_refined = self.subtree_adversary.fit_adversarial(
                model, train_iter, adv_train_iter, threshold.h
            )
            model.accuracy_with_reject(
                valid_iter,
                self.dataset.TARGET,
                self.dataset.reject_token_id,
                threshold.h,
            )

            # print_rejection_thresholds(valid_iter, model, self.dataset)

            thresholds = get_rejection_thresholds(
                valid_iter, model, self.dataset, [0.99, 0.98, 0.95, 0.9]
            )
            thresholds = [
                t
                for t in thresholds
                if t.h is not None and t.size > min_nonabstained * 2
            ]
            if not thresholds:
                return None
            threshold = thresholds[0]

            if num_refined == 0:
                break
            if epoch > 7 and num_refined * 3 >= sum(num_refined_all[-3:]):
                break
            num_refined_all.append(num_refined)

        thresholds = get_rejection_thresholds(
            valid_iter, model, self.dataset, [1.00, 0.99, 0.98, 0.95]
        )
        thresholds = [
            t for t in thresholds if t.h is not None and t.size > min_nonabstained
        ]
        Logger.debug("Selected Threshold: {}".format(thresholds))
        assert thresholds
        return thresholds[0]
Пример #21
0
def iter_to_trees(iter) -> Dict[int, AstTree]:
    Logger.start_scope("Converting Iter to Trees")
    trees = {}
    for batch in iter:
        batch_trees = batch_to_trees(batch)
        for tree, idx in zip(batch_trees, batch.id):
            trees[idx.item()] = tree
        sys.stderr.write("\r{}".format(len(trees)))

    sys.stderr.write("\r")
    Logger.debug("# Trees: {}".format(len(trees)))
    Logger.end_scope()
    return trees
Пример #22
0
def dataset_to_trees(dataset, ID, analyzer=None) -> Dict[int, AstTree]:
    Logger.start_scope("Converting Dataset to Trees")
    trees = {}
    for sample in dataset:
        tree = AstTree.fromTensor(sample.types, sample.values, sample.depth,
                                  {"target": sample.target})
        tree.analyzer = analyzer
        trees[ID.vocab.stoi[sample.id]] = tree
        sys.stderr.write("\r{}".format(len(trees)))

    sys.stderr.write("\r")
    Logger.debug("# Trees: {}".format(len(trees)))
    Logger.end_scope()
    return trees
Пример #23
0
    def __train_inner(
        self,
        train_iter,
        valid_iter,
        num_epochs=10,
        train_adv_mode=AdversarialMode.RANDOM,
        min_nonabstained=500,
    ):

        model, threshold = train_nonempty_model(
            self.model_fn, self.dataset, train_iter, valid_iter, num_epochs=num_epochs
        )

        if model is None:
            Logger.debug("Nonempty model failed!")
            return False

        best_model = model
        best_threshold = threshold
        best_edge_filter = None

        self.__accuracy(model, threshold.h, valid_iter, adversarial=False)

        edge_filter = compute_edge_filter(
            train_iter,
            best_model,
            self.dataset,
            best_model.loss_function,
            threshold=threshold.h,
            verbose=True,
        )

        while True:
            Logger.debug(
                "Model with #{} non-rejected predictions".format(threshold.size)
            )

            Logger.debug("Original Edges: #{}".format(number_of_edges(train_iter)))
            train_iter = FilteredGraphIterator.from_iter(train_iter, edge_filter)
            Logger.debug("Filtered Edges: #{}".format(number_of_edges(train_iter)))
            valid_iter = FilteredGraphIterator.from_iter(valid_iter, edge_filter)

            model = self.__copy_model(model)
            threshold = self.__refine_model(
                model, train_iter, valid_iter, min_nonabstained=min_nonabstained
            )
            if threshold is None:
                break

            threshold = self.__refine_model_adversarial(
                model,
                train_iter,
                valid_iter,
                [
                    self.make_rename_adversary_iter(
                        train_iter, model, train_adv_mode, num_samples=5
                    ),
                    self.make_adversary_iter(
                        train_iter, model, train_adv_mode, num_samples=5
                    ),
                ],
                threshold,
                min_nonabstained=min_nonabstained,
            )

            if threshold is None:
                break

            best_model = model
            best_threshold = threshold
            best_edge_filter = edge_filter

            edge_filter = compute_edge_filter(
                train_iter,
                best_model,
                self.dataset,
                best_model.loss_function,
                threshold=threshold.h,
                verbose=False,
            )
            Logger.debug(
                "new edges: {} ({}), old edges: {}".format(
                    len(edge_filter), len(edge_filter) * 1.04, len(best_edge_filter)
                )
            )
            if len(edge_filter) * 1.04 >= len(best_edge_filter):
                # self.accuracy(model, threshold.h, valid_iter, adversarial=True)
                break

        if best_edge_filter is None:
            Logger.debug("No Edge Filter, training base model adversarially")
            best_threshold = self.__refine_model_adversarial(
                best_model,
                train_iter,
                valid_iter,
                [
                    self.make_rename_adversary_iter(train_iter, model, train_adv_mode),
                    self.make_adversary_iter(train_iter, model, train_adv_mode),
                ],
                best_threshold,
                min_nonabstained=min_nonabstained,
            )

        Logger.debug("Train Accuracy:")
        train_stats = best_model.accuracy_with_reject(
            train_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            best_threshold.h,
        )
        Logger.debug("Valid Accuracy:")
        valid_stats = best_model.accuracy_with_reject(
            valid_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            best_threshold.h,
        )

        train_prec = train_stats["mask_valid_noreject_acc"]
        valid_prec = valid_stats["mask_valid_noreject_acc"]
        Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}")

        self.edge_filter = best_edge_filter
        self.model = best_model
        self.threshold = best_threshold.h
        self.model.accuracy_with_reject(
            train_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            self.threshold,
        )

        return True
Пример #24
0
    def solve(self, debug_info=None):
        import gurobipy as gb

        verbose = len(self.samples) > 1
        if verbose:
            Logger.debug("Number of samples: #{}".format(len(self.samples)))
        self.build_edge_types(self.samples)
        # Create optimization model
        m = gb.Model("netflow")

        timers = collections.defaultdict(Timer)
        if verbose:
            Logger.start_scope("Encoding Solver Model")
        cost = m.addVars(
            range(len(self.edge_types.values())),
            obj=1.0,
            name="cost",
            vtype=gb.GRB.INTEGER,
        )
        flows = []
        for idx, sample in enumerate(self.samples):
            timers["flow"].start()
            flow = m.addVars(sample.edges.keys(),
                             name="flow_{}".format(idx),
                             vtype=gb.GRB.INTEGER)
            timers["flow"].stop()
            flows.append(flow)

            # Arc-capacity constraints
            timers["cap"].start()
            m.addConstrs(
                (flow[i, j] <= cost[self.edge_types[e_type]]
                 for (i, j), e_type in sample.edges.items()),
                "cap_{}".format(idx),
            )
            timers["cap"].stop()

            # Flow-conservation constraints
            timers["node"].start()
            m.addConstrs(
                (flow.sum("*", j) + sample.inflow.get(j, 0) == flow.sum(
                    j, "*") for j in sample.nodes),
                "node_{}".format(idx),
            )
            timers["node"].stop()

        if verbose:
            for key, timer in timers.items():
                Logger.debug("{} {}".format(key, timer))
            Logger.end_scope()

            Logger.start_scope("Optimizing")
        m.write("file.lp")
        # disable logging
        m.Params.OutputFlag = 0
        m.optimize()
        if verbose:
            Logger.end_scope()

        # Print solution
        if m.status == gb.GRB.Status.OPTIMAL:
            edge_costs = collections.Counter()
            edge_counts = collections.Counter()
            for flow, sample in zip(flows, self.samples):
                solution = m.getAttr("x", flow)
                # print('\nOptimal flows:')
                for (i, j), e_type in sample.edges.items():
                    if solution[i, j] > 0:
                        # print('%s -> %s: %g' % (i, j, solution[i, j]))
                        edge_costs[e_type] += solution[i, j]
                        edge_counts[e_type] += 1

            valid_features = []
            solution = m.getAttr("x", cost)
            # print('Costs')
            for idx, c in enumerate(solution):
                # print('\t{} {} -> {} {:.2f} ({:.2f}%)'.format(idx, c, solution[c],
                #                                   edge_costs[self.id_to_edge_type[c]],
                #                                   edge_costs[self.id_to_edge_type[c]] * 100.0 / sum(edge_costs.values()))
                #       )
                if solution[c] > 0:
                    edge_type = self.id_to_edge_type[c]
                    valid_features.append((edge_type, edge_costs[edge_type],
                                           edge_counts[edge_type]))
            if not valid_features:
                print("valid_features", valid_features)
                print(debug_info)
                exit(0)

            return EdgeFilter(valid_features)
        else:
            print(debug_info)
            print(m.status)
            print("The model is infeasible; computing IIS")

            for sample in self.samples[:5]:
                print(sample.inflow)
                print(sample.edges)
                print(sample.nodes)

            m.computeIIS()
            if m.IISMinimal:
                print("IIS is minimal\n")
            else:
                print("IIS is not minimal\n")
            print("\nThe following constraint(s) cannot be satisfied:")
            for c in m.getConstrs():
                if c.IISConstr:
                    print("%s" % c.constrName)
            exit(0)
Пример #25
0
    def train(
        self,
        train_iter,
        valid_iter,
        num_epochs=10,
        min_nonabstained=500,
        depth=None,
        test_iter=None,
        apply_model=True,  # base_model=False,
        train_adv_mode=AdversarialMode.RANDOM,
        loaded=False,
        model_eval=None,
    ):
        if not loaded:
            if self.base_model:

                train_base_model(
                    self.model,
                    self.dataset,
                    10,
                    RobustModelBatchIter(
                        model_eval,
                        AdversaryBatchIter(
                            self.subtree_adversary,
                            self.model,
                            AdversaryBatchIter(
                                self.rename_adversary,
                                self.model,
                                train_iter,
                                num_samples=1,
                                adv_mode=train_adv_mode,
                            ),
                        ),
                    ),
                    [valid_iter],
                    verbose=False,
                )

                success = True
            else:
                success = self.__train_inner(
                    train_iter,
                    valid_iter,
                    num_epochs=num_epochs,
                    train_adv_mode=train_adv_mode,
                    min_nonabstained=min_nonabstained,
                )

            if not success:
                Logger.debug("model train failed")
                input()
                return False

        # if self.idx is not None:
        #     torch.save(self.state_dict(), '{:03d}_model.pt'.format(self.idx))

        # we cannot reuse the iterators from calling __train_inner as these are shuffled
        if self.edge_filter is not None:
            Logger.debug("Original Edges: #{}".format(number_of_edges(train_iter)))
            f_train_iter = FilteredGraphIterator.from_iter(train_iter, self.edge_filter)
            Logger.debug("Filtered Edges: #{}".format(number_of_edges(f_train_iter)))
            f_valid_iter = FilteredGraphIterator.from_iter(valid_iter, self.edge_filter)
            if test_iter is not None:
                f_test_iter = FilteredGraphIterator.from_iter(
                    test_iter, self.edge_filter
                )
        else:
            f_train_iter = train_iter
            f_valid_iter = valid_iter
            if test_iter is not None:
                f_test_iter = test_iter

        Logger.debug("Train Accuracy:")
        train_stats = self.model.accuracy_with_reject(
            f_train_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            self.threshold,
        )
        Logger.debug("Valid Accuracy:")
        self.valid_stats = self.model.accuracy_with_reject(
            f_valid_iter,
            self.dataset.TARGET,
            self.dataset.reject_token_id,
            self.threshold,
        )
        if test_iter is not None:
            Logger.debug("Test Accuracy:")
            self.model.accuracy_with_reject(
                f_test_iter,
                self.dataset.TARGET,
                self.dataset.reject_token_id,
                self.threshold,
            )

        train_prec = train_stats["mask_valid_noreject_acc"]
        valid_prec = self.valid_stats["mask_valid_noreject_acc"]
        Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}")

        if apply_model:
            self.apply(train_iter, f_train_iter, is_train=True)
            self.apply(valid_iter, f_valid_iter, is_train=True)
            if test_iter is not None:
                self.apply(test_iter, f_test_iter)
        return True
Пример #26
0
 def print_stat(self):
     values = " ".join(
         "{:6.2f}%".format((100.0 * value /
                            self.stat[0]) if self.stat[0] != 0 else 0)
         for value in self.stat)
     Logger.debug("nodes entering step: {}".format(values))
Пример #27
0
def train_model(
    model: NeuralModelBase,
    dataset: Dataset,
    num_epochs,
    train_iter,
    valid_iter,
    lr=0.001,
    weight=None,
    target_o=1.0,
):
    # model.reset_parameters()
    opt = optim.Adam(model.parameters(), lr=lr)
    Logger.start_scope("Training Model")

    o_base = len(dataset.TARGET.vocab) - 4  # 'reject', '<unk>', '<pad>'
    loss_function = RejectionCrossEntropyLoss(
        o_base,
        len(dataset.TARGET.vocab),
        dataset.reject_token_id,
        reduction="none",
        weight=weight,
    )
    model.loss_function = loss_function
    model.opt = opt

    step = 1.0 / (num_epochs // 2)
    schedule = [
        f * o_base + (1 - f) * 1.0 for f in np.arange(start=1.0, stop=0.0, step=-step)
    ]
    schedule += [
        f * ((1.0 + schedule[-1]) / 2) + (1 - f) * target_o
        for f in np.arange(start=1.0, stop=0.0, step=-step)
    ]
    schedule += [target_o] * (num_epochs // 2)

    train_prec, valid_prec = None, None
    for epoch, o_upper in enumerate(schedule):
        Logger.start_scope("Epoch {}, o_upper={:.3f}".format(epoch, o_upper))
        loss_function.o = o_upper
        model.fit(train_iter, opt, loss_function, mask_field="mask_valid")

        valid_stats = model.accuracy(
            valid_iter, dataset.TARGET
        )  # , thresholds=[0.5, 0.8, 0.9, 0.95])
        valid_prec = valid_stats["mask_valid_noreject_acc"]
        Logger.debug(f"valid_prec: {valid_prec}")
        Logger.end_scope()

        # Logger.start_scope('Print Rejection Thresholds')
        # print_rejection_thresholds(train_iter, model, dataset)
        # print_rejection_thresholds(valid_iter, model, dataset)
        # Logger.end_scope()

        # Logger.start_scope('Get Rejection Thresholds')
        # get_rejection_thresholds(train_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8])
        # get_rejection_thresholds(valid_iter, model, dataset, [1.00, 0.99, 0.95, 0.9, 0.8])
        # Logger.end_scope()

    train_stats = model.accuracy(train_iter, dataset.TARGET, verbose=False)
    train_prec = train_stats["mask_valid_noreject_acc"]
    Logger.debug(f"train_prec: {train_prec}, valid_prec: {valid_prec}")
    Logger.end_scope()
    # exit(0)
    return train_prec, valid_prec
Пример #28
0
def robust_multi(
    args, dataset: Dataset, device: torch.device, masks, max_models=20, model_id=0
):
    train_iter, valid_iter, test_iter = Iterators.make(
        args, Models[args.model], dataset, device, masks
    )

    def make_model():
        return Models.make(args, dataset, device, train_iter)

    adversary, subtree_adversary = make_adversary(
        dataset,
        functools.partial(
            Iterators.make_single,
            args,
            Models[args.model],
            device,
            masks,
            dataset.EDGES,
        ),
    )

    stats = collections.Counter()
    stats["mask_valid_noreject_correct"] = 0
    stats["mask_valid_noreject_predicted"] = 0
    models = []
    for idx in range(max_models):
        # last model is trained without threshold to predict all the remaining samples
        base_model = (idx + 1) == max_models
        if not base_model:
            continue
        Logger.debug("Training iter: {}, base_model: {}".format(idx, base_model))
        if mask_count(train_iter) == 0:
            break

        model = RobustModel(
            make_model,
            dataset,
            idx=idx,
            rename_adversary=adversary,
            subtree_adversary=subtree_adversary,
            base_model=base_model,
        )

        if model_id is not None and model.load(args, model_id):
            # TODO: refactor, model is loaded but it needs to be applied on the iterator
            model.train(
                train_iter,
                valid_iter,
                num_epochs=args.num_epochs,
                test_iter=test_iter,
                apply_model=True,  # base_model=base_model,
                train_adv_mode=args.train_adv_mode,
                loaded=True,
            )
        else:
            Logger.debug(
                "Train positions to predict: {}".format(mask_count(train_iter))
            )
            Logger.debug(
                "Valid positions to predict: {}".format(mask_count(valid_iter))
            )

            model_eval = None
            if base_model:
                # reset iterators
                train_iter, valid_iter, test_iter = Iterators.make(
                    args, Models[args.model], dataset, device, masks
                )

                model_eval = RobustModelEval(subtree_adversary)
                model_eval.load_models(
                    make_model,
                    dataset,
                    adversary,
                    subtree_adversary,
                    args,
                    model_id,
                    max_models=max_models - 1,
                    last_base=False,
                )

                # train_iter = RobustModelBatchIter(model_eval, train_iter)

            if not model.train(
                train_iter,
                valid_iter,
                num_epochs=args.num_epochs,
                test_iter=test_iter,
                apply_model=True,  # base_model=base_model,
                train_adv_mode=args.train_adv_mode,
                min_nonabstained=args.min_nonabstained,
                model_eval=model_eval,
            ):
                break

            if model_id is not None:
                model.save(args, model_id)
                exit(0)

        models.append(model)

        for key in stats.keys():
            stats[key] += model.valid_stats[key]

        Logger.debug(
            "Valid Accuracy: {}/{} ({:.2f}%)".format(
                stats["mask_valid_noreject_correct"],
                stats["mask_valid_noreject_predicted"],
                acc(
                    stats["mask_valid_noreject_correct"],
                    stats["mask_valid_noreject_predicted"],
                ),
            )
        )

    return eval(args, dataset, device, masks, max_models=max_models, model_id=model_id)
Пример #29
0
def compute_edge_filter(
    it, model, dataset, loss_function, threshold=0.5, verbose=False
):
    timers = collections.defaultdict(Timer)
    edge_optimizer = EdgeOptimizer()
    for node_grads in each_node_grads(
        it, model, dataset, loss_function, threshold=threshold, max_samples=30
    ):
        if torch.any(node_grads.probs > 0.1):
            tgt_nodes = (
                torch.masked_select(node_grads.tgt_nodes, node_grads.probs > 0.1)
                .cpu()
                .numpy()
            )
            probs = (
                torch.masked_select(node_grads.probs, node_grads.probs > 0.1)
                .cpu()
                .numpy()
            )
        else:
            tgt_nodes = node_grads.tgt_nodes[:3].cpu().numpy()
            probs = node_grads.probs[:3].cpu().numpy()

        if len(tgt_nodes) == 0:
            Logger.debug(
                "Empty target nodes: src: {}, tgt_nodes: {}, {}".format(
                    node_grads.src_node, node_grads.tgt_nodes, node_grads.probs
                )
            )
            continue

        debug_info = "" if verbose else None

        timers["nodes"].start()
        depth = max(
            nx.shortest_path_length(
                node_grads.rev_tree_nx, source=node_grads.src_node, target=tgt
            )
            for tgt in tgt_nodes
        )
        nodes = [node_grads.src_node] + list(
            itertools.chain.from_iterable(
                successors
                for _, successors in nx.bfs_successors(
                    node_grads.rev_tree_nx,
                    source=node_grads.src_node,
                    depth_limit=depth,
                )
            )
        )
        assert all(tgt_node in nodes for tgt_node in tgt_nodes)
        if verbose:
            debug_info += "nodes: {}\n".format(nodes)
        timers["nodes"].stop()

        timers["edges"].start()
        edges = [
            (i, j)
            for (i, j) in node_grads.tree_nx.edges(nodes)
            if i in nodes and j in nodes
        ]
        if verbose:
            debug_info += "edges: {}\n".format(edges)
        timers["edges"].stop()

        timers["arcs"].start()
        features = EdgeFilter.edge_features(
            edges, node_grads.tree, debug_info=debug_info
        )
        arcs = {}
        for (i, j), feature in zip(edges, features):
            if i == j:
                # split self-loops into new nodes
                # Needed when using self-loops as otherwise the same node both generates and consumes inflow
                i = "{}r".format(i)
            arcs[(str(i), str(j))] = feature  # '{}_{}'.format(node_type, edge_type)
        if verbose:
            debug_info += "arcs: {}\n".format(arcs)
            debug_info += "features: {}\n".format(features)
        timers["arcs"].stop()

        # update list of notes with newly generated ones
        nodes = set()
        for (i, j) in arcs.keys():
            nodes.add(i)
            nodes.add(j)
        if verbose:
            debug_info += "nodes: {}\n".format(nodes)

        tgt_nodes = [
            str(v) if v != node_grads.src_node else "{}r".format(v) for v in tgt_nodes
        ]
        inflow = {
            tgt_node: int(p * 100)
            for tgt_node, p in zip(tgt_nodes, probs)
            if tgt_node in nodes
        }
        inflow[str(node_grads.src_node)] = -sum(inflow.values())
        if verbose:
            debug_info += "inflow: {}\n".format(inflow)
        if len(arcs) == 0:
            continue
        edge_optimizer.add_sample(nodes, arcs, inflow)

        if verbose:
            edge_optimizer_tmp = EdgeOptimizer()
            edge_optimizer_tmp.add_sample(nodes, arcs, inflow)
            edge_optimizer_tmp.solve(debug_info=debug_info)

    for key, timer in timers.items():
        Logger.debug("{}: {}".format(key, timer))
    edge_filter = edge_optimizer.solve()
    edge_filter.print(dataset=dataset)  # , edge_gen=it.edge_gen)
    return edge_filter
Пример #30
0
 def __init__(self, it):
     Logger.debug("Caching Batches")
     self.batches = [batch.clone() for batch in tqdm.tqdm(it)]