Пример #1
0
def lbfgsb(loss,
           max_vals,
           min_vals=None,
           weights=None,
           deviation_tol=1.E-6,
           verbose=True,
           **kwargs):
    if min_vals is None:
        min_vals = [0] * len(max_vals)
    if weights is None:
        weights = [(min_val + max_val) / 2
                   for min_val, max_val in zip(min_vals, max_vals)]
    import scipy.optimize
    import scipy.sparse
    if verbose:
        utils.log("Optimizing with LBFGSB")
    ret = scipy.optimize.minimize(
        loss,
        bounds=[(min_val, max_val)
                for min_val, max_val in zip(min_vals, max_vals)],
        method='L-BFGS-B',
        x0=weights,
        options={"ftol": deviation_tol})
    if verbose:
        utils.log()
    return ret.x
Пример #2
0
def nelder_mead(loss,
                max_vals,
                min_vals=None,
                weights=None,
                deviation_tol=1.E-6,
                parameter_tol: float = float('inf'),
                verbose=True,
                **kwargs):
    if min_vals is None:
        min_vals = [0] * len(max_vals)
    if weights is None:
        weights = [(min_val + max_val) / 2
                   for min_val, max_val in zip(min_vals, max_vals)]
    import scipy.optimize
    import scipy.sparse
    if verbose:
        utils.log("Optimizing with Nelder-Mead")
    ret = scipy.optimize.minimize(
        loss,
        bounds=[(min_val, max_val)
                for min_val, max_val in zip(min_vals, max_vals)],
        method='Nelder-Mead',
        x0=weights,
        options={
            "fatol": deviation_tol,
            "xatol": parameter_tol
        })
    if verbose:
        utils.log()
    return ret.final_simplex[0][0]
Пример #3
0
def _gnn_train_torch(model,
                     features,
                     graph,
                     labels,
                     training,
                     validation,
                     optimizer=None,
                     patience=100,
                     epochs=10000,
                     test=None,
                     verbose=False):
    import torch
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01) if optimizer is None else optimizer
    remaining_patience = patience
    test = validation if test is None else test
    labels = torch.FloatTensor(labels)
    features = torch.FloatTensor(features)
    training = torch.LongTensor(training)
    test = torch.LongTensor(test)
    validation = torch.LongTensor(validation)
    best_loss = float('inf')
    for epoch in range(epochs):
        optimizer.zero_grad()
        predictions = model(features, graph, training=True)
        loss = _gnn_cross_entropy_torch(labels, predictions,
                                        training) + model.loss
        loss.backward()
        optimizer.step()
        loss = _gnn_cross_entropy_torch(labels, predictions, validation)
        remaining_patience -= 1
        if loss < best_loss:
            remaining_patience = patience
            best_loss = loss
            torch.save(model.state_dict(), "_pygrank_torch_state.pt")
            if verbose:  # pragma: no cover
                utils.log(
                    f"Epoch {epoch} loss {loss} acc {float(_gnn_accuracy_torch(labels, predictions, test)):.3f}"
                )
        if remaining_patience == 0:
            break
    utils.log()
    model.load_state_dict(torch.load("_pygrank_torch_state.pt"))
    model.eval()
    import os
    os.remove("_pygrank_torch_state.pt")
Пример #4
0
def _gnn_train_tf(model,
                  features,
                  graph,
                  labels,
                  training,
                  validation,
                  optimizer=None,
                  patience=100,
                  epochs=10000,
                  test=None,
                  verbose=False):
    import tensorflow as tf
    optimizer = tf.optimizers.Adam(
        learning_rate=0.01) if optimizer is None else optimizer
    best_loss = float('inf')
    best_params = None
    test = validation if test is None else test
    remaining_patience = patience
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            predictions = model(features, graph, training=True)
            loss = _gnn_cross_entropy_tf(labels, predictions, training)
            loss = loss + tf.reduce_sum(model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        predictions = model(features, graph, training=False)
        loss = _gnn_cross_entropy_tf(labels, predictions, validation)
        remaining_patience -= 1
        if loss < best_loss:
            remaining_patience = patience
            best_loss = loss
            best_params = [
                tf.identity(param) for param in model.trainable_variables
            ]
            if verbose:  # pragma: no cover
                utils.log(
                    f"Epoch {epoch} loss {loss} acc {float(_gnn_accuracy_tf(labels, predictions, test)):.3f}"
                )
        if remaining_patience == 0:
            break
    if verbose:
        utils.log()
    for variable, best_value in zip(model.trainable_variables, best_params):
        variable.assign(best_value)
Пример #5
0
def optimize(loss,
             max_vals=[1 for _ in range(1)],
             min_vals=None,
             deviation_tol: float = 1.E-9,
             divide_range: float = 1.01,
             partitions=5,
             parameter_tol: float = float('inf'),
             depth: int = 1,
             coarse: float = 0,
             shrink_strategy: str = "divide",
             partition_strategy: str = "split",
             randomize: bool = False,
             weights=None,
             verbose: bool = True,
             validation_loss=None):
    """
    Implements a coordinate descent algorithm for optimizing the argument vector of the given loss function.
    Arguments:
        loss: The loss function. Could be an expression of the form `lambda p: f(p)' where f takes a list as an argument.
        max_vals: Optional. The maximum value for each parameter to search for. Helps determine the number of parameters.
            Default is a list of ones for one parameter.
        min_vals: Optional. The minimum value for each paramter to search for. If None (default) it becomes a list of
            zeros and equal length to max_vals.
        deviation_tol: Optional. The numerical tolerance of the loss to optimize to. Default is 1.E-8.
        divide_range: Optional. Value greater than 1 with which to divide the range at each iteration. Default is 1.01,
            which guarantees convergence even for difficult-to-optimize functions, but values such as 1.1, 1.2 or 2 may
            also be used for much faster, albeit a little coarser, convergence. If the *shrink_strategy* argument
            is set to "shrinking" instead, the range is scaled proportionally to
            *iteration<sup>divide_range</sup>/log(iteration)* per block coordinate descent.
        partitions: Optional. In how many pieces to break the search space on each iteration. Default is 5.
        parameter_tol: Optional. The numerical tolerance of parameter values to optimize to. **Both** this and
            deviation_tol need to be met. Default is infinity.
        depth: Optional. Declares the number of times to re-perform the optimization given the previous found solution.
            Default is 1, which only runs the optimization once. Larger depth values can help offset coarseness
            introduced by divide_range.
        coarse: Optional. Optional. Snaps solution to this precision. If 0 (default) then this behavior is ignored.
        shrink_strategy: Optional. The shrinking strategy towards convergence. If "divide" (default), then
            the search range is divided by the argument *divide_range*, but if "shrinking" then it is
            scaled based on block coordinate descent.
        partition_strategy: Optional. Strategy with which to traverse partitions. If "split" (default), then
            the partition is split to *partitions* parts. If "step", then the *partitions* argument is used as a fixed
            step and however many splits are needed to achieve this are performed. This last strategy helps
            force block coordinate descent traverse a finite set of values, as long as it holds that
            **coarse==partitions**.
        randomize: Optional. If True (default), then a random parameter is updated each time instead of moving
            though them in a cyclic order.
        weights: Optional. An estimation of parameters to start optimization from. The algorithm tries to center
            solution search around these - hence the usefulness of *depth* as an iterative scheme. If None (default),
            the center of the search range (max_vals+min_vals)/2 is used as a starting estimation.
        verbose: Options. If True, optimization outputs its intermediate steps. Default is False.
    Example:
        >>> import pygrank as pg
        >>> p = pg.optimize(loss=lambda p: (1.5-p[0]+p[0]*p[1])**2+(2.25-p[0]+p[0]*p[1]**2)**2+(2.625-p[0]+p[0]*p[1]**3)**2, max_vals=[4.5, 4.5], min_vals=[-4.5, -4.5])
        >>> # desired optimization point for the Beale function of this example is [3, 0.5]
        >>> print(p)
        [3.000000052836577, 0.5000000141895036]
    """
    if min_vals is None:
        min_vals = [0 for _ in max_vals]
    #if divide_range<=1:
    #    raise Exception("Need to have a divide_range parameter greater than 1 to actually reduce the search area")
    for min_val, max_val in zip(min_vals, max_vals):
        if min_val > max_val:
            raise Exception("Empty parameter range [" + str(min_val) + "," +
                            str(max_val) + "]")
    if str(divide_range) != "shrinking" and divide_range <= 1:
        raise Exception(
            "divide_range should be greater than 1, otherwise the search space never shrinks."
        )
    #weights = [1./dims for i in range(dims)]
    if weights is None:
        weights = [(min_val + max_val) / 2
                   for min_val, max_val in zip(min_vals, max_vals)]
    range_search = [(max_val - min_val) / 2
                    for min_val, max_val in zip(min_vals, max_vals)]
    curr_variable = 0
    iter = 0
    range_deviations = [float('inf')] * len(max_vals)
    #checkpoint_weights = weights
    best_weights = weights
    best_loss = float('inf')
    evals = 0
    while True:
        if randomize:
            curr_variable = int(random() * len(weights))
        if max(range_search) == 0:
            break
        assert max(
            range_search
        ) != 0, "Something went wrong and took too many iterations for optimizer to run (check for nans)"
        if shrink_strategy == "shrinking":
            range_search[curr_variable] = (
                max_vals[curr_variable] - min_vals[curr_variable]) / (
                    (iter + 1)**divide_range * log(iter + 2))
        elif shrink_strategy == "divide":
            range_search[curr_variable] /= divide_range
        else:
            raise Exception(
                "Invalid shrink strategy: either shrinking or divide expected")
        if range_search[curr_variable] == 0:
            range_deviations[curr_variable] = 0
            curr_variable += 1
            if curr_variable >= len(max_vals):
                curr_variable -= len(max_vals)
            continue
        if partition_strategy == "split":
            candidate_weights = [
                __add(weights,
                      curr_variable,
                      range_search[curr_variable] * (part * 2. /
                                                     (partitions - 1) - 1),
                      max_vals[curr_variable],
                      min_vals[curr_variable],
                      coarse=coarse) for part in range(partitions)
            ]
        elif partition_strategy == "step":
            candidate_weights = [
                __add(weights,
                      curr_variable,
                      part * partitions,
                      max_vals[curr_variable],
                      min_vals[curr_variable],
                      coarse=coarse) for part in range(
                          -int(range_search[curr_variable] / partitions), 1 +
                          int(range_search[curr_variable] / partitions))
            ]
        else:
            raise Exception(
                "Invalid partition strategy: either split or step expected")
        loss_pairs = [(w, loss(w)) for w in candidate_weights if w is not None]
        evals += len(loss_pairs)
        weights, weights_loss = min(loss_pairs, key=lambda pair: pair[1])
        prev_best_loss = best_loss
        if validation_loss is not None:
            weights_loss = validation_loss(weights)
            if weights_loss < best_loss:
                best_loss = weights_loss
                best_weights = weights
        else:
            best_loss = weights_loss
            best_weights = weights
        range_deviations[curr_variable] = abs(prev_best_loss - best_loss)
        if verbose:
            utils.log(
                f"Tuning evaluations {evals} loss {best_loss:.8f} +- {max(range_deviations):.8f}"
            )

        if max(range_deviations) <= deviation_tol and max(
                range_search) <= parameter_tol:
            break
        # move to next var
        iter += 1
        curr_variable += 1
        if curr_variable >= len(max_vals):
            curr_variable -= len(max_vals)
            #if sum(abs(w1-w2) for w1, w2 in zip(weights, checkpoint_weights)) == 0:
            #    break
            #checkpoint_weights = weights
    #print("trained weights in", iter, "iterations", weights, "final loss", loss(weights))
    weights = best_weights
    if verbose:
        utils.log()
    if depth > 1:
        return optimize(loss, max_vals, min_vals, deviation_tol, divide_range,
                        partitions, parameter_tol, depth - 1, coarse,
                        shrink_strategy, partition_strategy, randomize,
                        weights, verbose, validation_loss)
    return weights
Пример #6
0
def import_snap_format_dataset(dataset: str,
                               path: Union[Iterable[str], str] = (os.path.join(os.path.expanduser('~'), '.pygrank/data'), ".", "data"),
                               pair_file: str = 'pairs.txt',
                               group_file: str = 'groups.txt',
                               directed: bool = False,
                               min_group_size: float = 0.01,
                               max_group_number: int = 20,
                               graph_api=nx,
                               verbose=True):
    """
    Imports a dataset of the SNAP format.
    Args:
        dataset: The name of the dataset to be loaded. If a name among 'dataset'
        path: The dataset's path in which *dataset* is a folder, or a list of paths in which to search.
            The first of these will be set as the preferred download location if the dataset is not
            found and can be downloaded. Default is a list comprising the path where pygrank's settings file resides,
            "." and "data".
        pair_file: Optional. The rows of the file *[path]/[dataset]/pair_file* should contain pairs of ","-separated
            node names. Default is "pairs.txt".
        group_file: Optional. The rows of the file *[path]/[dataset]/pair_file* should contain lists of ","-separated
            node names. Default is "groups.txt".
        directed: Whether a directed or undirected graph should be returned. Default is False.
        min_group_size: Optional. The minimum group length to be considered for inclusion in groups. Can be either a
            number less than 1 to indicate group size as a fraction of the dataset or an integer to denote
            Default is 0.01, meaning that groups comprising at least 1% of graph nodes are considered.
        max_group_number: Limits the numbers of found groups to be up to that number. Default is 20.
        graph_api: The library used to construct the graph. Either `networkx` or `pygrank.fastgraph` are supported.
        verbose: Whether to show intermediate status for lengthy loading. These messages use carriage return
            to eventually disappear. Default is True.

    Returns:
        graph: A graph of node relations.
        groups: A dictionary whose values are lists of group node members.
    """
    path = _select_path(path, dataset)
    download_dataset(dataset, path=path)
    if verbose:
        utils.log(f"Loading {dataset} graph")
    G = (graph_api.DiGraph() if hasattr(graph_api, "DiGraph") else graph_api.Graph(directed)) if directed else graph_api.Graph()
    groups = {}
    with open(path+'/'+dataset+'/'+pair_file, 'r', encoding='utf-8') as file:
        for line in file:
            if len(line) != 0 and line[0] != '#':
                splt = line[:-1].split()
                if len(splt) > 1:
                    G.add_edge(splt[0], splt[1])
    if min_group_size < 1:
        min_group_size *= len(G)
    if verbose:
        utils.log(f"Loading {dataset} communities")
    if group_file is not None and os.path.isfile(path+'/'+dataset+'/'+group_file):
        with open(path+'/'+dataset+'/'+group_file, 'r', encoding='utf-8') as file:
            for line in file:
                if line[0] != '#':
                    group = [item for item in line[:-1].split() if len(item) > 0 and item in G]
                    if len(group) >= min_group_size:
                        groups[len(groups)] = group
                        if verbose:
                            utils.log(f"Loaded {dataset} communities {len(groups)}/{max_group_number}")
                        if len(groups) >= max_group_number:
                            break
    if verbose:
        utils.log()
    return G, groups
Пример #7
0
def download_dataset(dataset,
                     path: str = os.path.join(os.path.expanduser('~'),
                                              '.pygrank/data'),
                     verbose=True):  # pragma: no cover
    dataset = dataset.lower()
    if dataset not in datasets:
        return
    source = datasets[dataset] if isinstance(dataset, str) else dataset
    credentials = "REQUIRED CITATION: Please visit the url "+source["url"]\
                  + " for instructions on how to cite the dataset "+dataset+" in your research"
    print(credentials, file=sys.stderr)
    sys.stderr.flush()
    if verbose:
        utils.log("Downloading " + dataset + " into " + path)
    if not os.path.isdir(path):
        os.mkdir(path)
    download_path = os.path.join(path, dataset)
    if not os.path.isdir(download_path):
        os.mkdir(download_path)
        if "all" in source:
            all_path = download_path + "/all." + source["all"].split(".")[-1]
            wget.download(source["all"], all_path)
            try:
                tarfile.open(all_path, 'r').extractall(download_path + "/")
            except tarfile.ReadError:
                with gzip.open(all_path, 'rb') as f_in:
                    with open(download_path + "/all.txt", 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
            os.remove(all_path)
        if "script" in source:
            source["script"](path)

        if "pairs" in source:
            if source["pairs"].startswith("http"):
                pairs_path = download_path + "/pairs." + source["pairs"].split(
                    ".")[-1]
                wget.download(source["pairs"], pairs_path)
                if pairs_path.split(".")[-1] not in ["txt", "csv"]:
                    with gzip.open(pairs_path, 'rb') as f_in:
                        with open(download_path + "/pairs.txt", 'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                    os.remove(pairs_path)
            else:
                shutil.move(download_path + "/" + source["pairs"],
                            download_path + "/pairs.txt")
        if "pair_process" in source:
            pairs = list()
            with open(download_path + "/pairs.txt", "r") as file:
                for line in file:
                    pair = source["pair_process"](line[:-1].split())
                    if pair is not None:
                        pairs.append(pair)
            os.remove(download_path + "/pairs.txt")
            with open(download_path + "/pairs.txt", "w") as file:
                for pair in pairs:
                    file.write(pair[0] + "\t" + pair[1] + "\n")

        if "node2group" in source:
            groups = dict()
            with open(download_path + "/pairs.txt", "r") as file:
                for line in file:
                    pair = line[:-1].split()
                    for node in pair[0:1]:
                        group = source["node2group"](node)
                        if group is not None:
                            if group not in groups:
                                groups[group] = list()
                            groups[group].append(node)
            with open(download_path + "/groups.txt", "w") as file:
                for group in groups.values():
                    if len(group) > 1:
                        file.write(" ".join(group) + "\n")

        if "features" in source and "groups" not in source:
            features_path = download_path + "/" + source["features"]
            groups = dict()
            features = dict()
            with open(features_path) as features_file:
                for line in features_file:
                    line = line[:-1].split()
                    if "feature_process" in source:
                        line = source["feature_process"](line)
                        if line is None:
                            continue
                    node_id = line[0]
                    group = line[-1]
                    if group not in groups:
                        groups[group] = list()
                    groups[group].append(node_id)
                    features[node_id] = [val.strip() for val in line[1:-1]]
            groups = {
                group: nodes
                for group, nodes in groups.items() if len(nodes) > 1
            }
            with open(download_path + '/groups.txt', 'w',
                      encoding='utf-8') as file:
                for g in groups.values():
                    for uid in g:
                        file.write(str(uid) + '\t')
                    file.write('\n')
            with open(download_path + '/features.txt', 'w',
                      encoding='utf-8') as file:
                for p in features:
                    file.write(str(p) + '\t' + '\t'.join(features[p]) + '\n')

        if "features" in source and "groups" in source:
            if source["features"].startswith("http"):
                pairs_path = download_path + "/features." + source[
                    "features"].split(".")[-1]
                wget.download(source["features"], pairs_path)
                if pairs_path.split(".")[-1] not in ["txt", "csv"]:
                    with gzip.open(pairs_path, 'rb') as f_in:
                        with open(download_path + "/features.txt",
                                  'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                    os.remove(pairs_path)
            else:
                shutil.move(download_path + "/" + source["features"],
                            download_path + "/features.txt")

        if "groups" in source:
            groups_path = download_path + "/groups." + source["groups"].split(
                ".")[-1]
            wget.download(source["groups"], groups_path)
            if groups_path.split(".")[-1] not in ["txt", "csv"]:
                with gzip.open(groups_path, 'rb') as f_in:
                    with open(download_path + "/groups.txt", 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                os.remove(groups_path)
        elif "labels" in source:
            labels_path = download_path + "/labels." + source["labels"].split(
                ".")[-1]
            wget.download(source["labels"], labels_path)
            with gzip.open(labels_path, 'rb') as f_in:
                with open(download_path + "/labels.txt", 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            os.remove(labels_path)
            groups = dict()
            with open(download_path + "/labels.txt", 'r',
                      encoding='utf-8') as file:
                for line in file:
                    if line[0] != '#':
                        splt = line[:-1].split()
                        if len(splt) >= 2:
                            if splt[1] not in groups:
                                groups[splt[1]] = list()
                            groups[splt[1]].append(splt[0])
            with open(download_path + "/groups.txt", 'w',
                      encoding='utf-8') as file:
                for group in groups.values():
                    file.write((" ".join(group)) + "\n")

        if "remove" in source:
            shutil.rmtree(download_path + "/" + source["remove"])
    if verbose:
        utils.log()
    return credentials