def split_cv_all_goterms(ann_obj, folds=5, seed=None, **kwargs):
    """
    Split the positives and negatives into folds across all GO terms
    *seed*: the seed used by the random number generator when generating the folds. If None, the np.random RandomState will be used
    *returns*: a list of tuples containing the (train pos, train neg, test pos, test neg)
    """
    ann_matrix = ann_obj.ann_matrix
    goids, prots = ann_obj.goids, ann_obj.prots
    print(
        "Splitting all annotations into %d folds by splitting each GO terms annotations into folds, and then combining them"
        % (folds))
    # TODO there must be a better way to do this than getting the folds in each go term separately
    # but thi at least ensures that each GO term has evenly split annotations
    # list of tuples containing the (train pos, train neg, test pos, test neg)
    ann_matrix_folds = []
    for i in range(folds):
        train_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
        test_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
        ann_matrix_folds.append((train_ann_mat, test_ann_mat))

    for i in trange(ann_matrix.shape[0]):
        goid = goids[i]
        positives, negatives = alg_utils.get_goid_pos_neg(ann_matrix, i)
        # if there are less positives or negatives than there are folds, this will give an error
        if len(positives) < folds or len(negatives) < folds:
            continue
        # print("%d positives, %d negatives for goterm %s" % (len(positives), len(negatives), goid))
        # split the set of positives and the set of negatives into K folds separately
        kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
        kf.get_n_splits(positives)
        kf.get_n_splits(negatives)
        fold = 0

        # now combine the positive and negative sets into a single array, and store it in the corresponding training or testing matrix
        for (pos_train_idx,
             pos_test_idx), (neg_train_idx,
                             neg_test_idx) in zip(kf.split(positives),
                                                  kf.split(negatives)):
            train_pos, test_pos = positives[pos_train_idx], positives[
                pos_test_idx]
            train_neg, test_neg = negatives[neg_train_idx], negatives[
                neg_test_idx]
            train_ann_mat, test_ann_mat = ann_matrix_folds[fold]
            fold += 1
            # build an array of positive and negative assignments and set it in corresponding annotation matrix
            for pos, neg, mat in [(train_pos, train_neg, train_ann_mat),
                                  (test_pos, test_neg, test_ann_mat)]:
                pos_neg_arr = np.zeros(len(prots))
                pos_neg_arr[list(pos)] = 1
                pos_neg_arr[list(neg)] = -1
                mat[i] = pos_neg_arr

    return ann_matrix_folds
Пример #2
0
def run_cv_all_goterms(
        alg_runners, ann_obj, folds=5, num_reps=1, 
        cv_seed=None, **kwargs):
    """
    Split the positives and negatives into folds across all GO terms
    and then run the algorithms on those folds.
    Algorithms are all run on the same split of data. 
    *num_reps*: Number of times to repeat cross-validation. 
    An output file will be written for each repeat
    *cv_seed*: Seed to use for the random number generator when splitting the annotations into folds
        If *num_reps* > 1, the seed will be incremented by 1 each time
    """
    ann_matrix = ann_obj.ann_matrix
    goids, prots = ann_obj.goids, ann_obj.prots

    # set the cv_seed if specified
    # 2019-06-26 BUG: If there are a different number of terms, or the order of the terms changed, then the results won't be the same
    #if cv_seed is not None:
    #    print("\nSetting the Random State seed to %d" % (cv_seed))
    #    np.random.seed(cv_seed)

    # first check to see if the algorithms have already been run
    # and if the results should be overwritten
    if kwargs['forcealg'] is True or len(goids) == 1:
        # runners_to_run is a list of runners for each repitition
        runners_to_run = {i: alg_runners for i in range(1,num_reps+1)}
    else:
        runners_to_run = {}
        # a different file is stored for each repitition, so check each one
        for rep in range(1,num_reps+1):
            curr_runners_to_run = [] 
            curr_seed = cv_seed
            if curr_seed is not None:
                # add the current repitition number to the seed
                curr_seed += rep-1
            for run_obj in alg_runners:
                out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                    run_obj.out_dir, folds, rep,
                    "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
                if os.path.isfile(out_file):
                    print("%s already exists. Use --forcealg to overwite" % (out_file))
                else:
                    curr_runners_to_run.append(run_obj)
            runners_to_run[rep] = curr_runners_to_run

    # repeat the CV process the specified number of times
    for rep in range(1,num_reps+1):
        if len(runners_to_run[rep]) == 0:
            continue
        curr_seed = cv_seed
        if curr_seed is not None:
            # add the current repitition number to the seed
            curr_seed += rep-1
        # split the annotation matrix into training and testing matrices K times
        ann_matrix_folds = split_cv_all_goterms(ann_obj, folds=folds, seed=curr_seed, **kwargs)

        for run_obj in runners_to_run[rep]:
            # because each fold contains a different set of positives, and combined they contain all positives,
            # store all of the prediction scores from each fold in a matrix
            combined_fold_scores = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
            for curr_fold, (train_ann_mat, test_ann_mat) in enumerate(ann_matrix_folds):
                print("*  "*20)
                print("Fold %d" % (curr_fold+1))

                # change the annotation matrix to the current fold
                curr_ann_obj = setup.Sparse_Annotations(train_ann_mat, goids, prots)
                # replace the ann_obj in the runner with the current fold's annotations  
                run_obj.ann_obj = curr_ann_obj
                run_obj.train_mat = train_ann_mat
                run_obj.test_mat = test_ann_mat
                #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, net_obj, curr_ann_obj, **kwargs)
                # now setup the inputs for the runners
                run_obj.setupInputs()
                # run the alg
                run_obj.run()
                # parse the outputs. Only needed for the algs that write output files
                run_obj.setupOutputs()

                # store only the scores of the test (left out) positives and negatives
                for i in range(len(goids)):
                    test_pos, test_neg = alg_utils.get_goid_pos_neg(test_ann_mat, i)
                    curr_goid_scores = run_obj.goid_scores[i].toarray().flatten()
                    curr_comb_scores = combined_fold_scores[i].toarray().flatten()
                    curr_comb_scores[test_pos] = curr_goid_scores[test_pos]
                    curr_comb_scores[test_neg] = curr_goid_scores[test_neg]
                    combined_fold_scores[i] = curr_comb_scores 

            # replace the goid_scores in the runner to combined_fold_scores to evaluate
            run_obj.goid_scores = combined_fold_scores 

            #curr_goids = dag_goids if alg == 'birgrank' else goids
            # now evaluate the results and write to a file
            out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                run_obj.out_dir, folds, rep,
                "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
            utils.checkDir(os.path.dirname(out_file)) 
            eval_utils.evaluate_ground_truth(
                run_obj, ann_obj, out_file,
                #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
                alg=run_obj.name, append=False, **kwargs)

    print("Finished running cross-validation")
    return
Пример #3
0
def weight_SWSN(ann_matrix,
                sparse_nets=None,
                normalized_nets=None,
                net_names=None,
                out_file=None,
                nodes=None,
                verbose=False):
    """ 
    *normalized_nets*: list of networks stored as scipy sparse matrices. Should already be normalized
    """
    # UPDATED: normalize the networks
    if sparse_nets is not None:
        print("Normalizing the networks")
        normalized_nets = []
        for net in sparse_nets:
            normalized_nets.append(_net_normalize(net))
    elif normalized_nets is None:
        print("No networks given. Nothing to do")
        return None, 0
    if len(normalized_nets) == 1:
        print("Only one network given to weight_SWSN. Nothing to do.")
        total_time = 0
        return sparse_nets[0], total_time
    if verbose:
        print("Removing rows with 0 annotations/positives")
        utils.print_memory_usage()
    # remove rows with 0 annotations/positives
    empty_rows = []
    for i in range(ann_matrix.shape[0]):
        pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i)
        # the combineWeightsSWSN method doesn't seem to
        # work if there's only 1 positive
        if len(pos) <= 1 or len(neg) <= 1:
            empty_rows.append(i)
    # don't modify the original annotation matrix to keep the rows matching the GO ids
    curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows)

    if verbose:
        utils.print_memory_usage()
    print("Weighting networks for %d different GO terms" %
          (curr_ann_mat.shape[0]))
    print("Running simultaneous weights with specific negatives")
    start_time = time.process_time()
    alpha, indices = combineNetworksSWSN(curr_ann_mat,
                                         normalized_nets,
                                         verbose=verbose)
    # print out the computed weights for each network
    if net_names is not None:
        print("network weights:")
        #print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices])))
        weights = defaultdict(int)
        for i in range(len(alpha)):
            weights[net_names[indices[i]]] = alpha[i]
        weights_table = ["%0.3e" % weights[net] for net in net_names]
        print('\t'.join(net_names))
        print('\t'.join(weights_table))

    # now add the networks together with the alpha weight applied
    weights_list = [0] * len(normalized_nets)
    weights_list[indices[0]] = alpha[0]
    combined_network = alpha[0] * normalized_nets[indices[0]]
    for i in range(1, len(alpha)):
        combined_network += alpha[i] * normalized_nets[indices[i]]
        weights_list[indices[i]] = alpha[i]
    total_time = time.process_time() - start_time

    if out_file is not None:
        # replace the .txt if present
        out_file = out_file.replace('.txt', '.npz')
        utils.checkDir(os.path.dirname(out_file))
        print("\twriting combined network to %s" % (out_file))
        sp.save_npz(out_file, combined_network)
        # also write the node ids so it's easier to access
        # TODO figure out a better way to store this
        node2idx_file = out_file + "-node-ids.txt"
        print("\twriting node ids to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (n, i)
                              for i, n in enumerate(nodes)))

        # write the alpha/weight of the networks as well
        net_weight_file = out_file + "-net-weights.txt"
        print("\twriting network weights to %s" % (net_weight_file))
        with open(net_weight_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i]))
                              for i, idx in enumerate(indices)))

    return combined_network, total_time, weights_list
Пример #4
0
def run(run_obj):
    """
    This script performs logistic regression by building a classifier for each term in the ontology
    """

    params_results = run_obj.params_results
    P, alg, params = run_obj.P, run_obj.name, run_obj.params

    # get the labels matrix and transpose it to have label names as columns
    ann_mat = run_obj.ann_matrix
    max_iter = params['num_iter']
    # see if train and test annotation matrices from the cross validation pipeline exist
    # if not, set train and test to the original annotation matrix itself
    if run_obj.train_mat is not None and run_obj.test_mat is not None:
        print("Performing cross validation")
        run_obj.cv = True
        train_mat = run_obj.train_mat
        test_mat = run_obj.test_mat
    else:
        run_obj.cv = False
        train_mat = ann_mat
        test_mat = ann_mat

    # stores the scores for all the terms
    scores = sparse.lil_matrix(ann_mat.shape,
                               dtype=np.float)  #   dim: term x genes

    for term in tqdm(run_obj.goids_to_run):

        idx = run_obj.hpoidx[term]
        # compute the train gene indices of the annotations for the given label
        train_pos, train_neg = alg_utils.get_goid_pos_neg(train_mat, idx)
        train_set = sorted(list(set(train_pos) | set(train_neg)))

        if len(train_pos) == 0:
            print("Skipping term, 0 positive examples")
            continue

        if run_obj.cv:
            # if cross validation, then obtain the test gene set on which classifier should be tested
            test_pos, test_neg = alg_utils.get_goid_pos_neg(test_mat, idx)
            test_set = set(test_pos) | set(test_neg)
            test_set = sorted(list(test_set))
        else:
            # set all unlabeled genes to the test set
            test_set = sorted(
                list(set(run_obj.protidx.values()) - set(train_set)))

        # obtain the feature vector only for the genes in the training set
        X_train = P[train_set, :]
        # obtain the feature vector only for the genes in the testing set
        X_test = P[test_set, :]
        # obtain the labels matrix corresponding to genes in the training set
        y_train = train_mat.transpose()[train_set, :]
        y_train = sparse.lil_matrix(y_train)

        # get the column of training data for the given label
        lab = y_train[:, idx].toarray().flatten()
        # now train the model on the constructed training data and the column of labels
        clf = logReg.training(X_train, lab, max_iter)
        # make predictions on the constructed training set
        predict = logReg.testing(clf, X_test)
        predict = predict.tolist()

        # get the current scores for the given label term in the current fold
        curr_score = scores[idx].toarray().flatten()
        # for the test indices of the current label, set the scores
        curr_score[test_set] = predict
        curr_score[train_pos] = 1
        # add the scores produced by predicting on the current label of test set to a combined score matrix
        scores[idx] = curr_score

    run_obj.goid_scores = scores
    run_obj.params_results = params_results
Пример #5
0
def leave_out_taxon(t,
                    ann_obj,
                    species_to_uniprot_idx,
                    eval_ann_obj=None,
                    keep_ann=False,
                    non_pos_as_neg_eval=False,
                    eval_goterms_with_left_out_only=False,
                    oracle=False,
                    num_test_cutoff=10,
                    **kwargs):
    """
    Training positives are removed from testing positives, and train pos and neg are removed from test neg
        I don't remove training negatives from testing positives, because not all algorithms use negatives
    *t*: species to be left out. If t is None or 'all', then no species will be left out, and keep_ann must be True.
    *eval_ann_obj*: 
    *eval_goterms_with_left_out_only*: if eval_ann_obj is given and keep_ann is False, 
        only evaluate GO terms that have at least 2% of annotations. 
        Useful to speed-up processing for term-based algorithms
    *oracle*: remove train negatives that are actually test positives
    *num_test_cutoff*: minimum number of annotations for each go term in the left-out species 
    """
    if t == "all":
        t = None
    # leave this taxon out by removing its annotations
    # rather than a dictionary, build a matrix
    ann_matrix, goids, prots = ann_obj.ann_matrix, ann_obj.goids, ann_obj.prots
    train_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
    test_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
    sp_goterms = []
    #skipped_eval_no_left_out_ann = 0
    for idx, goid in enumerate(goids):
        pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, idx)
        ann_pos = set(list(pos))
        ann_neg = set(list(neg))
        # first setup the training annotations (those used as positives/negatives for the algorithm)
        if keep_ann:
            train_pos = ann_pos
            train_neg = ann_neg
        else:
            train_pos = ann_pos - species_to_uniprot_idx[t]
            train_neg = ann_neg - species_to_uniprot_idx[t]
        eval_pos = ann_pos.copy()
        eval_neg = ann_neg.copy()
        # setup the testing annotations (those used when evaluating the performance)
        if eval_ann_obj is not None:
            if goid not in eval_ann_obj.goid2idx:
                eval_pos, eval_neg = set(), set()
            else:
                eval_pos, eval_neg = alg_utils.get_goid_pos_neg(
                    eval_ann_obj.ann_matrix, eval_ann_obj.goid2idx[goid])
                eval_pos = set(list(eval_pos))
                eval_neg = set(list(eval_neg))
            # if this species has little-to-no annotations that are being left-out, then we can skip it
            #if not keep_ann and eval_goterms_with_left_out_only:
            ## If the percentage of left-out ann is less than 2%, then skip it
            #if (len(ann_pos) - len(train_pos)) / float(len(train_pos)) < .02:
            #    skipped_eval_no_left_out_ann += 1
            #    continue
        if t is None:
            test_pos = eval_pos
            test_neg = eval_neg
            if non_pos_as_neg_eval:
                # everything minus the positives
                test_neg = set(prots) - test_pos
        else:
            test_pos = eval_pos & species_to_uniprot_idx[t]
            # UPDATE 2018-06-27: Only evaluate the species prots as negatives, not all prots
            if non_pos_as_neg_eval:
                test_neg = species_to_uniprot_idx[t] - eval_pos
                test_neg.discard(None)
            else:
                test_neg = eval_neg & species_to_uniprot_idx[t]
        # UPDATE 2018-06-30: Remove test positives/negatives that are part of the training positives/negatives
        # don't remove test positives if its a training negative because not all algorithms use negatives
        test_pos -= train_pos
        if oracle:
            train_neg -= test_pos
        test_neg -= train_pos | train_neg
        # build an array of the scores and set it in the goid sparse matrix of scores
        # UPDATE 2019-07: Some algorithms are node-based and could benefit from the extra annotations
        pos_neg_arr = np.zeros(len(prots))
        pos_neg_arr[list(train_pos)] = 1
        pos_neg_arr[list(train_neg)] = -1
        train_ann_mat[idx] = pos_neg_arr
        pos_neg_arr = np.zeros(len(prots))
        pos_neg_arr[list(test_pos)] = 1
        pos_neg_arr[list(test_neg)] = -1
        test_ann_mat[idx] = pos_neg_arr
        # UPDATE 2018-10: Add a cutoff on both the # of training positive and # of test pos
        if len(train_pos) < num_test_cutoff or len(test_pos) < num_test_cutoff or \
           (len(train_neg) == 0 or len(test_neg) == 0):
            continue
        sp_goterms.append(goid)

    #if eval_ann_matrix is not None and not keep_ann and eval_goterms_with_left_out_only:
    #    print("\t%d goterms skipped_eval_no_left_out_ann (< 0.02 train ann in the left-out species)" % (skipped_eval_no_left_out_ann))

    return train_ann_mat.tocsr(), test_ann_mat.tocsr(), sp_goterms
def weight_SWSN(ann_matrix,
                sparse_nets,
                net_names=None,
                out_file=None,
                nodes=None):
    """ TODO DOC
    """
    if len(sparse_nets) == 1:
        print("Only one network given to weight_SWSN. Nothing to do.")
        total_time = 0
        return sparse_nets[0], total_time
    # remove rows with 0 annotations/positives
    empty_rows = []
    for i in range(ann_matrix.shape[0]):
        pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i)
        # the combineWeightsSWSN method doesn't seem to
        # work if there's only 1 positive
        if len(pos) <= 1 or len(neg) <= 1:
            empty_rows.append(i)
    # don't modify the original annotation matrix to keep the rows matching the GO ids
    curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows)

    # normalize the networks
    print("Normalizing the networks")
    normalized_nets = []
    for net in sparse_nets:
        normalized_nets.append(_net_normalize(net))
    print("Weighting networks for %d different GO terms" %
          (curr_ann_mat.shape[0]))
    print("Running simultaneous weights with specific negatives")
    start_time = time.process_time()
    alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets)
    if net_names is not None:
        print("\tnetworks chosen: %s" %
              (', '.join([net_names[i] for i in indices])))

    # now add the networks together with the alpha weight applied
    combined_network = alpha[0] * sparse_nets[indices[0]]
    for i in range(1, len(alpha)):
        combined_network += alpha[i] * sparse_nets[indices[i]]
    total_time = time.process_time() - start_time

    if out_file is not None:
        # replace the .txt if present
        out_file = out_file.replace('.txt', '.npz')
        utils.checkDir(os.path.dirname(out_file))
        print("\twriting combined network to %s" % (out_file))
        sparse.save_npz(out_file, combined_network)
        # also write the node ids so it's easier to access
        # TODO figure out a better way to store this
        node2idx_file = out_file + "-node-ids.txt"
        print("\twriting node ids to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (n, i)
                              for i, n in enumerate(nodes)))

        # write the alpha/weight of the networks as well
        net_weight_file = out_file + "-net-weights.txt"
        print("\twriting network weights to %s" % (net_weight_file))
        with open(net_weight_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i]))
                              for i, idx in enumerate(indices)))

    return combined_network, total_time
Пример #7
0
def setup_post_to_graphspace(config_map,
                             selected_goid,
                             alg='fastsinksource',
                             name_postfix='',
                             tags=None,
                             taxon=None,
                             goid_summary_file=None,
                             num_neighbors=1,
                             nodes_to_post=None,
                             **kwargs):

    input_settings, alg_settings, \
            output_settings, out_pref, kwargs = \
            plot_utils.setup_variables(
                    config_map, **kwargs)

    input_dir = input_settings['input_dir']
    dataset = input_settings['datasets'][0]
    for arg in [
            'ssn_target_only', 'ssn_target_ann_only', 'ssn_only',
            'string_target_only', 'string_nontarget_only',
            'limit_to_taxons_file', 'add_target_taxon', 'oracle_weights',
            'rem_neg_neighbors', 'youngs_neg', 'sp_leaf_terms_only'
    ]:
        kwargs[arg] = dataset.get(arg)
    uniprot_taxon_file = "%s/%s" % (input_dir, dataset['taxon_file'])

    # don't need it since we are re-running the alg anyway
    #    # predictions file:
    #    results_dir = "%s/%s/%s" % (
    #        output_settings['output_dir'], dataset['net_version'], dataset['exp_name'])
    #    alg_params = alg_settings[alg]
    #    combos = [dict(zip(alg_params.keys(), val))
    #        for val in itertools.product(
    #            *(alg_params[param] for param in alg_params))]
    #    # TODO allow for multiple
    #    if len(combos) > 1:
    #        print("%d combinations for %s. Using the first one" % (len(combos), alg))
    #    param_combo = combos[0]
    #    # first get the parameter string for this runner
    #    params_str = runner.get_runner_params_str(alg, dataset, param_combo)
    #    prec_rec_str = "prec-rec%s-%s" % (taxon, selected_goid)
    #    exp_type = 'loso'
    #    pred_file = "%s/%s/%s%s%s%s.txt" % (results_dir, alg, exp_type, params_str, kwargs.get('postfix',''), prec_rec_str)
    #    if not os.path.isfile(pred_file):
    #        print("\tPredictions file not found: %s. Quitting" % (pred_file))
    #        sys.exit(1)
    #    print("\treading %s" % (pred_file))
    #    df = pd.read_csv(pred_file, sep='\t')
    #    print(df.head())

    out_dir = "outputs/viz/graphspace/%s-%s/" % (dataset['net_version'].split(
        '/')[-1], dataset['exp_name'].split('/')[-1])
    os.makedirs(out_dir, exist_ok=True)
    print("storing net and ann files to %s" % (out_dir))

    # TODO allow posting without STRING
    net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx = \
            load_net_ann_datasets(
        out_dir, taxon,
        dataset, input_settings, alg_settings,
        uniprot_taxon_file, **kwargs)
    W = new_net_obj.W
    prots = ann_obj.prots

    # also run the alg to get the full prediction scores
    # TODO get them from a file?
    alg_settings = {alg: alg_settings[alg]}
    alg_settings[alg]['should_run'] = [True]
    kwargs['verbose'] = True
    alg_runners = run_eval_algs.setup_runners(alg_settings, new_net_obj,
                                              ann_obj,
                                              output_settings['output_dir'],
                                              **kwargs)
    run_obj = alg_runners[0]
    run_obj.goids_to_run = [selected_goid]

    train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon(
        taxon,
        ann_obj,
        species_to_uniprot_idx,
        eval_ann_obj=eval_ann_obj,
        **kwargs)
    # now run the loso evaluation for this term, and get the scores back
    eval_loso.run_and_eval_algs(run_obj,
                                ann_obj,
                                train_ann_mat,
                                test_ann_mat,
                                taxon=taxon,
                                **kwargs)
    term_scores = np.ravel(
        run_obj.goid_scores[ann_obj.goid2idx[selected_goid]].toarray())
    print("top 10 scores for %s, %s:" % (taxon, selected_goid))
    taxon_prots_idx = list(species_to_uniprot_idx[taxon])
    taxon_prots = [prots[i] for i in taxon_prots_idx]
    taxon_term_scores = term_scores[taxon_prots_idx]
    print('\n'.join(["%s\t%0.4e" % (
        ann_obj.prots[taxon_prots_idx[i]], taxon_term_scores[i]) \
            for i in np.argsort(taxon_term_scores)[::-1][:10]]))

    pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file'])
    #selected_goid = "15643"  # toxic substance binding
    #selected_goid = "9405"  # pathogenesis
    #selected_goid = "98754"  # detoxification
    selected_goname = None
    # build a dictionary of the evidencecode for each prot
    uniprot_to_evidencecode = defaultdict(set)
    annotated_prots = set()
    neg_prots = set()
    if goid_summary_file is None:
        goid_summary_file = pos_neg_file.replace("bp-", '').replace("mf-", '')
        if '-list' in pos_neg_file:
            goid_summary_file = goid_summary_file.replace(
                "-list", "-summary-stats")
        elif '.gz' in pos_neg_file:
            goid_summary_file = goid_summary_file.replace(
                ".tsv.gz", "-summary-stats.tsv")
        else:
            goid_summary_file = goid_summary_file.replace(
                ".tsv", "-summary-stats.tsv")
    df_summary = pd.read_csv(goid_summary_file, sep='\t')
    goid_names = dict(zip(df_summary['GO term'], df_summary['GO term name']))
    #goid_num_anno = dict(zip(df_summary['GO term'], df_summary['# positive examples']))
    print("GO name: %s" % (goid_names[selected_goid]))
    selected_goname = goid_names[selected_goid].replace(' ', '-')[0:20]
    # load the GAIN propagation to get the evidence code
    ev_codes_file = dataset.get('ev_codes_file')
    if ev_codes_file is not None:
        for orf, goid, goname, hierarchy, evidencecode, annotation_type in utils.readColumns(
                ev_codes_file, 1, 2, 3, 4, 5, 6):
            if selected_goid[:3] == "GO:":
                goid = "GO:" + "0" * (7 - len(goid)) + goid
            if goid != selected_goid:
                continue
            selected_goname = goname.replace(' ', '-')[0:20]
            if annotation_type != '1':
                continue

            uniprot_to_evidencecode[orf].add(evidencecode)
    # limit it to the current taxon
    if taxon is not None:
        print("Getting species of each prot from %s" % (uniprot_taxon_file))
        #print("Limiting the prots to those for taxon %s (%s)" % (taxon, selected_species[taxon]))
        print("Limiting the prots to those for taxon %s" % (taxon))
        # for each of the 19 species, leave out their annotations
        # and see how well we can retrieve them
        uniprot_to_species = utils.readDict(uniprot_taxon_file, 1, 2)
        if taxon not in species_to_uniprot_idx:
            print("Error: taxon ID '%d' not found" % (taxon))
            sys.exit()
        # also limit the proteins to those in the network
        print("\t%d prots for taxon %s." % (len(taxon_prots_idx), taxon))
        goid_idx = ann_obj.goid2idx[selected_goid]
        pos, neg = alg_utils.get_goid_pos_neg(train_ann_mat, goid_idx)
        non_taxon_annotated_prots = set([prots[i] for i in pos])
        non_taxon_neg_prots = set([prots[i] for i in neg])
        print("\t%d non-taxon pos, %d non-taxon neg" %
              (len(non_taxon_annotated_prots), len(non_taxon_neg_prots)))
        pos, neg = alg_utils.get_goid_pos_neg(test_ann_mat, goid_idx)
        annotated_prots = set([prots[i] for i in pos])
        neg_prots = set([prots[i] for i in neg])
        print("\t%d taxon pos, %d taxon neg" %
              (len(annotated_prots), len(neg_prots)))

    print("\t%d annotated prots for %s (%s)" %
          (len(annotated_prots), selected_goname, selected_goid))

    #conf_cutoff = 0.2
    conf_cutoff = -1
    predicted_prots = set()
    ranks = {}
    scores = {}
    first_zero_rank = None
    for i, idx in enumerate(np.argsort(taxon_term_scores)[::-1]):
        rank = i + 1
        prot = prots[taxon_prots_idx[idx]]
        predicted_prots.add(prot)
        score = taxon_term_scores[idx]
        scores[prot] = score
        if taxon is not None:
            ranks[prot] = rank
            if score == 0 and first_zero_rank is None:
                first_zero_rank = rank
        else:
            ranks[prot] = rank

            # move the score between 0 and 1 if it's genemania (normally between -1 and 1)
            # as the score is used to set the opacity
            # TODO fix genemania
            #if alg == "genemania":
            #    pred_cut_conf[gene] = local_conf
            #    local_conf = ((float(local_conf) - -1) / float(1--1)) * (1-0) + 0
            #pred_local_conf[gene] = local_conf

    print("\t%d prots with a score" % (len(taxon_term_scores)))
    print("Rank of first zero score: %d" % (first_zero_rank))
    print("Ranks of left-out positives:")
    for gene in sorted(annotated_prots, key=ranks.get):
        print("%s\t%d" % (gene, ranks[gene]))
    print("Including top 30 ranked-proteins of left-out species")
    top_30 = sorted(set(taxon_prots) & set(ranks.keys()), key=ranks.get)[:30]
    if ev_codes_file is not None:
        print("Evidence codes of top 30:")
        for i, gene in enumerate(top_30):
            if gene in uniprot_to_evidencecode:
                print("%s\t%s\t%s" % (i, gene, uniprot_to_evidencecode[gene]))
    top_30 = set(top_30)

    if taxon is not None:
        print(
            "Getting the induced subgraph of the neighbors of the %d annotated nodes"
            % (len(annotated_prots)))
        prededges = set()
        if nodes_to_post is not None:
            print("Getting neighbors of %s" % (', '.join(nodes_to_post)))
            nodes_to_add_neighbors = set(nodes_to_post)
        else:
            nodes_to_add_neighbors = annotated_prots.copy() | top_30
        node2idx = ann_obj.node2idx
        for i in range(opts.num_neighbors):
            #print("Adding neighbors %d" % (i+1))
            curr_nodes_to_add_neighbors = nodes_to_add_neighbors.copy()
            nodes_to_add_neighbors = set()
            print("adding %sneighbors of %d nodes" %
                  ("positive ", len(curr_nodes_to_add_neighbors)))
            for u in curr_nodes_to_add_neighbors:
                #neighbors = set(nx.all_neighbors(G, u))
                neighbors = set(
                    [prots[v] for v in get_mat_neighbors(W, node2idx[u])])
                if opts.node_to_post is None:
                    # UPDATE 2018-10: try adding just the positive neighbors of the node
                    # TODO make this a command-line option
                    neighbors = neighbors & (non_taxon_annotated_prots
                                             | annotated_prots | top_30)
                #if len(neighbors) > 15 and nodes_to_post is None:
                #    print("\tskipping adding neighbors of %s. len(neighbors): %d" % (u, len(neighbors)))
                #    continue
                nodes_to_add_neighbors.update(neighbors)
                prededges.update(set([(u, v) for v in neighbors]))
    else:
        print(
            "Getting the induced subgraph of the %d annotated and %d predicted proteins"
            % (len(annotated_prots), len(predicted_prots)))
        print("not yet implemented. quitting")
        sys.exit()
    #    prededges = set(G.subgraph(annotated_prots.union(predicted_prots)).edges())
    prededges = set([tuple(sorted((u, v))) for u, v in prededges])
    # TODO I should also show the disconnected nodes
    prednodes = set([n for edge in prededges for n in edge])

    print("\t%d nodes, %d edges" % (len(prednodes), len(prededges)))
    if len(prededges) > 1000 or len(prednodes) > 500:
        print("\nToo many nodes/edges. Not posting to GraphSpace. Quitting")
        sys.exit()

    #graph_attr_file = ""
    #graph_attr, attr_desc = readGraphAttr()
    # add the edge weight from the network to attr_desc which will be used for the popup
    # set the edges as the neighbors of the annotated genes
    #prededges = set()
    # get the induced subgraph of the annotated nodes and predicted nodes
    #for n in func_prots:
    #    if not G.has_node(n):
    #        continue
    #    for neighbor in G.neighbors(n):
    #        prededges.add((n, neighbor))

    graph_attr = {n: {} for n in prednodes}
    attr_desc = {n: {} for n in prednodes}

    print("Reading gene names and species for each protein from %s" %
          (uniprot_taxon_file))
    #prot_species = utils.readDict(uniprot_taxon_file, 1, 2)
    uniprot_to_gene = utils.readDict(uniprot_taxon_file, 1, 4)
    # there can be multiple gene names. Just show the first one for now
    uniprot_to_gene = {
        n: gene.split(' ')[0]
        for n, gene in uniprot_to_gene.items()
    }
    node_labels = {}

    print("building graphspace object")
    # get the abbreviation of the species names
    species_names, net_taxons = eval_loso.get_selected_species(
        species_to_uniprot_idx, kwargs['limit_to_taxons_file'])
    sp_abbrv = {
        t: ''.join(subs[0] for subs in sp_name.split(' ')[:2])
        for t, sp_name in species_names.items()
    }
    # for each node, add the prediction values
    for n in tqdm(prednodes):
        # set the name of the node to be the gene name and add the k to the label
        gene_name = uniprot_to_gene.get(n, n)
        curr_taxon = uniprot_to_species[n]
        species_short_name = sp_abbrv[curr_taxon]
        # add the species to the front of the gene name
        label = "%s-%s" % (species_short_name, gene_name)
        uniprot_to_gene[n] = label
        #node_labels[n] = "%s\n%d" % (label, min(ranks[n], 43)) if n in annotated_prots else label
        node_labels[n] = "%s\n%d" % (
            label, ranks[n] if ranks[n] < first_zero_rank else
            first_zero_rank) if n in taxon_prots else label

        # maybe put the labels below the nodes?
        # helps with visualizing the background opacity
        graph_attr[n]['text-valign'] = 'bottom'
        # add the strain name to the popup
        attr_desc[n]['Strain'] = species_names[curr_taxon]
        if n in predicted_prots:
            # don't need to normalize because the confidence values are already between 0 and 1
            if taxon and (n in non_taxon_annotated_prots
                          or n in non_taxon_neg_prots):
                pass
            else:
                # UPDATE: use the node rank instead of the node score
                #graph_attr[n]['background-opacity'] = pred_local_conf[n]
                if n not in ranks:
                    graph_attr[n]['background-opacity'] = scores[n]
                else:
                    #graph_attr[n]['background-opacity'] = scores[n]
                    graph_attr[n]['background-opacity'] = max([
                        0.9 - (ranks[n] / float(first_zero_rank)),
                        float(scores[n])
                    ])
                    attr_desc[n]["%s rank" % (alg_names[alg])] = ranks[n]
            attr_desc[n]["%s prediction score" %
                         (alg_names[alg])] = "%0.4f" % (scores[n])
        #elif n in annotated_prots or (taxon and (n in non_taxon_annotated_prots or n in non_taxon_neg_prots)) \
        #     or n in neg_prots:
        #if n in pred_local_conf:
        #    graph_attr[n]['background-opacity'] = pred_local_conf[n]
        #    attr_desc[n]["Local prediction confidence"] = pred_local_conf[n]
        # also add the annotation to the popup
        if n in uniprot_to_evidencecode:
            codes = uniprot_to_evidencecode[n]
            # TODO add bullet points to the list
            #attr_desc[n]["Evidence code"] = ''.join(["%s (%s)\n" % (c, evidence_code_name[c]) for c in codes])
            # order it by exp, comp, then elec
            evidence_codes = ''.join([
                "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes
                if evidence_code_type[c] == 'experimental'
            ])
            evidence_codes += ''.join([
                "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes
                if evidence_code_type[c] == 'computational'
            ])
            evidence_codes += ''.join([
                "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes
                if evidence_code_type[c] == 'electronic'
            ])
            attr_desc[n]["Evidence code"] = "<ul>%s</ul>" % (evidence_codes)

    # set the width of the edges by the network weight
    edge_weights = defaultdict(float)
    for u, v in tqdm(prededges):
        e = (u, v)
        if e not in attr_desc:
            attr_desc[e] = {}
        if e not in graph_attr:
            graph_attr[e] = {}
        #attr_desc[e]["edge weight"] = G.adj[u][v]]['weight']
        if net_obj.multi_net:
            #attr_desc[e]["Final edge weight"] = "%0.1f" % (W[node2idx[u]][:,node2idx[v]].A.flatten()[0])
            edge_type_weights = []
            # add the weights for the individual string networks
            for i in range(len(net_obj.net_names)):
                net_name = net_obj.net_names[i]
                net_name = "SSN (E-value <= 0.1)" if 'eval-e0_1' in net_name else net_name
                net = net_obj.sparse_networks[i]
                w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0]

                if w != 0:
                    #attr_desc[e][net_name] = "%0.1f" % (w)
                    edge_type_weights.append("<li>%s: %0.1f</li>" %
                                             (net_name, w))
                    edge_weights[e] += w * net_obj.swsn_weights[i]
            attr_desc[e]["Edge weights by type"] = "<ul>%s</ul>" % (''.join(
                sorted(edge_type_weights)))
        else:
            attr_desc[e]["Edge weight"] = "%0.1f" % (
                W[node2idx[u]][:, node2idx[v]].A.flatten()[0])
        # make the edges somewhat opaque for a better visual style
        graph_attr[e]['opacity'] = 0.7

    # set the width of the edges by the network weight
    #edge_weights = {(u,v): float(W[node2idx[u]][:,node2idx[v]].A.flatten()[0]) for u,v in prededges}
    for e, w in edge_weights.items():
        attr_desc[e]["Final edge weight"] = "%0.1f" % (w)
    # TODO set the min and max as parameters or something
    #max_weight = 180
    if net_obj.multi_net:
        max_weight = net_obj.swsn_weights[0] * 180
        print(max_weight)
    else:
        max_weight = 180
    for e in edge_weights:
        if edge_weights[e] > max_weight:
            edge_weights[e] = max_weight
    graph_attr = gs.set_edge_width(prededges,
                                   edge_weights,
                                   graph_attr,
                                   a=1,
                                   b=12,
                                   min_weight=1,
                                   max_weight=max_weight)

    H = nx.Graph()
    H.add_edges_from(prededges)

    # see which DB the edge came from to set the edge color
    print("Getting the edge type from networks")
    if net_obj.multi_net:
        print("\tFrom both STRING and SEQ_SIM")
        seq_sim_edges = set()
        for u, v in prededges:
            # get the SSN weight of this edge. Should be the first network
            net = net_obj.sparse_networks[0]
            w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0]
            if w != 0:
                # these are all undirected, so just store the sorted version
                u, v = tuple(sorted((u, v)))
                # give these the default color
                graph_attr[(u, v)]['color'] = edge_type_color['default']
                seq_sim_edges.add((u, v))

#        string_edges = set()
#        temp_version = '2017_10-string'
#        net = f_settings.NETWORK_template % (temp_version, temp_version)
#        for u,v in utils.readColumns(net, 1, 2):
#            #if (u,v) not in prededges:
#            if not H.has_edge(u,v):
#                continue
#            # give these the default color
#            u,v = tuple(sorted((u,v)))
#            graph_attr[(u,v)]['color'] = edge_type_color['string']
#            string_edges.add((u,v))
        string_edges = prededges.difference(seq_sim_edges)
        print("\t%d edges from seq-sim, %d edges from STRING" %
              (len(seq_sim_edges), len(string_edges)))
        # set the color to STRING if it didn't come from sequence similarity
        for e in string_edges:
            #if 'color' not in graph_attr[e]:
            graph_attr[e]['color'] = edge_type_color['string']

    #elif 'STRING' in f_settings.NETWORK_VERSION_INPUTS[version]:
    #    for e in graph_attr:
    #        graph_attr[e]['color'] = edge_type_color['string']
    else:
        for e in graph_attr:
            graph_attr[e]['color'] = edge_type_color['default']

    # apply the evidence code style to each protein
    for n in prednodes:
        if n in annotated_prots:
            graph_attr[n]['color'] = node_type_color['annotation']
        elif taxon and n in non_taxon_annotated_prots:
            graph_attr[n]['color'] = node_type_color['non-taxon-annotation']
        elif taxon and n in non_taxon_neg_prots:
            graph_attr[n]['color'] = node_type_color[
                'non-taxon-neg-annotation']
        elif n in neg_prots:
            graph_attr[n]['color'] = node_type_color['neg-annotation']
        elif n in predicted_prots:
            graph_attr[n]['color'] = node_type_color['prediction']
        if n in uniprot_to_evidencecode:
            curr_style = ""
            for evidencecode in uniprot_to_evidencecode[n]:
                curr_type = evidence_code_type[evidencecode]
                if curr_type == "experimental":
                    curr_style = annotation_type_styles[curr_type]
                    break
                elif curr_style == "computational":
                    continue
                else:
                    curr_style = annotation_type_styles[curr_type]
            graph_attr[n].update(curr_style)
        # temporary fix to get the non-target positive examples
        if n in non_taxon_annotated_prots:
            graph_attr[n].update(annotation_type_styles['experimental'])

    # TODO build the popups here. That way the popup building logic can be separated from the
    # GSGraph building logic
    popups = {}
    prednodes = set([n for edge in prededges for n in edge])
    for n in prednodes:
        popups[n] = gs.buildNodePopup(n, attr_val=attr_desc)
    for u, v in prededges:
        popups[(u, v)] = gs.buildEdgePopup(u,
                                           v,
                                           node_labels=uniprot_to_gene,
                                           attr_val=attr_desc)

    # Now post to graphspace!
    print("Building GraphSpace graph")
    G = gs.constructGraph(prededges,
                          node_labels=node_labels,
                          graph_attr=graph_attr,
                          popups=popups)

    # TODO add an option to build the 'graph information' tab legend/info
    # build the 'Graph Information' metadata
    #desc = gs.buildGraphDescription(opts.edges, opts.net)
    desc = ''
    metadata = {'description': desc, 'tags': [], 'title': ''}
    if tags is not None:
        metadata['tags'] = tags
    G.set_data(metadata)
    if 'graph_exp_name' in dataset:
        graph_exp_name = dataset['graph_exp_name']
    else:
        graph_exp_name = "%s-%s" % (dataset['exp_name'].split('/')[-1],
                                    dataset['net_version'].split('/')[-1])
    graph_name = "%s-%s-%s-%s%s" % (selected_goname, selected_goid, alg,
                                    graph_exp_name, name_postfix)
    G.set_name(graph_name)

    # rather than call it from here and repeat all the options, return G, and then call this after
    #post_graph_to_graphspace(G, opts.username, opts.password, opts.graph_name, apply_layout=opts.apply_layout, layout_name=opts.layout_name,
    #                         group=opts.group, make_public=opts.make_public)
    return G, graph_name
Пример #8
0
def run(run_obj):

    params_results = run_obj.params_results
    P, alg, params = run_obj.P, run_obj.name, run_obj.params

    # get the labels matrix and transpose it to have label names as columns
    ann_mat = run_obj.ann_matrix
    max_iter = params['num_iter']

    if run_obj.train_mat is not None and run_obj.test_mat is not None:
        print("Performing Cross validation")
        run_obj.cv = True
        train_mat = run_obj.train_mat
        test_mat = run_obj.test_mat
    else:
        run_obj.cv = False
        train_mat = ann_mat
        test_mat = ann_mat

    # stores the scores for all the terms
    scores = sparse.lil_matrix(ann_mat.shape,
                               dtype=np.float)  #   dim: term x genes

    for term in tqdm(run_obj.goids_to_run):

        idx = run_obj.hpoidx[term]
        # get the training positive, negative sets for current fold
        train_pos, train_neg = alg_utils.get_goid_pos_neg(train_mat, idx)
        train_set = sorted(list(set(train_pos) | set(train_neg)))

        if len(train_pos) == 0:
            print("Skipping term, 0 positive examples")
            continue

        if run_obj.cv:
            # if cross validation, then obtain the test gene set on which classifier should be tested
            test_pos, test_neg = alg_utils.get_goid_pos_neg(test_mat, idx)
            test_set = set(test_pos) | set(test_neg)
            test_set = sorted(list(test_set))
        else:
            # set all unlabeled genes to the test set
            test_set = sorted(
                list(set(run_obj.protidx.values()) - set(train_set)))

        # obtain the feature vector only for the genes in the training set
        X_train = P[train_set, :]
        # obtain the feature vector only for the genes in the testing set
        X_test = P[test_set, :]
        # obtain the labels matrix corresponding to genes in the training set
        y_train = train_mat.transpose()[train_set, :]
        y_train = sparse.lil_matrix(y_train)

        classifier = svm.training(X_train, y_train[:, idx].toarray().flatten(),
                                  max_iter)
        score_testSet = svm.testing(classifier, X_test)
        predict = score_testSet.tolist()

        # get the current scores for the given term in current fold
        curr_score = scores[idx].toarray().flatten()
        # for the test indices of the current label, set the scores
        curr_score[test_set] = predict
        curr_score[train_pos] = 1

        # add the scores produced by predicting on the current label of test set to a combined score matrix
        scores[idx] = curr_score

    run_obj.goid_scores = scores
    run_obj.params_results = params_results
def run(run_obj):
    """
    Function to run FastSinkSource, FastSinkSourcePlus, Local and LocalPlus
    *goids_to_run*: goids for which to run the method. 
        Must be a subset of the goids present in the ann_obj
    """
    params_results = run_obj.params_results
    # make sure the goid_scores matrix is reset
    # because if it isn't empty, overwriting the stored scores seems to be time consuming
    goid_scores = sp.lil_matrix(run_obj.ann_matrix.shape, dtype=np.float)
    goid_rank_stats = {}
    P, alg, params = run_obj.P, run_obj.name, run_obj.params
    print("Running %s with these parameters: %s" % (alg, params))

    # run FastSinkSource on each GO term individually
    #for i in trange(run_obj.ann_matrix.shape[0]):
    #goid = run_obj.goids[i]
    for goid in tqdm(run_obj.goids_to_run):
        idx = run_obj.ann_obj.goid2idx[goid]
        # get the row corresponding to the current goids annotations
        y = run_obj.ann_matrix[idx, :]
        positives = (y > 0).nonzero()[1]
        negatives = (y < 0).nonzero()[1]
        # if this method uses positive examples only, then remove the negative examples
        if alg in ["fastsinksourceplus", "sinksourceplus", "localplus"]:
            negatives = None

        if run_obj.net_obj.weight_gmw is True:
            start_time = time.process_time()
            # weight the network for each GO term individually
            W, process_time = run_obj.net_obj.weight_GMW(y.toarray()[0], goid)
            P = alg_utils.normalizeGraphEdgeWeights(W,
                                                    ss_lambda=params.get(
                                                        'lambda', None))
            params_results['%s_weight_time' %
                           (alg)] += time.process_time() - start_time

        a, max_iters = params['alpha'], params['max_iters']
        compare_ranks = params['compare_ranks']
        # rank_all is a T/F option, but 'rank_pos_neg' will be the test/left-out ann matrix
        # from which we can get the left-out pos/neg for this term
        rank_all, rank_pos_neg = params['rank_all'], params['rank_pos_neg']
        if sp.issparse(rank_pos_neg):
            pos, neg = alg_utils.get_goid_pos_neg(rank_pos_neg, idx)
            rank_pos_neg = (set(pos), set(neg))
        elif rank_pos_neg is True:
            print("ERROR: rank_pos_neg must be the test_ann_mat")
            sys.exit()

        # now actually run the algorithm
        ss_obj = ss_bounds.SinkSourceBounds(P,
                                            positives,
                                            negatives=negatives,
                                            max_iters=max_iters,
                                            a=a,
                                            rank_all=rank_all,
                                            rank_pos_neg=rank_pos_neg,
                                            verbose=run_obj.kwargs.get(
                                                'verbose', False))

        scores_arr = ss_obj.runSinkSourceBounds()
        process_time, update_time, iters, comp = ss_obj.get_stats()

        if run_obj.kwargs.get('verbose', False) is True:
            tqdm.write("\t%s converged after %d iterations " % (alg, iters) +
                       "(%0.4f sec) for %s" % (process_time, goid))

        if compare_ranks:
            # compare how long it takes for the ranks to match the previous run
            tqdm.write(
                "\tRepeating the run, but comparing the ranks from the previous run at each iteration"
            )
            # keep only the nodes with a non-zero score
            scores = {n: s for n, s in enumerate(scores_arr) if s > 0}
            # ranks is a list containing the ranked order of nodes.
            # The node with the highest score is first, the lowest is last
            if rank_pos_neg is not None:
                pos_neg_nodes = rank_pos_neg[0] | rank_pos_neg[1]
                ranks = [
                    n for n in sorted(set(scores.keys()) & pos_neg_nodes,
                                      key=scores.get,
                                      reverse=True)
                ]
            else:
                ranks = [
                    n for n in sorted(scores, key=scores.get, reverse=True)
                ]

            # left off top-k for now
            #ranks = ranks[:k] if self.rank_topk is True else ranks
            ss_obj = ss_bounds.SinkSourceBounds(P,
                                                positives,
                                                negatives=negatives,
                                                max_iters=max_iters,
                                                a=a,
                                                rank_all=rank_all,
                                                rank_pos_neg=rank_pos_neg,
                                                ranks_to_compare=ranks,
                                                verbose=run_obj.kwargs.get(
                                                    'verbose', False))
            ss_obj.runSinkSourceBounds()

            rank_stats = [
                "%d\t%d\t%0.4e\t%d\t%d\t%0.2e\t%0.2e\t%0.4f\t%0.4f\t%0.4f" %
                (len(positives), i + 1, ss_obj.kendalltau_list[i],
                 ss_obj.num_unranked_list[i],
                 ss_obj.max_unranked_stretch_list[i], ss_obj.max_d_list[i],
                 ss_obj.UB_list[i], ss_obj.eval_stats_list[i][0],
                 ss_obj.eval_stats_list[i][1], ss_obj.eval_stats_list[i][2])
                for i in range(ss_obj.num_iters)
            ]
            goid_rank_stats[goid] = rank_stats

            #rank_fh.write(''.join("%s%s\t%d\t%d\t%0.6f\t%d\t%d\t%0.4e\t%0.4e\t%0.4f\t%0.4f\t%0.4f\t%0.4f\n" % (
            #    goid, "\t%s"%self.taxon if self.taxon is not None else "", len(positives), i+1, ss_squeeze.kendalltau_list[i],
            #    ss_squeeze.num_unranked_list[i], ss_squeeze.max_unranked_stretch_list[i], ss_squeeze.max_d_list[i], ss_squeeze.UB_list[i],
            #    ss_squeeze.eval_stats_list[i][0], ss_squeeze.eval_stats_list[i][1], ss_squeeze.eval_stats_list[i][2], ss_squeeze.eval_stats_list[i][3])
            #                    for i in range(ss_squeeze.num_iters)))

        ## if they're different dimensions, then set the others to zeros
        #if len(scores_arr) < goid_scores.shape[1]:
        #    scores_arr = np.append(scores_arr, [0]*(goid_scores.shape[1] - len(scores_arr)))
        # limit the scores to the target nodes
        if len(run_obj.target_prots) != len(scores_arr):
            mask = np.ones(len(scores_arr), np.bool)
            mask[run_obj.target_prots] = 0
            scores_arr[mask] = 0
        goid_scores[idx] = scores_arr
        # make sure 0s are removed
        #goid_scores.eliminate_zeros()

        # also keep track of the time it takes for each of the parameter sets
        alg_name = "%s%s" % (alg, run_obj.params_str)
        #params_results["%s_wall_time"%alg_name] += wall_time
        params_results["%s_process_time" % alg_name] += process_time
        params_results["%s_update_time" % alg_name] += update_time

    run_obj.goid_scores = goid_scores.tocsr()
    run_obj.params_results = params_results
    run_obj.goid_rank_stats = goid_rank_stats
    return