Exemplo n.º 1
0
def load_annotations(prots, dataset, input_dir, **kwargs):
    # limit the terms to whatever is specified either in the only_functions_file,
    # or the --goterm command-line option
    only_functions_file = None
    # if specific goterms are passed in_then ignore the only functions file
    if kwargs[
            'goterm'] is None and 'only_functions_file' in dataset and dataset[
                'only_functions_file'] != '':
        only_functions_file = "%s/%s" % (input_dir,
                                         dataset['only_functions_file'])
    selected_terms = alg_utils.select_goterms(
        only_functions_file=only_functions_file, goterms=kwargs['goterm'])

    # now build the annotation matrix
    pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file'])
    obo_file = "%s/%s" % (input_dir, dataset['obo_file'])
    dag_matrix, ann_matrix, goids, ann_prots = setup.create_sparse_ann_file(
        obo_file, pos_neg_file, **kwargs)
    #ann_matrix, goids = setup.setup_sparse_annotations(pos_neg_file, selected_terms, prots)
    ann_obj = setup.Sparse_Annotations(dag_matrix, ann_matrix, goids,
                                       ann_prots)
    # so that limiting the terms won't make a difference, apply youngs_neg here
    if kwargs.get('youngs_neg'):
        ann_obj = setup.youngs_neg(ann_obj, **kwargs)
    if kwargs.get('leaf_terms_only'):
        terms = selected_terms if selected_terms is not None else goids
        # limit the terms to only those that are the most specific (i.e., leaf terms),
        # meaning remove the ancestors of all terms
        leaf_terms = go_utils.get_most_specific_terms(terms, ann_obj=ann_obj)
        print("\t%d / %d terms are most specific, or leaf terms" %
              (len(leaf_terms), len(terms)))
        if selected_terms is not None:
            selected_terms &= leaf_terms
        else:
            selected_terms = leaf_terms
    if selected_terms is not None:
        ann_obj.limit_to_terms(selected_terms)
    else:
        selected_terms = goids
    # align the ann_matrix prots with the prots in the network
    ann_obj.reshape_to_prots(prots)

    eval_ann_obj = None
    # also check if a evaluation pos_neg_file was given
    if dataset.get('pos_neg_file_eval', '') != '':
        pos_neg_file_eval = "%s/%s" % (input_dir, dataset['pos_neg_file_eval'])
        dag_matrix, ann_matrix, goids, ann_prots = setup.create_sparse_ann_file(
            obo_file, pos_neg_file_eval, **kwargs)
        #ann_matrix, goids = setup.setup_sparse_annotations(pos_neg_file_eval, selected_terms, prots)
        eval_ann_obj = setup.Sparse_Annotations(dag_matrix, ann_matrix, goids,
                                                ann_prots)
        if kwargs.get('youngs_neg'):
            eval_ann_obj = setup.youngs_neg(eval_ann_obj, **kwargs)
        # also limit the terms in the eval_ann_obj to those from the pos_neg_file
        eval_ann_obj.limit_to_terms(selected_terms)
        eval_ann_obj.reshape_to_prots(prots)
    return selected_terms, ann_obj, eval_ann_obj
Exemplo n.º 2
0
def run_cv_all_goterms(
        alg_runners, ann_obj, folds=5, num_reps=1, 
        cv_seed=None, **kwargs):
    """
    Split the positives and negatives into folds across all GO terms
    and then run the algorithms on those folds.
    Algorithms are all run on the same split of data. 
    *num_reps*: Number of times to repeat cross-validation. 
    An output file will be written for each repeat
    *cv_seed*: Seed to use for the random number generator when splitting the annotations into folds
        If *num_reps* > 1, the seed will be incremented by 1 each time
    """
    ann_matrix = ann_obj.ann_matrix
    goids, prots = ann_obj.goids, ann_obj.prots

    # set the cv_seed if specified
    # 2019-06-26 BUG: If there are a different number of terms, or the order of the terms changed, then the results won't be the same
    #if cv_seed is not None:
    #    print("\nSetting the Random State seed to %d" % (cv_seed))
    #    np.random.seed(cv_seed)

    # first check to see if the algorithms have already been run
    # and if the results should be overwritten
    if kwargs['forcealg'] is True or len(goids) == 1:
        # runners_to_run is a list of runners for each repitition
        runners_to_run = {i: alg_runners for i in range(1,num_reps+1)}
    else:
        runners_to_run = {}
        # a different file is stored for each repitition, so check each one
        for rep in range(1,num_reps+1):
            curr_runners_to_run = [] 
            curr_seed = cv_seed
            if curr_seed is not None:
                # add the current repitition number to the seed
                curr_seed += rep-1
            for run_obj in alg_runners:
                out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                    run_obj.out_dir, folds, rep,
                    "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
                if os.path.isfile(out_file):
                    print("%s already exists. Use --forcealg to overwite" % (out_file))
                else:
                    curr_runners_to_run.append(run_obj)
            runners_to_run[rep] = curr_runners_to_run

    # repeat the CV process the specified number of times
    for rep in range(1,num_reps+1):
        if len(runners_to_run[rep]) == 0:
            continue
        curr_seed = cv_seed
        if curr_seed is not None:
            # add the current repitition number to the seed
            curr_seed += rep-1
        # split the annotation matrix into training and testing matrices K times
        ann_matrix_folds = split_cv_all_goterms(ann_obj, folds=folds, seed=curr_seed, **kwargs)

        for run_obj in runners_to_run[rep]:
            # because each fold contains a different set of positives, and combined they contain all positives,
            # store all of the prediction scores from each fold in a matrix
            combined_fold_scores = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
            for curr_fold, (train_ann_mat, test_ann_mat) in enumerate(ann_matrix_folds):
                print("*  "*20)
                print("Fold %d" % (curr_fold+1))

                # change the annotation matrix to the current fold
                curr_ann_obj = setup.Sparse_Annotations(train_ann_mat, goids, prots)
                # replace the ann_obj in the runner with the current fold's annotations  
                run_obj.ann_obj = curr_ann_obj
                run_obj.train_mat = train_ann_mat
                run_obj.test_mat = test_ann_mat
                #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, net_obj, curr_ann_obj, **kwargs)
                # now setup the inputs for the runners
                run_obj.setupInputs()
                # run the alg
                run_obj.run()
                # parse the outputs. Only needed for the algs that write output files
                run_obj.setupOutputs()

                # store only the scores of the test (left out) positives and negatives
                for i in range(len(goids)):
                    test_pos, test_neg = alg_utils.get_goid_pos_neg(test_ann_mat, i)
                    curr_goid_scores = run_obj.goid_scores[i].toarray().flatten()
                    curr_comb_scores = combined_fold_scores[i].toarray().flatten()
                    curr_comb_scores[test_pos] = curr_goid_scores[test_pos]
                    curr_comb_scores[test_neg] = curr_goid_scores[test_neg]
                    combined_fold_scores[i] = curr_comb_scores 

            # replace the goid_scores in the runner to combined_fold_scores to evaluate
            run_obj.goid_scores = combined_fold_scores 

            #curr_goids = dag_goids if alg == 'birgrank' else goids
            # now evaluate the results and write to a file
            out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                run_obj.out_dir, folds, rep,
                "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
            utils.checkDir(os.path.dirname(out_file)) 
            eval_utils.evaluate_ground_truth(
                run_obj, ann_obj, out_file,
                #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
                alg=run_obj.name, append=False, **kwargs)

    print("Finished running cross-validation")
    return
Exemplo n.º 3
0
def run_and_eval_algs(run_obj,
                      ann_obj,
                      train_ann_mat,
                      test_ann_mat,
                      taxon=None,
                      **kwargs):
    goids, prots = ann_obj.goids, ann_obj.prots
    dag_matrix = ann_obj.dag_matrix
    params_results = defaultdict(int)

    if kwargs.get('keep_ann', False) is True:
        print("Keeping all annotations when making predictions")
    elif kwargs.get('non_pos_as_neg_eval', False) is True:
        print(
            "Evaluating using all non-ground-truth positives for the taxon as false positives"
        )
    else:
        print(
            "Evaluating using only the ground-truth negatives predicted as positives as false positives"
        )

    # change the annotation matrix to the current training positive examples
    curr_ann_obj = setup.Sparse_Annotations(dag_matrix, train_ann_mat, goids,
                                            prots)
    # make an ann obj with the test ann mat
    test_ann_obj = setup.Sparse_Annotations(dag_matrix, test_ann_mat, goids,
                                            prots)
    # if this is a gene based method, then run it on only the nodes which have a pos/neg annotation
    # unless specified otherwise by the "run_all_nodes" flag
    if run_obj.get_alg_type(
    ) == 'gene-based' and not run_obj.kwargs.get("run_all_nodes"):
        # sum the boolean of the columns, then use nonzero to get the columns with a nonzero value
        run_obj.kwargs['nodes_to_run'] = (test_ann_mat != 0).sum(
            axis=0).nonzero()[1]
        print("\trunning %s using only the %d pos/neg nodes" %
              (run_obj.name, len(run_obj.kwargs['nodes_to_run'])))

    # setup the output file. Could be used by the runners to write temp files or other output files
    exp_type = "loso"
    postfix = kwargs.get("postfix", "")
    if kwargs['keep_ann']:
        exp_type = "eval-per-taxon"
    out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type,
                                  run_obj.params_str, postfix)
    run_obj.out_pref = out_file.replace('.txt', '')
    utils.checkDir(os.path.dirname(out_file))

    # for sinksource_bounds, keep track of which nodes are either a left-out pos or left-out neg
    if run_obj.name in ['sinksource_bounds', 'sinksourceplus_bounds']:
        run_obj.params['rank_pos_neg'] = test_ann_mat

    # if predictions were already generated, and taxon is set to 'all', then use those.
    # otherwise, generate the prediction scores
    if kwargs['keep_ann'] and run_obj.goid_scores.getnnz() != 0:
        print("Using already computed scores")
    else:
        # replace the ann_obj in the runner with the current training annotations
        run_obj.ann_obj = curr_ann_obj
        #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, curr_ann_obj, **kwargs)
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.setupInputs()
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.run()
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.setupOutputs(taxon=taxon)

    # now evaluate
    # this will write a file containing the fmax and other measures for each goterm
    # with the taxon name in the name of the file
    eval_utils.evaluate_ground_truth(
        run_obj,
        test_ann_obj,
        out_file,
        #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
        taxon=taxon,
        append=True,
        **kwargs)
    for key in run_obj.params_results:
        params_results[key] += run_obj.params_results[key]

    return params_results
Exemplo n.º 4
0
def load_net_ann_datasets(out_dir, taxon, dataset, input_settings,
                          alg_settings, uniprot_taxon_file, **kwargs):
    sparse_net_file = "%s/%s-net.npz" % (out_dir, taxon)
    node2idx_file = sparse_net_file + "-node-ids.txt"
    swsn_weights_file = sparse_net_file + "-swsn-weights.txt"
    sparse_ann_file = "%s/ann.npz" % (out_dir)
    if not kwargs.get('forcenet') and \
            (os.path.isfile(sparse_net_file) and os.path.isfile(node2idx_file)) and \
            os.path.isfile(sparse_ann_file):
        print("Reading network from %s" % (sparse_net_file))
        W = sp.load_npz(sparse_net_file)
        print("\t%d nodes and %d edges" % (W.shape[0], len(W.data) / 2))
        print("Reading node names from %s" % (node2idx_file))
        prots = utils.readItemList(node2idx_file, 1)
        new_net_obj = setup.Sparse_Networks(W, prots)
        if os.path.isfile(swsn_weights_file):
            print("Reading swsn weights file %s" % (swsn_weights_file))
            weights = [
                float(w) for w in utils.readItemList(swsn_weights_file, 1)
            ]
            # also load the original networks to get the edge weights for the STRING networks
            net_obj = run_eval_algs.setup_net(input_settings['input_dir'],
                                              dataset, **kwargs)
            net_obj.swsn_weights = weights
        else:
            net_obj = new_net_obj
        print("\nReading annotation matrix from %s" % (sparse_ann_file))
        loaded_data = np.load(sparse_ann_file, allow_pickle=True)
        dag_matrix = setup.make_csr_from_components(loaded_data['arr_0'])
        ann_matrix = setup.make_csr_from_components(loaded_data['arr_1'])
        goids, prots = loaded_data['arr_2'], loaded_data['arr_3']
        ann_obj = setup.Sparse_Annotations(dag_matrix, ann_matrix, goids,
                                           prots)
        species_to_uniprot_idx = eval_loso.get_uniprot_species(
            uniprot_taxon_file, ann_obj)
        # TODO eval ann obj
        eval_ann_obj = None
    else:
        # load the network
        # TODO if a subset of the network was run, need to get that subset
        net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
            dataset, input_settings['input_dir'], alg_settings, **kwargs)
        species_to_uniprot_idx = eval_loso.get_uniprot_species(
            uniprot_taxon_file, ann_obj)
        new_net_obj = net_obj
        # run SWSN if needd
        #if net_obj.multi_net:
        # TODO if LOSO was run, need to leave out the taxon for edge weights to be accurate
        if taxon is not None:
            if kwargs.get('limit_to_taxons_file'):
                # limit the network to the specified species
                # read in the specified taxons from the file
                _, net_taxons = eval_loso.get_selected_species(
                    species_to_uniprot_idx, kwargs['limit_to_taxons_file'])
                net_taxon_prots = net_exp.get_taxon_prots(
                    net_obj.nodes, net_taxons, species_to_uniprot_idx)
                net_obj, ann_obj = net_exp.limit_to_taxons(net_taxon_prots,
                                                           net_obj=net_obj,
                                                           ann_obj=ann_obj,
                                                           **kwargs)
            # leave out the annotations for this taxon ID
            train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon(
                taxon,
                ann_obj,
                species_to_uniprot_idx,
                eval_ann_obj=eval_ann_obj,
                **kwargs)
            taxon_prots = net_exp.get_taxon_prots(net_obj.nodes, [taxon],
                                                  species_to_uniprot_idx)
            new_net_obj = net_exp.limit_net_to_target_taxon(
                train_ann_mat, taxon_prots, net_obj, ann_obj, **kwargs)
            W = new_net_obj.W
        #    else:
        #        W, _ = net_obj.weight_SWSN(ann_obj.ann_matrix)
        #        #new_net_obj =
        else:
            W = net_obj.W
        print("\twriting sparse matrix to %s" % (sparse_net_file))
        sp.save_npz(sparse_net_file, W)
        print("\twriting node2idx labels to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join([
                "%s\t%d\n" % (prot, i) for i, prot in enumerate(net_obj.nodes)
            ]))
        if net_obj.multi_net:
            print("\twriting swsn weights file to %s" % (swsn_weights_file))
            with open(swsn_weights_file, 'w') as out:
                out.write('\n'.join([str(w)
                                     for w in new_net_obj.swsn_weights]) +
                          '\n')
                net_obj.swsn_weights = new_net_obj.swsn_weights
        # now store them to a file
        print("\twriting sparse annotations to %s" % (sparse_ann_file))
        # store all the data in the same file
        dag_matrix_data = setup.get_csr_components(ann_obj.dag_matrix)
        ann_matrix_data = setup.get_csr_components(ann_obj.ann_matrix)
        #np.savez_compressed(
        #    sparse_ann_file, dag_matrix_data=dag_matrix_data,
        #    ann_matrix_data=ann_matrix_data, goids=goids, prots=prots)
        np.savez_compressed(sparse_ann_file, dag_matrix_data, ann_matrix_data,
                            ann_obj.goids, ann_obj.prots)
    return net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx