Exemplo n.º 1
0
def run(config_map, **kwargs):
    input_settings = config_map['input_settings']
    input_dir = input_settings['input_dir']
    alg_settings = config_map['algs']
    output_settings = config_map['output_settings']
    postfix = kwargs.get("postfix")
    # combine the evaluation settings in the config file and the kwargs
    kwargs.update(config_map['eval_settings'])
    # if specified, use this postfix, meaning overwrite the postfix from the yaml file
    if postfix is not None:
        kwargs['postfix'] = postfix
    # otherwise use the default empty string
    elif kwargs.get('postfix') is None:
        kwargs['postfix'] = ""

    for dataset in input_settings['datasets']:
        # add options specified for this dataset to kwargs
        # youngs_neg: for a term t, a gene g cannot be a negative for t if g shares an annotation with any gene annotated to t
        kwargs['youngs_neg'] = dataset.get('youngs_neg')
        # leaf_terms_only: limit the terms to only those that are the most specific, meaning remove the ancestors of all terms
        kwargs['leaf_terms_only'] = dataset.get('leaf_terms_only')
        # sp_leaf_terms_only: limit the terms to only those that are the most specific, meaning remove the ancestors of all terms
        kwargs['sp_leaf_terms_only'] = dataset.get('sp_leaf_terms_only')

        net_obj, ann_obj, eval_ann_obj = setup_dataset(dataset, input_dir,
                                                       alg_settings, **kwargs)
        # if there are no annotations, then skip this dataset
        if len(ann_obj.goids) == 0:
            print("No terms found. Skipping this dataset")
            continue
        # the outputs will follow this structure:
        # outputs/<net_version>/<exp_name>/<alg_name>/output_files
        out_dir = "%s/%s/%s/" % (output_settings['output_dir'],
                                 dataset['net_version'], dataset['exp_name'])
        alg_runners = setup_runners(alg_settings, net_obj, ann_obj, out_dir,
                                    **kwargs)

        # first run prediction mode since it is the fastest
        if kwargs['only_eval'] is False:
            # run algorithms in "prediction" mode
            run_algs(alg_runners, **kwargs)
            # if specified, write the SWSN combined network to a file
            save_net = dataset['net_settings'].get(
                'save_net', None) if 'net_settings' in dataset else None
            if net_obj.weight_swsn is True and save_net is not None:
                out_file = "%s/%s/%s" % (input_dir, dataset['net_version'],
                                         save_net)
                # the SWSN network is part of the runner object. Need to organize that better
                net_obj.save_net(out_file)

            # if a pos_neg_file_eval was passed in (e.g., for temporal holdout validation),
            # use it to evaluate the predictions
            if eval_ann_obj is not None:
                exp_type = "eval"
                # For LOSO, 'all-sp-loso' was used in the past
                #if kwargs.get('keep_ann') is not None:
                #    exp_type="all-sp-loso"
                for run_obj in alg_runners:
                    out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type,
                                                  run_obj.params_str,
                                                  kwargs.get("postfix", ""))
                    utils.checkDir(os.path.dirname(out_file))
                    eval_utils.evaluate_ground_truth(run_obj, eval_ann_obj,
                                                     out_file, **kwargs)

        if kwargs['cross_validation_folds'] is not None:
            # run cross validation
            cross_validation.run_cv_all_goterms(
                alg_runners,
                ann_obj,
                folds=kwargs['cross_validation_folds'],
                **kwargs)

        if kwargs['loso'] is True:
            # add the taxon file paths for this dataset to kwargs
            for arg in ['taxon_file', 'only_taxon_file']:
                kwargs[arg] = "%s/%s" % (input_dir, dataset[arg])
            # now run the leave-one-species-out eval
            eval_loso.eval_loso(alg_runners,
                                ann_obj,
                                eval_ann_obj=eval_ann_obj,
                                **kwargs)
Exemplo n.º 2
0
def run_cv_all_goterms(
        alg_runners, ann_obj, folds=5, num_reps=1, 
        cv_seed=None, **kwargs):
    """
    Split the positives and negatives into folds across all GO terms
    and then run the algorithms on those folds.
    Algorithms are all run on the same split of data. 
    *num_reps*: Number of times to repeat cross-validation. 
    An output file will be written for each repeat
    *cv_seed*: Seed to use for the random number generator when splitting the annotations into folds
        If *num_reps* > 1, the seed will be incremented by 1 each time
    """
    ann_matrix = ann_obj.ann_matrix
    goids, prots = ann_obj.goids, ann_obj.prots

    # set the cv_seed if specified
    # 2019-06-26 BUG: If there are a different number of terms, or the order of the terms changed, then the results won't be the same
    #if cv_seed is not None:
    #    print("\nSetting the Random State seed to %d" % (cv_seed))
    #    np.random.seed(cv_seed)

    # first check to see if the algorithms have already been run
    # and if the results should be overwritten
    if kwargs['forcealg'] is True or len(goids) == 1:
        # runners_to_run is a list of runners for each repitition
        runners_to_run = {i: alg_runners for i in range(1,num_reps+1)}
    else:
        runners_to_run = {}
        # a different file is stored for each repitition, so check each one
        for rep in range(1,num_reps+1):
            curr_runners_to_run = [] 
            curr_seed = cv_seed
            if curr_seed is not None:
                # add the current repitition number to the seed
                curr_seed += rep-1
            for run_obj in alg_runners:
                out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                    run_obj.out_dir, folds, rep,
                    "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
                if os.path.isfile(out_file):
                    print("%s already exists. Use --forcealg to overwite" % (out_file))
                else:
                    curr_runners_to_run.append(run_obj)
            runners_to_run[rep] = curr_runners_to_run

    # repeat the CV process the specified number of times
    for rep in range(1,num_reps+1):
        if len(runners_to_run[rep]) == 0:
            continue
        curr_seed = cv_seed
        if curr_seed is not None:
            # add the current repitition number to the seed
            curr_seed += rep-1
        # split the annotation matrix into training and testing matrices K times
        ann_matrix_folds = split_cv_all_goterms(ann_obj, folds=folds, seed=curr_seed, **kwargs)

        for run_obj in runners_to_run[rep]:
            # because each fold contains a different set of positives, and combined they contain all positives,
            # store all of the prediction scores from each fold in a matrix
            combined_fold_scores = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
            for curr_fold, (train_ann_mat, test_ann_mat) in enumerate(ann_matrix_folds):
                print("*  "*20)
                print("Fold %d" % (curr_fold+1))

                # change the annotation matrix to the current fold
                curr_ann_obj = setup.Sparse_Annotations(train_ann_mat, goids, prots)
                # replace the ann_obj in the runner with the current fold's annotations  
                run_obj.ann_obj = curr_ann_obj
                run_obj.train_mat = train_ann_mat
                run_obj.test_mat = test_ann_mat
                #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, net_obj, curr_ann_obj, **kwargs)
                # now setup the inputs for the runners
                run_obj.setupInputs()
                # run the alg
                run_obj.run()
                # parse the outputs. Only needed for the algs that write output files
                run_obj.setupOutputs()

                # store only the scores of the test (left out) positives and negatives
                for i in range(len(goids)):
                    test_pos, test_neg = alg_utils.get_goid_pos_neg(test_ann_mat, i)
                    curr_goid_scores = run_obj.goid_scores[i].toarray().flatten()
                    curr_comb_scores = combined_fold_scores[i].toarray().flatten()
                    curr_comb_scores[test_pos] = curr_goid_scores[test_pos]
                    curr_comb_scores[test_neg] = curr_goid_scores[test_neg]
                    combined_fold_scores[i] = curr_comb_scores 

            # replace the goid_scores in the runner to combined_fold_scores to evaluate
            run_obj.goid_scores = combined_fold_scores 

            #curr_goids = dag_goids if alg == 'birgrank' else goids
            # now evaluate the results and write to a file
            out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                run_obj.out_dir, folds, rep,
                "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
            utils.checkDir(os.path.dirname(out_file)) 
            eval_utils.evaluate_ground_truth(
                run_obj, ann_obj, out_file,
                #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
                alg=run_obj.name, append=False, **kwargs)

    print("Finished running cross-validation")
    return
Exemplo n.º 3
0
def run_and_eval_algs(run_obj,
                      ann_obj,
                      train_ann_mat,
                      test_ann_mat,
                      taxon=None,
                      **kwargs):
    goids, prots = ann_obj.goids, ann_obj.prots
    dag_matrix = ann_obj.dag_matrix
    params_results = defaultdict(int)

    if kwargs.get('keep_ann', False) is True:
        print("Keeping all annotations when making predictions")
    elif kwargs.get('non_pos_as_neg_eval', False) is True:
        print(
            "Evaluating using all non-ground-truth positives for the taxon as false positives"
        )
    else:
        print(
            "Evaluating using only the ground-truth negatives predicted as positives as false positives"
        )

    # change the annotation matrix to the current training positive examples
    curr_ann_obj = setup.Sparse_Annotations(dag_matrix, train_ann_mat, goids,
                                            prots)
    # make an ann obj with the test ann mat
    test_ann_obj = setup.Sparse_Annotations(dag_matrix, test_ann_mat, goids,
                                            prots)
    # if this is a gene based method, then run it on only the nodes which have a pos/neg annotation
    # unless specified otherwise by the "run_all_nodes" flag
    if run_obj.get_alg_type(
    ) == 'gene-based' and not run_obj.kwargs.get("run_all_nodes"):
        # sum the boolean of the columns, then use nonzero to get the columns with a nonzero value
        run_obj.kwargs['nodes_to_run'] = (test_ann_mat != 0).sum(
            axis=0).nonzero()[1]
        print("\trunning %s using only the %d pos/neg nodes" %
              (run_obj.name, len(run_obj.kwargs['nodes_to_run'])))

    # setup the output file. Could be used by the runners to write temp files or other output files
    exp_type = "loso"
    postfix = kwargs.get("postfix", "")
    if kwargs['keep_ann']:
        exp_type = "eval-per-taxon"
    out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type,
                                  run_obj.params_str, postfix)
    run_obj.out_pref = out_file.replace('.txt', '')
    utils.checkDir(os.path.dirname(out_file))

    # for sinksource_bounds, keep track of which nodes are either a left-out pos or left-out neg
    if run_obj.name in ['sinksource_bounds', 'sinksourceplus_bounds']:
        run_obj.params['rank_pos_neg'] = test_ann_mat

    # if predictions were already generated, and taxon is set to 'all', then use those.
    # otherwise, generate the prediction scores
    if kwargs['keep_ann'] and run_obj.goid_scores.getnnz() != 0:
        print("Using already computed scores")
    else:
        # replace the ann_obj in the runner with the current training annotations
        run_obj.ann_obj = curr_ann_obj
        #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, curr_ann_obj, **kwargs)
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.setupInputs()
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.run()
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.setupOutputs(taxon=taxon)

    # now evaluate
    # this will write a file containing the fmax and other measures for each goterm
    # with the taxon name in the name of the file
    eval_utils.evaluate_ground_truth(
        run_obj,
        test_ann_obj,
        out_file,
        #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
        taxon=taxon,
        append=True,
        **kwargs)
    for key in run_obj.params_results:
        params_results[key] += run_obj.params_results[key]

    return params_results