Python featurize_data 예제들, features.featurization.featurize_data Python 예제들

예제 #1

0

파일 보기

파일: model_comparison.py 프로젝트: ElucidataInc/crispor

def predict(seq, aa_cut=0, percent_peptide=0, model=None, model_file=None):
    assert not (model is None and model_file is None
                ), "you have to specify either a model or a model_file"

    if model is None:
        try:
            with open(model_file, 'rb') as f:
                model, learn_options = pickle.load(f)
        except:
            raise Exception("could not find model stored to file %s" %
                            model_file)
    else:
        model, learn_options = model

    learn_options["V"] = 2

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=[[seq, 'NA']])
    gene_position = pandas.DataFrame(
        columns=[u'Percent Peptide', u'Amino Acid Cut position'],
        data=[[percent_peptide, aa_cut]])
    feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(),
                                       gene_position)
    inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(
        feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    return model.predict(inputs)[0]

예제 #2

0

파일 보기

파일: model_comparison.py 프로젝트: mayavanand/RMMAFinalProject

def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True):

    num_proc = shared_setup(learn_options, order, test)

    assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
        raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')

    Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1

    if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True:
        print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)"
        for i in range(Xdf.shape[0]):
            Xdf['30mer'].iloc[i] = util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"])
        # to_keep = Xdf['30mer'].isnull() == False
        # Xdf = Xdf[to_keep]
        # gene_position = gene_position[to_keep]
        # Y = Y[to_keep]
        Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide

    if learn_options.has_key('left_right_guide_ind') and learn_options['left_right_guide_ind'] is not None:
        seq_start, seq_end, expected_length = learn_options['left_right_guide_ind']
        Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end])

    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit, length_audit=length_audit)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc

예제 #3

0

파일 보기

파일: model_comparison.py 프로젝트: maximilianh/crisporWebsite

def predict(seq, aa_cut=0, percent_peptide=0, model=None, model_file=None):
    assert not (model is None and model_file is None), "you have to specify either a model or a model_file"

    if model is None:
        try:
            with open(model_file, 'rb') as f:
                model, learn_options = pickle.load(f)
        except:
            raise Exception("could not find model stored to file %s" % model_file)
    else:
        model, learn_options = model

    learn_options["V"] = 2

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=[[seq, 'NA']])
    gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=[[percent_peptide, aa_cut]])
    feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position)
    inputs, dim, dimsum, feature_names = concatenate_feature_sets(feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    return model.predict(inputs)[0]

예제 #4

0

파일 보기

파일: hackseq_featurization.py 프로젝트: hackseq/2017_project_8

def get_proximal_5mer_feature(data_df):
    proximal_5mer = data_df['30mer'].apply(get_5mer)
    proximal_5mer.name = "proximal_5mers"
    proximal_5mer = pd.DataFrame(proximal_5mer)

    proximal_5mer_counts = proximal_5mer.groupby(["proximal_5mers"]).size().reset_index()
    proximal_5mer = proximal_5mer.merge(proximal_5mer_counts, on="proximal_5mers")
    proximal_5mer = proximal_5mer.rename(columns={0: 'proximal_5mer_counts'})
    return proximal_5mer


if __name__ == '__main__':
    feature_df = pd.read_csv("../../../../../results/cleaned_c_elegans_30mers_energies.csv")

    features = featurize_data(feature_df,
                         learn_options=learn_options,
                         Y=feature_df,
                         gene_position=feature_df)

    features['proximal_5mer'] = get_proximal_5mer_feature(feature_df)
    inputs, dim, dimsum, feature_names = concatenate_feature_sets(features)

    doensch_df = pd.DataFrame(inputs, columns=feature_names)
    feature_df = feature_df.join(doensch_df)
    feature_df = feature_df.drop(axis=1, labels=['sgRNA', 'Gene target', '30mer', 'WormsInjected', 'SuccessfulInjections'])
    feature_df = pd.get_dummies(feature_df).dropna(axis=0)
    if any(feature_df.columns.duplicated()):
        feature_df = feature_df.loc[:, ~feature_df.columns.duplicated()]

    feature_df = feature_df.rename(columns={"SuccessRate": "target"})

    print(feature_df.shape)

예제 #5

0

파일 보기

파일: model_comparison.py 프로젝트: maximilianh/crisporWebsite

def setup(test=False, order=1, learn_options=None, data_file=None):

    if 'num_proc' not in learn_options.keys():
        learn_options['num_proc'] = None
    if 'num_thread_per_proc' not in learn_options.keys():
        learn_options['num_thread_per_proc'] = None

    num_proc = local_multiprocessing.configure(TEST=test, num_proc=learn_options["num_proc"], 
                                                num_thread_per_proc=learn_options["num_thread_per_proc"])
    learn_options["num_proc"] = num_proc

    learn_options["order"] = order  # gets used many places in code, not just here

    if "cv" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["cv"] = "gene"

    if "normalize_features" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["normalize_features"] = True

    if "weighted" not in learn_options.keys():
        learn_options['weighted'] = None

    if "all pairs" not in learn_options.keys():
        learn_options["all pairs"] = False

    if "include_known_pairs" not in learn_options.keys():
        learn_options["include_known_pairs"] = False

    if "include_gene_guide_feature" not in learn_options.keys():
        learn_options["include_gene_guide_feature"] = 0 #used as window size, so 0 is none

    #these should default to true to match experiments before they were options:
    if "gc_features" not in learn_options.keys():
        learn_options["gc_features"] = True
    if "nuc_features" not in learn_options.keys():
        learn_options["nuc_features"] = True

    if 'train_genes' not in learn_options.keys():
        learn_options["train_genes"] = None
    if 'test_genes' not in learn_options.keys():
        learn_options["test_genes"] = None

    if "num_proc" not in learn_options:
        learn_options["num_proc"] = None
    if "num_thread_per_proc" not in learn_options:
        learn_options["num_thread_per_proc"] = None

    if 'seed' not in learn_options:
        learn_options['seed'] = 1

    if "flipV1target" not in learn_options:
        learn_options["flipV1target"] = False

    if 'num_genes_remove_train' not in learn_options:
        learn_options['num_genes_remove_train'] = None

    if "include_microhomology" not in learn_options:
        learn_options["include_microhomology"] = False


    assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
        raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')

    Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1
 
    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc

예제 #6

0

파일 보기

파일: model_comparison.py 프로젝트: ElucidataInc/crispor

def setup(test=False, order=1, learn_options=None, data_file=None):

    if 'num_proc' not in learn_options.keys():
        learn_options['num_proc'] = None
    if 'num_thread_per_proc' not in learn_options.keys():
        learn_options['num_thread_per_proc'] = None

    num_proc = local_multiprocessing.configure(
        TEST=test,
        num_proc=learn_options["num_proc"],
        num_thread_per_proc=learn_options["num_thread_per_proc"])
    learn_options["num_proc"] = num_proc

    learn_options[
        "order"] = order  # gets used many places in code, not just here

    if "cv" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["cv"] = "gene"

    if "normalize_features" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["normalize_features"] = True

    if "weighted" not in learn_options.keys():
        learn_options['weighted'] = None

    if "all pairs" not in learn_options.keys():
        learn_options["all pairs"] = False

    if "include_known_pairs" not in learn_options.keys():
        learn_options["include_known_pairs"] = False

    if "include_gene_guide_feature" not in learn_options.keys():
        learn_options[
            "include_gene_guide_feature"] = 0  #used as window size, so 0 is none

    #these should default to true to match experiments before they were options:
    if "gc_features" not in learn_options.keys():
        learn_options["gc_features"] = True
    if "nuc_features" not in learn_options.keys():
        learn_options["nuc_features"] = True

    if 'train_genes' not in learn_options.keys():
        learn_options["train_genes"] = None
    if 'test_genes' not in learn_options.keys():
        learn_options["test_genes"] = None

    if "num_proc" not in learn_options:
        learn_options["num_proc"] = None
    if "num_thread_per_proc" not in learn_options:
        learn_options["num_thread_per_proc"] = None

    if 'seed' not in learn_options:
        learn_options['seed'] = 1

    if "flipV1target" not in learn_options:
        learn_options["flipV1target"] = False

    if 'num_genes_remove_train' not in learn_options:
        learn_options['num_genes_remove_train'] = None

    if "include_microhomology" not in learn_options:
        learn_options["include_microhomology"] = False

    assert "testing_non_binary_target_name" in learn_options.keys(
    ), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in [
            'ranks', 'raw', 'thrs'
    ]:
        raise Exception(
            'learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]'
        )

    Xdf, Y, gene_position, target_genes = load_data.from_file(
        data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1

    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc

예제 #7

0

파일 보기

파일: model_comparison.py 프로젝트: mayavanand/RMMAFinalProject

def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True, length_audit=False, learn_options_override=None):
    """
    if pam_audit==False, then it will not check for GG in the expected position
    this is useful if predicting on PAM mismatches, such as with off-target
    """
    print "predict function running"
    # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
    assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array"
    assert len(seq[0]) > 0, "Make sure that seq is not empty"
    assert isinstance(seq[0], str), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations"

    if aa_cut is not None:
        assert len(aa_cut) > 0, "Make sure that aa_cut is not empty"
        assert isinstance(aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array"
        assert np.all(np.isreal(aa_cut)), "amino-acid cut position needs to be a real number"

    if percent_peptide is not None:
        assert len(percent_peptide) > 0, "Make sure that percent_peptide is not empty"
        assert isinstance(percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array"
        assert np.all(np.isreal(percent_peptide)), "percent_peptide needs to be a real number"


    if model_file is None:
        azimuth_saved_model_dir = os.path.join(os.path.dirname(__file__), 'saved_models')
        if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None):
            print("No model file specified, using V3_model_nopos")
            model_name = 'V3_model_nopos.pickle'
        else:
            print("No model file specified, using V3_model_full")
            model_name = 'V3_model_full.pickle'

        model_file = os.path.join(azimuth_saved_model_dir, model_name)

    if model is None:
        with open(model_file, 'rb') as f:
            model, learn_options = pickle.load(f)
        print model_file
        print learn_options
    else:
        model, learn_options = model
        
    learn_options["V"] = 2

    learn_options = override_learn_options(learn_options_override, learn_options)

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))]))

    if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None):
        gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut))
    else:
        gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0])*-1, np.ones(seq.shape[0])*-1))

    feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit)
    inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    preds = model.predict(inputs)

    # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function())
    unique_preds = np.unique(preds)
    ok = False
    for pr in preds:
        if pr not in [0,1]:
            ok = True
    assert ok, "model returned only 0s and 1s"
    return preds