from sklearn.datasets import load_iris
from alipy.experiment import AlExperiment

# Get the data
X, y = load_iris(return_X_y=True)

for strategy in [
        'QueryInstanceQBC', 'QueryInstanceUncertainty', 'QueryInstanceRandom',
        'QureyExpectedErrorReduction', 'QueryInstanceGraphDensity',
        'QueryInstanceQUIRE', 'QueryInstanceBMDR', 'QueryInstanceSPAL',
        'QueryInstanceLAL', 'QueryExpectedErrorReduction'
]:
    # init the AlExperiment
    al = AlExperiment(X,
                      y,
                      stopping_criteria='num_of_queries',
                      stopping_value=50)

    # split the data by using split_AL()
    al.split_AL(split_count=5)

    # al.set_query_strategy(strategy=strategy)

    # al.set_performance_metric('accuracy_score')

    # al.start_query(multi_thread=True)

    # or set the data split indexes by input the specific parameters
    from alipy.data_manipulate import split

    train, test, lab, unlab = split(X=X,
Exemplo n.º 2
0
df = encode_and_bind(df, 'CodeSonar Rule')
df = encode_and_bind(df, 'Severity')
df = encode_and_bind(df, 'CWE')
df = df[np.isfinite(df['True Positive'])]
X = df.drop('True Positive', axis=1)
y = df.loc[:, 'True Positive']
#parameters for models defined here, simply change init_labels, trn_tst_split, splits to change experiment
stop = 300  #stopping value for number of queries
init_labels = 0.005  #initially labelled portion of the dataset
trn_tst_split = 0.2  #train test split to use for each fold
splits = 5  #number of k folds

al_unc = AlExperiment(X,
                      y,
                      model=LogisticRegression(penalty='l1',
                                               solver='liblinear'),
                      performance_metric='accuracy_score',
                      stopping_criteria='num_of_queries',
                      stopping_value=stop)
al_unc.split_AL(test_ratio=trn_tst_split,
                initial_label_rate=init_labels,
                split_count=splits,
                all_class=True)
al_unc.set_query_strategy(strategy='QueryInstanceUncertainty')
al_unc.set_performance_metric(performance_metric='accuracy_score')
al_unc.start_query(multi_thread=False)
# print(al.get_experiment_result())
# al.plot_learning_curve()
analyser = ExperimentAnalyser(x_axis='num_of_queries')
analyser.add_method('uncertainty', al_unc.get_experiment_result())
al_qbc = AlExperiment(X,
data_cumh = pd.concat([temp2, temp1])

X, y = data_cumh['Review'], data_cumh['Rating (Star)']

print(y)
vectorizer = tfidfvec(max_features=5000, min_df=10, ngram_range=(1, 2))
vectorizer.fit(X)
X_Vect = vectorizer.transform(X)
scaler.fit(X_Vect)
X_Vect = scaler.transform(X_Vect)
print("vectorized")

al = AlExperiment(X_Vect,
                  y,
                  stopping_criteria='num_of_queries',
                  stopping_value=250,
                  batch_size=1)

print(data_cumh.shape)

print("constructed")
# split the data by using split_AL()
from alipy.data_manipulate import split

x = 20 / X.shape[0]
print(x)
train, test, lab, unlab = split(X=X,
                                y=y,
                                test_ratio=0.3,
                                initial_label_rate=x,
        print(uncertain_samples)
        return uncertain_samples
'''   

data_cumh = pd.read_pickle("./cumh_prep_tech_spor.pkl")
print(data_cumh)
data_cumh = data_cumh.sample(frac=1)
print(data_cumh)
X, y = data_cumh['text'], data_cumh['Category']
print(y)
vectorizer=tfidfvec(max_features=5000,min_df=10,ngram_range=(1, 2))
vectorizer.fit(X)
X_Vect = vectorizer.transform(X)
print("vectorized")

al = AlExperiment(X_Vect, y,model=LogisticRegression(), stopping_criteria='num_of_queries', stopping_value=100, batch_size=1)

print(data_cumh.shape)

print("constructed")
# split the data by using split_AL()
from alipy.data_manipulate import split
x=20/X.shape[0]
print(x)
train, test, lab, unlab = split(X=X, y=y,test_ratio=0.3, initial_label_rate=x, split_count=1)


al.set_data_split(train_idx=train, test_idx=test, label_idx=lab, unlabel_idx=unlab)
al.set_query_strategy(strategy='QueryInstanceUncertainty', measure='least_confident')
al.set_performance_metric('accuracy_score')
from sklearn.feature_extraction.text import TfidfVectorizer as tfidfvec

phy_data = pd.read_pickle("./phy_2go.pkl")
classes = np.load("./phy_2go_class.npy")

X, y = phy_data['Body'], classes

vectorizer = tfidfvec(max_features=5000, min_df=10, ngram_range=(1, 1))
vectorizer.fit(X)
X_Vect = vectorizer.transform(X)
print("vectorized")

al = AlExperiment(X_Vect,
                  y,
                  model=LinearSVC(multi_class='crammer_singer'),
                  stopping_criteria='num_of_queries',
                  stopping_value=100,
                  batch_size=5)

print(classes)

print("constructed")
# split the data by using split_AL()
from alipy.data_manipulate import split
x = 50 / X.shape[0]
print(x)
train, test, lab, unlab = split(X=X,
                                y=y,
                                test_ratio=0.3,
                                initial_label_rate=x,
                                split_count=1)
Exemplo n.º 6
0
df = pd.read_excel('Juliet_Test_Suite/combined_data_table.xlsx')
df = encode_and_bind(df, 'Clang Rule')
df = encode_and_bind(df, 'CodeSonar Rule')
df = encode_and_bind(df, 'Severity')
df = encode_and_bind(df, 'CWE')
df = df[np.isfinite(df['True Positive'])]
X = df.drop('True Positive', axis=1)
y = df.loc[:, 'True Positive']
#parameters for models defined here, simply change init_labels, trn_tst_split, splits to change experiment
stop = 300  #stopping value for number of queries
init_labels = 0.005  #initially labelled portion of the dataset
trn_tst_split = 0.2  #train test split to use for each fold
splits = 5  #number of k folds
al_unc = AlExperiment(X,
                      y,
                      model=RandomForestClassifier(n_estimators=100),
                      performance_metric='accuracy_score',
                      stopping_criteria='num_of_queries',
                      stopping_value=stop)
al_unc.split_AL(test_ratio=trn_tst_split,
                initial_label_rate=init_labels,
                split_count=splits,
                all_class=True)
al_unc.set_query_strategy(strategy='QueryInstanceUncertainty')
al_unc.set_performance_metric(performance_metric='accuracy_score')
al_unc.start_query(multi_thread=False)
# print(al.get_experiment_result())
# al.plot_learning_curve()
analyser = ExperimentAnalyser(x_axis='num_of_queries')
analyser.add_method('uncertainty', al_unc.get_experiment_result())
al_qbc = AlExperiment(X,
                      y,
Exemplo n.º 7
0
from sklearn.datasets import load_iris
from alipy.experiment import AlExperiment

# Get the data
X, y = load_iris(return_X_y=True)

# init the AlExperiment
al = AlExperiment(X, y)

# split the data by using split_AL()
al.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=10)

# or set the data split indexes by input the specific parameters
from alipy.data_manipulate import split

train, test, lab, unlab = split(X=X,
                                y=y,
                                test_ratio=0.3,
                                initial_label_rate=0.05,
                                split_count=10)
al.set_data_split(train_idx=train,
                  test_idx=test,
                  label_idx=lab,
                  unlabel_idx=unlab)

# set the query strategy
# using the a pre-defined strategy
al.set_query_strategy(strategy="QueryInstanceUncertainty")

# or using your own query strategy
# class my_qs_class: