def load_swbd_data(data_dir=os.getcwd() + "/data/switchboard"):

    from statsmodels.tools import categorical
    import pandas as pd
    prev_dir = os.getcwd()
    print(data_dir)
    os.chdir(data_dir)
    x_text = list(open(data_dir + "/swbd_utterance.csv", "r").readlines())
    x_text = [s.strip() for s in x_text]
    x_text = [clean_str(sent) for sent in x_text]
    y = pd.read_csv(data_dir + "/swbd_act.csv")
    y = list(open(data_dir + "/swbd_act.csv", "r").readlines())
    a = np.array([s.strip() for s in y])
    y = categorical(a, drop=True)
    #y = y.argmax(1)
    '''
    from scikits.statsmodels.tools import categorical

    In [61]: a = np.array( ['a', 'b', 'c', 'a', 'b', 'c'])
    
    In [62]: b = categorical(a, drop=True)
    
    In [63]: b.argmax(1)
    Out[63]: array([0, 1, 2, 0, 1, 2])
    '''
    #return [x_text, y]
    return [x_text, y]
예제 #2
0
파일: views.py 프로젝트: valency/joker-api
def kmeans(request):
    if "header" in request.GET and "n_clusters" in request.GET and "set_id" in request.GET:
        # weight = [float(w) for w in request.GET["weight"].split(",")]
        if "metric" in request.GET:
            metric = request.GET["metric"]
        else:
            metric = "cosine"
        header = request.GET["header"].split(",")
        n_clusters = int(request.GET["n_clusters"])
        cust_set = CustomerSet.objects.filter(id=request.GET["set_id"])
        cust_matrix = numpy.array([])
        dbpk_list = numpy.array([entity.cust.dbpk for entity in cust_set])
        for h in header:
            # Choose header
            cust_column = numpy.array([getattr(entity.cust, h) for entity in cust_set])
            if h in CATEGORICAL_COLUMNS:
                cust_column = categorical(cust_column, drop=True)
            # Stack to matrix
            if cust_matrix.size == 0:
                cust_matrix = cust_column
            else:
                cust_matrix = numpy.column_stack((cust_matrix, cust_column))
        # Normalize
        cust_matrix = scale_linear_by_column(cust_matrix)
        # Weight
        # cust_matrix = numpy.nan_to_num(numpy.multiply(cust_matrix, numpy.array([numpy.array(weight)] * cust_set.count())))
        # Clustering
        kmeans_centres, kmeans_xtoc, kmeans_dist = joker_kmeans.kmeans(cust_matrix, joker_kmeans.randomsample(cust_matrix, n_clusters), metric=metric)
        # Output
        result = []
        for i in range(0, len(dbpk_list)):
            # Update cust set configurations
            cust = Customer.objects.get(dbpk=dbpk_list[i])
            cust_set_entity = cust_set.get(cust=cust)
            cust_set_entity.cluster = kmeans_xtoc[i]
            cust_set_entity.cluster_time = datetime.now()
            cust_set_entity.cluster_features = ";".join(header)
            cust_set_entity.cluster_count = n_clusters
            cust_set_entity.cluster_metric = metric
            cust_set_entity.save()
            # Construct response
            entity = {
                "id": cust.id,
                "cluster": kmeans_xtoc[i]
            }
            for h in header:
                entity[h] = cust.__dict__[h]
            result.append(entity)
        return Response(result)
    else:
        return Response(status=status.HTTP_400_BAD_REQUEST)
예제 #3
0
def load():
    """
    Loads the Grunfeld data and returns a Dataset class.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.

    Notes
    -----
    raw_data has the firm variable expanded to dummy variables for each
    firm (ie., there is no reference dummy)
    """
    data = _get_data()
    raw_data = categorical(data, col='firm', drop=True)
    ds = du.process_recarray(data, endog_idx=0, stack=False)
    ds.raw_data = raw_data
    return ds
예제 #4
0
파일: data.py 프로젝트: CRP/statsmodels
def load():
    """
    Loads the Grunfeld data and returns a Dataset class.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.

    Notes
    -----
    raw_data has the firm variable expanded to dummy variables for each
    firm (ie., there is no reference dummy)
    """
    data = _get_data()
    raw_data = categorical(data, col='firm', drop=True)
    ds = du.process_recarray(data, endog_idx=0, stack=False)
    ds.raw_data = raw_data
    return ds
예제 #5
0
파일: data_util.py 프로젝트: ggaemo/vae
def load_kdd(seed, anomaly_type):
    data =  pd.read_csv(os.path.join('data','kddcup.data.corrected'), header=None)

    for i in [1,2,3]:
        a = np.asarray(data.iloc[:,i])
        b = categorical(a, drop=True)
        data.iloc[:,i] = b.argmax(1)

    data.iloc[:,:-1] = normalize(data.iloc[:,:-1], axis=0)
    data_normal = data.iloc[np.where(data.iloc[:,-1]=='normal.')[0],:-1]
    mask = np.random.choice(np.arange(data_normal.shape[0]), data_normal.shape[0])
    data_normal = data_normal.iloc[mask,:]
    data_anomaly = data.iloc[np.where(data.iloc[:,-1]==anomaly_type)[0],:-1]

    x_train, x_test, t_train, t_test = train_test_split(data_normal, [0] * data_normal.shape[0], test_size=0.2, random_state=seed)
    x_test = np.vstack((x_test, data_anomaly))
    t_test = np.hstack((t_test, [1] * data_anomaly.shape[0]))
    print data_normal.shape
    print data_anomaly.shape
    return [x_train, t_train, x_test, t_test]
예제 #6
0
def load_pandas():
    """
    Loads the Grunfeld data and returns a Dataset class.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.

    Notes
    -----
    raw_data has the firm variable expanded to dummy variables for each
    firm (ie., there is no reference dummy)
    """
    from pandas import DataFrame
    from statsmodels.tools import categorical
    data = _get_data()
    raw_data = categorical(data, col='firm', drop=True)
    ds = du.process_recarray_pandas(data, endog_idx=0)
    ds.raw_data = DataFrame(raw_data)
    return ds
예제 #7
0
def load_pandas():
    """
    Loads the Grunfeld data and returns a Dataset class.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.

    Notes
    -----
    raw_data has the firm variable expanded to dummy variables for each
    firm (ie., there is no reference dummy)
    """
    from pandas import DataFrame
    from statsmodels.tools import categorical
    data = _get_data()
    raw_data = categorical(data, col='firm', drop=True)
    ds = du.process_recarray_pandas(data, endog_idx=0)
    ds.raw_data = DataFrame(raw_data)
    return ds
예제 #8
0
imp = sklp.Imputer(missing_values=0,strategy='mean',axis=0)
imp.fit_transform(Dataset)

# PCA
import sklearn.decomposition as skd
pca = skd.PCA(n_components=n, whiten=False)
pca.fit(Dataset)
Dataset_Reduced_Dim = pca.transform(Dataset)

# Train and Test
x_train, x_test, y_train, y_test = sklm.train_test_split(x,y,test_size = 0.2)


# Dummy encoding
from statsmodels.tools import categorical
cat_encod = categorical(data, dictnames=False, drop=False) #may need reshape(-1,1)







### plot ------------------------------------------------------------------
plt.plot(x,y)
plt.title('Training Error by Iteration')
plt.xlabel('Iteration Number')
plt.ylabel('Error')

# plot data in 3D plot
plt.figure()
import sklearn.model_selection as skms
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import cross_val_score
#from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import KFold
warnings.filterwarnings('ignore')
#----------- -----------Pre Pre-Procesing of Timeseries with Pandas
dframe = pd.read_csv('Q1Data.csv',
                     header=None)  #header none because no column names
dframe.info()
numdframe = dframe.iloc[:, 1:]
catdframe = dframe.iloc[:, 0]
catdf_encod = categorical(catdframe.values, dictnames=False, drop=True)
numArr = np.asarray(numdframe.values)
catArr = np.asarray(catdf_encod)
Output = numArr[:, 5]
Inp_num = numArr[:, 0:5]
Input = np.concatenate((Inp_num, catArr), axis=1)
Input = np.c_[catArr, Inp_num]
print(Input.shape)

####Q1 (b)########

imp = skp.Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
Input_new = imp.fit_transform(Input)

####Q1 (c)######
예제 #10
0
         if e[2] != 'non-event':
             labels.append(e[2])
             features.append(
                 featurize(
                     np.array(data[[
                         'ax', 'ay', 'az', 'g1', 'g2', 'g3', 'gx', 'gy',
                         'gz', 'r1', 'r2', 'r3'
                     ]]), e, templates))
             raw_data.append(
                 np.array(data[['ax', 'ay', 'az']].iloc[e[0]:e[1], ]))
         else:
             pass
 # Step 4: Classify Features
 features = np.array(features)
 labels = np.array(labels)
 labels_c = categorical(np.array(labels), drop=True)
 labels_num = np.argmax(labels_c, axis=1)
 features_complete = features[~np.isnan(features).any(axis=1), :]
 labels_complete = labels[~np.isnan(features).any(axis=1)]
 f_train, f_test, l_train, l_test = train_test_split(features_complete,
                                                     labels_complete,
                                                     test_size=0.2,
                                                     random_state=42)
 clf = RandomForestClassifier(n_estimators=250,
                              max_features=.33,
                              oob_score=True)
 clf = clf.fit(f_train, l_train)
 importances = clf.feature_importances_
 std = np.std([tree.feature_importances_ for tree in clf.estimators_],
              axis=0)
 y_predicted = clf.predict(f_test)
"""

import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tools import categorical
from sklearn import tree
# ------------------ Loading Dataset --------------------------#
dataframe = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data")
dataframe = dataframe .drop(dataframe .index[:-1000])
numericdf = dataframe[dataframe .columns[1:9]]
categordf = dataframe[dataframe .columns[0]]
categordf_en = categorical(categordf.values , drop=True)
categordf_en = categordf_en[:, 0:2]
numeric_arr = np.asarray(numericdf.values)
categor_arr = np.asarray(categordf_en)
Output = numeric_arr[:, 7]
Input_numeric = numeric_arr[:, 0:6]
Input_categor = categor_arr
Input = np.concatenate((Input_numeric, Input_categor), axis=1)
#---------------------------------------------------------------#
RF = RandomForestClassifier(n_estimators=5, random_state=12)
RF.fit(Input, Output)
Z_RF = RF.predict(Input)
CM_RF= confusion_matrix(Output, Z_RF)
#---------------------------------------------------------------
DT = tree.DecisionTreeClassifier()
DT.fit(Input, Output)
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Exercise – Week VIII
Data Programming With Python – Fall / 2017
Multi-class Classification - Data Pre-processing, SVM, NB, Decision Tree,
    Random Forest, Classification Metrics - Confusion Matrix
"""

### Dummy Coding / Encoding

from statsmodels.tools import categorical
import numpy as np
a = np.array(['Type1', 'Type2', 'Type3', 'Type1', 'Type2', 'Type3'])
cat_encod = categorical(a, dictnames=False, drop=True)
print(a.reshape(-1, 1))
print(cat_encod)

### Support Vector

from sklearn import svm
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)
svc_linear = svm.SVC(kernel='linear', C=1)
# Classification of data
# Qualitative Data
# Quantitative Data 

# Dummy encoding (similar to one-hot encoding)
from statsmodels.tools import categorical
import numpy as np
a = np.array(['Type1', 'Type2','Type3','Type1','Type2','Type3',])
cat_encod = categorical(a, dictnames=False, drop=True)
cat_encod2 = categorical(a, dictnames=False, drop=False)
print(a.reshape(-1,1))
print(cat_encod) #with Drop
print(cat_encod2) #without Drop
예제 #14
0
파일: cat.py 프로젝트: Zerowxm/kdd-cup2009
# Divide data into 80% training, 20% testing.
train_indices = list(range(40)) + list(range(50,90))
test_indices = list(range(40,50)) + list(range(90,100))
X_train = X[train_indices]
X_test = X[test_indices]
y_train = Y[train_indices]
y_test = Y[test_indices]


###########################################################################
###### Convert categorical variable to matrix and merge back with training
###### data.

# Fake categorical variable.
catVar = np.array(['a']*40 + ['b']*40)
catVar = categorical(catVar, drop=True)
X_train = np.concatenate((X_train, catVar), axis = 1)

catVar = np.array(['a']*10 + ['b']*10)
catVar = categorical(catVar, drop=True)
X_test = np.concatenate((X_test, catVar), axis = 1)
###########################################################################

# Model and test.
clf = GradientBoostingClassifier(learning_rate=0.01,max_depth=8,n_estimators=50).fit(X_train, y_train)

prob = clf.predict_proba(X_test)[:,1]   # Only look at P(y==1).

fpr, tpr, thresholds = roc_curve(y_test, prob)
roc_auc_prob = auc(fpr, tpr)
예제 #15
0
def one_hot_encode(labels):
    labels = categorical(labels, drop=True)
    return labels
예제 #16
0
            temp.append(tube_specs[key][i])
tube_specs_unique=list(set(temp))
tube_specs_unique.sort()

#dataframe of features
n=df_train_set.shape[0]
#generate month and year columns
X_time=np.zeros((n,3))
for i in range(n):
    X_time[i,:]=[int(y) for y in df_train_set['quote_date'][i].split('-')]

cols_year=[str(x) for x in list(set(X_time[:,0]))]
cols_month=[str(x) for x in list(set(X_time[:,1]))]

from statsmodels.tools import categorical
df_year=pd.DataFrame(categorical(X_time[:,0],drop=True),columns=cols_year)
df_month=pd.DataFrame(categorical(X_time[:,1],drop=True),columns=cols_month)

df_tube['end_a_1x']=pd.Categorical(df_tube['end_a_1x']).labels
df_tube['end_a_2x']=pd.Categorical(df_tube['end_a_2x']).labels
df_tube['end_x_1x']=pd.Categorical(df_tube['end_x_1x']).labels
df_tube['end_x_2x']=pd.Categorical(df_tube['end_x_2x']).labels

df_train_set=pd.merge(df_train_set, df_tube, on ='tube_assembly_id')

df_train_set.drop('material_id', axis=1, inplace=True)
df_train_set.drop('end_a', axis=1, inplace=True)
df_train_set.drop('end_x', axis=1, inplace=True)

df=pd.DataFrame()
#fill out with zeros, one column for each specs category
    'GB180': [400, 2, 5],
    'GB1440': [400, 2, 5]
}

pars = par_dict[model + str(resolution)]

file_df = pd.read_csv(working_directory + '\\' + fitting_file, header=0)

X = file_df.loc[:, 'MX.5P_up':]
y = file_df['MX.5P_brkpt']

month_labels = [
    'mo-' + str(item) for item in list(sorted(file_df['Month'].unique()))
]
month_list = np.array(file_df['Month'].tolist())
month_cat = categorical(month_list, drop=True)

columns = list(X.columns)
columns.extend(month_labels)

X_cat = np.concatenate((X, month_cat), axis=1)
X = pd.DataFrame(X_cat, columns=columns)

###############################################################################
'prediction data'
###############################################################################
user_df = pd.read_csv(working_directory + '\\' + user_file, header=0)

X_user = user_df.loc[:, :]

month_labels_user = [
예제 #18
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.tools as stools

from scipy import stats

from mpl_toolkits.mplot3d import Axes3D

def to_num(data):
    return sum([data[b] << b for b in range(len(data))])

raw = pd.read_csv("C:\\Temp\\Random\\Crime_Map.csv")

count = 10000

longi = raw["Longitude"][:count]
lat = raw["Latitude"][:count]
encoded = np.array([to_num(x) for x in stools.categorical(np.array(raw["Offense Type"]),drop=True).astype(int)])[:count]

kde = stats.gaussian_kde(np.row_stack((longi,lat)))

plt.scatter(longi,lat,c=kde(np.row_stack((longi,lat))))
plt.show()
예제 #19
0
## 2. Modeling ######################################################################################################
# focus only on **TRANSFER** and **CASH_OUT** (where there are fraud), data slicing and data transformation. Keep only interested transaction type ('TRANSFER', 'CASH_OUT')
print("## Focus only on **TRANSFER** and **CASH_OUT** (where there are fraud)")
tmpData = raw_data.loc[(raw_data['type'].isin(['TRANSFER', 'CASH_OUT'])), :]

# Data slicing - Drop unnecessary data ('step', 'nameOrig', 'nameDest', 'isFlaggedFraud')
print(
    "## Data slicing - Drop unnecessary data ('step', 'nameOrig', 'nameDest', 'isFlaggedFraud')"
)
# tmpData.drop(['step', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True)
tmpData.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True)
tmpData = tmpData.reset_index(drop=True)
# Convert categorical variables to numeric variable
a = np.array(tmpData['type'])
b = categorical(a, drop=True)
tmpData['type_num'] = b.argmax(1)
tmpData.drop(['type'], axis=1, inplace=True)

## Plot Correlations of TRANSFER and CASH_OUT transactions and selected features
# print("## Plot Correlations of TRANSFER and CASH_OUT transactions and selected features")
# plotCorrelationHeatmap(tmpData, "TRANSFER and CASH_OUT Correlation")
# plotCorrelationHeatmap(raw_data.loc[(raw_data.type == 'TRANSFER'), :], "TRANSFER Correlation")
# plotCorrelationHeatmap(raw_data.loc[(raw_data.type == 'CASH_OUT'), :], "CASH_OUT Correlation")

## Quickly get the count and the target variable count.
# print("## Plot Transaction count by type")
# ax = tmpData.type.value_counts().plot(kind='bar', title="Transaction count by type", figsize=(6,6))
# for p in ax.patches:
# 	ax.annotate(str(format(int(p.get_height()), ',d')), (p.get_x(), p.get_height()*1.01))
# plt.show()
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 18 03:43:20 2018

@author: Ashtami
"""

from statsmodels.tools import categorical
import numpy as np
a = np.array(['Type1', 'Type2', 'Type3', 'Type1', 'Type2', 'Type3'])
cat_encod = categorical(a, dictnames=False, drop=True)  #a.values
print(a.reshape(-1, 1))
print(cat_encod)
import numpy as np
# from keras.utils import np_utils
from statsmodels.tools import categorical

nb_classes = 30

test_X = np.load('test_features.npy')
test_labels = np.load('test_labels.npy')
# print(test_X.shape)
# print(test_labels.shape)

training_X = np.load('train_features.npy')
training_labels = np.load('train_labels.npy')
# print(training_X.shape)
# print(training_labels.shape)

test_Y = categorical(test_labels, drop=True)
training_Y = categorical(training_labels, drop=True)
# print(test_Y.shape)
# print(training_Y.shape)