Exemplo n.º 1
0
    warnings.simplefilter("ignore")

df = pd.read_csv('data/housing/Boston.csv', header=None)

#dropping index column
df.drop(0, axis=1, inplace=True)

df.columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]

#Sample of the data
print(df.head())

summarize_data(df, ['CHAS', 'RAD'], [
    'CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B',
    'LSTAT'
], 'MEDV')

#splitting data into test and train
train, test = train_test_split(df, test_size=0.25, random_state=7)

train_X = train[train.columns[:-1]]
train_y = train['MEDV']
test_X = test[train.columns[:-1]]
test_y = test['MEDV']

#fitting regression models
fit_regression_models(train_X, train_y, test_X, test_y,
                      'scikit_learn_pkg/metrics/housing')
Exemplo n.º 2
0
#Disabling Warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

df = pd.read_csv('data/telco/WA_Fn-UseC_-Telco-Customer-Churn.csv')

#Conerting text column to float
df.loc[df['TotalCharges'] == ' ', 'TotalCharges'] = np.nan
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

print(df.head())

classes = summarize_data(df, [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod'
], ['tenure', 'MonthlyCharges', 'TotalCharges'], 'Churn', 'classification')

#Converting text to integers in columns
df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0})
df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})
df['Dependents'] = df['Dependents'].map({'Yes': 1, 'No': 0})
df['MultipleLines'] = df['MultipleLines'].map({
    'Yes': 1,
    'No': 0,
    'No phone service': -1
})
df['OnlineSecurity'] = df['OnlineSecurity'].map({
    'Yes': 1,
    'No': 0,
Exemplo n.º 3
0
#Two different datasets for red wine and white wine
red_df = pd.read_csv('data/wine/winequality-red.csv', sep=';')
white_df = pd.read_csv('data/wine/winequality-white.csv', sep=';')

red_df['color'] = 'red'
white_df['color'] = 'white'

#combining red wine data and white wine data
df = pd.concat([red_df, white_df], ignore_index=True)

print(df.head())

summarize_data(df, ['color'], [
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol'
], 'quality')

#one hot encoding
df = pd.concat([df, pd.get_dummies(df['color'])], axis=1)
df.drop('color', axis=1, inplace=True)

#predictor variables
independent_variables = list(df.columns)

#quality is the predicted variable
independent_variables.remove('quality')

#splitting data into test and train
train, test = train_test_split(df, test_size=0.25, random_state=7)
Exemplo n.º 4
0
def process_data(data,opt):
    label_adj_matrix = None
    """if (opt.adj_matrix_lambda > 0):
        print('using heirarchy mask')
        if 'rcv1' in opt.dataset:
            label_adj_matrix = utils.get_pairwise_adj_rcv1(data['dict']['tgt'],path.join(opt.dataset,'tf_interactions.tsv'))
        else:
            label_adj_matrix = utils.get_pairwise_adj(data['dict']['tgt'],path.join(opt.dataset,'tf_interactions.tsv'))"""
    if opt.label_mask == 'prior':
        print('using prior mask')
        # train_matrix = torch.zeros(len(data['train']['tgt']),len(data['dict']['tgt']))
        # for i in range(len(data['train']['tgt'])):
        #     indices = torch.from_numpy(np.array(data['train']['tgt'][i]))
        #     x = torch.zeros(len(data['dict']['tgt']))
        #     x.index_fill_(0, indices, 1)
        #     train_matrix[i] = x
        # train_matrix = train_matrix[:,4:]
        # label_adj_matrix = torch.from_numpy(np.corrcoef(train_matrix.transpose(0,1).cpu().numpy()))
        # label_adj_matrix[label_adj_matrix < 0.0] = 0
        # for i in range(label_adj_matrix.size(0)): label_adj_matrix[i,i] = 0
        # label_adj_matrix[label_adj_matrix > 0] = 1
        


        adj_matrix = torch.eye(len(data['dict']['tgt'])-4)
        for sample in data['train']['tgt']:
            sample2 = sample
            for i,idx1 in enumerate(sample[1:-1]):
                for idx2 in sample2[i+1:-1]:
                    if idx1 != idx2:
                        adj_matrix[idx1-4,idx2-4] = 1
                        adj_matrix[idx2-4,idx1-4] = 1


        label_adj_matrix = adj_matrix




    
    label_vals = torch.zeros(len(data['train']['tgt']),len(data['dict']['tgt']))
    for i in range(len(data['train']['tgt'])):
        indices = torch.from_numpy(np.array(data['train']['tgt'][i]))
        x = torch.zeros(len(data['dict']['tgt']))
        x.index_fill_(0, indices, 1)
        label_vals[i] = x
    # stop()
    values,ranking = torch.sort(label_vals.sum(0),dim=0,descending=True)
    ranking_values = values[2:-2]/values[2:-2].sum()
    # mean_tf_labels = label_vals[:,4:].sum(1).mean()
    ranking = ranking.numpy().tolist()
    ranking = ranking[2:-2]
    ranking.insert(0,2)
    ranking += [0,1,3]

    for sample in data['train']['tgt']: 
        sample = sorted(sample, key=ranking.index)
    for sample in data['valid']['tgt']: 
        sample = sorted(sample, key=ranking.index)
    for sample in data['test']['tgt']:
        sample = sorted(sample, key=ranking.index)

    opt.max_token_seq_len_e = data['settings'].max_seq_len
    opt.max_token_seq_len_d = opt.max_ar_length
    
    if opt.summarize_data:
        utils.summarize_data(data)

    if not 'sider' in opt.dataset:
        data['train']['adj'],data['valid']['adj'],data['test']['adj'] = None,None,None

    #========= Preparing DataLoader =========#
    train_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'],
        src_insts=data['train']['src'],
        adj_insts=data['train']['adj'],
        tgt_insts=data['train']['tgt'],
        batch_size=opt.batch_size,
        binary_relevance=opt.binary_relevance,
        cuda=opt.cuda,
        shuffle=True,
        drop_last=True)

    valid_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'], 
        src_insts=data['valid']['src'],
        adj_insts=data['valid']['adj'],
        tgt_insts=data['valid']['tgt'],
        batch_size=opt.test_batch_size,
        binary_relevance=opt.binary_relevance,
        shuffle=False,
        cuda=opt.cuda)

    test_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'], 
        src_insts=data['test']['src'],
        adj_insts=data['test']['adj'],
        tgt_insts=data['test']['tgt'],
        batch_size=opt.test_batch_size,
        binary_relevance=opt.binary_relevance,
        shuffle=False,
        cuda=opt.cuda)

    opt.src_vocab_size = train_data.src_vocab_size
    opt.tgt_vocab_size = train_data.tgt_vocab_size

    if opt.binary_relevance:
        opt.tgt_vocab_size = opt.tgt_vocab_size - 4
        opt.max_ar_length = opt.tgt_vocab_size

    return train_data,valid_data,test_data,label_adj_matrix,opt
Exemplo n.º 5
0
for col in df.columns:
    print 'col: %s' % col
    col_dtype = df[col].dtype
    print 'dtype: %s' % col_dtype
    print 'na values: %d' % df[col].isnull().sum()
    if col_dtype == int or col_dtype == float:
        print(df[col].describe())
    else:
        print(df[col].value_counts())
    print('\n')

print(df.head())

classes = summarize_data(
    df, ['Dead_Alive', 'Gear', 'Entangled'],
    ['SCL_notch', 'SCL_tip', 'SCW', 'CCL_notch', 'TestLevel_Before'],
    'Species', 'classification')

#Converting text to integers in columns
df['Dead_Alive'] = df['Dead_Alive'].map({'alive': 1, 'dead': 0})
df['Entangled'] = df['Entangled'].map({'free': 1, 'entangled': 0})

#one hot encoding
df = pd.concat([df, pd.get_dummies(df['Gear'])], axis=1)
df.drop('Gear', axis=1, inplace=True)

#splitting data into test and train
train, test = train_test_split(df, test_size=0.25, random_state=7)

train_X = train[train.columns[1:]]
train_y = train['Species']
Exemplo n.º 6
0
from utils import fit_classification_models

#Disabling Warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

iris_df = pd.read_csv('data/iris/iris.csv', header=None)

iris_df.columns = [
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'
]

print(iris_df.head())

classes = summarize_data(
    iris_df, [],
    ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], 'species',
    'classification')

#splitting data into test and train
iris_train, iris_test = train_test_split(iris_df,
                                         test_size=0.25,
                                         random_state=7)

iris_train_X = iris_train[[
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
]]
iris_train_y = iris_train['species']
iris_test_X = iris_test[[
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
]]
iris_test_y = iris_test['species']
Exemplo n.º 7
0
if not sys.warnoptions:
    warnings.simplefilter("ignore")

df = pd.read_csv('data/auto/auto-mpg.csv', header=None)

df.columns = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
    'model_year', 'origin', 'car_name'
]

#filling missing values with mean
df['horsepower'].fillna(value=df['horsepower'].mean(), inplace=True)

print(df.head())

summarize_data(df, ['cylinders', 'model_year', 'origin'],
               ['displacement', 'horsepower', 'weight', 'acceleration'], 'mpg')

#dropping car name column
df.drop('car_name', axis=1, inplace=True)

#one hot encoding
df = pd.concat([
    df,
    pd.get_dummies(df['cylinders']),
    pd.get_dummies(df['origin']),
    pd.get_dummies(df['model_year'])
],
               axis=1)
df.drop(['cylinders', 'origin', 'model_year'], axis=1, inplace=True)

#splitting data into test and train