示例#1
0
def random_forest():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
    df['species'] = pd.Factor(iris.target, iris.target_names)
    pd.factorize
    df.head()

    train, test = df[df['is_train'] == True], df[df['is_train'] == False]

    features = df.columns[:4]
    clf = RandomForestClassifier(n_jobs=2)
    y, _ = pd.factorize(train['species'])
    clf.fit(train[features], y)

    preds = iris.target_names[clf.predict(test[features])]
    pd.crosstab(test['species'],
                preds,
                rownames=['actual'],
                colnames=['preds'])
示例#2
0
# __author__ = 'cjweffort'
# -*- coding: utf-8 -*-

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df['species'] = pd.Factor(iris.target, iris.target_names)
df.head()

train, test = df[df['is_train'] == True], df[df['is_train'] == False]

features = df.columns[:4]
clf = RandomForestClassifier(n_jobs=2)
y, _ = pd.factorize(train['species'])
clf.fit(train[features], y)

preds = iris.target_names[clf.predict(test[features])]
pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
示例#3
0
from sklearn.datasets import load_iris
import pandas as pd
from pandasql import sqldf
from pandasql import load_meat, load_births
import re

births = load_births()
meat = load_meat()
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Factor(iris.target, levels=iris.target_names)
iris_df.columns = [re.sub("[() ]", "", col) for col in iris_df.columns]

print sqldf("select * from iris_df limit 10;", locals())
print sqldf("select sepalwidthcm, species from iris_df limit 10;", locals())

q = """
      select
        species
        , avg(sepalwidthcm)
        , min(sepalwidthcm)
        , max(sepalwidthcm)
      from
        iris_df
      group by
        species;
        
"""
print "*" * 80
print "aggregation"
print "-" * 80
示例#4
0
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
data = pd.read_excel("xiechengscore.xlsx")
data = pd.DataFrame(data, columns=data.sheet_names)
data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75
data['species'] = pd.Factor(data.target, data.target_names)

#最小-最大值规范化  环境评分、服务评分、设施评分、用户推荐比、用户评分、评价内容
#酒店等级
data['hotelClass'] = (data['hotelClass'] - data['hotelClass'].min()) / (
    data['hotelClass'].max() - data['hotelClass'].min())
#酒店最低价
data['hotelLowestprice'] = (
    data['hotelLowestprice'] - data['hotelLowestprice'].min()) / (
        data['hotelLowestprice'].max() - data['hotelLowestprice'].min())
#酒店评论数
data['hotelComment'] = (data['hotelComment'] - data['hotelComment'].min()) / (
    data['hotelComment'].max() - data['hotelComment'].min())
#用户推荐比
data['userRecommended'] = (
    data['userRecommended'] - data['userRecommended'].min()) / (
        data['userRecommended'].max() - data['userRecommended'].min())
#卫生分数
data['healthScore'] = (data['healthScore'] - data['healthScore'].min()) / (
    data['healthScore'].max() - data['healthScore'].min())
#环境评分
data['surroundingsScore'] = (
    data['surroundingsScore'] - data['surroundingsScore'].min()) / (
        data['surroundingsScore'].max() - data['surroundingsScore'].min())
#服务评分
示例#5
0
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn import ensemble
from credit_card_data import read_data
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report

df = read_data()

state_names = df.state1.unique()
state_names.sort()
state_names = pd.Series(state_names)

state_factor = pd.Factor(labels=state_names.index,
                         levels=state_names.values,
                         name="State")
df["state_factor"] = df.state1.apply(lambda x: state_factor.levels.get_loc(x))
df = df.drop("state1", axis=1)

domain_names = df.domain1.unique()
domain_names.sort()
domain_names = pd.Series(domain_names)

domain_factor = pd.Factor(labels=domain_names.index,
                          levels=domain_names.values,
                          name="Domain")
df["domain_factor"] = df.domain1.apply(
    lambda x: domain_factor.levels.get_loc(x))
df = df.drop("domain1", axis=1)
untrained = bigdata[bigdata['meta'] >= 15]
print 'trained data'
print trainedData[:5]
#extract two columns from trainedData
#convert to numpy array
features = trainedData.ix[:, ['ratio', 'area']].as_matrix(['ratio', 'area'])
test_features = untrained.ix[:, ['ratio', 'area']].as_matrix(['ratio', 'area'])
print 'features'
print features[:5]
print 'features shape', features.shape
print 'features type', type(features)
##label is a string:single, touching,nuclei,dust
print 'labels convertion'
lab1 = trainedData['type']
print 'lab1', type(lab1)
f = pandas.Factor(lab1)
print 'factor f', type(f)
print 'labels', f.labels[:5]
print 'labels type', type(f.labels)
print 'labels shape', f.labels.shape
#
##Classify with sklearn
classifier = svm.SVC()
model = classifier.fit(features, f.labels)
predicted = classifier.predict(test_features)

#match predicted /classified
hiddenlab1 = untrained['type']
hiddf = pandas.Factor(hiddenlab1)
match = (predicted == hiddf.labels)
print "prediction"