Exemplo n.º 1
0
import pandas as pd
import numpy as np
import basic_func as func

(listing_id, features, values, data) = func.load_unicef_data("train3.json")

df = data
high = df.loc[df["interest_level"] == 2, ["listing_id"]]
df_high = df.loc[df["listing_id"].isin(high["listing_id"])]

medium = df.loc[df["interest_level"] == 1, ["listing_id"]]
df_medium = df.loc[df["listing_id"].isin(medium["listing_id"])]

low = df.loc[df["interest_level"] == 0, ["listing_id"]]
df_low = df.loc[df["listing_id"].isin(low["listing_id"])]

(listing_high, values_high) = func.load_by_data(df_high)
(listing_medium, values_medium) = func.load_by_data(df_medium)
(listing_low, values_low) = func.load_by_data(df_low)
# proportion of target variable values
p = features.index('interest_level')
# get the counts of different levels
high = 0
medium = 0
low = 0
for value in values[:, p]:
    if value == 2:
        high = high + 1
    elif value == 1:
        medium = medium + 1
    elif value == 0:
Exemplo n.º 2
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import basic_func as func

(listing_id, features, values, train_df) = func.load_unicef_data("train.json")

# record the number of missing values
# initialize the counts
counts = dict()
for feature in features:
    counts[feature] = 0
# print(type(values[1, p]) is int)
print(train_df.isnull().sum())
for p in range(0, len(features)):
    for i in range(values[:, p].shape[0]):
        # missing value of string types
        if type(values[1, p]) is str:
            if values[i, p] == "" or values[i, p].replace(" ", "") == "":
                values[i, p] = ""
                counts[features[p]] = counts[features[p]] + 1
        # missing value of list value
        if type(values[1, p]) is list:
            if not values[i, p]:
                counts[features[p]] = counts[features[p]] + 1
# count is missing value
print(counts)

# handle missing value in street_address and display_address
p = features.index("display_address")
q = features.index("street_address")
Exemplo n.º 3
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import basic_func as func
import time
import seaborn as sns
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from tkinter import _flatten

(listing_id, features_all, values,
 train_df) = func.load_unicef_data('train.json')
# print(features)
# print(values)
ps = PorterStemmer()
p = features_all.index('features')
features = values[:, p]
features = list(features)
# new_words = list()
new_features = list(_flatten(features))
for i in range(len(new_features)):
    new_features[i] = new_features[i].lower()
    new_features[i] = ps.stem(new_features[i])

results = pd.value_counts(new_features)
temp = results.loc[results < 50]
results = results.loc[~results.isin(temp)]
feature_name = list(results.index)
# transform the features
new_values = list()
for feature in values[:, p]:
Exemplo n.º 4
0
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, \
    StackingClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import basic_func as func
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.preprocessing import StandardScaler
# import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

(listing_id_train, features_train, values_train,
 data_train1) = func.load_unicef_data("new_train.json")
(listing_id_train2, features_train2, values_train2,
 data_train2) = func.load_unicef_data('test.json')
(listing_id_test, features_test, values_test,
 data_test1) = func.load_unicef_data("new_test.json")

# get target --> target
p = features_train.index('interest_level')
target = values_train[:, p]
# print(target)

# get train data --> values_train
values_train_temp1 = values_train[:, :p]
values_train_temp2 = values_train[:, p + 1:]
values_train = np.append(values_train_temp1, values_train_temp2, axis=1)
train = values_train
Exemplo n.º 5
0
import numpy as np
import pandas as pd
import basic_func as func
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

(listing_id_train, features_train, values_train,
 data_train1) = func.load_unicef_data('trainxxx.json')
(listing_id_train2, features_train2, values_train2,
 data_train2) = func.load_unicef_data('test.json')
(listing_id_test, features_test, values_test,
 data_test1) = func.load_unicef_data('new_test.json')
# print all features

print(features_train)
# get train data

p = features_train.index('interest_level')
target = values_train[:, p]
# values_train = np.delete(values_train, p, 1)
# features_train.remove('interest_level')
# # get target data
# p = features_train.index('distance')
# values_train = np.delete(values_train, p, 1)
#
# p = features_test.index('distance')
# values_test = np.delete(values_test, p, 1)

train = values_train
Exemplo n.º 6
0
import pandas as pd
from sklearn import tree
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import graphviz
import basic_func as func
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

(listing_id, features, values, data) = func.load_unicef_data("trainxxx.json")
(listing_id2, features2, values2, data2) = func.load_unicef_data("test.json")
(listing_id_test1, features_test1, values_test1,
 data_test1) = func.load_unicef_data("new_test.json")
# (listing_id_test2, features_test2, values_test2, data_test2) = func.load_unicef_data("new_test2.json")
p = features.index('interest_level')
X = values[:, :p]
X = np.append(X, values[:, p + 1:], axis=1)
y = values[:, p]
t = list()
row = y.shape[0]
print(row)
count1 = 0
count2 = 0
for i in range(row):
    if y[i] == 0:
        t.append('low')
    elif y[i] == 2:
        t.append('high')
    elif y[i] == 1:
Exemplo n.º 7
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import basic_func as func
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from tkinter import _flatten

(listing_id, features_all, values,
 train_df) = func.load_unicef_data(fname="final_train.json")

# fina all the description data
words_all = func.deal_with_text(features_all, values, "description", 0.1)
# transform the description part
p = features_all.index("description")
descriptions = list()
for i in range(values[:, p].shape[0]):
    description = list()
    # find the words in every description
    if values[i, p].replace(" ", "") != "" and values[
            i, p] != "Must see!" and values[i, p].replace(".", "") != "":
        words = func.deal_with_text(features_all, values[i, :].reshape(
            (1, values.shape[1])), "description")
        for word in words:
            if word in words_all:
                description.append(word)
    # append the new description
    descriptions.append(description)

values[:, p] = descriptions
print("description = " + str(len(descriptions)))
Exemplo n.º 8
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import basic_func as func
import time
import seaborn as sns

(listing_id, features, values) = func.load_unicef_data()
# print(features)
# print(values)
train_df = pd.read_json('train.json')

# hist of price
p = features.index('price') + 1
min_price = min(train_df['price'])
max_price = int(np.percentile(train_df['price'], 99))
for i in range(values[:, p].shape[0]):
    if values[i, p] > max_price:
        values[i, p] = max_price
bins_price = range(min_price, max_price, 200)

fig1 = plt.figure(num='fig1')
plt.figure(num='fig1')
plt.subplot(121)
plt.scatter(range(train_df['price'].shape[0]), np.sort(train_df['price']))
plt.xlabel('index', fontsize=12)
plt.ylabel('price', fontsize=12)
plt.subplot(122)
(counts, bins, patches) = plt.hist(values[:, p],
                                   bins_price,
                                   facecolor="blue",