Exemplo n.º 1
0
def p3():
    # load data
    data = pd.read_csv("finances.csv")
    # quarterly basis
    dates = pd.period_range("1978-09", periods=84, freq="Q")
    data.index = dates
    # plot data
    data.drop([], axis=1).plot(linewidth=1)
    plt.xlabel("Date")
    plt.ylabel("Earnings and Expenses")
    plt.title("Earnings and Expenses per Quarter")
    plt.show()
Exemplo n.º 2
0
def p1():
    # load data
    data = pd.read_csv("DJIA.csv")
    # set index as datetime
    date_index = pd.to_datetime(data["DATE"], format="%Y-%m-%d")
    data.index = date_index
    df = data.drop(columns=["DATE"])
    # drop empty rows and change to floats
    df = df.dropna(subset=['VALUE'])
    df = df[(df["VALUE"] != ".")]
    df["VALUE"] = df["VALUE"].astype('float')
    plt.plot(df["VALUE"], lw=0.5)
    plt.show()
Exemplo n.º 3
0
def p6():
    # load data
    data = pd.read_csv("DJIA.csv")
    # set index as datetime
    date_index = pd.to_datetime(data["DATE"], format="%Y-%m-%d")
    data.index = date_index
    df = data.drop(columns=["DATE"])
    # drop empty rows and change to floats
    df = df.dropna(subset=['VALUE'])
    df = df[(df["VALUE"] != ".")]
    df["VALUE"] = df["VALUE"].astype('float')
    # plot data
    windows = [30, 120, 365]
    plt.figure(figsize=(10, 8))
    plt.plot(df, alpha=0.5, label='actual')
    for w in windows:
        plt.plot(df.rolling(window=w).max(), alpha=0.5, label=f'window = {w}')
    plt.title('Rolling maximums')
    plt.legend()
    plt.show()
Exemplo n.º 4
0
def p5():
    # load data
    data = pd.read_csv("DJIA.csv")
    # set index as datetime
    date_index = pd.to_datetime(data["DATE"], format="%Y-%m-%d")
    data.index = date_index
    df = data.drop(columns=["DATE"])
    # drop empty rows and change to floats
    df = df.dropna(subset=['VALUE'])
    df = df[(df["VALUE"] != ".")]
    df["VALUE"] = df["VALUE"].astype('float')
    # find difference for each day and order by value
    diff = df - df.shift(1)
    s_g = diff.sort_values("VALUE", ascending=False).index[0]
    s_l = diff.sort_values("VALUE", ascending=True).index[0]
    # find difference for each month and order by value
    m_data = df.resample('M').first()
    m_diff = (m_data - m_data.shift(1)).dropna()
    m_g = m_diff.sort_values("VALUE", ascending=False).index[0]
    m_l = m_diff.sort_values("VALUE", ascending=True).index[0]
    print("The single day with the largest gain was " + str(s_g) +
          ". The single day with the largest loss was " + str(s_l) +
          ". The month with the largest gain was " + str(m_g) +
          ". The month the the largest loss was " + str(m_l) + ".")
Exemplo n.º 5
0

#%%Case 3a: Unbalanced datasets for classification purpose. Following the case 1, here is the equivalent solution:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

#%%%
import pandas as pd
from sklearn.model_selection import train_test_split
datafile_name = 'path_to_data_file'
data = pd.read_csv(datafile_name)
data=df
#target_attribute = data['column_name']
target_attribute = data['mpg']
data = data.drop(columns = ['mpg'], axis = 1) 
X_train, X_test, y_train, y_test = train_test_split(data, target_attribute, test_size=0.2)
X_train
X_test
y_train
y_test
df.applymap(lambda x: len(str(x)))
pd.applymap?
pd.applymaplen(X_train,X_test)

#%%%To split into more than two classes such as train, test, and validation, one can do:
probs = np.random.rand(len(df))
training_mask = probs < 0.7
test_mask = (probs>=0.7) & (probs < 0.85)
validation_mask = probs >= 0.85
probs
import numpy as np
import keras

column_names = [
    "subject#", "age", "sex", "test_time", "motor_UPDRS", "total_UPDRS",
    "Jitter(%)", "Jitter(Abs)", "Jitter:RAP", "Jitter:PPQ5", "Jitter:DDP",
    "Shimmer", "Shimmer(dB)", "Shimmer:APQ3", "Shimmer:APQ5", "Shimmer:APQ11",
    "Shimmer:DDA", "NHR", "HNR", "RPDE", "DFA", "PPE"
]

data = pd.read_csv("parkinsons.csv")

df = pd.DataFrame(data, columns=column_names)

data = df.drop('subject#', axis=1)
data1 = data.drop('age', axis=1)
data2 = data1.drop('sex', axis=1)
data3 = data.drop('test_time', axis=1)

X = data3[[
    "motor_UPDRS", "Jitter(%)", "Jitter(Abs)", "Jitter:RAP", "Jitter:PPQ5",
    "Jitter:DDP", "Shimmer", "Shimmer(dB)", "Shimmer:APQ3", "Shimmer:APQ5",
    "Shimmer:APQ11", "Shimmer:DDA", "NHR", "HNR", "RPDE", "DFA", "PPE"
]]

y = data3[["total_UPDRS"]]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


def get_model():