示例#1
0
def outputs(model, pred, df, csv_path_name):
    df = clean_data(df)
    pred = clean_data(pred)
    y = df['price']
    x = df.drop(columns='price')
    model = model()
    model.fit(x, y)
    y_pred = (model.predict(pred))
    return pd.DataFrame(y_pred, columns=["price"]).to_csv(csv_path_name)
示例#2
0
def main(argv):
    df = clean_data(argv[0])

    X  = df.drop(['PassengerId', 'Survived'], 1)
    y  = df['Survived']

    select_features(X, y)
def test(testfile, outputfile, best=False):
    features_n = 14 if best else 18  # number of features used

    test_data = pd.read_csv(testfile)

    if best:
        test_data = test_data[test_data["測項"] != "RAINFALL"]
        test_data = test_data[test_data["測項"] != "THC"]
        test_data = test_data[test_data["測項"] != "WD_HR"]
        test_data = test_data[test_data["測項"] != "WIND_DIREC"]

    test_data = clean_data(test_data)
    X_test = []
    for i in range(0, test_data.shape[0], features_n):
        X_test.append(test_data[i:i + features_n, :].ravel())
    X_test = np.array(X_test)
    # print(X_test)
    X_test = np.concatenate((np.ones((X_test.shape[0], 1)), X_test), axis=1)
    weightsfile = "./models/weights-best.npy" if best else "./models/weight.npy"
    Y_test = np.dot(X_test, np.load(weightsfile))

    # create output directory if not exist
    dirname = os.path.dirname(outputfile)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(outputfile, "w") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["id", "value"])
        for i, y in enumerate(Y_test):
            writer.writerow(["id_%d" % i, y])
示例#4
0
def main(country, yeari, yearf):
    data = get_data()
    cldata = clean.clean_data(data)
    filfix = filter_fixer(cldata, country, yeari, yearf)
    pop_plot_r, gr_plot_r = analyse_this(filfix)
    nomb, altnomb, capi, regi, langu, cpopul = enrich_that(filfix)
    rpdf = make_PDF(nomb, altnomb, capi, regi, langu, cpopul, pop_plot_r,
                    gr_plot_r)
    return rpdf
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-source",
                        choices=["local", "remote"],
                        nargs=1,
                        help="where data should be gotten from")
    args = parser.parse_args()

    location = args.source[0]

    if location == "local":
        download()
        grab_data_from_downloaded_raw_files()
    else:
        grab_data_by_scraping_and_api_requests()

    future_dates, past_dates_format = clean_data()
    draw_picuture(future_dates, past_dates_format)
示例#6
0
def main():

    dir_path = './raw_html'
    #dir_path = input("Input data file path: ")

    df_record = []

    file_list = get_file_list(dir_path)
    #print(file_list)

    for file_name in file_list:
        file_path = os.path.join(dir_path, file_name)
        tree = read_html(file_path)

        record = parse_html(tree)
        df_record.append(record)

    df = pd.DataFrame(df_record)

    lagou = clean_data(df)
    #print(df.head())
    print(lagou)

    lagou.to_csv('./output/lagou.csv', encoding='gbk')
    find_thresh(preds, y_test, 10)

    cross_val_recall_auc(X_train, y_train, best_rf_model)


####################################################

###############################################
############### MAIN ##########################

if __name__ == '__main__':
    df = pd.read_json('website/data/data.json')

    ## Clean the data
    cleaned = clean.clean_data(df)
    print(cleaned.columns)

    ## Getting targets and cleaned features
    y = clean.get_target(cleaned)

    # log_reg_cols = ['user_age', 'age_dummy']
    # fit_logreg(y, cleaned, log_reg_cols)

    #Cols: 'USD','GBP','CAD','AUD','EUR','NZD','MXN','age_dummy','user_age','payoutdiff', 'eventdiff', 0.0, 1.0, 3.0, 'gts', 'num_order', 'num_payouts','payee_exists'
    rf_cols = [
        'USD', 'GBP', 'CAD', 'AUD', 'EUR', 'NZD', 'MXN', 'age_dummy',
        'user_age', 'payoutdiff', 'gts', 'num_order', 'num_payouts',
        'payee_exists', 'dict_elements'
    ]
    rf_model = fit_rf(y, cleaned, rf_cols)
示例#8
0
def main(argv):
    df_train = clean_data(argv[0])
    df_test  = clean_data(argv[1])

    select_parameters(df_train, df_test)
示例#9
0
def main(argv):
    df_train = clean_data(argv[0])
    df_test  = clean_data(argv[1])

    make_prediction(df_train, df_test, fit_tree, './output/prediction_tree.csv')
示例#10
0
modules = ["CMY 127"]
#modules = ["CMY 382", "CMY 383", "CMY 384", "CMY 385"]
chorts = [2011, 2012, 2013, 2014, 2015]
show_plans = [
    'Chemistry', 'Microbiology', 'Biochemistry', 'Physics', 'Geology',
    'Medical', 'Biological', 'Human Physiology', 'Genetics',
    'Veterinary Science'
]

names = ["Marks", "Plans", "Students"]
file_names = {
    name: [
        "../Data/{0}/{1}".format(name, f)
        for f in os.listdir('../Data/{0}'.format(name))
    ]
    for name in names
}

plans = clean.read_plan(file_names["Plans"])

for chort in chorts:
    sn, marks = clean.read_marks(file_names["Marks"], modules, chort, True)
    df = clean.read_student(file_names["Students"], chort, sn, plans, marks)
    dfnew = clean.clean_data(df, show_plans, 15)

    for key, val in d3s.items():
        gen_html("base_bl.html", get_json.get(dfnew), modules[0], str(chort),
                 val, "../Output/Dark/{}".format(key))
        gen_html("base_mbl.html", get_json2.get(dfnew), modules[0], str(chort),
                 val, "../Output/Dark/{}".format(key), True)
示例#11
0
                   RandomForest]  #, LogisticRegression]

accuracy_dict = dict(
    zip([platform.__name__ for platform in model_platforms],
        [[] for i in range(len(model_platforms))]))
raw_df = pd.read_csv(filepath)
'''
bin_map = {'age': linspace(0, 100, 11).tolist()}#,
           #'capital.gain': linspace(0, 100000, 21).tolist()}#,
           #'capital.loss': unique(raw_df['capital.loss']).tolist(),
           #'hours.per.week': linspace(0, 100, 11).tolist()}

raw_df = cont2cat(raw_df, bin_map)
'''
for platform in model_platforms:
    df = clean_data(raw_df.copy(), data_cleaning_dict)

    model = platform()

    df1 = model.format_attr(df)

    for it in range(100):
        X_train, X_test, y_train, y_test = split_data(df1, response_variable)

        trained_model = model.create_model(X_train, y_train)

        accuracy_dict[platform.__name__].append(
            model.get_results(X_test, y_test)['Accuracy'])
    '''
    for it in range(100):
        skf = StratifiedKFold(n_splits=10)
示例#12
0
def main(argv):
    df_train = clean_data(argv[0])
    df_test  = clean_data(argv[1])

    make_prediction(df_train, df_test, fit_linear_model, './output/prediction_logit.csv')
示例#13
0
codes_dict = dict_all  # All codes:plans for all years worth of files
master_ = process.get_master(
    filelist,
    years)  # Generate master dataframe containing all data for all students

pbar = ProgBar()

i = 1
for k, p in plans_desc.items():

    pbar.pvar.set(i / len(list(plans_desc.values())) * 100)
    pbar.update()
    # Get raw data frame containing all student data for plan p (code k)
    df_raw = process.get_plan(k, master_, cohort, int(mg_dict[k]), codes_dict)

    # Check if any data in df, then continue: (Maybe change to check if multiple "terms" in df
    if len(df_raw) > 10:
        df_clean = clean.clean_data(df_raw, cohort, int(years[-1]))

        newname = p.replace(":", " ").replace(" ", "_")
        name = "{0}_{1}".format(str(cohort), newname)

        gen_html(get_json.get(df_clean), name, saveloc)

    if int(pbar.pvar.get()) == 100:
        pbar.button.config(state="normal", command=pbar.finish_button)

    i += 1

pbar.mainloop()
示例#14
0
文件: main.py 项目: GHIcarus/kgl-avoc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from clean import clean_data
from train import get_scores

df = pd.read_csv('avocado.csv')

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train = clean_data(df_train)

avoc_labels = df_train['AveragePrice']
df_train.drop(['AveragePrice'], axis=1, inplace=True)

lin_reg = LinearRegression()
lin_scores = get_scores(lin_reg, df_train, avoc_labels)
print(lin_scores.mean())

tree_reg = DecisionTreeRegressor()
tree_scores = get_scores(tree_reg, df_train, avoc_labels)
print(tree_scores.mean())

rf_reg = RandomForestRegressor()
rf_scores = get_scores(rf_reg, df_train, avoc_labels)
print(rf_scores.mean())
示例#15
0
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, f1_score

df = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

# clean dataset
clean = clean_data(df)
df_no_impact = clean[clean.Impact == 0]
df_impact = clean[clean.Impact == 1]
df_no_impact = df_no_impact.sample(frac=0.2, replace=False, random_state=1)
df_cleaned = pd.concat([df_impact, df_no_impact])
X = df_cleaned.drop(columns=['Impact'])
y = df_cleaned['Impact']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=0,
                                                    stratify=y)
X_train.head()

# logistic regression
model_lr = LogisticRegression(
示例#16
0
    'Express SocSci views', 'Science is difficult', 'Observe experiments',
    'Conduct experiments', 'Solve science problems', 'Express science views',
    'Watch TV', 'Read magazine', 'Read a book', 'Play games',
    'Help in household'
]

# target variables
target = ['performance', 'Maths %', 'Reading %', 'Science %', 'Social %']

# a new column "performance" is added to target as
# average of ('Maths %', 'Reading %', 'Science %', 'Social %')
marks["performance"] = marks[['Maths %', 'Reading %', 'Science %',
                              'Social %']].mean(axis=1, skipna=True)

# cleaning data into featureset(X) and target(y)
X, y = clean_data(marks, category, 'performance')

# printing the stats of cleaned data
print("No. of null values in X:\n", X.isnull().sum())
print("No. of null values in y:\n", y.isnull().sum())

# Creating a pipeline
pipe = Pipeline(steps=[
    # ('poly',PolynomialFeatures(degree=2, interaction_only = False)), # Uncomment this line to include extra polynomial features
    # ('selK',SelectKBest(score_func = f_regression, k='all')),
    # ('pca',PCA(n_components = 10)),
    ('regression', LinearRegression(fit_intercept=True, normalize=True))
])
pipe.fit(X, y)

# Print the classifier
示例#17
0
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import clean

customers, transactions = clean.get_data("Datasource")

charities = clean.get_charities(transactions)

data = clean.clean_data(customers, transactions, charities)

cut = round(data.shape[0] * 0.7)

train = data.iloc[:cut, :]
test = data.iloc[cut:, :]

X_train, y_train = clean.sep_Xy(train, charities)
X_test, y_test = clean.sep_Xy(test, charities)

model = Sequential()
model.add(Dense(250, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(150, activation="relu"))
model.add(Dense(y_train.shape[1], activation="relu"))

model.compile(optimizer="adam",
              loss="mean_squared_error",
              metrics=["accuracy", "mae", "mse"])

model.fit(X_train, y_train, batch_size=4, epochs=5)
def shape_element(element):
    if element.tag != "node" and element.tag != "way":
        return None             
    
    # Create an empty JSON object
    # Note: node is used as a generic name
    # here for "nodes" and "ways"
    node = {}
    node['created'] = {}
    node['type'] = element.tag
    
    # Create an empty list for GPS position
    if node['type'] == "node":
        node['pos'] = []
    
    # Insert all attributes of the element
    key_list = element.keys()    
    for key in key_list:
        
        # Inserted "created" values into a nested dictionary
        if key in created:
            node['created'][key] = element.attrib[key]
       
        # Insert "pos" coordinates ordered into a list     
        elif key == 'lat' or key == 'lon':
            node['pos'] = [float(element.attrib['lat']), 
                            float(element.attrib['lon'])]
                            
        # Create a key, value entry for all other attributes
        else:
            node[key] = element.attrib[key]
    
    # Loop through all children of the element and include their attributes
    for child in element:
        if child.tag == 'tag':
            
            tag_key, tag_value = child.attrib['k'], child.attrib['v']

            # If the tag is all lowercase, clean the data if needed,
            # then insert it                                                            
            if lower.search(tag_key) is not None:
                if tag_key in needs_cleaning:
                    tag_value = clean.clean_data(tag_key, tag_value)
                node[tag_key] = tag_value

            # If there is a colon in the key, split it and created
            # a nested dictionary as the entry
            elif lower_colon.search(tag_key) is not None:
                colon_location = tag_key.find(":")
                main_key, nested_key = tag_key[:colon_location], tag_key[colon_location+1:]
                
                # name:(language) contains foreign characters
                # and is only present on a few entries. These are skipped.
                if main_key == "name":
                    # Continue to next child in element
                    continue
                    
                # Categories like building vs. building:levels conflict
                # because "building" contains a string and 
                #"building" with a colon wants to contains a dictionary.         
                # To preserve all data, "building" is converted to a 
                # nested dictioinary and the old data is stored under
                # ['building']['type']
                if main_key in node.keys() and type(node[main_key])==type("string"):
                    temp = node[main_key]
                    node[main_key]= {}
                    node[main_key]["type"] = temp
                
                
                if main_key not in node.keys():
                    node[main_key] = {}
                
                if nested_key in needs_cleaning:
                    tag_value = clean.clean_data(nested_key, tag_value)
                
                node[main_key][nested_key] = tag_value                                        
                
        # Add node references in list format
        if child.tag == 'nd':
            if "node_refs" not in node:
                node["node_refs"] = []
            node["node_refs"].append(child.attrib['ref'])

    return node
示例#19
0
    # temp = spam_emails[j].get_content()
    msg = spam_emails[j]
    print(j)

    for part in msg.walk():
        # gets only text part of email
        if part.get_content_type() == 'text/plain':
            try:
                temp = part.get_content()
            except LookupError:
                print('exception at ', i)

    spam.append(temp)

# cleans up email data
new_ham, new_spam = clean.clean_data(ham, spam)

# take all text emails and put them into individual text files
new_ham_location = "C:\\Users\\Student\\Desktop\\email_files\\ham"
new_spam_location = "C:\\Users\\Student\\Desktop\\email_files\\spam"

count = 1
for script in new_ham:
    filename = str(count) + '.txt'
    print(filename, type(filename))
    filename = new_ham_location + '\\' + filename
    print(filename, type(filename))
    script_file = open(filename, "w", encoding='utf-8')
    script_file.write(script)
    script_file.close()
    count += 1
def train(X_train, Y_train, b = 10.):
    # add bias
    X_train = np.concatenate((np.full((X_train.shape[0], 1), b), X_train), axis = 1)
    w = np.zeros(X_train.shape[1])
    lr = 0.1
    iteration = 10000
    n = X_train.shape[0]
    grad_squared_sum = np.zeros(X_train.shape[1]) # for ada grad

    for t in range(iteration):
        loss = Y_train - np.dot(X_train, w)
        RMSE = math.sqrt(np.sum(loss ** 2) / n)
        grad = -2 * np.dot(X_train.T, loss)

        grad_squared_sum += grad ** 2
        w = w - lr * grad / np.sqrt(grad_squared_sum)

        print("Iteration %d: RMSE = %f" % (t, RMSE))

    np.save("./models/weights-best", w)
    return w

if __name__ == "__main__":
    features_n = 14 # number of features used
    train_data_list = []
    for i in range(1, len(sys.argv)):
        train_data_list.append(pd.read_csv(sys.argv[i]))
    data = clean_data(pd.concat(train_data_list))
    data = flatten_data_remove_outliers(data, features_n)
    X_train, Y_train = parse_train_data(data)
    w = train(X_train, Y_train)
示例#21
0
import pandas as pd
import clean

# reading and cleaning data
train = pd.read_csv('train.csv')
clean.clean_data(train)
test = pd.read_csv('test.csv')

# first model (all women survive, all men die)
# independent of training set
test["survived_prediction"] = 0
test.loc[test.Sex == "female", "survived_prediction"] = 1

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test["survived_prediction"]
})
submission.to_csv("titanic_gender.csv", index=False)
示例#22
0
    driver.find_element_by_tag_name('body').send_keys(Keys.ARROW_RIGHT)
    try:
        video_time = driver.find_element_by_class_name('ytp-time-duration')
        if video_time.text == '':
            time_ad = int(current_ad.split('\n')[1][-2:])
            time.sleep(time_ad + 1)
            driver.find_element_by_tag_name('body').send_keys(Keys.ARROW_RIGHT)
            video_time = driver.find_element_by_class_name('ytp-time-duration')
            time_ls.append(str(video_time.text))
        else:
            time_ls.append(str(video_time.text))
        print("Time: ", video_time.text)
    except:
        time_ls.append('0')
        print("Time: 0")
    i += 1
    print('------------------------------------------')

insert_to_df()

end = time.time()
print(f'Time needed for scraping {end_video - start_video} videos:',
      timer(start, end))
driver.close()

#--------------------------------------------------------------------------------------------------------------------------------------------------
# Data cleaning
clean_data(df_export, yt_channel, df_error)
# Create Dashboard
create_dashboard(df_export, yt_channel, sub)
import clean
import model
import scraper

# region SETTINGS
logging.basicConfig(level='INFO')
ROWS_TO_SCRAPE = 600
sb_url = 'https://sfbay.craigslist.org/search/sby/apa'
nb_url = 'https://sfbay.craigslist.org/search/nby/apa'
# endregion

# Scrape Craigslist if scraped data is not present
if not os.path.isfile('../data/northbay.csv') or not os.path.isfile(
        '../data/southbay.csv'):
    southbay = scraper.scrape_apts(ROWS_TO_SCRAPE, sb_url)
    northbay = scraper.scrape_apts(ROWS_TO_SCRAPE, nb_url)
    southbay.to_csv('../data/southbay.csv')
    northbay.to_csv('../data/northbay.csv')
if not os.path.isfile('../data/merge.csv'):
    clean.clean_data()

# Option to change to 'merge2.csv'
x, y = clean.prep_model('merge2.csv')

# Fit model and generate plots
lm, selected_x = model.model(x, y)

# List of model coefficients; no application at the moment
coefficient_list = model.return_coefficients(lm, selected_x)
model.plot_analyze(lm, selected_x, y)
示例#24
0
def preprocess():
    logging.debug('preprocess function in process')
    # Load dataset
    # location test data
    os.listdir('C:\\Users\\Student\\Desktop\\extension_data\\hamnspam')

    ham_filenames = [
        name for name in sorted(
            os.listdir(
                'C:\\Users\\Student\\Desktop\\extension_data\\hamnspam\\ham'))
        if len(name) > 20
    ]
    spam_filenames = [
        name for name in sorted(
            os.listdir(
                'C:\\Users\\Student\\Desktop\\extension_data\\hamnspam\\spam'))
        if len(name) > 20
    ]

    # making list to match index values with filenames
    ham_emails = [
        load_email(is_spam=False, filename=name) for name in ham_filenames
    ]
    spam_emails = [
        load_email(is_spam=True, filename=name) for name in spam_filenames
    ]

    #joey
    numTrainHam = round(len(ham_emails) * 0.8, 0)
    numTrainSpam = round(len(spam_emails) * 0.8, 0)
    train_emails = []
    test_emails = []
    trainHam = 0
    trainSpam = 0
    testHam = 0
    testSpam = 0

    logging.debug('Entering ham for loop of adding data to test/train lists')
    for i in range(0, len(ham_emails) - 1):
        # temp = ham_emails[i].get_content()
        msg = ham_emails[i]

        for part in msg.walk():
            # gets only text part of email
            if part.get_content_type() == 'text/plain':
                temp = part.get_content()

        if i < numTrainHam:
            train_emails.append(temp)
            trainHam += 1
            logging.debug('current # ham emails for training: %d' % (trainHam))
        else:
            test_emails.append(temp)
            testHam += 1
            logging.debug('current # ham emails for testing: %d' % (testHam))

    logging.debug(
        'Finished ham - total # ham training emails: %d - total # ham test emails: %d'
        % (trainHam, testHam))
    logging.debug('Entering spam for loop of adding data to test/train lists')

    for j in range(0, len(spam_emails) - 1):
        # temp = spam_emails[j].get_content()
        msg = spam_emails[j]

        for part in msg.walk():
            # gets only text part of email
            if part.get_content_type() == 'text/plain':
                temp = part.get_content()

        if j < numTrainSpam:
            train_emails.append(temp)
            trainSpam += 1
            logging.debug('current # spam emails for training: %d' %
                          (trainSpam))
        else:
            test_emails.append(temp)
            testSpam += 1

    logging.debug('Finished spam - total # spam training emails: %d' %
                  (trainSpam))
    # endJOey

    #train_labels
    train_labels = []
    for x in range(0, trainHam):
        train_labels.append("ham")
        logging.debug('current # ham labels: %d' % (x + 1))

    ham_labels_len = len(train_labels)

    for x in range(0, trainSpam):
        train_labels.append("spam")
        logging.debug('current # ham labels: %d' % (x + 1))

    logging.debug('total # ham labels: %d - total # spam labels: %d' %
                  (ham_labels_len, (len(train_labels) - ham_labels_len)))

    #test_labels
    test_labels = []
    for x in range(0, testHam):
        test_labels.append("ham")

    for x in range(0, testSpam):
        test_labels.append("spam")

    # cleans up email data
    logging.debug('going to clean.py')
    train_emails_cfd, test_emails_cfd = clean.clean_data(
        train_emails, test_emails)
    logging.debug('back from clean.py')

    # return data
    return train_emails_cfd, train_labels, test_emails_cfd, test_labels
示例#25
0
import matplotlib.pyplot as plt
import pandas as pd
from clean import clean_data

future_dates, past_dates_format = clean_data()


def future_picture(future_dates):
    # draw picture of future_temperature_from_url and future_temperature_from_api
    plt.style.use('ggplot')
    fig = plt.figure(figsize=(10, 6))
    colors1 = '#6D6D6D'

    data1 = pd.read_csv(
        'future_temperature_from_url.csv')['Day Temperature'].values.tolist()
    for i in range(len(data1)):
        data1[i] = float(data1[i][:-1])
    data3 = pd.read_csv('future_temperature_from_url.csv'
                        )['Night Temperature'].values.tolist()
    for i in range(len(data1)):
        data3[i] = float(data3[i][:-1])

    data2 = pd.read_csv('future_temperature_from_api_cleaned.csv'
                        )['Day Temperature'].values.tolist()
    data4 = pd.read_csv('future_temperature_from_api_cleaned.csv'
                        )['Night Temperature'].values.tolist()

    plt.plot(future_dates, data1, label='day_temperature_url')
    plt.plot(future_dates, data2, label='day_temperature_api')
    plt.plot(future_dates, data3, label='night_temperature_url')
    plt.plot(future_dates, data4, label='night_temperature_api')