示例#1
0
    def parse_file(self, sgmfiles):
        '''
        accesses the given files and finds the 
        content between the required tags
        '''

        for filename in sgmfiles:
            fopen = open(filename, 'r')
            lines = fopen.read()
            fopen.close()
            pattern = "<REUTERS[^>]*>([\s\S]*?)</REUTERS>"
            reuters = re.findall(pattern, lines)
            for line in reuters:
                rec_dict = {}
                #removing special characters from the obtained text
                line = CleanData().clean_spc_articles(line)
                rec_dict['REUTERS'] = line
                t_pttrn = "<TITLE>(.*?)</TITLE>"
                title = re.findall(t_pttrn, line)
                if len(title) != 0:
                    title = CleanData().clean_spc_articles(title[0])
                rec_dict['TITLE'] = title
                txt_pttrn = "<TEXT[^>]*>([\s\S]*?)</TEXT>"
                text = re.findall(txt_pttrn, line)
                if len(text) != 0:
                    text = CleanData().clean_spc_articles(text[0])
                rec_dict['TEXT'] = text
                #inserting each article into the database into the Articles collection
                ConnectDb().insert_data(rec_dict, self.connect, 'ReuterDb',
                                        'Articles')
    def process_data(self):
        #function that cleans the fetched tweets and inserts into processdb
        data = ConnectDb().find_data(self.connect, 'RawDb', 'RawTweets')
        for value in data:
            processed_data = {}
            for prop in value:

                if (isinstance(value[prop], bson.objectid.ObjectId)) | (
                        value[prop] == None):
                    if value[prop] == None:
                        processed_data[prop] = value[prop]
                else:
                    try:
                        formatted = CleanData().clean_emoji_data(
                            value[prop])  #remove emoji
                        clean_url = CleanData().clean_url_data(
                            formatted)  #remove url
                        clean_spc = CleanData().clean_spc_chars(
                            clean_url)  #remove special characters
                        processed_data[prop] = clean_spc
                    except:
                        pass
            #insert cleaned data into process db
            ConnectDb().insert_data(processed_data, self.connect, 'ProcessDb',
                                    'Tweets')
示例#3
0
    def predict(self):
        X = self.__X_test.drop('rent_amount_boxcox',axis=1)
        y = self.__X_test['rent_amount_boxcox']
        ypred = self.__xgbRegression.predict(X)
        print('MAE:', metrics.mean_absolute_error(y, ypred))
        print('MSE:', metrics.mean_squared_error(y, ypred))
        print('RMSE:', np.sqrt(metrics.mean_squared_error(y, ypred)))
        print('r2_score:', metrics.r2_score(y, ypred))  

    """ def test(self, lambda_):
        test_data = np.array([2, 1, 4, 2, 3, 0, 1, 0, 1, 5, 3, 2, 10])
        ypred = self.__linearRegression.predict(test_data)
        scipy.special.inv_boxcox(ypred, lambda_) """


start = time.time()
clean_data = CleanData("house_price.csv")
data = clean_data.fit()
encode_data = EncodeData(data)  
data = encode_data.fit()
corr = Correlation(data)
data = corr.corr_fit()
split = SplitData(data)
X, x = split.fit()
para_x = split.getParameters()
print(para_x)
xgb = XGBReg(X, x)
xgb.fit_()
xgb.predict()

print("Total time taken:", time.time() - start)
示例#4
0
from download_data import vars_to_pull
from clean_data import CleanData
from analyze_data import Results, initial_data
from analyze_data_sklearn import SkLearnResults, init_data
from linear_regression_analysis import LinearData
if __name__ =='__main__':
    data_cleaner = CleanData(vars_to_pull, is_test=True)
    # Note: downloading all data can take 12+ hours
    #   Will skip if data exists (if partial download occured must move files for fresh download)
    data_cleaner.run_download()
    data_cleaner.clean_data()
    # Calculating residuals can take 30+ minutes
    results = Results(initial_data)
    results.run_analyze()
    sklresults = SkLearnResults(init_data)
    sklresults.run_sk_analysis()
    lrd = LinearData(init_data)
    lrd.run_linear_analysis()
示例#5
0
def plot_feature_size(num_iter):
    """Tests various feature sizes and plots the error.

    Args:
        num_iter: Number of times to test for each point.
    """
    points = [100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000,
              8000, 9000, 10000]
    errors = []
    train_errors = []

    # Iterate over all points defined.
    for point in points:
        print "Testing for point", point, "features."
        error = 0
        train_error = 0

        # Repeat the test the desired number of times.
        for i in range(0, num_iter):
            cd = CleanData(tfidf=True, max_train_size=25000, max_features=point)

            try:
                # Get and train data.
                training_data = cd.bag_of_words(in_file="data/clean_train_input.csv")

                ids, X, y = get_numpy_matrices(training_data)

                del training_data

                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

                del X, y, ids

                nb = NaiveBayes()
                nb.train(X_train, y_train)

                # Calculate training and validation errors.
                out = nb.classify(X_test)
                error += nb.compute_error(out, y_test)

                train_out = nb.classify(X_train)
                train_error += nb.compute_error(train_out, y_train)
            except MemoryError:
                print "Memory error. Continuing."
                continue

            del X_train, X_test, y_train, y_test

        errors.append(error / num_iter)
        train_errors.append(train_error / num_iter)

    # PLOT.
    plt.figure(2)

    plt.title("Error vs Features")
    plt.xlabel("Number of features")
    plt.ylabel("Error")
    # plt.xscale('log')
    plt.plot(points, errors, '-ro')
    plt.plot(points, train_errors, '-bo')
    plt.show()
示例#6
0
    print "Training Naive Bayes classifier."
    nb.train(X_train, y_train)
    print "Done training."

    print "Classifying training input."
    out = nb.classify(X_train)

    print "Training error:", nb.compute_error(out, y_train)

    # Clean up unused arrays.
    del out
    del y_train
    del X_train


if __name__ == '__main__':
    cd = CleanData(tfidf=True, max_train_size=15000, max_features=7000)
    nb = NaiveBayes()

    train_naive_bayes(cd, nb)

    # Classify the test data.
    classify_test_data(cd, nb, "results/nb_predictions2.csv")

    # Tests.
    # plot_max_train_size(3)
    # plot_feature_size(3)

    print "Completed successfully."
示例#7
0
        columns.remove("rent_amount_boxcox")
        test_data = np.array([2, 1, 4, 2, 3, 0, 1, 0, 1, 5, 3, 2, 91]).T
        index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

        df = pd.DataFrame(test_data, index=index, columns=columns)

        
        #for i in range(len(self.__X_train.columns)):
        #    df.loc[0, df.columns[i]] = test[i]
        #ypred = self.__model.predict(test_data)
        #return scipy.special.inv_boxcox(ypred, lambda_)
        return df """


start = time.time()
clean_data = CleanData("NO-CHANGES\hyd_v2.csv")
data = clean_data.fit()
encode_data = EncodeData(data)  
data = encode_data.fit()
corr = Correlation(data)
data = corr.corr_fit()
split = SplitData(data)
X, x = split.fit()
xgb_ = XGBReg(X, x)
print(xgb_.fit_())
choice = input("Enter do you want to save this model.....type 'yes' to save or 'no' to ignore: ")
choice = choice.lower()

if choice == 'yes':
    xgb_.save_XGBmodel()
else:
示例#8
0
from sklearn import svm
from clean_data import CleanData

__author__ = "Yacine Sibous"

cd = CleanData()

print "Getting the training data."
training_data = cd.bag_of_words(in_file="data/clean_train_input.csv")
print "Done collecting data."

X = [x[1] for x in training_data]
y = [y[2] for y in training_data]

print X[0:5]
print y[0:5]
clf = svm.SVC()
clf.fit(X, y)
        y = exp_smooth.exp_smooth_forecast(xs_fit_opt, True)[-forecast_periods:]
    else: # Consumption forecasting with elasticity and income
        y = 8

    # Mask any negative, zero, infinity, or n/a values before returning
    y = np.ma.masked_less_equal(y, 0)
    y = np.ma.fix_invalid(y)
    return y

# Format all rows
new_datum_xs = np.ma.masked_all(datum_xs.shape, np.float)
count = 0
for row in datum_xs:
    try:
        start, stop = np.ma.flatnotmasked_edges(row[VALUE_COLUMN:][np.newaxis, :])
        values = CleanData(row[VALUE_COLUMN:stop + VALUE_COLUMN + 1][np.newaxis, :], X)
        xs = np.ma.hstack((values.get_return_values().flatten(), np.ma.masked_all(X.shape[0] - stop - 1)))
    except TypeError: # Some GDP rows do not have any values, therefore remove them
        xs = np.ma.array([0])
    if np.ma.sum(xs):
        new_datum_xs[count] = np.ma.hstack((row[ID_SLICE], xs))
        count += 1

# Resize the array to remove blank rows of data
new_datum_xs = np.ma.resize(new_datum_xs, (count, new_datum_xs.shape[1]))

# Append population and population net change arrays to the formatted and forecasted datum table
count = 0
Q = "SELECT * FROM Datum WHERE element_id BETWEEN 511 AND 703"
pop_xs = np.ma.masked_equal(cursor.execute(Q).fetchall(), -1)[:, 1:]
pop_xs = np.ma.filled(np.ma.column_stack(
示例#10
0
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from clean_data import CleanData
import numpy as np
import csv

__author__ = "Yacine Sibous, Jana Pavlasek"

# Initialize data for final submission.
cd = CleanData(tfidf=True, max_features=2500000, n_grams=3)

# Geat features and output.
print 'Getting Training data.'
X, y = cd.bag_of_words(in_file="data/clean_train_input.csv", sparse=True)
print 'Done collecting data.'

# Train.
print 'Training the model.'
lin_clf = svm.LinearSVC()
lin_clf.fit(X, y)
print 'Done training.'

# 3-fold cross validation.
print 'Cross Validation'
c_validation = cross_val_score(lin_clf, X, y, scoring='accuracy')
print c_validation.mean()

# Get and predict on the final test data.
print 'Collecting test data.'
test = cd.get_x_in(sparse=True)
print 'Done collecting data.'
示例#11
0
import folium
from folium import plugins
from folium.plugins import MarkerCluster
from clean_data import CleanData

met_df = CleanData()

m = folium.Map(zoom_start=4, tiles="OpenStreetMap")

cluster_map = MarkerCluster().add_to(m)

# Create an individual marker for each meteorite, adding it to a cluster
for coord in [tuple(x) for x in met_df.to_records(index=False)]:
    ID = coord[1]
    name = coord[0]
    year = coord[6]
    mass = coord[4]
    rec_class = coord[3]
    latitude = coord[7]
    longitude = coord[8]
    
    # Manually generate row index 
    #index = met_df[(met_df["reclat"] == latitude) & (met_df["reclong"] == longitude)].index.tolist()[0]    
    
    # Create custom marker icon | causes size increase of output html file 
    #meteorite_icon = folium.features.CustomIcon('Assets/meteorite.png', icon_size=(80,80))
    
    html = f"""
    <table border="1">
        <tr>
            <th> ID </th>