示例#1
0
    def __init__(self, nome, diretorio, nomeExibir=None):
        self.diretorio = diretorio
        self.nome = nome

        if nomeExibir is None:
            self.nomeExibir = nome
        else:
            self.nomeExibir = nomeExibir

        self.reader = DataUtils()
        self.imagens = self.reader.obterImagens(self.diretorio)
示例#2
0
    def train_threaded(self, thread_number, xtrain, ytrain):
        try:
            xtrain_shuffled, ytrain_shuffled = DataUtils.shuffle_data(xtrain, ytrain)
            weight = self.weights[thread_number]
            loss = []
            accuracy = []
            learning_rate = self.learning_rate

            last_checkpoint = 0
            for i in range(self.examples_per_thread):
                itemx, itemy = xtrain_shuffled[i], ytrain_shuffled[i]
                prediction = self.predict(itemx, weight)
                if itemy * prediction < 1:
                    weight -= learning_rate * self.hinge_gradient(weight, itemx, itemy, self.regularization)

                if self.collect_data:
                    current_percentage = (thread_number * self.examples_per_thread + i) / (xtrain_shuffled.shape[0] / 100)
                    if current_percentage != last_checkpoint and current_percentage % 5 == 0:
                        last_checkpoint = current_percentage
                        loss.append(self.loss_function(xtrain, ytrain, weight))
                        accuracy.append(self.accuracy_function(xtrain, ytrain, weight))
                else:
                    if i == self.examples_per_thread - 1:
                        loss.append(self.loss_function(xtrain, ytrain, weight))
                        accuracy.append(self.accuracy_function(xtrain, ytrain, weight))

            self.weights[thread_number] = weight
            self.losses[thread_number] = loss
            self.accuracies[thread_number] = accuracy
        except Exception as e:
            print(e)
    def transform(self, X):
        numerical_data = DataUtils.get_numerical_data(X)
        full_numerical_data = self.numeric_imputer.transform(numerical_data)
        # Safe remove columns which are all empty
        successfully_imputed_columns = numerical_data.columns[
            ~np.isnan(self.numeric_imputer.statistics_)]
        full_numerical_data = pd.DataFrame(
            full_numerical_data, columns=successfully_imputed_columns)
        # Drop columns not in fit self.successfully_imputed_columns
        columns_to_drop = [
            c for c in full_numerical_data.columns
            if c not in self.successfully_imputed_columns
        ]
        if len(columns_to_drop) > 0:
            full_numerical_data = full_numerical_data.drop(columns_to_drop,
                                                           axis=1,
                                                           errors='ignore')
        # Add columns in self.successfully_imputed_columns and not in current
        columns_to_add = [
            c for c in self.successfully_imputed_columns
            if c not in full_numerical_data.columns
        ]
        if len(columns_to_add) > 0:
            full_numerical_data = full_numerical_data.reindex(columns=[
                *full_numerical_data.columns.tolist(), *columns_to_add
            ],
                                                              fill_value=0)

        return full_numerical_data
示例#4
0
def main():
	X, y = getXandY(DataUtils('data', 'input.txt').get_data())

	X_train, X_test, y_train, y_test = train_test_split(
		X, y, test_size=0.33, random_state=42
	)

	text_clf = get_pipeline()

	text_clf.fit(X_train, y_train)

	predicted = text_clf.predict(X_test)
	accuracy = np.mean(predicted == y_test)
	print('SGD Classifier Accuracy: ' + str(accuracy))

	dump(text_clf, 'data/clf.joblib')

	parameters = {
		'vect__ngram_range': [(1, 1), (1, 2)],
		'tfidf__use_idf': (True, False),
		'clf__alpha': (1e-2, 1e-3),
		'clf__loss': ('hinge', 'log', 'squared_hinge', 'perceptron'), 
	}

	gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
	gs_clf.fit(X_train, y_train)
	gs_predicted = gs_clf.predict(X_test)
	print('GridSearch Accuracy: ' + str(np.mean(gs_predicted == y_test)))

	dump(gs_clf, 'data/gs_clf.joblib')
 def fit(self, X):
     numerical_data = DataUtils.get_numerical_data(X)
     full_numerical_data = self.numeric_imputer.fit_transform(
         numerical_data)
     self.successfully_imputed_columns = numerical_data.columns[
         ~np.isnan(self.numeric_imputer.statistics_)]
     # Return fit object
     return self
示例#6
0
def main():
	du = DataUtils()
	du.start_session()
	usertrends = []
	try:
		usertrends = du.get_usertrends()
		for ut in usertrends:
			usertrend_id = ut.usertrend_id
			ntwts = du.get_num_tweets(usertrend_id)
			print ut.usertrend_id, ntwts
	finally: du.close_session()
    def fit(self, X):
        categorical_data = DataUtils.get_categorical_data(X)
        # Categorical mode imputer
        self.frequent_vals = pd.Series([
            categorical_data[c].value_counts().index[0]
            for c in categorical_data
        ],
                                       index=categorical_data.columns)

        # Return fit object
        return self
示例#8
0
def main(filename, train_size):
    w2v = load_w2v()
    dataset = DataUtils.load_dataset(filename, w2v)
    train_len = int(len(dataset) * train_size)
    test_len = len(dataset) - train_len
    train_set, test_set = random_split(dataset, [train_len, test_len])
    net_model = Siamese(batch_size=1, output_size=5, hidden_size=hidden_layer,
                        vocab_size=len(w2v.wv.vocab), embedding_length=embedded_dim, weights=w2v.wv.vectors)
    train_dataloader = DataLoader(train_set, batch_size=1, shuffle=True)
    test_dataloader = DataLoader(test_set,batch_size=1,shuffle=True)
    iterate_model(net_model, train_dataloader, test_dataloader)
    def run_problem_two_experiment(self):

        self.frozen_lake_jumbo_env = gym.make('FrozenLakeJumbo-v0')

        policy, V, iterations, theta, avg_deltas = self.value_iteration(
            env=self.frozen_lake_jumbo_env,
            discount_factor=self.discount_factor,
            theta=self.convergence_threshold)

        score = self.evaluate_policy(self.frozen_lake_jumbo_env, policy,
                                     self.discount_factor)

        print(
            "Value Iteration converged on frozen lake problem -> \n\titerations to converge: "
            + str(iterations) + "\n\tconergence threshold: " + str(theta) +
            "\n\tdiscount factor: " + str(self.discount_factor) +
            "\n\t100 game avg score: " + str(score) + "\n")

        DataUtils.write_convergence_diffs(
            DataUtils.get_results_directory_name() +
            "/lake-value-iter-gamma-" + str(self.discount_factor) + ".csv",
            avg_deltas)
    def run_problem_two_experiment(self):

        self.frozen_lake_jumbo_env = gym.make('FrozenLakeJumbo-v0')

        q_table, iterations, avg_deltas = self.q_learning(
            self.frozen_lake_jumbo_env, 2000000)

        score = self.evaluate_policy(env=self.frozen_lake_jumbo_env,
                                     q_table=q_table,
                                     gamma=self.discount_factor)

        print(
            "Q-Learning converged on frozen lake problem -> \n\titerations to converge: "
            + str(iterations) + "\n\tconvergence threshold: " +
            str(self.convergence_threshold) + "\n\tdiscount factor: " +
            str(self.discount_factor) + "\n\t100 game avg score: " +
            str(score) + "\n")

        DataUtils.write_convergence_diffs(
            DataUtils.get_results_directory_name() +
            "/lake-q-learning-gamma-" + str(self.discount_factor) + ".csv",
            avg_deltas)
    def run_problem_one_experiment(self):

        self.taxi_v2_env = gym.make('Taxi-v2')
        self.taxi_v2_env._max_episode_seconds = 999999999

        q_table, iterations, avg_deltas = self.q_learning(self.taxi_v2_env)

        score = self.evaluate_policy(env=self.taxi_v2_env,
                                     q_table=q_table,
                                     gamma=self.discount_factor)

        print(
            "Q-Learning converged on taxi problem -> \n\titerations to converge: "
            + str(iterations) + "\n\tconvergence threshold: " +
            str(self.convergence_threshold) + "\n\tdiscount factor: " +
            str(self.discount_factor) + "\n\t100 game avg score: " +
            str(score) + "\n")

        DataUtils.write_convergence_diffs(
            DataUtils.get_results_directory_name() +
            "/taxi-q-learning-gamma-" + str(self.discount_factor) + ".csv",
            avg_deltas)
示例#12
0
def classify_custom(text_samples, author_name, test):
	X, y = getXandY(DataUtils('data', 'input.txt').get_data())
	y = list(y)
	X.extend(text_samples)
	y.extend([author_name for sample in text_samples])

	X_train, y_train = shuffle(X, y)

	clf = get_pipeline()
	clf.fit(X_train, y_train)

	predicted = clf.predict_proba([test])[0]
	return list(zip(list(clf.classes_), list(predicted)))
示例#13
0
def main():
    data, labels, idx2char, unique_chars, char2idx = DataUtils.character_encoding(
        './Dataset/lyrics15LIN.csv', 'Country', max_vec_len, step)
    num_of_chars = len(unique_chars)
    model = Sequential()
    model.add(LSTM(128, input_shape=(max_vec_len, num_of_chars)))
    model.add(Dense(num_of_chars))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.RMSprop(lr=0.001))
    model.fit(data, labels, batch_size=128, epochs=epochs)
    model.save('./Dataset/15k-30epoch')
    predict(model, 'country road take me', char2idx, idx2char, unique_chars)
    def run_problem_one_experiment(self):

        self.taxi_v2_env = gym.make('Taxi-v2')
        self.taxi_v2_env._max_episode_seconds = 999999999

        policy, V, iterations, theta, avg_deltas = self.value_iteration(
            env=self.taxi_v2_env.env,
            discount_factor=self.discount_factor,
            theta=self.convergence_threshold)

        score = self.evaluate_policy(self.taxi_v2_env, policy,
                                     self.discount_factor)

        print(
            "Value Iteration converged on taxi problem -> \n\titerations to converge: "
            + str(iterations) + "\n\tconvergence threshold: " + str(theta) +
            "\n\tdiscount factor: " + str(self.discount_factor) +
            "\n\t100 game avg score: " + str(score) + "\n")

        DataUtils.write_convergence_diffs(
            DataUtils.get_results_directory_name() +
            "/taxi-value-iter-gamma-" + str(self.discount_factor) + ".csv",
            avg_deltas)
示例#15
0
def predict(model, seed, char2idx, idx2char, unique_chars):
    pattern = DataUtils.translator(unique_chars, seed, char2idx)
    res = '' + seed
    for i in range(word_count):
        x = np.reshape(pattern, (1, len(pattern), len(unique_chars)))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = idx2char[index]
        res += result
        seq = np.zeros((1, len(unique_chars)), dtype=bool)
        seq[0, index] = 1
        pattern = np.concatenate((pattern, seq))
        pattern = pattern[1:]
    print(res)
示例#16
0
    def fit(self, X, y=None):
        categorical_data = DataUtils.get_categorical_data(X)
        # Drop categorical with a lot of categories
        cat_sizes = pd.Series([
            categorical_data[c].value_counts().size for c in categorical_data
        ],
                              index=categorical_data.columns)
        sparse_categories = \
            cat_sizes.loc[cat_sizes >
                          self.max_categories_in_single_variable]
        skewed_categories = \
            cat_sizes.loc[cat_sizes <
                          self.min_categories_in_single_variable]
        self.categorical_variables_to_remove = sparse_categories.append(
            skewed_categories)

        # Return fit object
        return self
示例#17
0
    def train(self, xtrain, ytrain):
        """ Calculates the average gradient for a given batch
            and updates the weights with these averages after the
            batch has been processed. """

        self.weight = np.zeros(xtrain.shape[1])
        learning_rate = self.learning_rate
        start = datetime.now()  # runtime

        losses, accuracies = [], []
        for epoch in range(self.epoch_count):
            learning_rate /= np.sqrt(epoch + 1)  # adaptive learning rate
            xtrain, ytrain = DataUtils.shuffle_data(xtrain, ytrain)

            # Loops through the batches
            for i in range(int(len(ytrain) / self.batch_size)):
                batch_start = i * self.batch_size
                batch_end = (i + 1) * self.batch_size

                # Sums up the gradients of the incorrectly classified samples
                # or the samples that lie within the margin
                grad = 0
                for sample, label in zip(xtrain[batch_start:batch_end],
                                         ytrain[batch_start:batch_end]):

                    prediction = self.predict(sample)
                    if label * prediction < 1:  # either within margin or incorrectly classified
                        grad += self.hinge_gradient(self.weight, sample, label,
                                                    self.regularization)

                # Weights are updated with average gradients after batch is completed
                self.weight -= learning_rate * grad / self.batch_size

            # Losses & accuracies after one epoch. We always need latest value
            if self.collect_data or epoch == self.epoch_count - 1:
                losses.append(self.loss_function(xtrain, ytrain, self.weight))
                accuracies.append(
                    self.accuracy_function(xtrain, ytrain, self.weight))

        self.runtime = datetime.now() - start
        print(f"Finished in {self.runtime}")

        return np.array(losses), np.array(accuracies)
示例#18
0
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from DataUtils import DataUtils
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from classify import get_pipeline
from sklearn.pipeline import Pipeline

np.random.seed(5)

getXandY = lambda d: (list(map(lambda x: x[0], d[:, :-1])), d[:,
                                                              len(d[0]) - 1])
X, y = getXandY(np.array(DataUtils('data', 'input.txt').get_data()))

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
])
X_ = pipeline.fit_transform(X).todense()

fig = plt.figure()
ax = Axes3D(fig)

pca = PCA(n_components=3).fit(X_)
data3D = pca.transform(X_)

from joblib import load
    def __init__(self, data_set_file_name):

        self.data_set_file_name = data_set_file_name
        self.data_set = DataUtils.load_data_to_nd_array(data_set_file_name)
示例#20
0
def add_all_sample_tweets():
	sample_size = 100
	du = DataUtils()
	du.start_session()
	try:
		usertrends = du.get_usertrends()
		usertrend_ids = map(lambda u: u.usertrend_id, usertrends)
		for uid in usertrend_ids:
			if du.has_usertrend_for_uttl(uid):
				continue
			num_tweets = du.get_num_tweets(uid)
			if num_tweets < 1000:
				continue
			sample_tweet_ids = du.get_sample_tweet_ids(uid, sample_size)
			tweets = du.get_tweets(sample_tweet_ids)
			uttls = map(lambda tweet: UserTrendTweetLabel(
										usertrend_id=uid,
										tweet_id=tweet.id,
										text=tweet.text),
						tweets)
			du.add_usertrendtweetlabels(uttls)
	finally:
		du.close_session()
# define paths to FBA dataset and FBA annotations
# NEED TO EDIT THE PATH HERE IF USING ON A DIFFERENT COMPUTER
if sys.version_info[0] < 3:
    PATH_FBA_ANNO = '/Data/FBA2013/'
    PATH_FBA_AUDIO = '/Data/FBA2013data/'
else:
    PATH_FBA_ANNO = '/home/apati/FBA2013/'
    PATH_FBA_AUDIO = '/home/apati/FBA2013/'

# create data holder
perf_assessment_data = []
req_audio = False
# instantiate the data utils object for different instruments and create the data
INSTRUMENT = 'Alto Saxophone'
utils = DataUtils(PATH_FBA_ANNO, PATH_FBA_AUDIO, BAND, INSTRUMENT)
for year in YEAR:
    perf_assessment_data += utils.create_data(year, SEGMENT, audio=req_audio)

INSTRUMENT = 'Bb Clarinet'
utils = DataUtils(PATH_FBA_ANNO, PATH_FBA_AUDIO, BAND, INSTRUMENT)
for year in YEAR:
    perf_assessment_data += utils.create_data(year, SEGMENT, audio=req_audio)

INSTRUMENT = 'Flute'
utils = DataUtils(PATH_FBA_ANNO, PATH_FBA_AUDIO, BAND, INSTRUMENT)
for year in YEAR:
    perf_assessment_data += utils.create_data(year, SEGMENT, audio=req_audio)

print(len(perf_assessment_data))
示例#22
0
    def __init__(self, data_set_file_name):

        self.data_set_file_name = data_set_file_name
        self.data_set = DataUtils.load_data_to_nd_array(data_set_file_name)
        self.data_set_feature_labels = DataUtils.load_feature_labels_from_file(data_set_file_name)
def main():
    data_utils = DataUtils()

    # load the data from CSV file to pandas DataFrame (start from the project root directory)
    housing_data_frame = data_utils.load_csv_to_pandas_df(
        os.path.join("dataset", "housing", "housing.csv"))

    # Visualization of the data - geographically (by lat lang)
    DataVisualizationUtils.scatter_plot(
        housing_data_frame,
        "median_house_value",
        x_name="longitude",
        y_name="latitude",
        circle_radius=housing_data_frame["population"] / 100,
        label="population size (Expressed by the circle radius)")

    # cross correlation matrix
    DataVisualizationUtils.cross_correlation_matrix(housing_data_frame)

    # scatter plot matrix
    attribute_scatter_plot_matrix = [
        "median_house_value", "median_income", "total_rooms",
        "housing_median_age"
    ]
    DataVisualizationUtils.scatter_plot_matrix(housing_data_frame,
                                               attribute_scatter_plot_matrix)

    # Create more relevant/make sense new columns from the data which help us to predict
    # TODO: NEED TO MAKE IT AN ATTRIBUTE TRANSFORM
    housing_data_frame["rooms_per_household"] = housing_data_frame[
        "total_rooms"] / housing_data_frame["households"]
    housing_data_frame["bedrooms_per_rooms"] = housing_data_frame[
        "total_bedrooms"] / housing_data_frame["total_rooms"]
    housing_data_frame["population_per_household"] = housing_data_frame[
        "population"] / housing_data_frame["households"]

    # By re-examination of the correlation matrix, we can see that we created new features that more correlated with
    # house prices.
    DataVisualizationUtils.cross_correlation_vector(housing_data_frame,
                                                    "median_house_value")

    # split to train set and test set using stratified sampling
    test_data_frame, train_data_frame = data_utils.split_test_train_set_by_stratified_sampling(
        housing_data_frame, "median_income", [0., 1.5, 3., 4.5, 6., np.inf])

    # create label data frame
    train_labels_data_frame = data_utils.copy_and_drop_column(
        train_data_frame, "median_house_value")
    test_labels_data_frame = data_utils.copy_and_drop_column(
        test_data_frame, "median_house_value")

    # Prapare the data for ML algorithm
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scalar', StandardScaler()),
    ])

    numerical_attribute = list(test_data_frame.columns)
    numerical_attribute.remove("ocean_proximity")
    categorical_attribute = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ('numerical', numerical_pipeline, numerical_attribute),
        ('categorical', OneHotEncoder(), categorical_attribute)
    ])

    housing_prepared = full_pipeline.fit_transform(housing_data_frame)
    housing_train_data_prepared = housing_prepared.take(train_data_frame.index,
                                                        axis=0)
    housing_test_data_prepared = housing_prepared.take(test_data_frame.index,
                                                       axis=0)

    # training the algorithm
    linear_regression = LinearRegression()
    linear_regression.fit(housing_train_data_prepared, train_labels_data_frame)
    prediction_result_linear_regression = linear_regression.predict(
        housing_test_data_prepared)
    mean_square_error_linear_regression = np.sqrt(
        mean_squared_error(prediction_result_linear_regression,
                           test_labels_data_frame))
    DataVisualizationUtils.print_with_title(
        "Linear Regression MSE", mean_square_error_linear_regression)

    # Trying decision tree model in case our data contain alot of non-linear correlation between the features
    tree_regression = DecisionTreeRegressor()
    tree_regression.fit(housing_train_data_prepared, train_labels_data_frame)
    prediction_result_decision_tree = tree_regression.predict(
        housing_test_data_prepared)
    mean_square_error_decision_tree = np.sqrt(
        mean_squared_error(prediction_result_decision_tree,
                           test_labels_data_frame))
    DataVisualizationUtils.print_with_title("Decision Tree MSE",
                                            mean_square_error_decision_tree)

    # Trying cross validation
    scores = cross_val_score(DecisionTreeRegressor(),
                             housing_train_data_prepared,
                             train_labels_data_frame,
                             scoring="neg_mean_squared_error",
                             cv=10)
    scores = np.sqrt(
        -scores
    )  # sklearn uses utility function rather than cost function so the results are negative.
    # print(scores);
    DataVisualizationUtils.print_with_title(
        "Decision Tree Cross Validation MSE", scores.mean())
    # print(scores.std()); # 2566.8761488982286

    # Trying random forest
    random_forest = RandomForestRegressor()
    random_forest.fit(housing_train_data_prepared,
                      train_labels_data_frame.values.ravel())
    prediction_result_random_forest = random_forest.predict(
        housing_test_data_prepared)
    mean_square_error_random_forest = np.sqrt(
        mean_squared_error(prediction_result_random_forest,
                           test_labels_data_frame))
    DataVisualizationUtils.print_with_title("Rnadom Forest MSE",
                                            mean_square_error_random_forest)

    # Trying grid search
    param_grid_search = [{
        'n_estimators': [3, 10, 30],
        'max_features': [2, 4, 6, 8]
    }, {
        'bootstrap': [False],
        'n_estimators': [3, 10],
        'max_features': [2, 3, 4]
    }]

    random_forest_grid_search = RandomForestRegressor()
    grid_search_results = GridSearchCV(random_forest_grid_search,
                                       param_grid_search,
                                       cv=5,
                                       scoring="neg_mean_squared_error",
                                       return_train_score=True)
    grid_search_results.fit(housing_train_data_prepared,
                            train_labels_data_frame.values.ravel())
    cv_results = grid_search_results.cv_results_
    DataVisualizationUtils.print_with_title(
        "Rnadom Forest with Grid Search MSE", "")
    for mean_score, param in zip(cv_results['mean_test_score'],
                                 cv_results['params']):
        print(np.sqrt(-mean_score), param)
示例#24
0
    q_learning_exp.run_problem_one_experiment()


def run_q_learning_on_problem_two():
    q_learning_exp = QLearningExperiment()
    q_learning_exp.run_problem_two_experiment()


if __name__ == "__main__":

    print("Application running...\n")

    # Problem one is the small state space problem and problem two is the large state space problem.

    # Make sure that the 'Results' directory is present to save output files in.
    if not os.path.isdir(DataUtils.get_results_directory_name()):
        os.mkdir(DataUtils.get_results_directory_name())

    # Run value iteration on the first problem.
    run_value_iteration_on_problem_one()

    # Run policy iteration on the first problem.
    #run_policy_iteration_on_problem_one()

    # Run q learning on the first problem.
    #run_q_learning_on_problem_one()

    # Register the custom jumbo lake environment with open ai gym.
    gym.envs.registration.register(
        id='FrozenLakeJumbo-v0',
        entry_point='frozen_lake_jumbo:FrozenLakeJumboEnv',
示例#25
0
class DataLoader(object):
    def __init__(self, usertrend_id=None):
        DB_NAME = "sqlite:///../data/db.sqlite"
        self.du = DataUtils(DB_NAME)
        #self.utid = usertrend_id

    def get_sample_tweets_from_usertrendid(usertrend_id, sample_size=1000):
        tweet_ids = du.get_sample_tweet_ids(usertrend_id, sample_size)
        tweets = self.du.get_tweets_from_tweetids(tweet_ids)
        return tweets

    def label_generator(self, usertrend_id, sample_size=1000, limit=150):

        du = self.du
        du.start_session()

        if du.has_usertrend_for_uttl(usertrend_id):
            print "{} already has tweet labels".format(usertrend_id)
            return

        usertrend = du.get_usertrend(usertrend_id)
        user_id = usertrend.user_id
        user = du.get_user(user_id)
        bio = user.bio
        headlines = usertrend.related_queries

        #user_id = remove_non_ascii(user_id)
        #bio = remove_non_ascii(bio)
        #headlines = remove_non_ascii(headlines)

        label_question = 'Would the following tweets offend to the mentioned user: {}\n\n \
						  For context, here is the user\'s bio:\n\n \
						  {}\n\n \
						  Here are the headlines surrounding the user during the time of the tweet:\n \
						  {}\n\n'.decode(sys.stdin.encoding).format(user_id, bio, headlines)

        print label_question

        tweet_ids = du.get_sample_tweet_ids(usertrend_id, sample_size)

        if len(tweet_ids) == 0:
            print "{} has less than 1000 tweets".format(usertrend_id)
            return

        tweets = du.get_tweets_from_tweetids(tweet_ids)
        tweet_iter = iter(tweets)

        label_map = {'y': 1, 'n': 0, 'u': 2}
        tweet_labels = []
        while (len(tweet_labels) < limit and tweet_iter):
            tweet = tweet_iter.next()
            text = tweet.text
            text = remove_non_ascii(text)
            resp = raw_input(text + '\n')
            print
            while resp not in label_map:
                print "Please enter 'y': yes, 'n': no, 'u': unsure"
                resp = raw_input(text + '\n')
                print
            label = label_map[resp]
            if label == 0 or label == 1:
                tweet_label = UserTrendTweetLabel(usertrend_id=usertrend_id,
                                                  tweet_id=tweet.id,
                                                  text=tweet.text,
                                                  label=label)
                tweet_labels.append(label)
            print len(tweet_labels)

        du.add_usertrendtweetlabels(tweet_labels)
        du.close_session()

    def load_data(self, usertrend_id):
        du = self.du
        du.start_session()
        utid = self.utid
        tweet_texts = []
        try:
            tweet_texts = du.get_tweet_texts_from_usertrendid(utid)
        finally:
            du.close_session()
        return tweet_texts
示例#26
0
 def __init__(self, usertrend_id=None):
     DB_NAME = "sqlite:///../data/db.sqlite"
     self.du = DataUtils(DB_NAME)
示例#27
0
 def anomaly(y_train, y_test, anomaly_label):
     y_train = DataUtils.anomaly(y_train, anomaly_label)
     y_test = DataUtils.anomaly(y_test, anomaly_label)
     return y_train, y_test
 def transform(self, X):
     categorical_data = DataUtils.get_categorical_data(X)
     return categorical_data.fillna(self.null_value)
 def transform(self, X):
     categorical_data = DataUtils.get_categorical_data(X)
     return categorical_data.fillna(self.frequent_vals)
示例#30
0
# define paths to FBA dataset and FBA annotations
# NEED TO EDIT THE PATH HERE IF USING ON A DIFFERENT COMPUTER
PATH_FBA_ANNO = '/media/SSD/FBA/MIG-FbaData/'
PATH_FBA_AUDIO = ''  # not including raw audio
PATH_FBA_MIDI = "/media/SSD/FBA/fall19/data/midi/"
PATH_FBA_DILL = "/media/SSD/FBA/save_dill/"

# create data holder
perf_assessment_data = []
req_audio = False
req_rating = True
# instantiate the data utils object for different instruments and create the data
for band in BAND:
    perf_assessment_data = []
    for instrument in INSTRUMENT:
        utils = DataUtils(PATH_FBA_ANNO, PATH_FBA_AUDIO, band, instrument)
        for year in YEAR:
            perf_assessment_data += utils.create_data(year,
                                                      SEGMENT,
                                                      audio=req_audio,
                                                      rating=req_rating)

    file_name = band + '_' + str(SEGMENT) + '_pc_' + str(len(YEAR))
    print("Saving to " + file_name)
    with open(PATH_FBA_DILL + file_name + '.dill', 'wb') as f:
        dill.dump(perf_assessment_data, f)

# create midi data (piano roll)
midi_score = {}
unit = "res12"
for band in BAND: