def load_from_raw_json(self, single_ticker=None, force_update=False, save=True): """ :param single_ticker: str, ticker of the stock you need :param force_update: bool, force to fetch from remote :param save: bool whether to save to json file :return: """ if type(single_ticker) is str: self._single_ticker_call = True if single_ticker.endswith("json"): single_ticker = single_ticker[:-5] _, df = get_data(ticker=single_ticker, force_update=force_update, save=save) self._dataframe = [df] return self json_path = os.path.join( os.path.join(os.getcwd(), DataPreprocessor._cache_folder_name), "*.json") self._dataframe = [] for path in glob.glob(pathname=json_path, recursive=False): ticker = os.path.split(path)[-1][:-5].lower() _, df = get_data(ticker=ticker, force_update=force_update, save=save) if df is None: continue self._dataframe.append(df) return self
def main(): import time from data_extractor import get_data, corpus data = get_data(1) start = time.time() run_crawler(corpus, data) end = time.time() print print "Time Taken to Compute all Features : " + (str((end - start)/60.0)) + " minutes"
def create_dataset(start_year=2007, end_year=2018): """ Creates a dataset with the following properties x is a data matrix of size nxm where m is the number of features: feature_years * n_features n is the number of data vectors: n_municipalities * (n_years-feature_years+1) y is a vector of length n with the reported number of crimes predict_years after the end of a data vector """ #Getting municipality data from 2007 until 2018 data = get_data(aarstal=[str(x) for x in range(start_year, end_year + 1)], noegletal=FEATURES) data = _fix_nans(data) data_target = get_data( aarstal=[str(x) for x in range(start_year, end_year + 1)], noegletal=[TARGET_FEATURE]) data_target = _fix_nans(data_target).ravel() ordered = data_target.copy() ordered.sort() N = len(data_target) boundary = ordered[N // 2] #Create model data X = [] y = [] for i in range(len(data)): for j in range(data.shape[1] - 1): X.append(data[i, j].ravel()) for i in range(len(X)): y.append(data_target[i] <= boundary) #Standardize X = np.array(X) y = np.array(y, dtype=np.bool) mean_X = X.mean(axis=0) std_X = X.std(axis=0) + EPS X = (X - mean_X) / std_X return X, y
def load_from_csv(self, csv_path: str): """ :param csv_path: csv is a column of ticker strings :return: """ self._dataframe = [] with open(csv_path, "r") as fp: cr = csv.reader(fp) for line in cr: ticker = line[0] _, df = get_data(ticker=ticker, force_update=False, save=True) if df is None: continue self._dataframe.append(df) return self
def create_data_with_classes(): ''' Transformation of dataset adding new column with string labels for categories of each record Output: X and y with y being a collection of string labels ''' X = np.array([]) y = np.array([]) #Features to use for the models feature_idcs = np.arange(len(FEATURES)) target_feature_idx = FEATURES.index(TARGET_FEATURE) feature_idcs = np.delete(feature_idcs, target_feature_idx) #Getting municipality data from 2007 until 2018 data = get_data(aarstal=[str(x) for x in range(2007, 2018 + 1)]) data = fix_nans(data) #Create model data X = [] y = [] for i in range(len(data)): for j in range(len(data[0])): X.append(data[i, j, feature_idcs].ravel()) y.append(data[i, j, target_feature_idx].ravel()) #Standardize X = np.array(X) y = np.array(y) mean_X = X.mean(axis=0) std_X = X.std(axis=0) + EPS X = (X - mean_X) / std_X mean_y = y.mean() print('Mean of y: {}'.format(mean_y)) std_y = y.std() + EPS print('Standard deviation of y: {}\n'.format(std_y)) y = (y - mean_y) / std_y #Transform intervals into labels for categorization class_y = np.array([to_risk_cat(target) for target in y]) classes, count = np.unique(class_y, return_counts=True) print('\nClass count after transformation:') for i in range(len(classes)): print('{0} risk: {1}'.format(classes[i], count[i])) print('\n') return Data(X, class_y, mean_X, mean_y, std_X, std_y)
def create_dataset(feature_years=4, predict_years=5, start_year=2007, end_year=2018): """ Creates a dataset with the following properties x is a data matrix of size nxm where m is the number of features: feature_years * n_features n is the number of data vectors: n_municipalities * (n_years-feature_years+1) y is a vector of length n with the reported number of crimes predict_years after the end of a data vector """ target_feature_idx = FEATURES.index(TARGET_FEATURE) feature_idcs = np.arange(len(FEATURES)) # Comment out this line to include previous crime rates in features # feature_idcs = feature_idcs[feature_idcs!=target_feature_idx] data = get_data(aarstal=[str(x) for x in range(start_year, end_year + 1)]) log("Estimating nans") data = fix_nans(data) x_total = data[:, :-predict_years, feature_idcs] y_total = data[:, predict_years:, target_feature_idx] x_new = [] y_new = [] # Loops through all years that are at the start of an observation for i in range(x_total.shape[1] - feature_years + 1): # Loops through all municipalities for j in range(x_total.shape[0]): x_new.append(x_total[j, i:i + feature_years, :].ravel()) y_new.append(y_total[j, i + feature_years - 1]) x = np.array(x_new) mean_x = x.mean(axis=0) std_x = x.std(axis=0) + EPS x = (x - mean_x) / std_x y = np.array(y_new) mean_y = y.mean() std_y = y.std() + EPS y = (y - mean_y) / std_y return Data(x, y, mean_x, mean_y, std_x, std_y)
import data_extractor import defines import naive_bayes # Set logger logging.basicConfig(filename='output.log',level=logging.DEBUG) #logging.info('This is an info log') #logging.warning('This is a warning log') #logging.error('This is an error log') #******************************************************************************* # DATA EXTRACTION TRAIN_DATA = data_extractor.get_data(defines.DATA_TRAIN_CSV_FILE) TEST_DATA = data_extractor.get_data(defines.DATA_TEST_CSV_FILE) #******************************************************************************* # STOP WORDS FILTER #logging.info('Prepare Data') #nb_lib.nb_lib_prepare(TRAIN_DATA) #******************************************************************************* # FEATURE SELECTION logging.info('Feature selection') #feature_select.get_selected_features(TRAIN_DATA) #******************************************************************************* # DERIVATION OF NAIVE BAYES CLASSIFIER logging.info('Naive Bayes Classifier')
all_ops.update(set(op)) d = dict() for op in all_ops: d[op] = len(d) return d def transform(ops, tr): ret = [] for op in ops: ret.append(list(map(lambda x: tr[x], op))) return ret if True: train_data = get_nice_data(get_data('reviews.json')) train_data = list(map(lambda x: np.array(x), train_data)) scores = [] for train_idx, test_idx in KFold(len(train_data[0]), n_folds=7, \ shuffle=True): X_train = train_data[0][train_idx] Y_train = train_data[1][train_idx] X_test, Y_test = Solution._remove_differencies((train_data[0][test_idx],\ train_data[1][test_idx]), True) sol = Solution(True) sol.train((X_train, Y_train)) # sometimes it says "AttributeError: '_ConstantPredictor'
import csv from data_extractor import get_data # get sp500 data, store json file in work dir, skip if json file already exists if __name__ == "__main__": tickers = set() with open("pop_stk.csv", 'r') as csv_file: csv_reader = csv.reader(csv_file) for line in csv_reader: tickers.add(line[0]) # with open("index_list.csv", 'r') as csv_file: # csv_reader = csv.reader(csv_file) # for line in csv_reader: # tickers.add(line[0]) for ticker in tickers: get_data(ticker=ticker, force_update=False, save=True)
def test_get_data(self): ticker, df = get_data(ticker="AAPL", force_update=True, save=False) self.assertTrue(df is not None) self.assertTrue(ticker is not None) self.assertNotEqual(len(df), 0)
def test_cache_write_and_read(self): ticker1, df1 = get_data(ticker="AAPL", force_update=True, save=True) ticker2, df2 = get_data(ticker="AAPL", force_update=False, save=False) self.assertEqual(ticker1, ticker2) self.assertEqual(len(df1), len(df2))