def loadData(dataPath): """ Loads the Delhi Climate Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ if not os.path.isdir(dataPath): api = KaggleApi() api.authenticate() api.dataset_download_files( dataset='sumanthvrao/daily-climate-time-series-data', path=dataPath, quiet=True, unzip=True ) filepath1 = os.path.join(dataPath, 'DailyDelhiClimateTrain.csv') df1 = DatasetUtility.sortByDate(pd.read_csv(filepath1, header='infer')) filepath2 = os.path.join(dataPath, 'DailyDelhiClimateTest.csv') df2 = DatasetUtility.sortByDate(pd.read_csv(filepath2, header='infer')) return pd.concat([df1, df2], axis=0, ignore_index=True)
def loadData(dataPath): """ Loads the Retail Sales Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ filename = 'mock_kaggle.csv' filePath = os.path.join(dataPath, filename) if not os.path.isfile(filePath): api = KaggleApi() api.authenticate() api.dataset_download_file( dataset='tevecsystems/retail-sales-forecasting', file_name=filename, path=dataPath) retailDf = pd.read_csv(filePath, header='infer').rename(columns={ 'data': 'date', 'venda': 'sale', 'estoque': 'stock', 'preco': 'price' }) return DatasetUtility.sortByDate(retailDf)
def loadData(filepath): """ Loads Apple Stock Data as a Pandas Dataframe. The file is located in the 'datasets/' directory of this repository with name 'apple-stock.csv' :param filepath: path where the data file is located :return: complete dataframe of the loaded dataset """ return DatasetUtility.sortByDate(pd.read_csv(filepath, skiprows=14))
def test_sortByDate(filename, dateFormat): """ Tests the sortByDate static method of DatasetUtility :param filename: filename of the file containing the test data :param dateFormat: format of the date present in the dataset """ filepath = os.path.join(testDataDir, filename) df = pd.read_csv(filepath) # Function we want to test newDf = DatasetUtility.sortByDate(df, dateFormat=dateFormat) # Getting the name of date column dateColumnName = next( filter(lambda columnName: columnName.lower() == 'date', list(df.columns))) # Convert date string to datetime df[dateColumnName] = df[dateColumnName].map( lambda dateString: datetime.datetime.strptime(dateString, dateFormat)) # Dataframe date column must have dates in increasing order assert newDf[dateColumnName].is_monotonic_increasing # Index of dataframe must increase assert newDf.index.is_monotonic_increasing # Both dataframes after reordering of rows must be equal assert newDf.shape == df.shape for _, newDfRow in newDf.iterrows(): oldDfRow = df\ .loc[df[dateColumnName] == newDfRow[dateColumnName]]\ .iloc[0] assert newDfRow.equals(oldDfRow)
def loadData(dataPath): """ Loads the Jaipur Weather Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ filename = 'JaipurFinalCleanData.csv' filePath = os.path.join(dataPath, filename) if not os.path.isfile(filePath): api = KaggleApi() api.authenticate() api.dataset_download_file( dataset='rajatdey/jaipur-weather-forecasting', file_name=filename, path=dataPath) return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
def loadData(dataPath): """ Loads the Amazon Stock Price Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ filename = 'Amazon.csv' filePath = os.path.join(dataPath, filename) if not os.path.isfile(filePath): api = KaggleApi() api.authenticate() api.dataset_download_file( dataset='salmanfaroz/amazon-stock-price-1997-to-2020', file_name=filename, path=dataPath) return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
def loadData(dataPath): """ Loads the Bitcoin Stock Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ filename = 'BTC-USD.csv' filePath = os.path.join(dataPath, filename) if not os.path.isfile(filePath): api = KaggleApi() api.authenticate() api.dataset_download_file( dataset='deepakvedantam/bitcoin-stock-data', file_name=filename, path=dataPath ) return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
def loadData(dataPath): """ Loads the Yahoo Stock Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ filename = 'yahoo_stock.csv' filePath = os.path.join(dataPath, filename) if not os.path.isfile(filePath): api = KaggleApi() api.authenticate() api.dataset_download_file( dataset='arashnic/time-series-forecasting-with-yahoo-stock-price', file_name=filename, path=dataPath ) return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
def loadData(dataPath): """ Loads the Seattle Rainfall Dataset as a dataframe. If the dataset is not present at the path, then it downloads the dataset. The returned dataset is sorted by date :param dataPath: filepath of where to download the dataset (or where the dataset is located if the dataset is already downloaded) :return: complete dataframe of the loaded dataset """ filename = 'seattleWeather_1948-2017.csv' filePath = os.path.join(dataPath, filename) if not os.path.isfile(filePath): api = KaggleApi() api.authenticate() api.dataset_download_file( dataset='rtatman/did-it-rain-in-seattle-19482017', file_name=filename, path=dataPath ) return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))