예제 #1
0
    def loadData(dataPath):
        """
        Loads the Delhi Climate Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        if not os.path.isdir(dataPath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_files(
                dataset='sumanthvrao/daily-climate-time-series-data',
                path=dataPath,
                quiet=True,
                unzip=True
            )

        filepath1 = os.path.join(dataPath, 'DailyDelhiClimateTrain.csv')
        df1 = DatasetUtility.sortByDate(pd.read_csv(filepath1, header='infer'))

        filepath2 = os.path.join(dataPath, 'DailyDelhiClimateTest.csv')
        df2 = DatasetUtility.sortByDate(pd.read_csv(filepath2, header='infer'))

        return pd.concat([df1, df2], axis=0, ignore_index=True)
예제 #2
0
    def loadData(dataPath):
        """
        Loads the Retail Sales Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        filename = 'mock_kaggle.csv'
        filePath = os.path.join(dataPath, filename)

        if not os.path.isfile(filePath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_file(
                dataset='tevecsystems/retail-sales-forecasting',
                file_name=filename,
                path=dataPath)

        retailDf = pd.read_csv(filePath,
                               header='infer').rename(columns={
                                   'data': 'date',
                                   'venda': 'sale',
                                   'estoque': 'stock',
                                   'preco': 'price'
                               })

        return DatasetUtility.sortByDate(retailDf)
예제 #3
0
    def loadData(filepath):
        """
        Loads Apple Stock Data as a Pandas Dataframe.
        The file is located in the 'datasets/' directory of this repository
        with name 'apple-stock.csv'

        :param filepath: path where the data file is located
        :return: complete dataframe of the loaded dataset
        """

        return DatasetUtility.sortByDate(pd.read_csv(filepath, skiprows=14))
예제 #4
0
def test_sortByDate(filename, dateFormat):
    """
    Tests the sortByDate static method of DatasetUtility

    :param filename: filename of the file containing the test data
    :param dateFormat: format of the date present in the dataset
    """

    filepath = os.path.join(testDataDir, filename)
    df = pd.read_csv(filepath)

    # Function we want to test
    newDf = DatasetUtility.sortByDate(df, dateFormat=dateFormat)

    # Getting the name of date column
    dateColumnName = next(
        filter(lambda columnName: columnName.lower() == 'date',
               list(df.columns)))

    # Convert date string to datetime
    df[dateColumnName] = df[dateColumnName].map(
        lambda dateString: datetime.datetime.strptime(dateString, dateFormat))

    # Dataframe date column must have dates in increasing order
    assert newDf[dateColumnName].is_monotonic_increasing

    # Index of dataframe must increase
    assert newDf.index.is_monotonic_increasing

    # Both dataframes after reordering of rows must be equal
    assert newDf.shape == df.shape

    for _, newDfRow in newDf.iterrows():
        oldDfRow = df\
            .loc[df[dateColumnName] == newDfRow[dateColumnName]]\
            .iloc[0]

        assert newDfRow.equals(oldDfRow)
예제 #5
0
    def loadData(dataPath):
        """
        Loads the Jaipur Weather Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        filename = 'JaipurFinalCleanData.csv'
        filePath = os.path.join(dataPath, filename)

        if not os.path.isfile(filePath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_file(
                dataset='rajatdey/jaipur-weather-forecasting',
                file_name=filename,
                path=dataPath)

        return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
예제 #6
0
    def loadData(dataPath):
        """
        Loads the Amazon Stock Price Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        filename = 'Amazon.csv'
        filePath = os.path.join(dataPath, filename)

        if not os.path.isfile(filePath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_file(
                dataset='salmanfaroz/amazon-stock-price-1997-to-2020',
                file_name=filename,
                path=dataPath)

        return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
예제 #7
0
    def loadData(dataPath):
        """
        Loads the Bitcoin Stock Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        filename = 'BTC-USD.csv'
        filePath = os.path.join(dataPath, filename)

        if not os.path.isfile(filePath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_file(
                dataset='deepakvedantam/bitcoin-stock-data',
                file_name=filename,
                path=dataPath
            )

        return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
예제 #8
0
    def loadData(dataPath):
        """
        Loads the Yahoo Stock Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        filename = 'yahoo_stock.csv'
        filePath = os.path.join(dataPath, filename)

        if not os.path.isfile(filePath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_file(
                dataset='arashnic/time-series-forecasting-with-yahoo-stock-price',
                file_name=filename,
                path=dataPath
            )

        return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))
예제 #9
0
    def loadData(dataPath):
        """
        Loads the Seattle Rainfall Dataset as a dataframe. If the
        dataset is not present at the path, then it downloads the
        dataset. The returned dataset is sorted by date

        :param dataPath: filepath of where to download the dataset (or where
        the dataset is located if the dataset is already downloaded)
        :return: complete dataframe of the loaded dataset
        """

        filename = 'seattleWeather_1948-2017.csv'
        filePath = os.path.join(dataPath, filename)

        if not os.path.isfile(filePath):
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_file(
                dataset='rtatman/did-it-rain-in-seattle-19482017',
                file_name=filename,
                path=dataPath
            )

        return DatasetUtility.sortByDate(pd.read_csv(filePath, header='infer'))