예제 #1
0
def test_download_file__exist_file():
    output = 'exist_file_test.test'
    with open(output, 'w') as f:
        f.write('test')
    url = 'https://github.com/duketemon/pyuplift/blob/master/LICENSE'
    download_file(url, output)
    os.remove(output)
예제 #2
0
def download_criteo_uplift_prediction(
    data_home=None,
    url='https://s3.us-east-2.amazonaws.com/criteo-uplift-dataset/criteo-uplift.csv.gz'
):
    """Downloading the Criteo Uplift Prediction dataset.

    ****************
    Data description
    ****************
    This dataset is constructed by assembling data resulting from several incrementality tests,
    a particular randomized trial procedure
    where a random part of the population is prevented from being targeted by advertising.
    It consists of 25M rows, each one representing a user with 11 features,
    a treatment indicator and 2 labels (visits and conversions).

    *******
    Privacy
    *******
    For privacy reasons the data has been sub-sampled non-uniformly so that the original incrementality level
    cannot be deduced from the dataset while preserving a realistic, challenging benchmark.
    Feature names have been anonymized and their values randomly projected so as to keep predictive power
    while making it practically impossible to recover the original features or user context.

    +--------------------------+------------+
    |Features                  |         11 |
    +--------------------------+------------+
    |Treatment                 |          2 |
    +--------------------------+------------+
    |Samples total             | 25,309,483 |
    +--------------------------+------------+
    |Average visit rate        |    0.04132 |
    +--------------------------+------------+
    |Average conversion rate   |    0.00229 |
    +--------------------------+------------+

    More information about dataset you can find in
    the `official dataset description <http://ailab.criteo.com/criteo-uplift-prediction-dataset>`_.

    +-----------------+----------------------------------------------------------------------------------+
    | **Parameters**  | | **data_home: string**                                                          |
    |                 | |   Specify another download and cache folder for the dataset.                   |
    |                 | |   By default the dataset will be stored in the data folder in the same folder. |
    |                 | | **url: string**                                                                |
    |                 | |   The URL to file with data.                                                   |
    +-----------------+----------------------------------------------------------------------------------+
    | **Returns**     | **None**                                                                         |
    +-----------------+----------------------------------------------------------------------------------+
    """

    data_home, dataset_path = __get_data_home_dataset_file_paths(data_home)
    if not os.path.isdir(data_home):
        os.makedirs(data_home)

    archive_path = dataset_path.replace('.csv', '.gz')
    if not os.path.exists(dataset_path):
        if not os.path.exists(archive_path):
            download_file(url, archive_path)
        retrieve_from_gz(archive_path, dataset_path)
def download_hillstrom_email_marketing(
    data_home=None,
    url='http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'
):
    """Downloading the Hillstrom Email Marketing dataset.

    ****************
    Data description
    ****************
    This dataset contains 64,000 customers who last purchased within twelve months.
    The customers were involved in an e-mail test.

     * 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise.
     * 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise.
     * 1/3 were randomly chosen to not receive an e-mail campaign.

    During a period of two weeks following the e-mail campaign, results were tracked.
    Your job is to tell the world if the Mens or Womens e-mail campaign was successful.

    +--------------------------+------------+
    |Features                  |          8 |
    +--------------------------+------------+
    |Treatment                 |          3 |
    +--------------------------+------------+
    |Samples total             |     64,000 |
    +--------------------------+------------+
    |Average spend rate        |    1.05091 |
    +--------------------------+------------+
    |Average visit rate        |    0.14678 |
    +--------------------------+------------+
    |Average conversion rate   |    0.00903 |
    +--------------------------+------------+

    More information about dataset you can find in
    the `official paper <http://minethatdata.com/Stochastic_Solutions_E-Mail_Challenge_2008.04.30.pdf>`_.

    +-----------------+----------------------------------------------------------------------------------+
    | **Parameters**  | | **data_home: string**                                                          |
    |                 | |   Specify another download and cache folder for the dataset.                   |
    |                 | |   By default the dataset will be stored in the data folder in the same folder. |
    |                 | | **url: string**                                                                |
    |                 | |   The URL to file with data.                                                   |
    +-----------------+----------------------------------------------------------------------------------+
    | **Returns**     | **None**                                                                         |
    +-----------------+----------------------------------------------------------------------------------+
    """

    data_home, dataset_path = __get_data_home_dataset_file_paths(data_home)
    if not os.path.isdir(data_home):
        os.makedirs(data_home)

    if not os.path.exists(dataset_path):
        download_file(url, dataset_path)
예제 #4
0
def test_download_file__success():
    url = 'https://github.com/duketemon/pyuplift/blob/master/LICENSE'
    output = 'LICENSE'
    download_file(url, output)
    os.remove(output)
예제 #5
0
def test_download_file__wrong_output_path():
    output = '/data23/LICENSE'
    url = 'https://github.com/duketemon/pyuplift/blob/master/LICENSE'
    with pytest.raises(FileNotFoundError):
        download_file(url, output)
예제 #6
0
def test_download_file__wrong_url():
    output = 'LICENSE12'
    url = 'https://githu404b.com/duketemon/pyuplift/blob/master/LICENSE'
    with pytest.raises(Exception):
        download_file(url, output)