Пример #1
0
def test_prequest_get_success_calls_aws_with_correct_url():
    with requests_mock.Mocker() as m:
        m.get(url)
        m.get(prequest.Prequest.PARENT_API_URL.format(url, False))
        prequest.get(url)
        last_request = m.last_request

        assert last_request.url == prequest.Prequest.PARENT_API_URL.format(url, False)
    def execute(trial=False):
        start_time = datetime.datetime.now()

        print('Fetching MBTAPerformance data...')
        data_url = 'http://datamechanics.io/data/nathansw_rooday_sbajwa_shreyap/MBTAPerformance.json'
        response = requests.get(data_url).json()
        print('MBTAPerformance fetched!')

        count = 0
        obj1 = {}
        obj2 = {}
        obj3 = {}
        for key in response.keys():
            if count % 3 == 0:
                obj1[key] = response[key]
            elif count % 3 == 1:
                obj2[key] = response[key]
            elif count % 3 == 2:
                obj3[key] = response[key]
            count += 1

        final = [obj1, obj2, obj3]

        print('Saving MBTAPerformance data...')
        spark = SparkSession.builder.appName('save-mbta-performance').getOrCreate()
        df = spark.createDataFrame(final)
        df.write.json('hdfs://project/hariri/cs591/mbta-performance.json')
        spark.stop()

        print('Done!')
        end_time = datetime.datetime.now()
        return {'start': start_time, 'end': end_time}
Пример #3
0
    def execute(trial=False):
        start_time = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('fjansen', 'fjansen')
        auth_key = dml.auth['services']['nycportal']['token']

        url = 'https://data.cityofnewyork.us/resource/fhrw-4uyv.json'
        if trial:
            params = {'$limit': 1000, '$$app_token': auth_key}
        else:
            params = {'$limit': 10000, '$$app_token': auth_key}

        resp = json.loads(prequest.get(url, params=params).text)

        repo.dropCollection('nyc311')
        repo.createCollection('nyc311')
        repo['fjansen.nyc311'].insert_many(resp)

        repo.logout()

        end_time = datetime.datetime.now()

        return {"start": start_time, "end": end_time}
Пример #4
0
def test_get_success_returns_200():
    with requests_mock.Mocker() as m:
        m.get(url, text='resp')
        m.get(prequest.Prequest.PARENT_API_URL.format(url, False))
        resp = prequest.get(url)

        assert resp.text == 'resp'
        assert resp.status_code == 200
Пример #5
0
def test_get_fail_and_cache_fail_returns_original_resp():
    with requests_mock.Mocker() as m:
        m.get(url, status_code=500)
        m.get(prequest.Prequest.PARENT_API_URL.format(url, True), json={'url': url2}, status_code=404)
        m.get(url2)
        resp = prequest.get(url)

        assert resp.url == url
        assert resp.status_code == 500
Пример #6
0
def test_get_500_calls_cache():
    with requests_mock.Mocker() as m:
        m.get(url, status_code=500)
        m.get(prequest.Prequest.PARENT_API_URL.format(url, True), json={'url': url2})
        m.get(url2)
        resp = prequest.get(url)

        assert resp.url == url2
        assert resp.status_code == 200
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('bemullen_crussack_dharmesh_vinwah',
                          'bemullen_crussack_dharmesh_vinwah')

        key = "fires"
        address = {}
        urls = []

        september_data_url = (
            'https://data.boston.gov/api/3/action/datastore_search?'
            'resource_id=14683ec2-c53a-46e0-b6de-67ec123629f0')

        december_data_url = (
            'https://data.boston.gov/api/3/action/datastore_search?'
            'resource_id=ce5cb864-bd01-4707-b381-9e204b4db73f')

        may_data_url = (
            'https://data.boston.gov/api/3/action/datastore_search?'
            'resource_id=9d91dbc7-9875-4cd9-a772-3b363a4b193f')

        urls.append(RetrieveFire.parseURL(september_data_url))
        urls.append(RetrieveFire.parseURL(december_data_url))
        urls.append(RetrieveFire.parseURL(may_data_url))

        for url in urls:
            r = json.loads(prequest.get(url).text)
            month = ""
            if url[-1] == '0':
                month = 'september'
            elif url[-3] == '7':
                month = 'december'
            else:
                month = 'may'
            # appended the month of the incident to each record
            for record in r['result']['records']:
                streetAddress = (record['Street Number'].strip() + " " +
                                 record['Street Name'].strip() + " " +
                                 record['Street Type'].strip() + " " +
                                 record['Neighborhood'].strip() + "MA " +
                                 record['Zip'].strip())
                g = geocoder.google(streetAddress)
                address[record['Incident Number']] = (month, g.latlng)

        repo.dropCollection(key)
        repo.createCollection(key)
        repo['bemullen_crussack_dharmesh_vinwah.' + key].insert_many([address])

        repo.logout()
        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
    def execute(trial=False):
        start_time = datetime.datetime.now()
        
        print('Fetching stops data...')
        data_url = 'http://datamechanics.io/data/nathansw_rooday_sbajwa_shreyap/stops.json'
        response = requests.get(data_url).json()
        print('stops data fetched!')

        print('Saving stops data...')
        spark = SparkSession.builder.appName('save-stops').getOrCreate()
        df = spark.createDataFrame(response)
        df.write.json('hdfs://project/hariri/cs591/stops.json')
        spark.stop()

        print('Done!')
        end_time = datetime.datetime.now()
        return {'start': start_time, 'end': end_time}
    def execute(trial=False):
        start_time = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('fjansen', 'fjansen')

        if trial:
            urls = [
                'https://data.boston.gov/api/3/action/datastore_search?resource_id=8f4f497e-d93c-4f2f-b754-bfc69e2700a0&limit=1000'
            ]
        else:
            urls = [
                'https://data.boston.gov/api/3/action/datastore_search?resource_id=8608b9db-71e2-4acb-9691-75b3c66fdd17&limit=10000',
                'https://data.boston.gov/api/3/action/datastore_search?resource_id=d969a70d-2734-4e75-b2ae-e64aec289892&limit=10000',
                'https://data.boston.gov/api/3/action/datastore_search?resource_id=8f4f497e-d93c-4f2f-b754-bfc69e2700a0&limit=10000'
            ]
        responses = []

        for url in urls:
            temp = json.loads(prequest.get(url).text)
            for e in temp['result']['records']:
                # Delete pre-assigned _id, which clashes with mongo
                del e['_id']
            responses.append(temp['result']['records'])

        repo.dropCollection("fires")
        repo.createCollection("fires")
        for response in responses:
            repo['fjansen.fires'].insert_many(response)

        repo.logout()

        end_time = datetime.datetime.now()

        return {"start": start_time, "end": end_time}
    def execute(trial=False):

        start_time = datetime.datetime.now()

        spark = SparkSession.builder.appName('save-demographics').getOrCreate()

        # opens 'Race.json' file from datamechanics.io

        url = 'http://datamechanics.io/data/nathansw_sbajwa/Race.json'
        response = requests.get(url).json()

        df = spark.createDataFrame(response)
        df.write.json('hdfs://project/hariri/cs591/race.json')

        # opens 'MeansOfCommuting.json' file from datamechanics.io

        url = 'http://datamechanics.io/data/nathansw_sbajwa/MeansOfCommuting.json'
        response = requests.get(url).json()
        df = spark.createDataFrame(response)
        df.write.json('hdfs://project/hariri/cs591/commuting.json')

        # opens 'PovertyRates.json' file from datamechanics.io

        url = 'http://datamechanics.io/data/nathansw_sbajwa/PovertyRates.json'
        response = requests.get(url).json()
        df = spark.createDataFrame(response)
        df.write.json('hdfs://project/hariri/cs591/poverty-rates.json')

        # opens 'HouseholdIncome.json' file from datamechanics.io

        url = 'http://datamechanics.io/data/nathansw_sbajwa/HouseholdIncome.json'
        response = requests.get(url).json()

        # removes $ from all of the nested keys within the JSON file (char forbidden by mongodb)
        # TODO Is this necessary for Spark?
        for town in response.keys():
            # Preps variables to alter dict with
            toReplace = {}
            toDelete = []
            for old_key in response[town]:
                # ex: '$25,000-34,999' -> '25,000-34,999'
                new_key = old_key.replace('$', '')
                # only continue if the original key had a $ that needed to be removed
                if new_key != old_key:
                    # puts new key in separate dict
                    toReplace[new_key] = response[town][old_key]
                    # adds old key to list of keys to be deleted
                    toDelete += [old_key]
            # merges two dicts i.e. r[town] contains both old and new keys ($ and no $)
            response[town].update(toReplace)
            # deletes old keys from r[town] leaving only kys with no $
            for key in toDelete:
                del response[town][key]

        df = spark.createDataFrame(response)
        df.write.json('hdfs://project/hariri/cs591/household-income.json')

        # logs out of db
        spark.stop()

        end_time = datetime.datetime.now()

        return {"start": start_time, "end": end_time}