Exemplo n.º 1
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_TABLE):
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))

    ### 1. Check if table exists and create table
    checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD)
    existing_ids = getFieldAsList(CARTO_TABLE, UID_FIELD)
    num_existing = len(existing_ids)

    ### 2. Fetch data from FTP, dedupe, process
    num_new = processData(existing_ids)
    num_total = num_existing + num_new

    ### 3. Notify results
    logging.info('Total rows: {}, New rows: {}, Max: {}'.format(
        num_total, num_new, MAX_ROWS))
    deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD)

    # Get most recent update date
    #if new rows were added to table, make today the most recent update date
    if num_new > 0:
        most_recent_date = get_most_recent_date(CARTO_TABLE)
        lastUpdateDate(DATASET_ID, most_recent_date)

    logging.info("SUCCESS")
Exemplo n.º 2
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)
    logging.info('STARTING')

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_TABLE):
            logging.info("Clearing table")
            cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY'))

    # 1. Check if table exists and create table
    existing_ids = []
    if cartosql.tableExists(CARTO_TABLE):
        existing_ids = getFieldAsList(CARTO_TABLE, UID_FIELD)
    else:
        createTableWithIndices(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD)

    # 2. Iterively fetch, parse and post new data
    num_new = processNewData(existing_ids)

    existing_count = num_new + len(existing_ids)
    logging.info('Total rows: {}, New: {}, Max: {}'.format(existing_count, num_new, MAXROWS))

    # 3. Remove old observations
    deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD, MAXAGE)

    # Get most recent update date
    most_recent_date = get_most_recent_date(CARTO_TABLE)
    lastUpdateDate(DATASET_ID, most_recent_date)

    ###
    logging.info('SUCCESS')
Exemplo n.º 3
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)
    logging.info('STARTING')

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_TABLE):
            cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL')

    # 1. Check if table exists and create table
    existing_ids = []
    if cartosql.tableExists(CARTO_TABLE):
        logging.info('Fetching existing ids')
        existing_ids = getIds(CARTO_TABLE, UID_FIELD)
    else:
        logging.info('Table {} does not exist, creating'.format(CARTO_TABLE))
        createTableWithIndex(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD)

    # 2. Iterively fetch, parse and post new data
    new_ids = processNewData(existing_ids)

    new_count = len(new_ids)
    existing_count = new_count + len(existing_ids)
    logging.info('Total rows: {}, New: {}, Max: {}'.format(
        existing_count, new_count, MAXROWS))

    # 3. Remove old observations
    deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD)

    # Get most recent update date
    most_recent_date = get_most_recent_date(CARTO_TABLE)
    logging.info(most_recent_date)
    lastUpdateDate(DATASET_ID, most_recent_date)

    logging.info('SUCCESS')
Exemplo n.º 4
0
def delete_carto_entries(id_list, column):
    '''
    Delete entries in Carto table based on values in a specified column
    INPUT   id_list: list of column values for which you want to delete entries in table (list of strings)
            column: column name where you should search for these values (string)
    '''
    # generate empty variable to store WHERE clause of SQL query we will send
    where = None
    # go through each ID in the list to be deleted
    for delete_id in id_list:
        # if we already have values in the SQL query, add the new value with an OR before it
        if where:
            where += f' OR {column} = {delete_id}'
        # if the SQL query is empty, create the start of the WHERE clause
        else:
            where = f'{column} = {delete_id}'
        # if where statement is long or we are on the last id, delete rows
        # the length of 15000 was chosen arbitrarily - all the IDs to be deleted could not be sent at once, but no
        # testing was done to optimize this value
        if len(where) > 15000 or delete_id == id_list[-1]:
            cartosql.deleteRows(CARTO_TABLE, where=where, user=CARTO_USER,
                                key=CARTO_KEY)
            # after we have deleted a set of rows, start over with a blank WHERE clause for the SQL query so we don't
            # try to delete rows we have already deleted
            where = None
Exemplo n.º 5
0
def main():
    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
    logging.info('STARTING')

    # clear the table before starting, if specified
    if CLEAR_TABLE_FIRST:
        logging.info('Clearing Table')
        # if the table exists
        if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY):
            # delete all the rows
            cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=CARTO_USER, key=CARTO_KEY)
            # note: we do not delete the entire table because this will cause the dataset visualization on Resource Watch
            # to disappear until we log into Carto and open the table again. If we simply delete all the rows, this
            # problem does not occur

    # Check if table exists, create it if it does not
    logging.info('Checking if table exists and getting existing IDs.')
    existing_ids = checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD)

    # Fetch, process, and upload the new data
    logging.info('Fetching new data')
    num_new = processData(existing_ids)
    logging.info('Previous rows: {},  New rows: {}'.format(len(existing_ids), num_new))

    # Update Resource Watch
    updateResourceWatch(num_new)

    # Delete local files in Docker container
    delete_local()

    logging.info('SUCCESS')
Exemplo n.º 6
0
def processData():
    '''
    Function to download data and upload it to Carto
    Will first try to get the data for today three times
    Then decrease a day up until 8 tries until it finds one
    '''
    date = datetime.date.today() - datetime.timedelta(days=1)
    success = False
    tries = 0
    while tries < MAX_TRIES and success == False:
        logging.info("Fetching data for {}".format(str(date)))
        f = getFilename(date)
        url = SOURCE_URL.format(date=date.strftime('%Y%m%d'))
        try:
            urllib.request.urlretrieve(url, f)

        except Exception as inst:
            logging.info("Error fetching data for {}".format(str(date)))
            if tries >= 2:
                date = date - datetime.timedelta(days=1)
            tries = tries + 1
            if tries == MAX_TRIES:
                logging.error(
                    "Error fetching data for {}, and max tries reached. See source for last data update."
                    .format(str(datetime.date.today())))
            success = False
        else:
            df = pd.read_csv(f,
                             header=0,
                             usecols=[
                                 'Lat_DNB', 'Lon_DNB', 'Date_Mscan',
                                 'Date_LTZ', 'QF_Detect', 'EEZ', 'Land_Mask'
                             ])
            df = df.drop(df[df.QF_Detect == 999999].index)
            df['the_geom'] = df.apply(
                lambda row: getGeom(row['Lon_DNB'], row['Lat_DNB']), axis=1)

            df = df[[
                'the_geom', 'QF_Detect', 'Date_Mscan', 'Date_LTZ', 'Land_Mask',
                'Lon_DNB', 'Lat_DNB', 'EEZ'
            ]]
            if not cartosql.tableExists(CARTO_TABLE):
                logging.info('Table {} does not exist'.format(CARTO_TABLE))
                cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA)
            else:
                cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL')
                cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA)

                rows = df.values.tolist()
                logging.info('Success!')
                #logging.info('The following includes the first ten rows added to Carto:')
                #logging.info(rows[:10])
                if len(rows):
                    cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                             CARTO_SCHEMA.values(), rows)
            tries = tries + 1
            success = True
Exemplo n.º 7
0
def cleanOldRows(table, time_field, max_age, date_format='%Y-%m-%d %H:%M:%S'):
    ''' 
    Delete rows that are older than a certain threshold
    INPUT   table: name of table in Carto from which we will delete the old data (string)
            time_field: column that stores datetime information (string) 
            max_age: oldest date that can be stored in the Carto table (datetime object)
            date_format: format of dates in Carto table (string)
    RETURN  num_expired: number of rows that have been dropped from the table (integer)
    '''
    # initialize number of rows that will be dropped as 0
    num_expired = 0
    # if the table exists
    if cartosql.tableExists(table, CARTO_USER, CARTO_KEY):
        # check if max_age variable is a datetime object
        if isinstance(max_age, datetime.datetime):
            # convert datetime object to string formatted according to date_format
            max_age = max_age.strftime(date_format)
        elif isinstance(max_age, str):
            # raise an error if max_age is a string
            logging.error(
                'Max age must be expressed as a datetime.datetime object')
        # delete rows from table which are older than the max_age
        r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age),
                                CARTO_USER, CARTO_KEY)
        # get the number of rows that were dropped from the table
        num_expired = r.json()['total_rows']
    else:
        # raise an error if the table doesn't exist
        logging.error("{} table does not exist yet".format(table))

    return (num_expired)
Exemplo n.º 8
0
def deleteExcessRows(table, max_rows, time_field, max_age=''):
    '''Delete excess rows by age or count'''
    num_dropped = 0
    if isinstance(max_age, datetime.datetime):
        max_age = max_age.isoformat()

    # 1. delete by age
    if max_age:
        r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age))
        num_dropped = r.json()['total_rows']

    # 2. get sorted ids (old->new)
    r = cartosql.getFields('cartodb_id',
                           table,
                           order='{}'.format(time_field),
                           f='csv')
    ids = r.text.split('\r\n')[1:-1]

    # 3. delete excess
    if len(ids) > max_rows:
        r = cartosql.deleteRowsByIDs(table, ids[:-max_rows])
        num_dropped += r.json()['total_rows']
    if num_dropped:
        logging.info('Dropped {} old rows from {}'.format(num_dropped, table))
    return num_dropped
Exemplo n.º 9
0
def processData():
    '''
    Function to download data and upload it to Carto
    Will first try to get the data for MAX_TRIES then quits
    '''
    success = False
    tries = 0
    df = None
    while tries < MAX_TRIES and success == False:
        logging.info('Try running feeds, try number = {}'.format(tries))
        try:
            df = feeds()
            success = True
        except Exception as inst:
            logging.info(inst)
            logging.info("Error fetching data trying again")
            tries = tries + 1
            if tries == MAX_TRIES:
                logging.error(
                    "Error fetching data, and max tries reached. See source for last data update."
                )
            success = False
    if success == True:
        if not cartosql.tableExists(CARTO_TABLE,
                                    user=os.getenv('CARTO_USER'),
                                    key=os.getenv('CARTO_KEY')):
            logging.info('Table {} does not exist'.format(CARTO_TABLE))
            cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA)
            # Send dataframe to Carto
            logging.info('Writing to Carto')
            cc = cartoframes.CartoContext(
                base_url="https://{user}.carto.com/".format(user=CARTO_USER),
                api_key=CARTO_KEY)
            cc.write(df, CARTO_TABLE, overwrite=True, privacy='public')
        else:
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))
            # Send dataframe to Carto
            logging.info('Writing to Carto')
            cc = cartoframes.CartoContext(
                base_url="https://{user}.carto.com/".format(user=CARTO_USER),
                api_key=CARTO_KEY)
            cc.write(df, CARTO_TABLE, overwrite=True, privacy='public')
Exemplo n.º 10
0
def main():
    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
    logging.info('STARTING')

    # clear the table before starting, if specified
    if CLEAR_TABLE_FIRST:
        logging.info("clearing table")
        # if the table exists
        if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY):
            # delete all the rows
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=CARTO_USER,
                                key=CARTO_KEY)
            # note: we do not delete the entire table because this will cause the dataset visualization on Resource Watch
            # to disappear until we log into Carto and open the table again. If we simply delete all the rows, this
            # problem does not occur

    # Check if table exists, create it if it does not
    logging.info('Checking if table exists and getting existing IDs.')
    existing_ids = checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD,
                                    TIME_FIELD)

    # Delete rows that are older than a certain threshold
    num_expired = cleanOldRows(CARTO_TABLE, TIME_FIELD, MAX_AGE)

    # Get the filename from source url for which we want to download data
    filename = fetchDataFileName(SOURCE_URL)

    # Fetch, process, and upload new data
    logging.info('Fetching new data')
    num_new = processData(SOURCE_URL, filename, existing_ids)
    logging.info('Previous rows: {},  New rows: {}'.format(
        len(existing_ids), num_new))

    # Delete data to get back to MAX_ROWS
    num_deleted = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD)

    # Update Resource Watch
    updateResourceWatch(num_new)

    logging.info("SUCCESS")
Exemplo n.º 11
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_TABLE):
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))

    ### 1. Check if table exists, if not, create it
    checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD)

    ### 2. Retrieve existing data
    r = cartosql.getFields(UID_FIELD,
                           CARTO_TABLE,
                           order='{} desc'.format(TIME_FIELD),
                           f='csv')
    existing_ids = r.text.split('\r\n')[1:-1]
    num_existing = len(existing_ids)

    ### 3. Fetch data from FTP, dedupe, process
    num_new = processData(existing_ids)

    ### 4. Delete data to get back to MAX_ROWS
    num_dropped = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD, MAX_AGE)

    ### 5. Notify results
    total = num_existing + num_new - num_dropped

    # Get most recent update date
    most_recent_date = get_most_recent_date(CARTO_TABLE)
    lastUpdateDate(DATASET_ID, most_recent_date)

    logging.info('Existing rows: {},  New rows: {}, Max: {}'.format(
        total, num_new, MAX_ROWS))
    logging.info("SUCCESS")
Exemplo n.º 12
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)
    logging.info('STARTING')

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_TABLE):
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))

    ### 1. Check if table exists, if not, create it
    logging.info('Checking if table exists and getting existing IDs.')
    existing_ids = checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD)
    num_existing = len(existing_ids)

    ### 2. Fetch data from FTP, dedupe, process
    logging.info('Fetching new data')
    num_new = processData(existing_ids)
    logging.info('Processing interactions')
    processInteractions()

    ### 3. Delete data to get back to MAX_ROWS
    logging.info('Deleting excess rows')
    num_dropped = deleteExcessRows(CARTO_TABLE, MAX_ROWS, AGE_FIELD)

    ### 4. Notify results
    total = num_existing + num_new - num_dropped

    # If updates, change update date on RW
    if num_new > 0:
        lastUpdateDate(DATASET_ID, datetime.datetime.utcnow())

    logging.info('Existing rows: {},  New rows: {}, Max: {}'.format(
        total, num_new, MAX_ROWS))
    logging.info("SUCCESS")
Exemplo n.º 13
0
def cleanOldRows(table, time_field, max_age, date_format='%Y-%m-%d %H:%M:%S'):
    '''
    Delete excess rows by age
    Max_Age should be a datetime object or string
    Return number of dropped rows
    '''
    num_expired = 0
    if cartosql.tableExists(table):
        if isinstance(max_age, datetime.datetime):
            max_age = max_age.strftime(date_format)
        elif isinstance(max_age, str):
            logging.error('Max age must be expressed as a datetime.datetime object')

        r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age))
        num_expired = r.json()['total_rows']
    else:
        logging.error("{} table does not exist yet".format(table))

    return(num_expired)
Exemplo n.º 14
0
def deleteExcessRows(table, max_rows, time_field, max_age=''):
    '''Delete excess rows by age or count'''
    num_dropped = 0
    if isinstance(max_age, datetime.datetime):
        max_age = max_age.isoformat()

    # 1. delete by age
    if max_age:
        r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age))
        num_dropped = r.json()['total_rows']

    # 2. get sorted ids (old->new)
    ids = getFieldAsList(CARTO_TABLE, 'cartodb_id', orderBy=''.format(TIME_FIELD))

    # 3. delete excess
    if len(ids) > max_rows:
        r = cartosql.deleteRowsByIDs(table, ids[:-max_rows])
        num_dropped += r.json()['total_rows']
    if num_dropped:
        logging.info('Dropped {} old rows from {}'.format(num_dropped, table))
Exemplo n.º 15
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)
    logging.info('STARTING')

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_MARKET_TABLE):
            cartosql.deleteRows(CARTO_MARKET_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))
        if cartosql.tableExists(CARTO_ALPS_TABLE):
            cartosql.deleteRows(CARTO_ALPS_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))
        if cartosql.tableExists(CARTO_INTERACTION_TABLE):
            cartosql.deleteRows(CARTO_INTERACTION_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))

    # 1. Check if table exists and create table
    existing_markets = []
    if cartosql.tableExists(CARTO_MARKET_TABLE,
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY')):
        logging.info('Fetching existing market ids')
        existing_markets = getIds(CARTO_MARKET_TABLE, UID_FIELD)
    else:
        logging.info(
            'Table {} does not exist, creating'.format(CARTO_MARKET_TABLE))
        createTableWithIndex(CARTO_MARKET_TABLE, CARTO_MARKET_SCHEMA,
                             UID_FIELD)

    existing_alps = []
    if cartosql.tableExists(CARTO_ALPS_TABLE,
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY')):
        logging.info('Fetching existing ALPS ids')
        existing_alps = getIds(CARTO_ALPS_TABLE, UID_FIELD)
    else:
        logging.info(
            'Table {} does not exist, creating'.format(CARTO_ALPS_TABLE))
        createTableWithIndex(CARTO_ALPS_TABLE, CARTO_ALPS_SCHEMA, UID_FIELD,
                             TIME_FIELD)

    existing_interactions = []
    if cartosql.tableExists(CARTO_INTERACTION_TABLE,
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY')):
        logging.info('Fetching existing interaction ids')
        existing_interactions = getIds(CARTO_INTERACTION_TABLE, UID_FIELD)
    else:
        logging.info('Table {} does not exist, creating'.format(
            CARTO_INTERACTION_TABLE))
        createTableWithIndex(CARTO_INTERACTION_TABLE, CARTO_INTERACTION_SCHEMA,
                             UID_FIELD, INTERACTION_TIME_FIELD)

    # 2. Iterively fetch, parse and post new data
    num_new_markets, num_new_alps, markets_updated = processNewData(
        existing_markets, existing_alps)

    # Update Interaction table
    num_new_interactions = processInteractions(markets_updated)

    # Report new data count
    num_existing_markets = num_new_markets + len(existing_markets)
    logging.info('Total market rows: {}, New: {}, Max: {}'.format(
        num_existing_markets, num_new_markets, MAXROWS))

    num_existing_alps = num_new_alps + len(existing_alps)
    logging.info('Total alps rows: {}, New: {}, Max: {}'.format(
        num_existing_alps, num_new_alps, MAXROWS))

    num_existing_interactions = num_new_interactions + len(
        existing_interactions)
    logging.info('Total interaction rows: {}, New: {}, Max: {}'.format(
        num_existing_interactions, num_new_interactions, MAXROWS))

    # 3. Remove old observations
    deleteExcessRows(CARTO_ALPS_TABLE, MAXROWS, TIME_FIELD)  # MAXAGE)

    # Get most recent update date
    most_recent_date = get_most_recent_date(CARTO_ALPS_TABLE)
    lastUpdateDate(DATASET_ID, most_recent_date)

    logging.info('SUCCESS')
Exemplo n.º 16
0
def processNewData(url):
    '''
    Fetch, process and upload new data
    INPUT   url: url where you can find the download link for the source data (string)
    RETURN  num_new: number of rows of new data sent to Carto table (integer)
    '''
    # specify the starting page of source url we want to pull
    page = 1
    # generate the url and pull data for this page
    r = requests.get(url.format(page=page))
    # pull data from request response json
    raw_data = r.json()['data']
    # if data is available from source url
    if len(raw_data) > 0:
        # if the table exists
        if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY):
            # delete all the rows
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=CARTO_USER,
                                key=CARTO_KEY)
        logging.info('Updating {}'.format(CARTO_TABLE))
    else:
        # raise an error that data is not available from source url
        logging.error("Source data missing. Table will not update.")
    # create an empty list to store new data
    new_data = []
    # if data is available from source url
    while len(raw_data) > 0:
        logging.info('Processing page {}'.format(page))
        # read in source data as a pandas dataframe
        df = pd.DataFrame(raw_data)
        # go through each rows in the dataframe
        for row_num in range(df.shape[0]):
            # get the row of data
            row = df.iloc[row_num]
            # create an empty list to store data from this row
            new_row = []
            # go through each column in the Carto table
            for field in CARTO_SCHEMA:
                # if we are fetching data for unique id column
                if field == 'uid':
                    # add the unique id to the list of data from this row
                    new_row.append(row[UID_FIELD])
                # for any other column, check if there are values available from the source for this row
                else:
                    # if data available from source for this field, populate the field with the data
                    # else populate with None
                    val = row[field] if row[field] != '' else None
                    # add this value to the list of data from this row
                    new_row.append(val)
            # add the list of values from this row to the list of new data
            new_data.append(new_row)
        # go to the next page and check for data
        page += 1
        # generate the url and pull data for this page
        r = requests.get(url.format(page=page))
        # pull data from request response json
        raw_data = r.json()['data']

    # find the length (number of rows) of new_data
    num_new = len(new_data)
    # if we have found new dates to process
    if num_new:
        # insert new data into the carto table
        cartosql.blockInsertRows(CARTO_TABLE,
                                 CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(),
                                 new_data,
                                 user=CARTO_USER,
                                 key=CARTO_KEY)

    return num_new
Exemplo n.º 17
0
def processInteractions():
    r = cartosql.get(
        "SELECT * FROM {} WHERE current='True'".format(CARTO_TABLE),
        user=os.getenv('CARTO_USER'),
        key=os.getenv('CARTO_KEY'))
    interaction_data = r.json()['rows']
    try_num = 0
    #if we didn't get data back, wait a few minutes and try again
    while not len(interaction_data):
        logging.info('Sleeping and trying again.')
        try_num += 1
        time.sleep(300)
        interaction_data = r.json()['rows']
        if try_num > 5:
            logging.error('Problem fetching data to generate interactions')
            exit()

    countries_with_interaction = []
    for interaction in interaction_data:
        ctry = interaction['country_iso3']
        if ctry not in countries_with_interaction:
            countries_with_interaction.append(ctry)
    if cartosql.tableExists(CARTO_TABLE_INTERACTION,
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY')):
        cartosql.deleteRows(CARTO_TABLE_INTERACTION,
                            'cartodb_id IS NOT NULL',
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY'))
    #run to create new table
    #existing_interaction_ids = checkCreateTable(CARTO_TABLE_INTERACTION, CARTO_SCHEMA_INTERACTION, UID_FIELD)
    new_interactions = []
    for ctry in countries_with_interaction:
        r = cartosql.get(
            "SELECT * FROM {} WHERE current='True' AND country_iso3='{}'".
            format(CARTO_TABLE, ctry),
            user=os.getenv('CARTO_USER'),
            key=os.getenv('CARTO_KEY'))
        ctry_interaction_data = r.json()['rows']
        event_num = 1
        for interaction in ctry_interaction_data:
            event = interaction['event_name'].split(": ", 1)
            if event_num == 1:
                if len(event) == 1:
                    interaction_str = '{} ({})'.format(event[0],
                                                       interaction['url'])
                else:
                    interaction_str = '{} ({})'.format(event[1],
                                                       interaction['url'])
            else:
                if len(event) == 1:
                    interaction_str = interaction_str + '; ' + '{} ({})'.format(
                        event[0], interaction['url'])
                else:
                    interaction_str = interaction_str + '; ' + '{} ({})'.format(
                        event[1], interaction['url'])
            event_num += 1
        #uid = gen_interaction_uid(ctry)
        if ctry_interaction_data:
            row = []
            for key in CARTO_SCHEMA_INTERACTION.keys():
                try:
                    if key == 'the_geom':
                        lon = ctry_interaction_data[0]['lon']
                        lat = ctry_interaction_data[0]['lat']
                        item = {'type': 'Point', 'coordinates': [lon, lat]}
                    elif key == 'interaction':
                        item = interaction_str
                    else:
                        item = ctry_interaction_data[0][key]
                except KeyError:
                    item = None
                row.append(item)
            new_interactions.append(row)
    logging.info('Adding {} new interactions'.format(len(new_interactions)))
    cartosql.blockInsertRows(CARTO_TABLE_INTERACTION,
                             CARTO_SCHEMA_INTERACTION.keys(),
                             CARTO_SCHEMA_INTERACTION.values(),
                             new_interactions,
                             user=os.getenv('CARTO_USER'),
                             key=os.getenv('CARTO_KEY'))
Exemplo n.º 18
0
                                    ('year', 'numeric')])
        # Go through each type of "value" in this table
        # Add data column, unit, and indicator code to CARTO_SCHEMA, column_order, and dataset
        valnames = info['Carto Column'].split(";")
        for i in range(len(valnames)):
            CARTO_SCHEMA.update({valnames[i]: 'numeric'})
            # add the unit column name and type for this value  to the Carto Schema
            CARTO_SCHEMA.update({'unit' + str(i + 1): 'text'})
            # add the WB Indicator Code column name and type for this value to the Carto Schema
            CARTO_SCHEMA.update({'indicator_code' + str(i + 1): 'text'})
        # add the RW country name and country code columns to the table
        CARTO_SCHEMA.update({"rw_country_name": 'text'})
        CARTO_SCHEMA.update({"rw_country_code": 'text'})

        cartosql.deleteRows(table_name,
                            'cartodb_id IS NOT NULL',
                            user=CARTO_USER,
                            key=CARTO_KEY)

        # Insert new observations
        if len(all_world_bank_data):
            cartosql.blockInsertRows(table_name,
                                     CARTO_SCHEMA.keys(),
                                     CARTO_SCHEMA.values(),
                                     all_world_bank_data.values.tolist(),
                                     user=CARTO_USER,
                                     key=CARTO_KEY)
            logging.info('Success! New rows have been added to Carto.')
        else:
            logging.info('No rows to add to Carto.')
    '''
    Upload original data and processed data to Amazon S3 storage
Exemplo n.º 19
0
def processInteractions(markets_updated):
    num_new_interactions = 0
    #new_rows = []
    if PROCESS_HISTORY_INTERACTIONS == True:
        # get all markets
        logging.info('Processing interactions for all ALPS data')
        markets_to_process = getIds(CARTO_MARKET_TABLE, 'uid')

    else:
        logging.info('Getting IDs of interactions that should be updated')
        r = cartosql.getFields(
            ['region_id', 'market_id', 'market_name'],
            CARTO_INTERACTION_TABLE,
            where="{} < current_date - interval '{}' month".format(
                INTERACTION_TIME_FIELD, LOOKBACK),
            f='csv',
            user=os.getenv('CARTO_USER'),
            key=os.getenv('CARTO_KEY'))
        old_ids = [market.split(',') for market in r.text.split('\r\n')[1:-1]]
        old_market_uids = [
            genMarketUID(old_id[0], old_id[1], old_id[2]) for old_id in old_ids
        ]

        logging.info(
            'Processing interactions for new ALPS data and re-processing interactions that are out of date'
        )
        markets_to_process = np.unique(markets_updated + old_market_uids)
    #go through each market that was updated and create the correct rows for them
    num_markets = len(markets_to_process)
    market_num = 1
    logging.info('{} markets to update interactions for'.format(
        len(markets_to_process)))
    for m_uid in markets_to_process:
        logging.info('processing {} out of {} markets'.format(
            market_num, len(markets_to_process)))
        new_rows = []
        for food_category, sql_query in CATEGORIES.items():
            try_num = 1
            while try_num <= 3:
                try:
                    #logging.info('Processing interaction for {} at uid {}, try number {} (market {} of {})'.format(food_category, m_uid, try_num, market_num, num_markets))
                    # get information about market
                    r = cartosql.get("SELECT * FROM {} WHERE uid='{}'".format(
                        CARTO_MARKET_TABLE, m_uid),
                                     user=os.getenv('CARTO_USER'),
                                     key=os.getenv('CARTO_KEY'))
                    if r.json()['total_rows'] == 0:
                        #logging.info('No rows for interaction')
                        alps_entries = []
                        break
                    market_entry = r.json()['rows'][0]

                    # get information about food prices at market
                    # SQL gets most recent entry for each commodity at each market that is NOT a forecast
                    request = "SELECT DISTINCT ON (mktid, cmname) * FROM {table} WHERE mktid={market_id} AND mktname='{market_name}' AND adm1id={region_id} AND category LIKE '{cat_name}' AND date > current_date - interval '{x}' month AND forecast = 'False' ORDER  BY mktid, cmname, date desc".format(
                        table=CARTO_ALPS_TABLE,
                        market_id=market_entry['market_id'],
                        market_name=market_entry['market_name'].replace(
                            "'", "''"),
                        region_id=market_entry['region_id'],
                        cat_name=sql_query,
                        x=LOOKBACK)
                    r = cartosql.get(request,
                                     user=os.getenv('CARTO_USER'),
                                     key=os.getenv('CARTO_KEY'))
                    alps_entries = r.json()['rows']
                    break
                except:
                    try_num += 1
            uid = genInteractionUID(market_entry['region_id'],
                                    market_entry['market_id'],
                                    market_entry['market_name'], food_category)
            if alps_entries:
                commodity_num = 1
                for entry in alps_entries:
                    if commodity_num == 1:
                        interaction_string = INTERACTION_STRING_FORMAT.format(
                            num=commodity_num,
                            commodity=entry['cmname'],
                            alps=entry['alps'].lower(),
                            date=entry['date'][:10])
                    else:
                        interaction_string = interaction_string + '; ' + INTERACTION_STRING_FORMAT.format(
                            num=commodity_num,
                            commodity=entry['cmname'],
                            alps=entry['alps'].lower(),
                            date=entry['date'][:10])
                    commodity_num += 1
                # create new Carto row
                row = []
                for field in CARTO_INTERACTION_SCHEMA.keys():
                    if field == 'uid':
                        row.append(uid)
                    elif field == 'market_id':
                        row.append(int(market_entry['market_id']))
                    elif field == 'the_geom':
                        shapely_point = wkb.loads(market_entry['the_geom'],
                                                  hex=True)
                        json_point = json.loads(
                            json.dumps(
                                shapely.geometry.mapping(shapely_point)))
                        row.append(json_point)
                    elif field == 'region_name':
                        row.append(market_entry['region_name'])
                    elif field == 'region_id':
                        row.append(market_entry['region_id'])
                    elif field == 'market_name':
                        row.append(market_entry['market_name'])
                    elif field == 'market_interaction':
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            row.append(interaction_string)
                    elif field == 'category':
                        row.append(food_category)
                    elif field == 'highest_pewi':
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            highest_pewi = max(
                                [entry['pewi'] for entry in alps_entries])
                            row.append(highest_pewi)
                    elif field == 'highest_alps':
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            highest_alps_category = assignALPS(highest_pewi)
                            row.append(highest_alps_category)
                    elif field == INTERACTION_TIME_FIELD:
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            row.append(
                                min(entry['date'] for entry in alps_entries))
                new_rows.append(row)
                num_new_interactions += 1
            #delete old entries for the markets that were updated
            #logging.info('Deleting old interactions from Carto')
            try:
                cartosql.deleteRows(CARTO_INTERACTION_TABLE,
                                    "{} = '{}'".format(UID_FIELD, uid),
                                    user=os.getenv('CARTO_USER'),
                                    key=os.getenv('CARTO_KEY'))
            except:
                pass
        #cartosql.deleteRowsByIDs(CARTO_INTERACTION_TABLE, markets_to_process, id_field='market_id')
        #send new rows for these markets
        #logging.info('Sending new interactions to Carto')
        if new_rows:
            cartosql.insertRows(CARTO_INTERACTION_TABLE,
                                CARTO_INTERACTION_SCHEMA.keys(),
                                CARTO_INTERACTION_SCHEMA.values(),
                                new_rows,
                                blocksize=500,
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))
        market_num += 1
    return num_new_interactions