def harvest_citations(dois=None): if dois is None: pass else: dois = dois.split(',') db = db_connect() get_citations(db, dois)
def fetch_trips(args): """Calculates the OD flows from all o cells to all d cells for a given time-slice hour using timedist for start time weighting. Args: args: a 3-tuple o, d, where o is a list of the origin cell ids and d a list of destination cell ids 10min interval of the day Returns: A list of trips """ o, d = args #arguments are passed as tuple due to pool.map() limitations start = time.time() conn = util.db_connect() cur = conn.cursor() #fetch all trips cur.execute(open("SQL/03_Scaling_OD/fetch_trips.sql", 'r').read(), { "weekdays": WEEKDAYS, "speed": SPEED, "orig_cells": o, "dest_cells": d } ) result = [] for origin, destination, trip_scale_factor, start_time, end_time, start_interval_end in cur.fetchall(): result.append(((origin,destination), (trip_scale_factor, start_time, end_time, start_interval_end))) conn.close() return result
def harvest_all(): """Harvest commits, citations, mentions, metadata""" db = db_connect() dois = None get_commits() get_citations(db, dois) get_mentions(since_version=None) list_records(dois)
def build_df(adm_level, iso, extent_year): print 'starting loss for adm level {}'.format(adm_level) conn = util.db_connect() field_list = util.level_lkp(adm_level) field_text = ', '.join(field_list) sql = 'SELECT {}, thresh, year, sum(area) as area FROM loss '.format(field_text) if iso: sql += "WHERE iso = '{}' ".format(iso) sql += 'GROUP BY {}, thresh, year'.format(field_text) df = pd.read_sql(sql, conn) df = util.add_lookup(df, adm_level, conn) # Create expression to come up with a combined field name # if iso, just Country, if adm1, Country_Adm1_Name, etc df['Country_Index'] = eval(util.country_text_lookup(adm_level)) # rename the 'year' column to the name that we'll need for our summary output table column_name_dict = {0: 'Country', 1: 'Country_Subnat1', 2: 'Country_Subnat1_Subnat2'} summary_col_name = column_name_dict[adm_level] df.rename(columns = {'year': summary_col_name}, inplace=True) is_first = True for thresh in [10, 15, 20, 25, 30, 50, 75]: df_subset = df[df.thresh == thresh].copy() df_subset['All areas are in hectares'] = 'TREE COVER LOSS (>{}% CANOPY COVER)'.format(thresh) df_pivot = df_subset.pivot_table(index=['Country_Index', 'All areas are in hectares'], columns=summary_col_name, values='area') df_pivot['TOTAL 2001-2017'] = df_pivot.sum(axis=1) df_pivot = df_pivot.unstack('All areas are in hectares') df_pivot = df_pivot.swaplevel(0,1, axis=1) del df_pivot.index.name if is_first: output_df = df_pivot.copy() is_first = False else: output_df = pd.concat([output_df, df_pivot], axis=1, join_axes=[output_df.index]) sheet_name_dict = {0: 'Country', 1: 'Subnat1', 2: 'Subnat2'} sheet_name = 'Loss (2001-2017) by {}'.format(sheet_name_dict[adm_level]) return sheet_name, output_df
def build_df(adm_level, iso, extent_year): print 'starting extent{} for adm level {}'.format(extent_year, adm_level) field_list = util.level_lkp(adm_level) field_text = ', '.join(field_list) sql = 'SELECT {}, thresh, sum(area) as area FROM extent{} '.format(field_text, extent_year) if iso: sql += "WHERE iso = '{}' ".format(iso) sql += 'GROUP BY {}, thresh'.format(field_text) conn = util.db_connect() df = pd.read_sql(sql, conn) df = util.add_lookup(df, adm_level, conn) # remove thresh 0 values df = df[df.thresh != 0] # Create expression to come up with a combined field name # if iso, just Country, if adm1, Country_Adm1_Name, etc df['Country_Index'] = eval(util.country_text_lookup(adm_level)) # Group by Country and thresh, sum area df = df.groupby(['Country_Index', 'thresh'])['area'].sum().reset_index() # Add larger index for merged column in output excel sheet df['All areas are in hectares'] = 'TREE COVER ({}) BY PERCENT CANOPY COVER'.format(extent_year) # convert int thresh to labeled thresh percent df['thresh'] = df.apply(lambda row: '>{}%'.format(str(row['thresh'])), axis=1) # rename the 'thresh' column to the name that we'll need for our summary output table column_name_dict = {0: 'Country', 1: 'Country_Subnat1', 2: 'Country_Subnat1_Subnat2'} summary_col_name = column_name_dict[adm_level] df.rename(columns = {'thresh': summary_col_name}, inplace=True) # pivot and remove column where thresh == 0 df_pivot = df.pivot_table(index=['Country_Index', 'All areas are in hectares'], columns=summary_col_name, values='area') df_pivot = df_pivot.unstack('All areas are in hectares') df_pivot = df_pivot.swaplevel(0,1, axis=1) del df_pivot.index.name sheet_name_dict = {0: 'Country', 1: 'Subnat1', 2: 'Subnat2'} sheet_name = 'Extent ({}) by {}'.format(extent_year, sheet_name_dict[adm_level]) return sheet_name, df_pivot
def build_df(adm_level, iso, extent_year): print 'starting gain for adm level {}'.format(adm_level) field_list = util.level_lkp(adm_level) field_text = ', '.join(field_list) sql = 'SELECT {}, sum(area) as area FROM gain '.format(field_text) if iso: sql += "WHERE iso = '{}' ".format(iso) sql += 'GROUP BY {}'.format(field_text) conn = util.db_connect() df = pd.read_sql(sql, conn) df = util.add_lookup(df, adm_level, conn) # dealing with raw data, so need to convert it # all change column name for output spreadsheet df['area'] = df.area / 10000 # Create expression to come up with a combined field name # if iso, just Country, if adm1, Country_Adm1_Name, etc df['Country_Index'] = eval(util.country_text_lookup(adm_level)) # group and sum just in case, and to remove additional columns df = df.groupby(['Country_Index'])['area'].sum().reset_index() # Add larger index for merged column in output excel sheet df['All areas are in hectares'] = 'TREE COVER GAIN' df['Country'] = '(>50% CANOPY COVER)' # Pivot so that these columns (which all have the same values) go to the top of the DF as indices df = df.pivot_table(columns=['All areas are in hectares', 'Country'], index='Country_Index') # remove extraneous area level df.columns = df.columns.droplevel(0) # remove the index name level as well del df.index.name sheet_name_dict = {0: 'Country', 1: 'Subnat1', 2: 'Subnat2'} sheet_type = sheet_name_dict[adm_level] sheet_name = 'Gain (2001-2012) by {}'.format(sheet_type) return sheet_name, df
def upload_taz(feature): """Uploads a TAZ polygon to the database. Args: feature: a geojson feature dict containing a TAZ polygon """ conn = util.db_connect() cur = conn.cursor() taz_id, linestr = util.parse_taz(feature) sql = " INSERT INTO taz (taz_id, geom) \ SELECT %(taz_id)s, ST_SetSRID(ST_MakePolygon(ST_GeomFromText(%(linestr)s)),4326);" cur.execute(sql, {"taz_id": taz_id, "linestr": linestr}) conn.commit()
def fetch_timedist(): """Fetches the time distribution of trips from the database by counting the number of starting trips in every 10min interval of the day. Returns: A list of size 24*6 containing the number of trips that start in every 10min interval of the day. """ conn = util.db_connect() cur = conn.cursor() cur.execute(open("SQL/03_Scaling_OD/trip_timedist.sql", 'r').read(), { "weekdays": WEEKDAYS, "speed": SPEED, "maxinterval": MAX_INTERVAL } ) timedist = [None] * 24*6 for interval, count in cur.fetchall(): timedist[int(interval)] = count return timedist conn.close()
def init(): global conn, cur conn = util.db_connect() cur = conn.cursor()
def setUp(self): self.conn = util.db_connect() self.cur = self.conn.cursor()
import psycopg2 import util, config # local modules util.db_login() conn = util.db_connect() cur = conn.cursor() # create backup copy eant_pos_full that keeps all antennas even when eant_pos is clustered print("Creating backup table eant_pos_original (takes a while)...") cur.execute("DROP TABLE IF EXISTS eant_pos_original CASCADE") conn.commit() cur.execute("CREATE TABLE eant_pos_original (LIKE eant_pos);") cur.execute("ALTER TABLE eant_pos_original ADD CONSTRAINT eant_pos_original_pkey PRIMARY KEY (id);") cur.execute("INSERT INTO eant_pos_original SELECT * FROM eant_pos;") conn.commit() # create backup copy homebase_original before clustering print("Creating backup table ehomebase_orignal (takes a while)...") cur.execute("DROP TABLE IF EXISTS ehomebase_original CASCADE") conn.commit() cur.execute("CREATE TABLE ehomebase_original (LIKE ehomebase);") cur.execute("ALTER TABLE ehomebase_original ADD CONSTRAINT ehomebase_original_pkey PRIMARY KEY (id);") cur.execute("INSERT INTO ehomebase_original SELECT * FROM ehomebase;") conn.commit()
""" https://developer.github.com/v3/repos/commits/ """ import requests import concurrent.futures import os import re from dateutil import parser import datetime from util import db_connect import logging import pymongo db = db_connect() logger = logging.getLogger(__name__) class APIRateLimitExceeded(Exception): pass class Repo: def __init__(self, url): self.url = url self.name = re.match(r'https?://github.com/(.*?)/?$', url).group(1) self.new_commits = None try: last_commit_date = db.commit.find({'repositoryURL': url}).sort('date', pymongo.DESCENDING)[0]['date'] # add one second self.synced_until = (parser.parse(last_commit_date) + datetime.timedelta(0, 1)).isoformat()[0:-6] + 'Z' except IndexError:
def calculate_od(args): """Calculates the OD flows for one OD-pair and inserts into od table. Args: args: tuple od, values where od is a tuple of origin and destination cell ids (o,d) and values a tuple (trip_scale_factor, start_time, end_time, start_interval_end) """ global timedist od, values = args o, d = od trip_scale_factors, start_times, end_times, start_interval_ends = [list(x) for x in zip(*values)] #calculate time distribution no_trips = [0] * 24*6 #10min intervals for i in range(0, len(start_times)): start_interval_length = start_interval_ends[i] - start_times[i] #uncertainty in start time in minutes if start_interval_length < 0: #This trip must have been made with a speed higher than #50km/h since the computed end time is before the start #time. Skipping... continue elif start_interval_length < MAX_INTERVAL: #only count trips for time dist with precise trip start info interval = int(((start_interval_ends[i]/60 - start_interval_length/2) % (24*60))/10) #10min intervals no_trips[interval] += 1 #calculate OD flows flows = numpy.array([0.0] * 24) for i in range(0, len(start_times)): start_interval = int(((start_times[i]/60) % (24*60))/10) #10min intervals end_interval = int(((start_interval_ends[i]/60) % (24*60))/10) #10min intervals scale_factor = trip_scale_factors[i] if scale_factor == 0: scale_factor = 1 #no scale factor for this user, count as 1 trip weight = [0.0] * 24*6 #weights for the trip in 10min intervals weight_function = lambda trips, dist: scale_factor*float(trips)/float(sum(dist[start_interval:end_interval+1])) #scale * trips/total_trips if sum(no_trips[start_interval:end_interval+1]) >= OD_SPECIFIC_TIMEDIST_THRESHOLD: #enough time dist for this OD info available weight[start_interval:end_interval+1] = [weight_function(trips, no_trips) for trips in no_trips[start_interval:end_interval+1]] else: #otherwise use timedist for all trips weight[start_interval:end_interval+1] = [weight_function(trips, timedist) for trips in timedist[start_interval:end_interval+1]] weight_hours = numpy.array([sum(weight[6*hour:6*hour+6]) for hour in range(0,24)]) flows += weight_hours #add this (scaled) trip to the od flow #Upload OD flows to DB data = [] for interval in range(0,24): if flows[interval] > 0: data.append((o,d,interval,flows[interval])) conn = util.db_connect() cur = conn.cursor() rows = [cur.mogrify("(%s, %s, %s, %s)", values) for values in data] if len(rows) > 0: sql = "INSERT INTO od (orig_cell, dest_cell, interval, flow) \ VALUES " + ", ".join(rows) + ";" cur.execute(sql) conn.commit() cur.close() end = time.time()
def initiate_connection(username, cluster): """ Initiate connection with Redshift cluster @param username: master username from replay.yaml @param cluster: cluster dictionary """ response = None logger = logging.getLogger("SimpleReplayLogger") if cluster.get("is_serverless"): secret_name = get_secret(cluster.get('secret_name'), cluster.get("region")) response = { 'DbUser': secret_name["admin_username"], 'DbPassword': secret_name["admin_password"] } else: rs_client = client('redshift', region_name=cluster.get("region")) # get response from redshift to get cluster credentials using provided cluster info try: response = rs_client.get_cluster_credentials( DbUser=username, DbName=cluster.get("database"), ClusterIdentifier=cluster.get("id"), DurationSeconds=900, AutoCreate=False, ) except rs_client.exceptions.ClusterNotFoundFault: logger.error( f"Cluster {cluster.get('id')} not found. Please confirm cluster endpoint, account, and region." ) exit(-1) except Exception as e: logger.error( f"Unable to connect to Redshift. Confirm IAM permissions include Redshift::GetClusterCredentials." f" {e}") exit(-1) if response is None or response.get('DbPassword') is None: logger.error(f"Failed to retrieve credentials for user {username} ") response = None # define cluster string/dict cluster_string = { "username": response["DbUser"], "password": response["DbPassword"], "host": cluster.get("host"), "port": cluster.get("port"), "database": cluster.get("database"), } conn = None try: logger.info(f"Connecting to {cluster.get('id')}") conn = db_connect( host=cluster_string["host"], port=int(cluster_string["port"]), username=cluster_string["username"], password=cluster_string["password"], database=cluster_string["database"]) # yield to reuse connection yield conn except redshift_connector.error.Error as e: logger.error( f"Unable to connect to Redshift. Please confirm credentials. {e} ") exit(-1) except Exception as e: logger.error(f"Unable to connect to Redshift. {e}") exit(-1) finally: if conn is not None: conn.close()