def get_last_timestamp(results_table_name): # create db object p_con = psqlDB() # query last timestamp from psql timestamp_query = ''' SELECT timestamp FROM {} ORDER BY timestamp DESC LIMIT 1 ''' timestamp_query = sql.SQL(timestamp_query).format( sql.Identifier(results_table_name)) data = p_con.retr_query(timestamp_query, None) # check if there is no last timestamp in db in order to start a complete import if len(data) == 0: timestamp_psql = 0 else: timestamp_psql = data[0][0] # delete/close db connection p_con.close() return timestamp_psql
def create_all_results_per_user(tasks_per_user, raw_results): p_con = psqlDB() input_table_name_a = tasks_per_user input_table_name_b = raw_results output_table_name = raw_results + '_all' index_task_id = raw_results + '_all_task_id_index' index_group_id = raw_results + '_all_group_id_index' index_user_id = raw_results + '_all_user_id_index' sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT b.task_id ,b.user_id ,b.project_id --,b.group_id ,CASE WHEN r.result > 0 THEN r.result ELSE 0 END as result ,(b.group_timestamp / 1000)::int as group_timestamp ,b.task_geom FROM {} as b LEFT JOIN {} as r ON (b.user_id = r."userId" AND b.task_id = r."taskId" AND b.project_id = r."projectId"); DROP INDEX IF EXISTS {}; CREATE INDEX {} ON {} USING btree (task_id); DROP INDEX IF EXISTS {}; CREATE INDEX {} ON {} USING btree (user_id); ''' sql_insert = sql.SQL(sql_insert).format( sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name_a), sql.Identifier(input_table_name_b), sql.Identifier(index_task_id), sql.Identifier(index_task_id), sql.Identifier(output_table_name), sql.Identifier(index_user_id), sql.Identifier(index_user_id), sql.Identifier(output_table_name), ) p_con.query(sql_insert, None) print('created: %s' % output_table_name) del p_con return output_table_name
def save_users_psql(users_filename, users_table_name): ### this functions loads data from csv to psql and updates the group table # Open CSV file users_file = open(users_filename, 'r') columns = ('userid', 'distance', 'contributions', 'username') # create table for user data p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS {} CASCADE; CREATE TABLE {} ( userid character varying, distance integer DEFAULT 0, contributions integer DEFAULT 0, username character varying, CONSTRAINT pk_user_id PRIMARY KEY (userid) ) ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(users_table_name), sql.Identifier(users_table_name)) p_con.query(sql_insert, None) # second copy data from csv file to psql table p_con.copy_from(users_file, users_table_name, sep=';', columns=columns) users_file.close() os.remove(users_filename) p_con.close() return
def get_existing_projects(project_list, project_table_name): ### this function gets all project information that is already stored in the psql database ### and returns a json object existing_projects = {} p_con = psqlDB() # each row is converted to json format using psql function row_to_json sql_insert = ''' SELECT row_to_json({}) FROM {} WHERE id = ANY(%s) ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(project_table_name), sql.Identifier(project_table_name)) data = (project_list, ) retr_data = p_con.retr_query(sql_insert, data) p_con.close() for i in range(0, len(retr_data)): project_id = retr_data[i][0]["id"] existing_projects[project_id] = retr_data[i][0] existing_projects = json.dumps(existing_projects) existing_projects = json.loads(existing_projects) return existing_projects
def check_tasks(project_id, task_table_name): task_table_name = task_table_name + '_{}'.format(project_id) p_con = psqlDB() sql_insert = ''' SELECT taskid FROM {} WHERE projectid = %s LIMIT 1 ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(task_table_name)) data = [project_id] try: task = p_con.retr_query(sql_insert, data) p_con.close() if len(task) == 1: return True else: return False except: return False
def update_project_info(project_table_name, new_project): p_con = psqlDB() # update the information in the table # we don't update all columns, but just those that might change sql_insert = ''' UPDATE {} SET contributors = %s ,progress = %s ,state = %s ,isFeatured = %s ,corrupt = %s ,lastcheck = %s WHERE id = %s ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(project_table_name)) data = [ new_project['contributors'], new_project['progress'], new_project['state'], new_project['isFeatured'], new_project['corrupt'], new_project['last_check'], new_project['id'] ] p_con.query(sql_insert, data) # delete database connection p_con.close() return
def create_projects_table(): p_con = psqlDB() sql_insert = ''' CREATE EXTENSION postgis; CREATE TABLE projects ( id INT NOT NULL ,contributors INT NOT NULL ,groupAverage DOUBLE PRECISION NOT NULL ,image CHARACTER VARYING NOT NULL ,importKey CHARACTER VARYING NOT NULL ,isFeatured BOOLEAN NOT NULL ,lookFor CHARACTER VARYING NOT NULL ,name CHARACTER VARYING NOT NULL ,progress INT NOT NULL ,projectDetails CHARACTER VARYING NOT NULL ,state INT NOT NULL ,verificationCount INT NOT NULL ,corrupt BOOLEAN NOT NULL ,lastCheck TIMESTAMP WITHOUT TIME ZONE ,extent geometry ,centroid geometry ,CONSTRAINT pk_project_id PRIMARY KEY (id) ); ''' p_con.query(sql_insert, None) print('created table: projects') p_con.close()
def select_data_for_project(table_name, projectid): p_con = psqlDB() input_table_name = table_name output_table_name = '{}_{}'.format(input_table_name, projectid) sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT * FROM {} WHERE projectid = %s''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name)) data = [str(projectid)] p_con.query(sql_insert, data) print('created: %s' % output_table_name) del p_con return output_table_name
def create_table_all_redundant_tasks(): p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS all_redundant_tasks; CREATE TABLE all_redundant_tasks ( task_id character varying, project_id integer, real_completed_count bigint, task_geom geometry, agreement_1 numeric, agreement_2 numeric, agreement_3 numeric, agreement_4 numeric, agreement_5 numeric, agreement_6 numeric, msi_1 numeric, msi_2 numeric, msi_3 numeric, msi_4 numeric, msi_5 numeric, msi_6 numeric, no_si_1 numeric, no_si_2 numeric, no_si_3 numeric, no_si_4 numeric, no_si_5 numeric, no_si_6 numeric ) ''' p_con.query(sql_insert, None) del p_con
def calc_agreement_from_results(table_name): #################### # the table should have the following columns with the names: # task_id # completed_count # yes_count # maybe_count # badimage_count # no_count #################### p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT b.* ,CASE WHEN (b.yes_count + b.maybe_count)/b.completed_count::numeric > 0.3 THEN 1 WHEN b.no_count >= b.badimage_count THEN 0 WHEN b.badimage_count > b.no_count THEN 2 ELSE 9 END as class ,CASE WHEN b.completed_count = 1 THEN 1.0 ELSE ( round(((1.0 / (b.completed_count::numeric * (b.completed_count::numeric - 1.0))) * ( ((b.yes_count::numeric ^ 2.0) - b.yes_count::numeric) + ((b.maybe_count::numeric ^ 2.0) - b.maybe_count::numeric) + ((b.badimage_count::numeric ^ 2.0) - b.badimage_count::numeric) + ((b.no_count::numeric ^ 2.0) - b.no_count::numeric) )),3) ) END as agreement ,round(((b.yes_count::numeric + b.maybe_count::numeric)/b.completed_count::numeric),3) as msi ,round((b.no_count::numeric/b.completed_count::numeric),3) as no_si FROM {} as b''' input_table_name = table_name output_table_name = table_name + '_agreement' # we need to pass table names using the psycopg2 SQL module sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name)) p_con.query(sql_insert, None) print('created: %s' % output_table_name) del p_con return output_table_name
def create_results_psql(results_csv_filename, results_table_name): # Open CSV file results_file = open(results_csv_filename, 'r') columns = ('taskId', 'userId', 'projectId', 'timestamp', 'result', 'duplicates') raw_results_table_name = 'raw_' + results_table_name p_con = psqlDB() # first, create table with group id and completed count sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} ( taskId VARCHAR NOT NULL ,userId VARCHAR NOT NULL ,projectId INT NOT NULL ,timestamp BIGINT NOT NULL ,result INT NOT NULL ,duplicates INT NOT NULL ,CONSTRAINT pk_result_id_raw PRIMARY KEY (taskId, userId, projectId) ); ''' sql_insert = sql.SQL(sql_insert).format( sql.Identifier(raw_results_table_name), sql.Identifier(raw_results_table_name)) p_con.query(sql_insert, None) # copy completed count data to psql p_con.copy_from(results_file, raw_results_table_name, sep=',', columns=columns) results_file.close() os.remove(results_csv_filename) print('copied results to temporary psql table.') sql_insert = ''' INSERT INTO {} (taskid, userid, projectid, timestamp, result, duplicates) SELECT * FROM {} as b ON CONFLICT ON CONSTRAINT "pk_result_id" DO UPDATE SET (taskid, userid, projectid, timestamp, result, duplicates) = (results.taskid, results.userid, results.projectid, results.timestamp, results.result, results.duplicates); DROP TABLE IF EXISTS {} CASCADE; ''' sql_insert = sql.SQL(sql_insert).format( sql.Identifier(results_table_name), sql.Identifier(raw_results_table_name), sql.Identifier(raw_results_table_name)) p_con.query(sql_insert, None) p_con.close() return
def update_completed_count_psql(completed_count_filename, project_id, task_table_name): ### this functions loads data from csv to psql and updates the group table # Open CSV file completed_count_file = open(completed_count_filename, 'r') columns = ('groupid', 'projectid', 'completedcount') raw_group_table_name = 'groups_' + task_table_name + '_{}'.format( project_id) task_table_name = task_table_name + '_{}'.format(project_id) p_con = psqlDB() # first, create table with group id and completed count sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} ( groupid integer ,projectid integer ,completedcount integer ) ''' sql_insert = sql.SQL(sql_insert).format( sql.Identifier(raw_group_table_name), sql.Identifier(raw_group_table_name)) p_con.query(sql_insert, None) # copy completed count data to psql p_con.copy_from(completed_count_file, raw_group_table_name, sep=';', columns=columns) completed_count_file.close() os.remove(completed_count_filename) del completed_count_file sql_insert = ''' UPDATE {} as b SET completedcount = a.completedcount FROM {} as a WHERE a.groupid = b.groupid AND a.projectid = b.projectid; DROP TABLE IF EXISTS {} ''' sql_insert = sql.SQL(sql_insert).format( sql.Identifier(task_table_name), sql.Identifier(raw_group_table_name), sql.Identifier(raw_group_table_name)) p_con.query(sql_insert, None) p_con.close() return
def merge_all_results(project_id): p_con = psqlDB() sql_insert = ''' INSERT INTO all_results_raw (task_id, user_id, project_id, result, group_timestamp, task_geom) SELECT s.* FROM {} as s ''' input_table_name = str(project_id) + '_results_all' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(input_table_name)) p_con.query(sql_insert, None) del p_con
def merge_all_tasks(project_id): p_con = psqlDB() sql_insert = ''' INSERT INTO all_tasks (task_id, project_id, completed_count, msi, no_si, crowd_answer, agreement, task_geom) SELECT s.* FROM {} as s ''' input_table_name = str(project_id) + '_results_all_tasks' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(input_table_name)) p_con.query(sql_insert, None) del p_con
def run_stats_export(table_name, path): # open db connection p_con = psqlDB() # check if stats_general view is already created in db and ready to be querried check_stats_view = ''' SELECT EXISTS ( SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'stats_general' );''' check_view = p_con.retr_query(check_stats_view, None) #p_con.close() if check_view[0][0] == False: create_stats_general_view() print('view does not exist and will be created') #backup_path = os.getcwd() # seperate location from file try: head, tail = ntpath.split(path) except: print('slicing of path/file failed') print('path: %s' % path) print('head: %s' % head) print('tail: %s' % tail) # check if path is provided if head: os.chdir(head) print('changed dir') # p_con2 = psqlDB() # define query to stas view sql_insert = ''' SELECT row_to_json({}) FROM {} ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(table_name),sql.Identifier(table_name)) # execute query retr_data = p_con.retr_query(sql_insert, None) # delete db connection p_con.close() #w write date to file as json with open(tail, 'w') as fo: json.dump(retr_data[0][0], fo, sort_keys = False, indent = 2)
def clean_up_database(delete_table_list): for i in range(0, len(delete_table_list)): p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS {}; ''' sql_insert = sql.SQL(sql_insert).format( sql.Identifier(delete_table_list[i])) p_con.query(sql_insert, None) print('deleted: %s' % delete_table_list[i]) del p_con
def save_projects_psql(project_table_name, new_projects): ### this functions saves the new project information to psql p_con = psqlDB() # insert new values for each project for i in list(new_projects): if new_projects[i]['isNew'] == 1: insert_project_info(project_table_name, new_projects[i]) print('insert data in psql for new project:', i) # we only delete and insert information for projects that need an update elif new_projects[i]['needUpdate'] == 1: update_project_info(project_table_name, new_projects[i]) print('update data in psql for updated project:', i) return
def get_attributes_from_table(table): p_con = psqlDB() sql_insert = ''' SELECT a.attname as column_name ,format_type(a.atttypid, a.atttypmod) AS data_type FROM pg_attribute a JOIN pg_class b ON (a.attrelid = b.relfilenode) WHERE b.relname = %s AND a.attstattarget = -1; ''' data = [table] attributes_raw = p_con.retr_query(sql_insert, data) return attributes_raw
def get_all_user_ids(): p_con = psqlDB() sql_insert = ''' SELECT user_id FROM user_matrix ''' content = p_con.retr_query(sql_insert, None) user_ids = [] for i in range(0, len(content)): user_ids.append(content[i][0]) return user_ids
def create_table_all_results(): p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS all_results_raw; CREATE TABLE all_results_raw ( task_id character varying, user_id character varying, project_id integer, result integer, group_timestamp integer, task_geom geometry ) ''' p_con.query(sql_insert, None) del p_con
def create_table_all_tasks(): p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS all_tasks; CREATE TABLE all_tasks ( task_id character varying, project_id integer, completed_count bigint, msi numeric, no_si numeric, crowd_answer integer, agreement numeric, task_geom geometry ) ''' p_con.query(sql_insert, None) del p_con
def create_user_contributions(project_results, user_tasks): p_con = psqlDB() input_table_name_a = user_tasks input_table_name_b = project_results output_table_name = '{}_{}'.format('user_contributions', project_results.split('_')[-1]) sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT b.taskid ,b.userid ,b.projectid --,b.groupid ,CASE WHEN r.result > 0 THEN r.result ELSE 0 END as result ,(b.group_timestamp / 1000)::int as group_timestamp ,b.geo FROM {} as b LEFT JOIN {} as r ON (b.userid = r.userid AND b.taskid = r.taskid AND b.projectid::int = r.projectid) ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name_a), sql.Identifier(input_table_name_b)) p_con.query(sql_insert, None) print('created: %s' % output_table_name) del p_con return output_table_name
def create_all_contributions(project_id, unique_task_table_name, user_contributions_table): # user contributions and unique tasks are joined # this step is necessary since user contributions may leave out tasks where no user contributed any data p_con = psqlDB() input_table_name = user_contributions_table output_table_name = 'contributions_{}'.format(project_id) tasks_table = unique_task_table_name sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT t.taskid ,t.projectid ,t.completedCount ,c.userid ,c.group_timestamp ,c.result ,t.st_geomfromtext as geo FROM {} as t LEFT JOIN {} as c ON (t.taskid = c.taskid AND t.projectid::int = c.projectid::int) WHERE t.projectid = %s ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(tasks_table), sql.Identifier(input_table_name)) data = [str(project_id)] p_con.query(sql_insert, data) print('created: %s' % output_table_name) del p_con return output_table_name
def aggregate_results_using_array(results_per_user): #################### # input is a table that contains individual user results encoded as 0,1,2,3 etc. # e.g. 'all_results_per_user' table # the table should have the following columns with the names: # task_id # project_id # result # task_geom #################### p_con = psqlDB() sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT b.task_id ,count(task_id) as completed_count ,array_agg(result) as results ,b.project_id ,b.task_geom FROM {} as b GROUP BY b.project_id ,b.task_id ,b.task_geom''' input_table_name = results_per_user output_table_name = results_per_user + '_' + 'array' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name)) p_con.query(sql_insert, None) print('created: %s' % output_table_name) del p_con return output_table_name
def get_layer_bbox(project_table_name, project_id): p_con = psqlDB() sql_insert = ''' SELECT id ,st_XMin(extent) ,st_XMax(extent) ,st_YMin(extent) ,st_YMax(extent) FROM {} WHERE id = %s ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(project_table_name)) data = [project_id] bbox_raw = p_con.retr_query(sql_insert, data) # the function returns a bounding box as a list of minx, maxx, miny, maxy bbox = [bbox_raw[0][1], bbox_raw[0][2], bbox_raw[0][3], bbox_raw[0][4]] return bbox
def get_psql_projects(project_table_name): psq_projects = [] p_con = psqlDB() # each row is converted to json format using psql function row_to_json sql_insert = ''' SELECT id FROM {} WHERE corrupt is False ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(project_table_name)) retr_data = p_con.retr_query(sql_insert, None) p_con.close() for id in retr_data: psq_projects.append(id[0]) return psq_projects
def get_all_non_corrupt_projects(projects): # this function looks for the project ids of all projecst in our database # the projects need to be non-corrupt # the projects need to be enrichted (e.g. final_5519 table must be existing) # input is the project ids list, output is the project ids list without projects that are corrupt or not existing p_con = psqlDB() sql_insert = ''' SELECT p.id ,i.table_name FROM projects as p, information_schema.tables as i WHERE not p.corrupt AND i.table_schema = 'public' AND left(i.table_name, 6) = 'final_' AND ltrim(i.table_name, 'final_')::int = p.id ORDER BY p.id ''' retr = p_con.retr_query(sql_insert, None) existing_projects = [] for i in range(0, len(retr)): existing_projects.append(retr[i][0]) # intersect existing projects and input projects filtered_projects = list( set(existing_projects).intersection(set(projects))) print('filtered projects. original input: %s, remaining in list: %s' % (projects, filtered_projects)) logging.warning( 'filtered projects. original input: %s, remaining in list: %s' % (projects, filtered_projects)) return filtered_projects
def create_results_table(): p_con = psqlDB() sql_insert = ''' CREATE TABLE results ( taskId VARCHAR NOT NULL ,userId VARCHAR NOT NULL ,projectId INT NOT NULL ,timestamp BIGINT NOT NULL ,result INT NOT NULL ,duplicates INT NOT NULL ,CONSTRAINT pk_result_id PRIMARY KEY (taskId, userId, projectId) ); CREATE INDEX results_taskId_index ON public.results USING BTREE (taskId); CREATE INDEX results_timestamp_index ON public.results USING BTREE (timestamp); CREATE INDEX results_projectId_index ON public.results USING BTREE (projectId); CREATE INDEX results_index ON public.results USING BTREE (result); ''' p_con.query(sql_insert, None) print('crated table: results') p_con.close()
def get_unique_tasks(project_id, task_table_name): # some tasks might be duplicated in the database since they are part of two different groups # the completed count of these tasks will be merged p_con = psqlDB() input_table_name = task_table_name output_table_name = 'tasks_unique_{}'.format(project_id) sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT t.taskid ,t.projectid ,Sum(t.completedcount) as completedcount -- don't forget the geometry ,t.st_geomfromtext FROM {} as t WHERE projectid = %s GROUP BY t.taskid ,t.projectid ,t.st_geomfromtext ''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name)) data = [str(project_id)] p_con.query(sql_insert, data) print('created: %s' % output_table_name) del p_con return output_table_name
def create_all_tasks_per_user(groups_per_user, task_geom): p_con = psqlDB() input_table_name_a = groups_per_user input_table_name_b = task_geom output_table_name = task_geom + '_per_user' sql_insert = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} AS SELECT b."userId" as user_id ,t.task_id ,b.project_id ,Min(b.group_timestamp) as group_timestamp --,b.group_id --,b.count --,b.edge_count ,t.task_geom FROM {} as b, {} as t WHERE b.group_id = t.group_id GROUP BY -- we need to group by task_id so that we avoid duplicates user_id, t.task_id, b.project_id, t.task_geom -- ORDER BY user_id, t.task_id''' sql_insert = sql.SQL(sql_insert).format(sql.Identifier(output_table_name), sql.Identifier(output_table_name), sql.Identifier(input_table_name_a), sql.Identifier(input_table_name_b)) p_con.query(sql_insert, None) print('created: %s' % output_table_name) del p_con return output_table_name