def rebuild_all_dbs(Session): ''' If the tests are running on the same db, we have to make sure that the ckan tables are recrated. ''' db_read_url_parts = cli.parse_db_config('ckan.datastore.write_url') db_ckan_url_parts = cli.parse_db_config('sqlalchemy.url') same_db = db_read_url_parts['db_name'] == db_ckan_url_parts['db_name'] if same_db: model.repo.tables_created_and_initialised = False clear_db(Session) model.repo.rebuild_db()
def get_microservice_metadata(): for config_option in ('ckan.spatialingestor.postgis_url', 'ckan.spatialingestor.internal_geoserver_url',): if not config.get(config_option): raise Exception( 'Config option `{0}` must be set to use the SpatialIngestor.'.format(config_option)) core_url = config.get('ckan.site_url', 'http://localhost:8000/') return {'postgis': cli.parse_db_config('ckan.spatialingestor.postgis_url'), 'geoserver': cli.parse_db_config('ckan.spatialingestor.internal_geoserver_url'), 'geoserver_public_url': config.get('ckan.spatialingestor.public_geoserver_url', core_url + '/geoserver'), 'target_spatial_formats': list(set([x.upper() for x in toolkit.aslist(config.get('ckan.spatialingestor.target_formats', []))])) }
def load_packages(self): #Get our CKAN and Drupal connection string dbc = parse_db_config('sqlalchemy.url') ckan_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbc['db_host'], dbc['db_name'], dbc['db_user'], dbc['db_pass']) dbd = parse_db_config('ckan.drupal.url') drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) # get a connection, if a connect cannot be made an exception will be raised here ckan_conn = psycopg2.connect(ckan_conn_string) drupal_conn = psycopg2.connect(drupal_conn_string) # ckan_conn.ckan_cursor will return a ckan_cursor object, you can use this ckan_cursor to perform queries ckan_cursor = ckan_conn.cursor() drupal_cursor = drupal_conn.cursor() # execute our Query ckan_cursor.execute("""select p.id, p.name, p.title, case when pe1.value is null then '' else pe1.value end, case when p.notes is null then '' else p.notes end, case when pe2.value is null then '' else pe2.value end from package p left join package_extra pe1 on p.id = pe1.package_id and pe1.key = 'title_fra' left join package_extra pe2 on p.id = pe2.package_id and pe2.key = 'notes_fra'""") # retrieve the records from the CKAN database and insert into the Drupal database for rec in ckan_cursor: drupal_cursor.execute("""select count(*) from opendata_package where pkg_id = %s""", (rec[0],)) row = drupal_cursor.fetchone() if row[0] == 0: print "Inserting package %s" % (rec[0],) try: drupal_cursor.execute("""insert into opendata_package ( pkg_id, pkg_name, pkg_title_en, pkg_title_fr, pkg_description_en, pkg_description_fr ) values (%s, %s, %s, %s, %s, %s)""", (rec[0], self.format_drupal_string(rec[1]), self.format_drupal_string(rec[2]), self.format_drupal_string(rec[3]), self.format_drupal_string(rec[4]), self.format_drupal_string(rec[5]))) except psycopg2.DataError, e: self.logger.warn('Postgresql Database Exception %s', e.message)
def dataset_comment_count(pkg_id): count = 0 try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % ( dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # add this to the SQL statement to limit comments to those that are published 'and status = 0' drupal_cursor.execute( """select count(c.*) from comment c inner join opendata_package o on o.pkg_node_id = c.nid where o.pkg_id = %s""", (pkg_id, )) row = drupal_cursor.fetchone() count = row[0] drupal_cursor.close() drupal_conn.close() except KeyError: pass return count
def dataset_comments(pkg_id): #import pdb; pdb.set_trace() comment_list = [] try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # add this to the SQL statement to limit comments to those that are published 'and status = 0' drupal_cursor.execute( """select c.subject, to_char(to_timestamp(c.changed), 'YYYY-MM-DD'), c.name, c.thread, f.comment_body_value from comment c inner join field_data_comment_body f on c.cid = f.entity_id inner join opendata_package o on o.pkg_node_id = c.nid where o.pkg_id = %s""", (pkg_id,)) for comment in drupal_cursor: comment_body = clean_html(comment[4]) comment_list.append({'subject': comment[0], 'date': comment[1], 'thread': comment[3], 'comment_body': comment_body, 'user': comment[2]}) drupal_cursor.close() drupal_conn.close() except KeyError: pass return comment_list
def dataset_rating(pkg_id): rating = -1 try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % ( dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # retreive the average dataset rating from Drupal -- NB the parameter must be in the form (x,) drupal_cursor.execute( """select avg(v.value)/25+1 as rating from opendata_package p inner join votingapi_vote v on p.pkg_node_id = v.entity_id where p.pkg_id = %s""", (pkg_id, )) row = drupal_cursor.fetchone() rating = row[0] drupal_cursor.close() drupal_conn.close() except KeyError: pass return rating
def command(self): ''' Parse command line arguments and call appropriate method. ''' if not self.args or self.args[0] in ['--help', '-h', 'help']: print SetupDatastoreCommand.__doc__ return cmd = self.args[0] self._load_config() self.db_write_url_parts = cli.parse_db_config( 'ckan.datastore.write_url') self.db_read_url_parts = cli.parse_db_config( 'ckan.datastore.read_url') self.db_ckan_url_parts = cli.parse_db_config( 'sqlalchemy.url') write_db = self.db_write_url_parts['db_name'] read_db = self.db_read_url_parts['db_name'] assert write_db == read_db,\ "write and read db have to be the same" if len(self.args) != 2: print self.usage return if cmd == 'set-permissions': setup.set_permissions( pguser=self.args[1], pgport=self.db_ckan_url_parts['db_port'], ckandb=self.db_ckan_url_parts['db_name'], datastoredb=self.db_write_url_parts['db_name'], ckanuser=self.db_ckan_url_parts['db_user'], writeuser=self.db_write_url_parts['db_user'], readonlyuser=self.db_read_url_parts['db_user'] ) if self.verbose: print 'Set permissions for read-only user: SUCCESS' else: print self.usage log.error('Command "%s" not recognized' % (cmd,)) return
def set_permissions(ctx, config): load_config(config or ctx.obj['config']) write_url = parse_db_config(u'ckan.datastore.write_url') read_url = parse_db_config(u'ckan.datastore.read_url') db_url = parse_db_config(u'sqlalchemy.url') # Basic validation that read and write URLs reference the same database. # This obviously doesn't check they're the same database (the hosts/ports # could be different), but it's better than nothing, I guess. if write_url['db_name'] != read_url['db_name']: exit(u"The datastore write_url and read_url must refer to the same " u"database!") sql = permissions_sql(maindb=db_url['db_name'], datastoredb=write_url['db_name'], mainuser=db_url['db_user'], writeuser=write_url['db_user'], readuser=read_url['db_user']) print(sql)
def command(self): ''' Parse command line arguments and call appropriate method. ''' if not self.args or self.args[0] in ['--help', '-h', 'help']: print SetupDatastoreCommand.__doc__ return cmd = self.args[0] self._load_config() self.db_write_url_parts = cli.parse_db_config( 'ckan.datastore.write_url') self.db_read_url_parts = cli.parse_db_config('ckan.datastore.read_url') self.db_ckan_url_parts = cli.parse_db_config('sqlalchemy.url') write_db = self.db_write_url_parts['db_name'] read_db = self.db_read_url_parts['db_name'] assert write_db == read_db,\ "write and read db have to be the same" if len(self.args) != 2: print self.usage return if cmd == 'set-permissions': setup.set_permissions( pguser=self.args[1], pgport=self.db_ckan_url_parts['db_port'], ckandb=self.db_ckan_url_parts['db_name'], datastoredb=self.db_write_url_parts['db_name'], ckanuser=self.db_ckan_url_parts['db_user'], writeuser=self.db_write_url_parts['db_user'], readonlyuser=self.db_read_url_parts['db_user']) if self.verbose: print 'Set permissions for read-only user: SUCCESS' else: print self.usage log.error('Command "%s" not recognized' % (cmd, )) return
def command(self): ''' Parse command line arguments and call appropriate method. ''' if not self.args or self.args[0] in ['--help', '-h', 'help']: print SetupDatastoreCommand.__doc__ return cmd = self.args[0] self._load_config() self.db_write_url_parts = cli.parse_db_config( 'ckan.datastore.write_url') self.db_read_url_parts = cli.parse_db_config('ckan.datastore.read_url') self.db_ckan_url_parts = cli.parse_db_config('sqlalchemy.url') assert self.db_write_url_parts['db_name'] == self.db_read_url_parts[ 'db_name'], "write and read db should be the same" if cmd == 'create-db': if len(self.args) != 2: print self.usage return self.sql_superuser = self.args[1] self.create_db() if self.verbose: print 'Creating DB: SUCCESS' elif cmd == 'create-read-only-user': if len(self.args) != 2: print self.usage return self.sql_superuser = self.args[1] self.create_read_only_user() if self.verbose: print 'Creating read-only user: SUCCESS' else: print self.usage log.error('Command "%s" not recognized' % (cmd, )) return
def _purge_legacy_all(self): geoserver_info = cli.parse_db_config('ckan.spatialingestor.postgis_url') geoserver_credentials = (geoserver_info['db_user'], geoserver_info['db_pass']) geoserver_wsurl = 'http://' + geoserver_info['db_host'] + 'rest/workspaces' postgist_info = cli.parse_db_config('ckan.spatialingestor.postgis_url') def get_db_cursor(): try: connection = psycopg2.connect(dbname=postgist_info['db_name'], user=postgist_info['db_user'], password=postgist_info['db_pass'], host=postgist_info['db_host'], port=postgist_info.get('db_port', None)) connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) return connection.cursor(), connection except Exception, e: log.error("Failed to connect with PostGIS with error {0}".format(str(e))) return None
def _get_db_settings(): postgis_info = cli.parse_db_config( 'ckanext.datagovau.spatialingestor.postgis.url') db_port = postgis_info.get('db_port', '') if db_port == '': db_port = None return dict(dbname=postgis_info.get('db_name'), user=postgis_info.get('db_user'), password=postgis_info.get('db_pass'), host=postgis_info.get('db_host'), port=db_port)
def set_permissions(ctx, config): load_config(config or ctx.obj['config']) write_url = parse_db_config(u'ckan.datastore.write_url') read_url = parse_db_config(u'ckan.datastore.read_url') db_url = parse_db_config(u'sqlalchemy.url') # Basic validation that read and write URLs reference the same database. # This obviously doesn't check they're the same database (the hosts/ports # could be different), but it's better than nothing, I guess. if write_url['db_name'] != read_url['db_name']: exit(u"The datastore write_url and read_url must refer to the same " u"database!") sql = permissions_sql( maindb=db_url['db_name'], datastoredb=write_url['db_name'], mainuser=db_url['db_user'], writeuser=write_url['db_user'], readuser=read_url['db_user']) print(sql)
def saveUser(self, user_id, provider, user_key, user_secret): pprint.pprint("save user") dbd = parse_db_config('ckan.drupal.url') drupal_conn_string = "host='%s' dbname='%s' port='%s' user='******' password='******'" % ( dbd['db_host'], dbd['db_name'], dbd['db_port'], dbd['db_user'], dbd['db_pass']) drupal_conn = psycopg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) drupal_cursor.execute( """INSERT INTO opendata_tokens_provider_user (id_usuario, provider, key, secret) VALUES (%s, %s, %s, %s)""", (user_id, provider, user_key, user_secret)) drupal_conn.commit()
def _set_permissions(args): write_url = cli.parse_db_config('ckan.datastore.write_url') read_url = cli.parse_db_config('ckan.datastore.read_url') db_url = cli.parse_db_config('sqlalchemy.url') # Basic validation that read and write URLs reference the same database. # This obviously doesn't check they're the same database (the hosts/ports # could be different), but it's better than nothing, I guess. if write_url['db_name'] != read_url['db_name']: _abort("The datastore write_url and read_url must refer to the same " "database!") context = { 'maindb': db_url['db_name'], 'datastoredb': write_url['db_name'], 'mainuser': db_url['db_user'], 'writeuser': write_url['db_user'], 'readuser': read_url['db_user'], } sql = _permissions_sql(context) print(sql)
def saveUserToken(self, user_token, user_id, provider): pprint.pprint("store user token") pprint.pprint(str(user_token)) dbd = parse_db_config('ckan.drupal.url') drupal_conn_string = "host='%s' dbname='%s' port='%s' user='******' password='******'" % ( dbd['db_host'], dbd['db_name'], dbd['db_port'], dbd['db_user'], dbd['db_pass']) drupal_conn = psycopg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) drupal_cursor.execute( """UPDATE opendata_tokens_provider_user SET token=%s WHERE id_usuario=%s AND provider=%s""", (str(user_token), user_id, provider)) drupal_conn.commit()
def command(self): ''' Parse command line arguments and call appropriate method. ''' if not self.args or self.args[0] in ['--help', '-h', 'help']: print SetupDatastoreCommand.__doc__ return cmd = self.args[0] self._load_config() self.db_write_url_parts = cli.parse_db_config('ckan.datastore.write_url') self.db_read_url_parts = cli.parse_db_config('ckan.datastore.read_url') self.db_ckan_url_parts = cli.parse_db_config('sqlalchemy.url') assert self.db_write_url_parts['db_name'] == self.db_read_url_parts['db_name'], "write and read db should be the same" if cmd == 'create-db': if len(self.args) != 2: print self.usage return self.sql_superuser = self.args[1] self.create_db() if self.verbose: print 'Creating DB: SUCCESS' elif cmd == 'create-read-only-user': if len(self.args) != 2: print self.usage return self.sql_superuser = self.args[1] self.create_read_only_user() if self.verbose: print 'Creating read-only user: SUCCESS' else: print self.usage log.error('Command "%s" not recognized' % (cmd,)) return
def addexception(self, id): # Adds an exception to the database. # Sometimes resources will contain valid CPR-numbers which are in fact not d_port = config.get('ckan.cprvalidation.postgres_port', None) d_pass = config.get('ckan.cprvalidation.cprvalidation_password', None) db_name = config.get('ckan.cprvalidation.cprvalidation_db', None) add_exception = ''' UPDATE {0}.status SET excepted = TRUE WHERE package_id = %s returning * ;''' if d_pass == None: print( "Setup cprvalidation_password in /etc/ckan/default/production.ini" ) sys.exit(1) if d_port == None: print("Setup postgres_port in /etc/ckan/default/production.ini") sys.exit(1) try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) conn.autocommit = True print(" ") except Exception as e: print(e) sys.exit() cur = conn.cursor() cur.execute(add_exception.format(db_name), (id, )) count = len(cur.fetchall()) if (count == 0): print("Could not find relation %s " % id) else: print( "Added exception for %d resources in dataset with package_id: %s " % (count, id)) conn.commit() conn.close()
def _get_geoserver_data(): geoserver_info = cli.parse_db_config( 'ckanext.datagovau.spatialingestor.geoserver.url') protocol = "http://" if geoserver_info.get('db_type') == 'sslgeoserver': protocol = "https://" geoserver_host = protocol + geoserver_info.get('db_host') port = geoserver_info.get('db_port', '') if port != '': geoserver_host += ':' + port geoserver_host += '/' + geoserver_info.get('db_name') + '/' return ( geoserver_host, geoserver_info.get('db_user'), geoserver_info.get('db_pass'), config.get('ckanext.datagovau.spatialingestor.geoserver.public_url'))
def download(self): port = config.get('ckan.cprvalidation.postgres_port', None) password = config.get('ckan.cprvalidation.cprvalidation_password', None) db_name = config.get('ckan.cprvalidation.cprvalidation_db', None) db_config = parse_db_config() host = db_config.get('db_host') if port != None and password != None: try: conn = psycopg2.connect(database=db_name, host=host, user="******", password=password, port=port) except Exception as e: log.warn(e) sys.exit() else: log.warn( "Config not setup properly! Missing either postgres_port or cprvalidation_password" ) sys.exit() select = """COPY (SELECT * FROM {0}.status) to STDOUT WITH CSV HEADER""" cur = conn.cursor() #Instead of using an actual file, we use a file-like string buffer text_stream = StringIO.StringIO() cur.copy_expert(select.format(db_name), text_stream) output = text_stream.getvalue() #Cleanup after ourselves text_stream.close() conn.commit() conn.close() pylons.response.headers['Content-Type'] = 'text/csv;charset=utf-8' pylons.response.headers[ 'Content-Disposition'] = 'attachment; filename="cpr_report.csv"' return output
def _migrate_autogen_timestamp(old_name, new_name): write_url_obj = cli.parse_db_config('ckan.datastore.write_url') write_url = 'postgres://'+ write_url_obj['db_user'] + ':' write_url = write_url + write_url_obj['db_pass'] + '@' write_url = write_url + write_url_obj['db_host'] write_url = write_url + (write_url_obj['db_port'] if write_url_obj['db_port'] else '') + '/' write_url = write_url + write_url_obj['db_name'] conn = create_engine(write_url) sql_autogen_res = 'select table_name \ from INFORMATION_SCHEMA.COLUMNS where column_name = %s' sql_rename_column = 'ALTER TABLE "{table_name}" RENAME {old_name} TO {new_name}' autogen_res = conn.execute(sql_autogen_res, old_name).fetchall() for ar in autogen_res: result = conn.execute(sql_rename_column.format( table_name = ar[0], old_name = old_name, new_name = new_name))
def dataset_rating(pkg_id): rating = None try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # retreive the average dataset rating from Drupal -- NB the parameter must be in the form (x,) drupal_cursor.execute( """select avg(v.value)/25+1 as rating from opendata_package p inner join votingapi_vote v on p.pkg_node_id = v.entity_id where p.pkg_id = %s""", (pkg_id,)) row = drupal_cursor.fetchone() rating = row[0] drupal_cursor.close() drupal_conn.close() except KeyError: pass return int(0 if rating is None else rating)
def dataset_comment_count(pkg_id): count = 0 try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # add this to the SQL statement to limit comments to those that are published 'and status = 0' drupal_cursor.execute( """select count(c.*) from comment c inner join opendata_package o on o.pkg_node_id = c.nid where o.pkg_id = %s""", (pkg_id,)) row = drupal_cursor.fetchone() count = row[0] drupal_cursor.close() drupal_conn.close() except KeyError: pass return count
def dataset_comments(pkg_id): #import pdb; pdb.set_trace() comment_list = [] try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % ( dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # add this to the SQL statement to limit comments to those that are published 'and status = 0' drupal_cursor.execute( """select c.subject, to_char(to_timestamp(c.changed), 'YYYY-MM-DD'), c.name, c.thread, f.comment_body_value from comment c inner join field_data_comment_body f on c.cid = f.entity_id inner join opendata_package o on o.pkg_node_id = c.nid where o.pkg_id = %s""", (pkg_id, )) for comment in drupal_cursor: comment_body = clean_html(comment[4]) comment_list.append({ 'subject': comment[0], 'date': comment[1], 'thread': comment[3], 'comment_body': comment_body, 'user': comment[2] }) drupal_cursor.close() drupal_conn.close() except KeyError: pass return comment_list
def scanDB(): d_port = config.get('ckan.cprvalidation.postgres_port', None) d_pass = config.get('ckan.cprvalidation.cprvalidation_password', None) db_name = config.get('ckan.cprvalidation.cprvalidation_db', None) try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) except Exception as e: print(e) sys.exit() # TODO: PDF is really slow, so we need to fix that, removed for now select = """ SELECT * FROM """ + db_name + """.status WHERE format = ANY('{csv,xlsx,json,geojson,ods,docx}') AND (last_updated::timestamp >= last_checked::timestamp OR last_checked IS NULL) AND (url_type IS NOT NULL OR datastore_active = 'true') AND excepted IS NULL; """ print("Scanning for updates...") cur = conn.cursor() # Get the datasets we have to validate cur.execute(select) tmp_return = cur.fetchall() conn.commit() conn.close() # Return them print("Found %d updated resources to validate \n" % len(tmp_return)) return tmp_return
def setup_protected_resources(**kwargs): template_filename = os.path.join(os.path.dirname(__file__), u'set_protected_resource_table.sql') with open(template_filename) as f: content = f.read() print(content.format(**parse_db_config()))
def load_docs(self): ''' Load the Virtual Library datasets into the same Drupal table as the Open Data datasets. @return: nothing ''' #Get our CKAN and Drupal connection string dbc = parse_db_config('sqlalchemy.url') ckan_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbc['db_host'], dbc['db_name'], dbc['db_user'], dbc['db_pass']) dbd = parse_db_config('ckan.drupal.url') drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) # get a connection, if a connect cannot be made an exception will be raised here ckan_conn = psycopg2.connect(ckan_conn_string) drupal_conn = psycopg2.connect(drupal_conn_string) # ckan_conn.ckan_cursor will return a ckan_cursor object, you can use this ckan_cursor to perform queries ckan_cursor = ckan_conn.cursor() drupal_cursor = drupal_conn.cursor() # execute our Query ckan_cursor.execute("""select p.id, p.name, case when pe1.value is null then '' else pe1.value end, case when pe2.value is null then '' else pe2.value end from package p left join package_extra pe1 on p.id = pe1.package_id and pe1.key = 'title_ml' left join package_extra pe2 on p.id = pe2.package_id and pe2.key = 'description_ml' where p.type = 'doc' and p.state = 'active'""") # retrieve the records from the CKAN database and insert into the Drupal database for rec in ckan_cursor: drupal_cursor.execute("""select count(*) from opendata_package where pkg_id = %s""", (rec[0],)) row = drupal_cursor.fetchone() if row[0] == 0: titles = json.loads(rec[2]) descriptions = json.loads(rec[3]) title_en = '' if 'en' in titles: title_en = titles['en'] title_fr = '' if 'fr' in titles: title_fr = titles['fr'] desc_en = '' if 'en' in descriptions: desc_en = descriptions['en'] desc_fr = '' if 'fr' in descriptions: desc_fr = descriptions['fr'] print "Inserting package %s: %s %s %s: %s %s" % (rec[0], rec[1], title_en, title_fr, desc_en, desc_fr) try: drupal_cursor.execute("""insert into opendata_package ( pkg_id, pkg_name, pkg_title_en, pkg_title_fr, pkg_description_en, pkg_description_fr ) values (%s, %s, %s, %s, %s, %s)""", (rec[0], rec[1], title_en, title_fr, desc_en, desc_fr)) drupal_conn.commit() except psycopg2.DataError, e: self.logger.warn('Postgresql Database Exception %s', e.message)
def notify(self, entity, operation=None): if operation == model.domain_object.DomainObjectOperation.new and isinstance( entity, (_package.Package)): reload(sys) sys.setdefaultencoding('utf-8') dbc = parse_db_config('ckan.drupal.url') drupal_conn_string = "host='%s' port='%s' dbname='%s' user='******' password='******'" % ( dbc['db_host'], dbc['db_port'], dbc['db_name'], dbc['db_user'], dbc['db_pass']) drupal_conn = psycopg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() titles = json.loads(entity.title_translated) descriptions = json.loads(entity.notes_translated) title_en = '' if 'en' in titles: title_en = titles['en'] title_es = '' if 'es' in titles: title_es = titles['es'] title_ca = '' if 'ca' in titles: title_ca = titles['ca'] desc_en = '' if 'en' in descriptions: desc_en = descriptions['en'] desc_es = '' if 'es' in descriptions: desc_es = descriptions['es'] desc_ca = '' if 'ca' in descriptions: desc_ca = descriptions['ca'] log.debug("Inserting package %s: %s %s %s: %s %s %s %s" % (entity.id, entity.name, title_en, title_es, title_ca, desc_en, desc_es, desc_ca)) try: drupal_cursor.execute( """insert into opendata_package (pkg_id,pkg_name,pkg_title_en,pkg_title_es,pkg_title_ca,pkg_description_en,pkg_description_es,pkg_description_ca) values (%s, %s, %s, %s, %s, %s, %s, %s)""", (entity.id, self.format_drupal_string( entity.name), self.format_drupal_string(title_en), self.format_drupal_string(title_es), self.format_drupal_string(title_ca), self.format_drupal_string(desc_en), self.format_drupal_string(desc_es), self.format_drupal_string(desc_ca))) drupal_conn.commit() except psycopg2.DataError, e: log.warn('Postgresql Database Exception %s', e.message) drupal_conn.commit() drupal_cursor.close() drupal_conn.close() #Habilitar una vez importado el esquema de drupal en ckan '''
def resource_download(self, environ, id, resource_id, filename=None): context = { 'model': model, 'session': model.Session, 'user': c.user, 'auth_user_obj': c.userobj } try: rsc = t.get_action('resource_show')(context, {'id': resource_id}) except (logic.NotFound, logic.NotAuthorized): base.abort(404, _('Resource not found')) headers = { 'X-Forwarded-For': environ.get('REMOTE_ADDR'), 'User-Agent': environ.get('HTTP_USER_AGENT'), 'Accept-Language': environ.get('HTTP_ACCEPT_LANGUAGE', ''), 'Accept-Encoding': environ.get('HTTP_ACCEPT_ENCODING', '') } if rsc.get('token_required') == 'Yes': authentication = environ.get('HTTP_AUTHORIZATION', '') url_redirect = "%s/tokens?resource_id=%s&package_id=%s" % ( config.get('ckan.site_url'), resource_id, rsc['package_id']) if authentication == '': return redirect(url_redirect.encode('utf-8')) dbd = parse_db_config('ckan.drupal.url') drupal_conn_string = "host='%s' dbname='%s' port='%s' user='******' password='******'" % ( dbd['db_host'], dbd['db_name'], dbd['db_port'], dbd['db_user'], dbd['db_pass']) drupal_conn = psycopg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) if not rsc.get('token_type'): drupal_cursor.execute( """select id_usuario from opendata_tokens where tkn_usuario=%s""", (authentication, )) else: drupal_cursor.execute( """SELECT t.*, pu.*, p.*, u.name, u.mail, u.uid FROM opendata_tokens t LEFT JOIN opendata_tokens_provider_user pu ON pu.id_usuario=t.id_usuario LEFT JOIN opendata_tokens_provider p ON (pu.provider = p.id OR p.id='bsm') LEFT JOIN users u ON t.id_usuario = u.uid WHERE t.tkn_usuario = %s AND (p.id IS NULL OR p.id = %s)""", (authentication, rsc.get('token_type'))) if drupal_cursor.rowcount < 1: return redirect(url_redirect.encode('utf-8')) elif rsc.get('token_type'): record = drupal_cursor.fetchone() api = None if rsc.get('token_type') == 'bsm': api = bsm.BsmApi(rsc, app_token=record['app_token'], consumer_key=record['consumer_key'], consumer_secret=record['consumer_secret'], user_token=record['token'], user_id=record['uid'], user_key=record['key'], user_secret=record['secret'], username=record['name'], email=record['mail']) pprint.pprint(record['app_token']) api_content, status, headers = api.execute() # Save download to tracking_raw CustomTrackingController.update(environ['REQUEST_URI'], 'resource', environ) if rsc.get('url_type') == 'upload': # Internal redirect upload = uploader.get_resource_uploader(rsc) filepath = upload.get_path(rsc['id']) fileapp = paste.fileapp.FileApp(filepath) try: status, headers, app_iter = request.call_application(fileapp) except OSError: base.abort(404, _('Resource data not found')) response.headers.update(dict(headers)) content_type, content_enc = m.guess_type(rsc.get('url', '')) if content_type and content_type == 'application/xml': response.headers['Content-Type'] = 'application/octet-stream' elif content_type: response.headers['Content-Type'] = content_type response.status = status return app_iter h.redirect_to(rsc['url'].encode('utf-8')) elif api_content: response.headers['Content-Type'] = headers['content-type'] response.status = status return api_content elif 'url' not in rsc: base.abort(404, _('No download is available')) else: # External redirect return redirect(rsc['url'].encode('utf-8'))
def initdb(self): #For debugging purposes we delete the database everytime we init. This CLEANS the database d_port = config.get('ckan.cprvalidation.postgres_port', None) d_pass = config.get('ckan.cprvalidation.cprvalidation_password', None) db_name = config.get('ckan.cprvalidation.cprvalidation_db', None) postgres_pass = config.get('ckan.cprvalidation.postgres_password', None) error_state = False if d_pass == None: print( "Setup cprvalidation_password in /etc/ckan/default/production.ini" ) error_state = True if d_port == None: print("Setup postgres_port in /etc/ckan/default/production.ini") error_state = True if postgres_pass == None: print( "Setup postgres_password in /etc/ckan/default/production.ini") error_state = True if (error_state): print("Exiting..") sys.exit(1) create_user = ''' CREATE ROLE cprvalidation WITH PASSWORD %s; ''' drop_db = '''DROP DATABASE IF EXISTS {0};''' create_db = ''' CREATE DATABASE {0} WITH OWNER = cprvalidation ENCODING = 'UTF8' TABLESPACE = pg_default CONNECTION LIMIT = -1; ''' create_schema = ''' DROP SCHEMA IF EXISTS {0}; CREATE SCHEMA {0} AUTHORIZATION cprvalidation; ''' create_table = ''' DROP TABLE IF EXISTS {0}.status; CREATE TABLE {0}.status ( package_id character varying NOT NULL, resource_id character varying NOT NULL, status character varying, -- valid, invalid, pending format character varying NOT NULL, url character varying, url_type character varying, datastore_active character varying, last_checked timestamp, last_updated timestamp, cpr_number character varying, excepted boolean, error character varying, CONSTRAINT status_pkey PRIMARY KEY (resource_id) ) WITH ( OIDS=FALSE ); ALTER TABLE {0}.status OWNER TO cprvalidation; COMMENT ON COLUMN {0}.status.status IS 'valid, invalid, pending'; ''' try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database="postgres", host=host, user="******", password=postgres_pass, port=d_port) conn.autocommit = True print("Connected as postgres user.") except Exception as e: print(e) sys.exit() cur = conn.cursor() try: #cur.execute(create_user,[d_pass]) cur.execute(drop_db.format(db_name)) cur.execute(create_db.format(db_name)) print("Initialized Database") conn.commit() conn.close() except Exception as e: #TODO: Handle this sort of erros more gracefully print("Unexpected error") print(e.message) sys.exit(1) # # We need two different sessions to the database as we are changing user # try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) conn.autocommit = True print("Created the table and scheme") except Exception as e: print("Woops") print(e) sys.exit() cur = conn.cursor() try: cur.execute(create_schema.format(db_name)) cur.execute(create_table.format(db_name)) print("Created schema and table") conn.commit() conn.close() print("Done.") except: # TODO: Handle this sort of erros more gracefully print("Unexpected error") sys.exit(1)
def validateResource(resource): ''' Overview of the tuple ( 0 package_id character varying NOT NULL, 1 resource_id character varying NOT NULL, 2 status character varying, -- valid, invalid, pending 3 format character varying NOT NULL, 4 url character varying, 5 url_type character varying, 6 datastore_active character varying, 7 last_checked character varying, 8 last_updated character varying, 9 cpr_number character varying, 10 excepted bool, 11 error character varying ) ''' siteurl = config.get('ckan.site_url') email = config.get('ckan.cprvalidation.email', None) d_port = config.get('ckan.cprvalidation.postgres_port', None) d_pass = config.get('ckan.cprvalidation.cprvalidation_password', None) db_name = config.get('ckan.cprvalidation.cprvalidation_db', None) id = resource[1] format = str(resource[3]).lower() datastore = True if str(resource[6]).lower() == "true" else False filestore = True if resource[5] == "upload" else False file_url = resource[4] file_string = None file_path = None local = False error = False print("DEBUG INFO: ") print("Resource: " + str(resource)) print("Datastore: " + str(datastore)) print("Filestore: " + str(filestore)) storage_path = config.get('ckan.storage_path') # Get the filepath, locally or externally, it should not matter if filestore: file_path = os.path.join(storage_path, 'resources', id[0:3], id[3:6], id[6:]) local = True elif datastore: file_path = siteurl + "/datastore/dump/" + id + "?format=csv" format = "csv" #Datastore will always be CSV, so this makes it easier print("Format: " + str(format)) print("File_path: " + str(file_path)) if file_path is None: print("Could not construct file_path") return None format = str(format).lower() # If the s3filestore plugin is enabled, always retrieve files from HTTP if ckan.plugins.plugin_loaded('s3filestore'): local = False if format == "csv": output = processCSV(file_path, file_url, local) elif format == "docx": output = processDOCX(file_url) elif format == "ods": output = processODS(file_url) elif format == "xlsx": output = processXLSX(file_url) elif format == "pdf": output = processPDF(file_url) elif format == "geojson" or format == "json": output = processJSON(file_url) else: print("Format %s can't be processed" % format) return error = output[0] file_string = output[1] insert_error = False if (file_string is None or error != None): insert_error = True else: iscpr = validcpr(file_string) if (insert_error): print(error) try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) except Exception as e: print(e) sys.exit() current_time = datetime.datetime.utcnow( ) # Timestamp is UTC as CKAN stores metadata_modified as UTC insert = """ UPDATE {0}.status SET status='error', last_checked = %s,error = %s WHERE resource_id= %s returning * ;""" cur = conn.cursor() cur.execute(insert.format(db_name), [current_time, error, id]) conn.commit() conn.close() else: if (not iscpr[0]): #If we dont have a CPR in the resource try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) except Exception as e: print(e) sys.exit() current_time = datetime.datetime.utcnow() #Timestamp insert = """ UPDATE {0}.status SET status='valid', last_checked= %s WHERE resource_id= %s returning * ;""" cur = conn.cursor() cur.execute(insert.format(db_name), [current_time, id]) conn.commit() conn.close() else: #We have a CPR-number! print( "Detected a CPR number, if an exception is made nothing will happen" ) try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) except Exception as e: print(e) sys.exit() current_time = datetime.datetime.utcnow() #Timestamp select = """ SELECT * FROM {0}.status WHERE package_id = %s AND excepted IS NOT NULL; """ insert = """ UPDATE {0}.status SET status='invalid', last_checked= %s,cpr_number=%s WHERE resource_id= %s returning * ;""" cur = conn.cursor() cur.execute(select.format(db_name), [resource[0]]) if (len(cur.fetchall()) > 0): #There was an exception made for this resource print("Exception was made for package with id: %s ignoring." % resource[0]) return cur.execute(insert.format(db_name), [current_time, iscpr[1], id]) conn.commit() conn.close() try: print("Making dataset private") package_id = resource[0] package = get_action('package_show')({}, {'id': package_id}) except Exception as e: print("Could not show package") print(e.message) sys.exit(1) try: if ( package["private"] == True ): #If the dataset is already private, we do not need to send an email otherwise we spam return package["private"] = True get_action('package_update')({}, package) print( "Made dataset with package id: " + package_id + " private as it contains CPR data. Either add an exception or remove it from the site" ) print( "When an exception has been made or data altered, kindly mark data as public again" ) recipient = config.get('ckan.cprvalidation.email', None) subject = "CPR fundet i datasæt: %s" % resource[0] body = "CPR data er fundet i datasættet med id: %s specifikt resourcen med id: %s \n Data er gjort privat, tjek data igennem og " \ "publicer igen eller tilføj en exception hvis du mener data ikke indeholder CPR og kan stå" \ " inde for dette." % (resource[0],id) try: process = subprocess.Popen([ 'mail', '-s', subject, '-r', "*****@*****.**", recipient ], stdin=subprocess.PIPE) except Exception, error: print error process.communicate(body) except Exception as e: print("Could not update package") print(e.message) sys.exit(1)
def view(self): context = { 'model': model, 'session': model.Session, 'user': c.user, 'for_view': True, 'auth_user_obj': c.userobj } # Obtenemos parametros de configuracion site_url = config.get('ckan.site_url') + config.get( 'ckan.root_path').replace('{{LANG}}', '') # Inicializamos variables now = d.datetime.now() year_from = 1989 year_to = now.year # Comprobamos si es un usuario identificado o no logged_in = False if 'user' in context and context['user']: logged_in = True # Si se pide el catalogo publico ignoramos el usuario identificado if logged_in and 'public' in request.params and request.params.get( 'public') == 'true': logged_in = False if not logged_in: # Obtenemos el catalogo para usuarios no identificados packages = t.get_action('package_search')(context, { 'include_private': False, 'rows': 1000, 'sort': 'name asc' }) packages = packages['results'] elif c.userobj.sysadmin: # Obtenemos el catalogo para sysadmin packages = t.get_action('package_search')(context, { 'include_private': True, 'rows': 1000, 'sort': 'name asc' }) packages = packages['results'] else: # Obtenemos el catalogo para usuarios identificados: solo los de sus organizaciones user_org = t.get_action('organization_list_for_user')( context, { 'permission': 'create_dataset' }) org_facets = '' for org in user_org: if org_facets != '': org_facets = org_facets + ' OR ' org_facets = org_facets + org['name'] packages = t.get_action('package_search')( context, { 'fq': 'organization:(' + org_facets + ')', 'include_private': True, 'rows': 1000, 'sort': 'name asc' }) packages = packages['results'] # obtenemos los formatos formats = t.get_action('format_autocomplete')(context, { 'q': '', 'limit': 50 }) # puede devolver formatos duplicados, lo convertimos a un set que eliminara los elementos # duplicados y de nuevo a una lista formats = list(set(formats)) for format in formats: format_strip = format.strip() if not format_strip: formats.remove(format) # Realizamos conexion a la BBDD de Drupal para obtener el numero de comentarios de cada dataset y almacenamos los valores en un array dbc = parse_db_config('ckan.drupal.url') ckan_conn_string = "host='%s' port='%s' dbname='%s' user='******' password='******'" % ( dbc['db_host'], dbc['db_port'], dbc['db_name'], dbc['db_user'], dbc['db_pass']) ckan_conn = psycopg2.connect(ckan_conn_string) ckan_cursor = ckan_conn.cursor() ckan_cursor.execute( """SELECT OP.pkg_name, COUNT(*) FROM opendata_package OP INNER JOIN node N ON N.tnid = OP.pkg_node_id INNER JOIN comment C ON C.nid = N.nid WHERE N.tnid != 0 GROUP BY OP.pkg_name;""" ) comments = {} for row in ckan_cursor: comments.update({row[0]: row[1]}) ckan_cursor.close() ckan_conn.close() sql_downloads = '''select sum(count) AS downloads, sum(count_absolute) AS downloads_absolute, t.tracking_type, p.name from tracking_summary t inner join resource r ON r.id = t.resource_id inner join package p ON p.id = r.package_id GROUP BY p.name, t.tracking_type;''' results_downloads = model.Session.execute(sql_downloads) downloads = {} downloads_absolute = {} api_access_number = {} api_access_number_absolute = {} for row in results_downloads: if row.tracking_type == 'resource': downloads.update({row.name: row.downloads}) downloads_absolute.update({row.name: row.downloads_absolute}) else: api_access_number.update({row.name: row.downloads}) api_access_number_absolute.update( {row.name: row.downloads_absolute}) sql_views = '''SELECT t.tracking_date, t.running_total, t.recent_views, t.package_id FROM tracking_summary t INNER JOIN (SELECT package_id, MAX(tracking_date) AS tracking_date FROM tracking_summary GROUP BY package_id) t2 ON t.package_id = t2.package_id INNER JOIN package p ON p.id = t.package_id AND t.tracking_date = t2.tracking_date;''' results_views = model.Session.execute(sql_views) tracking_total = {} tracking_recent = {} for row in results_views: tracking_total.update({row.package_id: row.running_total}) tracking_recent.update({row.package_id: row.recent_views}) # Incluimos la informacion que necesitamos mostrar para cada dataset for package in packages: for key in package['notes_translated']: if package['notes_translated'][key]: package['notes_translated'][key] = package[ 'notes_translated'][key].replace('\n', ' ').replace( '\r', ' ') # Obtenemos un string con las etiquetas tags = '' for tag in package['tags']: tags = tags + ' ' + tag['display_name'] package['flattened_tags'] = tags # Obtenemos un string con los formatos de sus recursos, el total de descargas y el valor de openness_score del dataset # y si el dataset esta automatizado flattened_formats = ',' qa = 0 automatic = 'N' if 'update_string' in package and package['update_string']: automatic = 'S' for resource in package['resources']: if resource['format'].lower() not in flattened_formats: # Lo rodeamos con otros caracteres para que los strings contenidos en otros no den resultado "true" (ej: XLS y XLSX) flattened_formats = flattened_formats + resource[ 'format'].lower() + ',' if automatic == 'N': if (not resource['url_type'] == 'upload' and not '/resources/opendata/' in resource['url'] and not '/resource/' + resource['id'] + '/download/' in resource['url']): automatic = 'S' if 'qa' in resource: resource_qa = ast.literal_eval(resource['qa']) if (resource_qa['openness_score'] > qa): qa = int(resource_qa['openness_score']) package['flattened_formats'] = flattened_formats package['automatic'] = automatic package['qa'] = qa # Establecemos la tabla de formatos para cada dataset package['formats'] = OrderedDict() for format in formats: format_value = 'N' if ',' + format + ',' in flattened_formats: format_value = 'S' package['formats'][format] = format_value # Establecemos la tabla de anyos para cada dataset package['years'] = OrderedDict() for year in range(year_from, year_to + 1): year_value = 'N' if 'Any ' + str(year) in package['flattened_tags']: year_value = 'S' package['years'][year] = year_value # Escapamos los campos de texto self.escape_text(package) self.escape_translated_text(package) # Obtenemos numero comentarios if (package['name'] in comments): package['comments'] = comments[package['name']] else: package['comments'] = 0 if (package['name'] in downloads): package['downloads'] = downloads[package['name']] else: package['downloads'] = 0 if (package['name'] in downloads_absolute): package['downloads_absolute'] = downloads_absolute[ package['name']] else: package['downloads_absolute'] = 0 if (package['name'] in api_access_number): package['api_access_number'] = api_access_number[ package['name']] else: package['api_access_number'] = 0 if (package['name'] in api_access_number_absolute): package[ 'api_access_number_absolute'] = api_access_number_absolute[ package['name']] else: package['api_access_number_absolute'] = 0 if (package['id'] in tracking_total): package['tracking_total'] = tracking_total[package['id']] else: package['tracking_total'] = 0 if (package['id'] in tracking_recent): package['tracking_recent'] = tracking_recent[package['id']] else: package['tracking_recent'] = 0 curdate = d.datetime.now().strftime('%Y-%m-%d_%H-%M') t.response.headers['Content-Type'] = 'application/csv; charset=utf-8' t.response.headers[ 'Content-Disposition'] = 'attachment; filename=catalegBCN_' + curdate + '.csv' return t.render('cataleg.csv', extra_vars={ 'site_url': site_url, 'packages': packages, 'logged_in': logged_in, 'formats': formats, 'year_from': year_from, 'year_to': year_to, 'user': c.user, 'auth_user_obj': c.userobj, 'request': request })
def _get_read_only_user(data_dict): parsed = cli.parse_db_config("ckan.datastore.read_url") return parsed["db_user"]
def _get_read_only_user(data_dict): parsed = cli.parse_db_config('ckan.datastore.read_url') return parsed['db_user']
def updateSchema(resources): #Connect to the database d_port = config.get('ckan.cprvalidation.postgres_port', None) d_pass = config.get('ckan.cprvalidation.cprvalidation_password', None) db_name = config.get('ckan.cprvalidation.cprvalidation_db', None) try: db_config = parse_db_config() host = db_config.get('db_host') conn = psycopg2.connect(database=db_name, host=host, user="******", password=d_pass, port=d_port) except Exception as e: print(e) sys.exit() # Fetch all resources from the database print("Looking for new resources..") cur = conn.cursor() print db_name cur.execute("""SELECT resource_id, last_updated FROM {0}.status; """.format(db_name)) database_resources = cur.fetchall() # These are new resources difference_insert = list( set([str(r['id']) for r in resources]) - set(r[0] for r in database_resources)) difference_update = list( set([(str(r['metadata_modified']).replace("T", " ")) for r in resources]) - set(str(r[1]) for r in database_resources)) insert = """ INSERT INTO {0}.status values %s ON CONFLICT (resource_id) DO UPDATE SET last_updated = %s returning * ;""" update = """ UPDATE {0}.status SET last_updated = %s WHERE resource_id = %s returning * ;""" ''' Overview of the table order ( package_id character varying NOT NULL, resource_id character varying NOT NULL, status character varying, -- valid, invalid, pending format character varying NOT NULL, url character varying, url_type character varying, datastore_active character varying, last_checked character varying, last_updated character varying, excepted BOOLEAN CONSTRAINT status_pkey PRIMARY KEY (resource_id) ) ''' #For each new resource, add them to the schema and set their status to pending count = 0 for id in difference_insert: count += 1 dict = find(resources, "id", id) i = ( dict["package_id"], dict["id"], "pending", str(dict["format"]).lower(), dict["url"], dict["url_type"], dict["datastore_active"], None, dict["metadata_modified"], ) u = dict["metadata_modified"] cur.execute(insert.format(db_name), (i, u)) print("Inserted %d new resources to the database \n" % count) # # # # Update the information for last_updated # # # count = 0 for date in difference_update: #Multiple resources can share the same metadata_modified, so check them all dicts = findall(resources, "metadata_modified", date.replace(" ", "T")) for dict in dicts: count += 1 i = dict["metadata_modified"] try: cur.execute(update.format(db_name), (i, dict["id"])) except Exception as e: print(e.message) print("Updated %d new resources to the database \n" % count) try: conn.commit() conn.close() except Exception as e: print(e.message)