def run_job(job): """Worker function to index each row in each table in the database.""" job.connect_to_zmq() job.connect_to_database() tables = get_tables(job) for tbl in set(tables): geo = {} has_shape = False is_point = False shape_field_name = '' # -------------------------------------------------------------------------------------------------- # Get the table schema. # -------------------------------------------------------------------------------------------------- schema = {} schema['name'] = tbl # Get the primary key. qry = "SELECT K.COLUMN_NAME FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS C JOIN " \ "INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS K ON " \ "C.TABLE_NAME = K.TABLE_NAME AND C.CONSTRAINT_NAME = K.CONSTRAINT_NAME WHERE " \ "C.CONSTRAINT_TYPE = 'PRIMARY KEY' AND K.TABLE_NAME = '{0}'".format(tbl) cols = job.execute_query(qry).fetchall() primary_key = '' if cols: primary_key = cols[0][0] # Get the foreign key. qry = "SELECT K.COLUMN_NAME FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS C JOIN " \ "INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS K ON " \ "C.TABLE_NAME = K.TABLE_NAME AND C.CONSTRAINT_NAME = K.CONSTRAINT_NAME WHERE " \ "C.CONSTRAINT_TYPE = 'FOREIGN KEY' AND K.TABLE_NAME = '{0}'".format(tbl) cols = job.execute_query(qry).fetchall() foreign_key = '' if cols: foreign_key = cols[0][0] # Get the columns that have indexes. qry = "SELECT COL_NAME(ic.object_id,ic.column_id) AS column_name " \ "FROM sys.indexes AS i INNER JOIN sys.index_columns AS ic ON i.object_id = ic.object_id " \ "AND i.index_id = ic.index_id WHERE i.object_id = OBJECT_ID('{0}')".format(tbl) cols = job.execute_query(qry).fetchall() indexed_cols = [] if cols: for col in cols: indexed_cols.append(col[0]) schema_columns = [] for col in job.db_cursor.columns(table=tbl).fetchall(): column = {} props = [] column['name'] = col.column_name column['type'] = col.type_name if col.type_name == 'geometry': column['isGeo'] = True column['crs'] = job.db_cursor.execute("select {0}.STSrid from {1}".format(col.column_name, tbl)).fetchone()[0] if col.column_name == primary_key: props.append('PRIMARY KEY') if col.column_name == foreign_key: props.append('FOREIGN KEY') if col.column_name in indexed_cols: props.append('INDEXED') if col.is_nullable == 'YES': props.append('NULLABLE') else: props.append('NOTNULLABLE') column['properties'] = props schema_columns.append(column) schema['fields'] = schema_columns # -------------------------------- # Get the list of columns to keep. # -------------------------------- if not job.fields_to_keep == ['*']: columns = [] column_types = {} for col in job.fields_to_keep: qry = "select column_name, data_type from INFORMATION_SCHEMA.columns where table_name = '{0}' and column_name like '{1}'".format(tbl, col) for c in job.execute_query(qry).fetchall(): if not c.type_name == 'geometry': columns.append("{0}.{1}".format(tbl, c[0])) column_types[c[0]] = c[1] else: shape_field_name = c.column_name else: columns = [] column_types = {} for c in job.db_cursor.columns(table=tbl).fetchall(): if not c.type_name == 'geometry': columns.append("{0}.{1}".format(tbl, c.column_name)) column_types[c.column_name] = c.type_name else: shape_field_name = c.column_name if job.fields_to_skip: for col in job.fields_to_skip: qry = "select column_name from INFORMATION_SCHEMA.columns where table_name = '{0}' and column_name like '{1}'".format(tbl, col) [columns.remove("{0}.{1}".format(tbl, c[0])) for c in job.execute_query(qry).fetchall()] # -------------------------------------------------------------------------------------------------------- # Get the column names and types from the related tables. # -------------------------------------------------------------------------------------------------------- related_columns = [] if job.related_tables: for related_table in job.related_tables: for c in job.db_cursor.columns(table=related_table): if not c.type_name == 'geometry': related_columns.append("{0}.{1}".format(related_table, c.column_name)) # -------------------------------------------------------------------------------------------------------- # Check for a geometry column and pull out X,Y for points and extent coordinates for other geometry types. # -------------------------------------------------------------------------------------------------------- geom_type = '' if shape_field_name: has_shape = True srid = job.db_cursor.execute("select {0}.STSrid from {1}".format(shape_field_name, tbl)).fetchone()[0] geo['code'] = srid geom_type = job.db_cursor.execute("select {0}.STGeometryType() from {1}".format(shape_field_name, tbl)).fetchone()[0] if geom_type == 'Point': is_point = True columns.insert(0, "{0}.{1}.STPointN(1).STX as X".format(tbl, shape_field_name)) columns.insert(0, "{0}.{1}.STPointN(1).STY as Y".format(tbl, shape_field_name)) else: columns.insert(0, "{0}.{1}.STEnvelope().STPointN((3)).STY as YMAX".format(tbl, shape_field_name)) columns.insert(0, "{0}.{1}.STEnvelope().STPointN((3)).STX as XMAX".format(tbl, shape_field_name)) columns.insert(0, "{0}.{1}.STEnvelope().STPointN((1)).STY as YMIN".format(tbl, shape_field_name)) columns.insert(0, "{0}.{1}.STEnvelope().STPointN((1)).STX as XMIN".format(tbl, shape_field_name)) columns.insert(0, "{0}.{1}.STAsText() as WKT".format(tbl, shape_field_name)) # ----------------------------- # Query the table for the rows. # ----------------------------- sql_query = job.get_table_query(tbl) if not sql_query: row_count = float(job.db_cursor.execute("select Count(*) from {0}".format(tbl)).fetchone()[0]) rows = job.db_cursor.execute("select {0} from {1}".format(','.join(columns), tbl)) else: q = re.search('FROM(.*)', sql_query, re.IGNORECASE).group(0) try: row_count = float(job.db_cursor.execute("select Count(*) {0}".format(q)).fetchone()[0]) except Exception: row_count = float(job.db_cursor.execute("select Count(*) {0}".format(q.split('ORDER BY')[0])).fetchone()[0]) rows = job.execute_query("select {0} {1}".format(','.join(columns + related_columns), q)) # ----------------------------------------------------------------------------- # Index each row in the table. If there are relates, index the related records. # ----------------------------------------------------------------------------- cur_id = -1 entry = {} link = {} wkt_col = -1 action_type = job.action_type discovery_id = job.discovery_id location_id = job.location_id columns = [c.split('.')[1] for c in columns] mapped_fields = job.map_fields(tbl, columns, column_types) increment = job.get_increment(row_count) if 'WKT' in columns: has_shape = True try: wkt_col = mapped_fields.index('fs_WKT') except ValueError: wkt_col = mapped_fields.index('WKT') geometry_ops = worker_utils.GeometryOps() generalize_value = job.generalize_value # ----------------------------------------------- # Add an entry for the table itself with schema. # ----------------------------------------------- mapped_cols = {} schema['rows'] = row_count table_entry = {} table_entry['id'] = '{0}_{1}'.format(location_id, tbl) table_entry['location'] = location_id table_entry['action'] = action_type table_entry['format_type'] = 'Schema' table_entry['entry'] = {'fields': {'_discoveryID': discovery_id, 'name': tbl, 'path': job.sql_server_connection_str, 'format_type': 'Schema'}} table_entry['entry']['fields']['schema'] = schema job.send_entry(table_entry) for i, row in enumerate(rows): if not cur_id == row[0] or not job.related_tables: if entry: try: job.send_entry(entry) except Exception as ex: entry = {} continue entry = {} if has_shape: if is_point: geo['lon'] = row[1] geo['lat'] = row[0] mapped_cols = dict(zip(mapped_fields[2:], row[2:])) mapped_cols['geometry_type'] = 'Point' else: if generalize_value == 0 or generalize_value == 0.0: if wkt_col >= 0: geo['wkt'] = row[wkt_col] mapped_cols = dict(zip(mapped_fields, row)) else: geo['wkt'] = row[0] elif generalize_value > 0.9: if wkt_col >= 0: geo['wkt'] = row[wkt_col] mapped_cols = dict(zip(mapped_fields, row)) else: geo['xmin'] = row[1] geo['ymin'] = row[2] geo['xmax'] = row[3] geo['ymax'] = row[4] else: if wkt_col >= 0: geo['wkt'] = geometry_ops.generalize_geometry(str(row[wkt_col]), generalize_value) mapped_cols = dict(zip(mapped_fields, row)) else: geo['wkt'] = geometry_ops.generalize_geometry(str(row[0]), generalize_value) if not mapped_cols: mapped_cols = dict(zip(mapped_fields[5:], row[5:])) if 'Polygon' in geom_type: mapped_cols['geometry_type'] = 'Polygon' elif 'Polyline' in geom_type: mapped_cols['geometry_type'] = 'Polyline' else: mapped_cols['geometry_type'] = 'Point' else: mapped_cols = dict(zip(mapped_fields, row)) # Create an entry to send to ZMQ for indexing. mapped_cols['format_type'] = 'Record' mapped_cols['format'] = 'application/vnd.sqlserver.record' if 'id' in mapped_cols: mapped_cols['id'] = '{0}{1}'.format(random.randint(0, 1000000), mapped_cols['id']) else: mapped_cols['id'] = "{0}{1}".format(random.randint(0, 1000000), i) entry['id'] = '{0}_{1}_{2}'.format(location_id, tbl, i) entry['location'] = location_id entry['action'] = action_type # If the table supports relates/joins, handle them and add them as links. if job.related_tables: links = [] related_field_names = [d[0] for d in row.cursor_description[len(columns):]] related_field_types = dict(zip(related_field_names, [d[1] for d in row.cursor_description[len(columns):]])) mapped_related_fields = [] for related_table in job.related_tables: mapped_related_fields += job.map_fields(related_table, related_field_names, related_field_types) link['relation'] = 'contains' link = dict(zip(mapped_related_fields, row[len(columns):])) try: link['id'] = "{0}{1}".format(random.randint(0, 1000000), link['id']) except KeyError: link['id'] = "{0}{1}".format(random.randint(0, 1000000), i) # Send this link as an entry and set extract to true. link_entry = {} link_entry['id'] = "{0}{1}".format(link['id'], location_id) link_entry['action'] = action_type link_entry['entry'] = {"fields": link} if job.format: link_entry['entry']['fields']['__to_extract'] = True job.send_entry(link_entry) # Append the link to a list that will be part of the main entry. links.append(link) if geo: entry['entry'] = {'geo': geo, 'fields': mapped_cols, 'links': links} else: entry['entry'] = {'fields': mapped_cols, 'links': links} else: if geo: entry['entry'] = {'geo': geo, 'fields': mapped_cols} else: entry['entry'] = {'fields': mapped_cols} entry['entry']['fields']['_discoveryID'] = discovery_id entry['entry']['fields']['_discoveryID'] = discovery_id cur_id = row[0] else: link['relation'] = 'contains' link = dict(zip(mapped_related_fields, row[len(columns):])) try: link['id'] = "{0}{1}".format(random.randint(0, 1000000), link['id']) except KeyError: link['id'] = "{0}{1}".format('0000', i) link_entry = {} link_entry['id'] = "{0}{1}".format(link['id'], location_id) link_entry['action'] = action_type link_entry['entry'] = {"fields": link} if job.format: link_entry['entry']['fields']['__to_extract'] = True job.send_entry(link_entry) links.append(link) entry['entry']['links'] = entry['entry'].pop('links', links) # Report status percentage. if (i % increment) == 0: status_writer.send_percent(i / row_count, '{0}: {1:%}'.format(tbl, i / row_count), 'sql_server') # Send final entry. job.send_entry(entry) status_writer.send_percent(1, '{0}: {1:%}'.format(tbl, 1), 'sql_server')
def run_job(mongodb_job): """Worker function to index each document in each collection in the database.""" job = mongodb_job job.connect_to_zmq() job.connect_to_database() collection_names = get_collections(job) grid_fs = None for collection_name in collection_names: if job.has_gridfs: if collection_name.find('.files') > 0: grid_fs = gridfs.GridFS(job.db_connection, collection_name.split('.')[0]) col = job.db_connection[collection_name] query = job.get_table_query(col) if query: documents = col.find(eval(query)) else: documents = col.find() # Index each document -- get a suitable base 10 increment for reporting percentage. increment = job.get_increment(documents.count()) geometry_ops = worker_utils.GeometryOps() generalize_value = job.generalize_value for i, doc in enumerate(documents): fields = doc.keys() field_types = dict((k, type(v)) for k, v in doc.iteritems()) if grid_fs: grid_out = grid_fs.get(doc['_id']) if hasattr(grid_out, 'metadata'): #TODO: Determine how to ingest files stored in the database. #with open(r"c:\temp\{0}".format(grid_out.filename), "wb") as fp: #fp.write(grid_out.read()) fields += grid_out.metadata.keys() field_types = dict(field_types.items() + dict((k, type(v)) for k, v in grid_out.metadata.iteritems()).items()) values = [doc[k] for k in doc.keys() if not k == 'metadata'] values += grid_out.metadata.values() fields.remove('metadata') else: values = doc.values() entry = {} geo = {} geo_json_converter = worker_utils.GeoJSONConverter() if 'loc' in doc: if 'type' in doc['loc']: if 'bbox' in doc['loc']: if job.include_wkt: wkt = geo_json_converter.convert_to_wkt(doc['loc'], 3) if generalize_value == 0: geo['wkt'] = wkt else: geo['wkt'] = geometry_ops.generalize_geometry(wkt, generalize_value) geo['xmin'] = doc['loc']['bbox'][0] geo['ymin'] = doc['loc']['bbox'][1] geo['xmax'] = doc['loc']['bbox'][2] geo['ymax'] = doc['loc']['bbox'][3] elif 'Point' in doc['loc']['type']: if job.include_wkt: geo['wkt'] = geo_json_converter.convert_to_wkt(doc['loc'], 3) geo['lon'] = doc['loc']['coordinates'][0] geo['lat'] = doc['loc']['coordinates'][1] else: status_writer.send_state(status.STAT_WARNING, 'No bbox information for {0}.'.format(doc['_id'])) elif isinstance(doc['loc'][0], float): geo['lon'] = doc['loc'][0] geo['lat'] = doc['loc'][1] else: geo['xmin'] = doc['loc'][0][0] geo['xmax'] = doc['loc'][0][1] geo['ymin'] = doc['loc'][1][0] geo['ymax'] = doc['loc'][1][1] fields.remove('loc') mapped_fields = job.map_fields(col.name, fields, field_types) mapped_fields = dict(zip(mapped_fields, values)) mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['title'] = col.name mapped_fields['format_type'] = 'Record' mapped_fields['format'] = 'application/vnd.mongodb.record' entry['id'] = str(doc['_id']) entry['location'] = job.location_id entry['action'] = job.action_type entry['entry'] = {'geo': geo, 'fields': mapped_fields} job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent(float(i) / documents.count(), '{0}: {1:%}'.format(collection_name, float(i)/documents.count()), 'MongoDB') schema = {} schema['name'] = col.full_name schema['name'] = collection_name schema['rows'] = documents.count() schema_columns = [] index_info = col.index_information() indexes = [] for v in index_info.values(): indexes.append(v['key'][0][0]) for n, t in dict(zip(fields, field_types.values())).items(): column = {} props = [] column['name'] = n column['type'] = t if n in indexes: props.append('INDEXED') if 'ObjectId' in str(t) or n == '_id': props.append('NOTNULLABLE') props.append('PRIMARY KEY') else: props.append('NULLABLE') column['properties'] = props schema_columns.append(column) if geo: props = [] if 'loc' in indexes: props.append('INDEXED') schema_columns.append({'name': 'loc', 'isGeo': True, 'properties': props}) schema['fields'] = schema_columns # Add an entry for the table itself with schema. table_entry = {} table_entry['id'] = '{0}_{1}'.format(job.location_id, collection_name) table_entry['location'] = job.location_id table_entry['action'] = job.action_type table_entry['format_type'] = 'Schema' table_entry['entry'] = {'fields': {'_discoveryID': job.discovery_id, 'name': collection_name, 'path': job.mongodb_client_info}} table_entry['entry']['fields']['schema'] = schema job.send_entry(table_entry)
def run_job(oracle_job): """Worker function to do the indexing.""" job = oracle_job job.connect_to_zmq() job.connect_to_database() job.db_cursor.arraysize = 250 all_tables = [] all_tables += get_tables(job) all_tables += get_layers(job) all_tables += get_views(job) if not all_tables: status_writer.send_state( status.STAT_FAILED, "No tables, views or layers found. Check the configuration.") return # Begin indexing. for tbl in set(all_tables): geo = {} columns = [] column_types = {} is_point = False has_shape = False geometry_field = None shape_type = None # ---------------------------------------------------------------------------- # Check if the table is a layer/view and set name as "owner.table". # ---------------------------------------------------------------------------- if isinstance(tbl, tuple): column_query = "select column_name, data_type from all_tab_cols where " \ "table_name = '{0}' and column_name like".format(tbl[0]) query = job.get_table_query(tbl[0]) tbl = "{0}.{1}".format(tbl[1], tbl[0]) else: query = job.get_table_query(tbl) column_query = "select column_name, data_type from all_tab_cols where " \ "table_name = '{0}' and column_name like".format(tbl) # ----------------------------------------------------------------------------------------------------- # Get the table schema. # ----------------------------------------------------------------------------------------------------- table_schema = {} schema_columns = [] table_schema['name'] = tbl # Get primary key and foreign keys primary_key_col = '' foreign_key_col = '' primary_key_qry = "select c.table_name, c.column_name from all_constraints cons, all_cons_columns c " \ "where c.table_name = '{0}' and cons.constraint_type like 'P%' " \ "and cons.constraint_name = c.constraint_name".format(tbl) primary_cols = job.execute_query(primary_key_qry).fetchone() if primary_cols: primary_key_col = primary_cols[1] foreign_key_qry = "select c.table_name, c.column_name from all_constraints cons, all_cons_columns c " \ "where c.table_name = '{0}' and cons.constraint_type like 'F%' " \ "and cons.constraint_name = c.constraint_name".format(tbl) foreign_cols = job.execute_query(foreign_key_qry).fetchone() if foreign_cols: foreign_key_col = foreign_cols[1] # Get columns that are indexed. if "." in tbl: owner, table = tbl.split('.') index_query = "select column_name from all_ind_columns where table_name = '{0}' and index_owner = '{1}'".format( table, owner) else: index_query = "select column_name from all_ind_columns where table_name = '{0}'".format( tbl) index_columns = [ c[0] for c in job.execute_query(index_query).fetchall() ] for i, c in enumerate( job.execute_query( "select * from {0}".format(tbl)).description): schema_col = {} schema_props = [] schema_col['name'] = c[0] try: schema_col['type'] = field_types[c[1]] except (AttributeError, KeyError): schema_col['type'] = 'OBJECTVAR' try: if c[1] in ('SDO_GEOMETRY', 'ST_GEOMETRY'): schema_col['isGeo'] = True elif job.db_cursor.fetchvars[i].type.name == 'ST_GEOMETRY': schema_col['isGeo'] = True elif job.db_cursor.fetchvars[i].type.name == 'SDO_GEOMETRY': schema_col['isGeo'] = True except AttributeError: pass if c[6] == 1: schema_props.append('NULLABLE') else: schema_props.append('NOTNULLABLE') if c[0] == primary_key_col: schema_props.append('PRIMARY KEY') if c[0] == foreign_key_col: schema_props.append('FOREIGN KEY') if c[0] in index_columns: schema_props.append('INDEXED') if schema_props: schema_col['properties'] = schema_props schema_columns.append(schema_col) table_schema['fields'] = schema_columns # --------------------------------------------------------------------------------------- # Create the list of columns and column types to include in the index. # --------------------------------------------------------------------------------------- if not job.fields_to_keep == ['*']: for col in job.fields_to_keep: for c in job.db_cursor.execute("{0} '{1}'".format( column_query, col)).fetchall(): columns.append(c[0]) column_types[c[0]] = c[1] if c[1] in ('SDO_GEOMETRY', 'ST_GEOMETRY'): has_shape = True geometry_field = c[0] geometry_type = c[1] else: for i, c in enumerate( job.db_cursor.execute( "select * from {0}".format(tbl)).description): columns.append(c[0]) column_types[c[0]] = c[1] try: if c[1] in ('SDO_GEOMETRY', 'ST_GEOMETRY'): has_shape = True geometry_field = c[0] geometry_type = c[1] elif job.db_cursor.fetchvars[i].type.name == 'ST_GEOMETRY': has_shape = True geometry_field = c[0] geometry_type = 'ST_GEOMETRY' elif job.db_cursor.fetchvars[ i].type.name == 'SDO_GEOMETRY': has_shape = True geometry_field = c[0] geometry_type = 'SDO_GEOMETRY' except AttributeError: continue # ----------------------------------- # Remove fields meant to be excluded. # ----------------------------------- if job.fields_to_skip: for col in job.fields_to_skip: [ columns.remove(c[0]) for c in job.execute_query("{0} '{1}'".format( column_query, col)).fetchall() ] # ----------------------------------------------------------- # If there is a shape column, get the geographic information. # ----------------------------------------------------------- if geometry_field: columns.remove(geometry_field) if job.db_cursor.execute("select {0} from {1}".format( geometry_field, tbl)).fetchone() is None: status_writer.send_status( "Skipping {0} - no records.".format(tbl)) continue else: schema = job.db_cursor.execute("select {0} from {1}".format( geometry_field, tbl)).fetchone()[0].type.schema # Figure out if geometry type is ST or SDO. if geometry_type == 'SDO_GEOMETRY': geo['code'] = job.db_cursor.execute( "select c.{0}.SDO_SRID from {1} c".format( geometry_field, tbl)).fetchone()[0] # dimension = job.db_cursor.execute("select c.shape.Get_Dims() from {0} c".format(tbl)).fetchone()[0] if not job.db_cursor.execute( "select c.{0}.SDO_POINT from {1} c".format( geometry_field, tbl)).fetchone()[0] is None: is_point = True if geo['code'] == 4326: columns.insert( 0, '{0}.{1}.SDO_POINT.Y'.format( schema, geometry_field)) columns.insert( 0, '{0}.{1}.SDO_POINT.X'.format( schema, geometry_field)) else: job.db_cursor.execute( "SDO_CS.TRANSFORM({0}.{1}, 4326)".format( schema, geometry_field)) else: columns.insert( 0, "sdo_geom.sdo_mbr({0}).sdo_ordinates".format( geometry_field)) else: # ST_GEOMETRY shape_type = job.db_cursor.execute( "select {0}.ST_GEOMETRYTYPE({1}) from {2}".format( schema, geometry_field, tbl)).fetchone()[0] geo['code'] = int( job.db_cursor.execute( "select {0}.ST_SRID({1}) from {2}".format( schema, geometry_field, tbl)).fetchone()[0]) if 'POINT' in shape_type: is_point = True if geo['code'] == 4326 or geo['code'] == 3: for x in ('y', 'x', 'astext'): columns.insert( 0, '{0}.st_{1}({2})'.format( schema, x, geometry_field)) else: for x in ('y', 'x', 'astext'): columns.insert( 0, '{0}.st_{1}({0}.st_transform({2}, 4326))'. format(schema, x, geometry_field)) else: if geo['code'] == 4326: for x in ('maxy', 'maxx', 'miny', 'minx', 'astext'): columns.insert( 0, '{0}.st_{1}({2})'.format( schema, x, geometry_field)) else: try: job.db_cursor.execute( "select {0}.st_maxy(SDE.st_transform({1}, 4326)) from {2}" .format(schema, geometry_field, tbl)) for x in ('maxy', 'maxx', 'miny', 'minx', 'astext'): columns.insert( 0, '{0}.st_{1}({0}.st_transform({2}, 4326))'. format(schema, x, geometry_field)) except Exception: for x in ('maxy', 'maxx', 'miny', 'minx', 'astext'): columns.insert( 0, '{0}.st_{1}({2})'.format( schema, x, geometry_field)) # ------------------------------------------------- # Drop astext from columns if WKT is not requested. # ------------------------------------------------- # include_wkt = job.include_wkt # if not include_wkt and not geometry_type == 'SDO_GEOMETRY': # columns.pop(0) # ------------------------------------------------------------ # Get the count of all the rows to use for reporting progress. # ------------------------------------------------------------ if query: row_count = job.db_cursor.execute( "select count(*) from {0} where {1}".format( tbl, query)).fetchall()[0][0] else: row_count = job.db_cursor.execute( "select count(*) from {0}".format(tbl)).fetchall()[0][0] if row_count == 0 or row_count is None: continue else: row_count = float(row_count) # --------------------------- # Get the rows to be indexed. # --------------------------- try: if geometry_type == 'SDO_GEOMETRY': rows = job.db_cursor.execute("select {0} from {1} {2}".format( ','.join(columns), tbl, schema)) else: # Quick check to ensure ST_GEOMETRY operations are supported. row = job.db_cursor.execute("select {0} from {1}".format( ','.join(columns), tbl)).fetchone() del row if query: rows = job.db_cursor.execute( "select {0} from {1} where {2}".format( ','.join(columns), tbl, query)) else: rows = job.db_cursor.execute("select {0} from {1}".format( ','.join(columns), tbl)) except Exception: # This can occur for ST_GEOMETRY when spatial operators are un-available (See: http://tinyurl.com/lvvhwyl) columns.pop(0) geo['wkt'] = None if query: rows = job.db_cursor.execute( "select {0} from {1} where {2}".format( ','.join(columns), tbl, query)) else: rows = job.db_cursor.execute("select {0} from {1}".format( ','.join(columns), tbl)) # Continue if the table has zero records. if not rows: status_writer.send_status("Skipping {0} - no records.".format(tbl)) continue # --------------------------------------------------------- # Index each row. # --------------------------------------------------------- mapped_fields = job.map_fields(tbl, columns, column_types) increment = job.get_increment(row_count) location_id = job.location_id action_type = job.action_type discovery_id = job.discovery_id entry = {} # First, add an entry for the table itself with schema. table_schema['rows'] = row_count table_entry = {} table_entry['id'] = '{0}_{1}'.format(location_id, tbl) table_entry['location'] = location_id table_entry['action'] = action_type table_entry['format_type'] = 'Schema' table_entry['entry'] = { 'fields': { '_discoveryID': discovery_id, 'name': tbl, 'path': rows.connection.dsn } } table_entry['entry']['fields']['schema'] = table_schema job.send_entry(table_entry) if not has_shape: for i, row in enumerate(rows): try: # Map column names to Voyager fields. mapped_cols = dict(izip(mapped_fields, row)) if has_shape: [mapped_cols.pop(name) for name in geom_fields] mapped_cols['_discoveryID'] = discovery_id mapped_cols['meta_table_name'] = tbl mapped_cols['format_type'] = 'Record' mapped_cols['format'] = 'application/vnd.oracle.record' entry['id'] = '{0}_{1}_{2}'.format(location_id, tbl, i) entry['location'] = location_id entry['action'] = action_type entry['entry'] = {'fields': mapped_cols} job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0}: {1:%}".format(tbl, i / row_count), 'oracle_worker') except Exception as ex: status_writer.send_status(ex) continue else: geom_fields = [ name for name in mapped_fields if '{0}'.format(geometry_field) in name ] geometry_ops = worker_utils.GeometryOps() generalize_value = job.generalize_value for i, row in enumerate(rows): try: # if include_wkt: if is_point: geo['lon'] = row[1] geo['lat'] = row[2] else: if generalize_value == 0 or generalize_value == 0.0: geo['wkt'] = row[0] elif generalize_value > 0.9: geo['xmin'] = row[1] geo['ymin'] = row[2] geo['xmax'] = row[3] geo['ymax'] = row[4] else: geo['wkt'] = geometry_ops.generalize_geometry( str(row[0]), generalize_value) # else: # if is_point: # geo['lon'] = row[0] # geo['lat'] = row[1] # else: # if geometry_type == 'SDO_GEOMETRY': # if dimension == 3: # geo['xmin'] = row[0][0] # geo['ymin'] = row[0][1] # geo['xmax'] = row[0][3] # geo['ymax'] = row[0][4] # elif dimension == 2: # geo['xmin'] = row[0][0] # geo['ymin'] = row[0][1] # geo['xmax'] = row[0][2] # geo['ymax'] = row[0][3] # else: # geo['xmin'] = row[0] # geo['ymin'] = row[1] # geo['xmax'] = row[2] # geo['ymax'] = row[3] # Map column names to Voyager fields. mapped_cols = dict(izip(mapped_fields, row)) [mapped_cols.pop(name) for name in geom_fields] mapped_cols['_discoveryID'] = discovery_id mapped_cols['meta_table_name'] = tbl mapped_cols['format_type'] = 'Record' mapped_cols['format'] = 'application/vnd.oracle.record' entry['id'] = '{0}_{1}_{2}'.format(location_id, tbl, i) entry['location'] = location_id entry['action'] = action_type entry['entry'] = {'geo': geo, 'fields': mapped_cols} job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0}: {1:%}".format(tbl, i / row_count), 'oracle_worker') except Exception as ex: status_writer.send_status(ex) continue
def run_job(mysql_job): """Worker function to index each row in each table in the MySQL database.""" job = mysql_job job.connect_to_zmq() job.connect_to_database() tables = get_tables(job) processed = 0 for table in tables: geo = {} is_point = False has_shape = False # -------------------------------------------------------------------------------------------------- # Get the table schema. # -------------------------------------------------------------------------------------------------- schema = {} # Get the primary key. qry = "SELECT column_name FROM information_schema.table_constraints JOIN information_schema.key_column_usage USING (CONSTRAINT_NAME, TABLE_NAME) WHERE CONSTRAINT_TYPE = 'PRIMARY KEY' AND table_name = '{0}'".format( table) cols = job.execute_query(qry).fetchall() primary_key = '' if cols: primary_key = cols[0][0] # Get the columns which are indexed. qry = "SELECT * FROM information_schema.statistics WHERE table_name = '{0}'".format( table) cols = job.execute_query(qry).fetchall() indexed_cols = [] if cols: for col in cols: indexed_cols.append(col.COLUMN_NAME) schema_columns = [] for col in job.db_cursor.columns(table=table).fetchall(): column = {} props = [] column['name'] = col.column_name column['type'] = col.type_name if col.column_name == primary_key: props.append('PRIMARY KEY') if col.column_name in indexed_cols: props.append('INDEXED') if col.is_nullable == 'YES': props.append('NULLABLE') else: props.append('NOTNULLABLE') column['properties'] = props schema_columns.append(column) schema['fields'] = schema_columns schema['name'] = table # -------------------------------------------------------------------------------------------------- # Set up the fields to index. # -------------------------------------------------------------------------------------------------- columns = [] column_types = {} if not job.fields_to_keep == ['*']: for col in job.fields_to_keep: qry = "select column_name, data_type from information_schema.columns where table_name = '{0}' and column_name like '{1}'".format( table, col) for col in job.execute_query(qry).fetchall(): columns.append(col[0]) column_types[col[0]] = col[1] else: for col in job.db_cursor.columns(table=table).fetchall(): columns.append(col.column_name) column_types[col.column_name] = col.type_name if job.fields_to_skip: for col in job.fields_to_skip: qry = "select column_name from information_schema.columns where table_name = '{0}' AND column_name like '{1}'".format( table, col) [ columns.remove(col[0]) for col in job.execute_query(qry).fetchall() ] # ------------------------------------------------------------------------------------------------ # Get the query information. # ------------------------------------------------------------------------------------------------ query = job.get_table_query(table) constraint = job.get_table_constraint(table) if query and constraint: expression = """{0} AND {1}""".format(query, constraint) else: if query: expression = query else: expression = constraint # -------------------------------------------------------------------------------------------------------- # Check for a geometry column and pull out X,Y for points and extent coordinates for other geometry types. # -------------------------------------------------------------------------------------------------------- for col in job.db_cursor.columns(table=table).fetchall(): if col.type_name == 'geometry': has_shape = True srid = job.db_cursor.execute( "select SRID({0}) from {1}".format(col.column_name, table)).fetchone()[0] geo['code'] = srid geom_type = job.db_cursor.execute( "select GeometryType({0}) from {1}".format( col.column_name, table)).fetchone()[0] if geom_type == 'POINT': is_point = True columns.insert(0, "X({0})".format(col.column_name)) columns.insert(0, "Y({0})".format(col.column_name)) columns.insert(0, "AsText({0})".format(col.column_name)) else: columns.insert( 0, "AsText(Envelope({0}))".format(col.column_name)) columns.remove(col.column_name) column_types.pop(col.column_name) break # ------------------------------ # Query the table for the rows. # ------------------------------ if not expression: rows = job.db_cursor.execute("select {0} from {1}".format( ','.join(columns), table)) else: rows = job.db_cursor.execute( "select {0} from {1} where {2}".format(','.join(columns), table, expression)) # -------------------------------------- # Remove shape columns from field list. # -------------------------------------- for x in ("X({0})".format(col.column_name), "Y({0})".format(col.column_name), "AsText({0})".format(col.column_name)): try: columns.remove(x) except ValueError: continue # ----------------------------- # Index each row in the table. # ----------------------------- entry = {} location_id = job.location_id discovery_id = job.discovery_id action_type = job.action_type mapped_fields = job.map_fields(table, columns, column_types) new_fields = job.new_fields row_count = float(rows.rowcount) schema['rows'] = row_count increment = job.get_increment(row_count) geometry_ops = worker_utils.GeometryOps() generalize_value = job.generalize_value # Add an entry for the table itself with schema. table_entry = {} table_entry['id'] = '{0}_{1}'.format(location_id, table) table_entry['location'] = location_id table_entry['action'] = action_type table_entry['format_type'] = 'Schema' table_entry['entry'] = { 'fields': { '_discoveryID': discovery_id, 'name': table, 'path': job.sql_server_connection_str } } table_entry['entry']['fields']['schema'] = schema job.send_entry(table_entry) for i, row in enumerate(rows): if has_shape: if is_point: if job.include_wkt: geo['wkt'] = row[0] geo['lon'] = row[2] geo['lat'] = row[1] mapped_cols = dict(zip(mapped_fields[3:], row[3:])) mapped_cols['geometry_type'] = 'Point' for nf in new_fields: if nf['name'] == '*' or nf['name'] == table: for k, v in nf['new_fields'].iteritems(): mapped_fields[k] = v else: if job.include_wkt: if generalize_value == 0: geo['wkt'] = row[0] else: geo['wkt'] = geometry_ops.generalize_geometry( str(row[0]), generalize_value) nums = re.findall("-?(?:\.\d+|\d+(?:\.\d*)?)", row[0].rpartition(',')[0]) geo['xmin'] = float(nums[0]) geo['ymin'] = float(nums[1]) geo['xmax'] = float(nums[4]) geo['ymax'] = float(nums[5]) mapped_cols = dict(zip(mapped_fields[1:], row[1:])) for nf in new_fields: if nf['name'] == '*' or nf['name'] == table: for k, v in nf['new_fields'].iteritems(): mapped_fields[k] = v if 'POLYGON' in geom_type: mapped_cols['geometry_type'] = 'Polygon' else: mapped_cols['geometry_type'] = 'Polyline' else: mapped_cols = dict(zip(mapped_fields, row)) for nf in new_fields: if nf['name'] == '*' or nf['name'] == table: for k, v in nf['new_fields'].iteritems(): mapped_fields[k] = v # Create an entry to send to ZMQ for indexing. mapped_cols['title'] = table mapped_cols['format_type'] = 'Record' mapped_cols['format'] = 'application/vnd.mysql.record' entry['id'] = '{0}_{1}_{2}'.format(location_id, table, i) entry['location'] = location_id entry['action'] = action_type entry['entry'] = {'geo': geo, 'fields': mapped_cols} entry['entry']['fields']['_discoveryID'] = discovery_id job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, '{0}: {1:%}'.format(table, i / row_count), 'MySql') processed += i status_writer.send_status("Processed: {0}".format(processed))
def worker(data_path, esri_service=False): """The worker function to index feature data and tabular data.""" if esri_service: index_service(job.service_connection) else: job.connect_to_zmq() geo = {} entry = {} schema = {} dsc = arcpy.Describe(data_path) try: from utils import worker_utils geometry_ops = worker_utils.GeometryOps() except ImportError: geometry_ops = None try: global_id_field = dsc.globalIDFieldName except AttributeError: global_id_field = None try: shape_field_name = dsc.shapeFieldName except AttributeError: shape_field_name = None # Get the table schema. table_entry = {} schema['name'] = dsc.name try: alias = dsc.aliasName except AttributeError: alias = dsc.name if not dsc.name == alias: schema['alias'] = alias schema['OIDFieldName'] = dsc.OIDFieldName if shape_field_name: schema['shapeFieldName'] = shape_field_name schema['wkid'] = dsc.spatialReference.factoryCode if global_id_field: schema['globalIDField'] = global_id_field schema_fields = [] for fld in dsc.fields: field = {} props = [] field['name'] = fld.name field['alias'] = fld.aliasName field['type'] = fld.type field['domain'] = fld.domain if fld.isNullable: props.append('nullable') else: props.append('notnullable') indexes = dsc.indexes if indexes: for index in indexes: if fld.name in [f.name for f in index.fields]: props.append('indexed') break else: props.append('notindexed') break field['properties'] = props schema_fields.append(field) schema['fields'] = schema_fields if dsc.dataType == 'Table': # Get join information. table_join = job.get_join(dsc.name) if table_join: table_view = arcpy.MakeTableView_management( dsc.catalogPath, 'view') arcpy.AddJoin_management( table_view, table_join['field'], os.path.join(job.path, table_join['table']), table_join['field'], 'KEEP_COMMON') else: table_view = dsc.catalogPath # Get any query or constraint. query = job.get_table_query(dsc.name) constraint = job.get_table_constraint(dsc.name) if query and constraint: expression = """{0} AND {1}""".format(query, constraint) else: if query: expression = query else: expression = constraint field_types = job.search_fields(table_view) fields = field_types.keys() row_count = float( arcpy.GetCount_management(table_view).getOutput(0)) if row_count == 0.0: return with arcpy.da.SearchCursor(table_view, fields, expression) as rows: mapped_fields = job.map_fields(dsc.name, fields, field_types) new_fields = job.new_fields ordered_fields = OrderedDict() for f in mapped_fields: ordered_fields[f] = None increment = job.get_increment(row_count) for i, row in enumerate(rows, 1): try: if job.domains: row = update_row(dsc.fields, rows, list(row)) mapped_fields = dict(zip(ordered_fields.keys(), row)) mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['meta_table_name'] = dsc.name mapped_fields['format_category'] = 'GIS' mapped_fields['format_type'] = "Record" mapped_fields[ 'format'] = "application/vnd.esri.{0}.record".format( dsc.dataType.lower()) for nf in new_fields: if nf['name'] == '*' or nf['name'] == dsc.name: for k, v in nf['new_fields'].iteritems(): mapped_fields[k] = v oid_field = filter( lambda x: x in ('FID', 'OID', 'OBJECTID'), rows.fields) if oid_field: fld_index = rows.fields.index(oid_field[0]) else: fld_index = i if global_id_field: mapped_fields['meta_{0}'.format( global_id_field)] = mapped_fields.pop( 'fi_{0}'.format(global_id_field)) entry['id'] = '{0}_{1}_{2}'.format( job.location_id, os.path.basename(data_path), row[fld_index]) entry['location'] = job.location_id entry['action'] = job.action_type entry['entry'] = {'fields': mapped_fields} job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(dsc.name, i / row_count), 'esri_worker') except (AttributeError, RuntimeError): continue else: generalize_value = job.generalize_value sr = arcpy.SpatialReference(4326) geo['spatialReference'] = dsc.spatialReference.name geo['code'] = dsc.spatialReference.factoryCode # Get join information. table_join = job.get_join(dsc.name) if table_join: lyr = arcpy.MakeFeatureLayer_management(dsc.catalogPath, 'lyr') arcpy.AddJoin_management( lyr, table_join['input_join_field'], os.path.join(job.path, table_join['table']), table_join['output_join_field'], 'KEEP_COMMON') else: lyr = dsc.catalogPath field_types = job.search_fields(lyr) fields = field_types.keys() query = job.get_table_query(dsc.name) constraint = job.get_table_constraint(dsc.name) if query and constraint: expression = """{0} AND {1}""".format(query, constraint) else: if query: expression = query else: expression = constraint if dsc.shapeFieldName in fields: fields.remove(dsc.shapeFieldName) field_types.pop(dsc.shapeFieldName) elif table_join: fields.remove(arcpy.Describe(lyr).shapeFieldName) field_types.pop(arcpy.Describe(lyr).shapeFieldName) row_count = float(arcpy.GetCount_management(lyr).getOutput(0)) if row_count == 0.0: return if dsc.shapeType == 'Point': with arcpy.da.SearchCursor(lyr, ['SHAPE@'] + fields, expression, sr) as rows: mapped_fields = job.map_fields(dsc.name, list(rows.fields[1:]), field_types) new_fields = job.new_fields ordered_fields = OrderedDict() for f in mapped_fields: ordered_fields[f] = None increment = job.get_increment(row_count) for i, row in enumerate(rows): try: if job.domains: row = update_row(dsc.fields, rows, list(row)) if row[0]: geo['lon'] = row[0].firstPoint.X geo['lat'] = row[0].firstPoint.Y mapped_fields = dict( zip(ordered_fields.keys(), row[1:])) mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['meta_table_name'] = dsc.name mapped_fields['format_category'] = 'GIS' mapped_fields['geometry_type'] = 'Point' mapped_fields['format_type'] = 'Feature' mapped_fields[ 'format'] = "application/vnd.esri.{0}.feature".format( dsc.dataType.lower()) for nf in new_fields: if nf['name'] == '*' or nf['name'] == dsc.name: for k, v in nf['new_fields'].iteritems(): mapped_fields[k] = v if global_id_field: mapped_fields['meta_{0}'.format( global_id_field)] = mapped_fields.pop( 'fi_{0}'.format(global_id_field)) entry['id'] = '{0}_{1}_{2}'.format( job.location_id, os.path.basename(data_path), i) entry['location'] = job.location_id entry['action'] = job.action_type entry['entry'] = { 'geo': geo, 'fields': mapped_fields } job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(dsc.name, i / row_count), 'esri_worker') except (AttributeError, RuntimeError): continue else: with arcpy.da.SearchCursor(lyr, ['SHAPE@'] + fields, expression, sr) as rows: increment = job.get_increment(row_count) mapped_fields = job.map_fields(dsc.name, list(rows.fields[1:]), field_types) new_fields = job.new_fields ordered_fields = OrderedDict() for f in mapped_fields: ordered_fields[f] = None for i, row in enumerate(rows): try: if job.domains: row = update_row(dsc.fields, rows, list(row)) if row[0]: if generalize_value == 0 or generalize_value == 0.0: geo['wkt'] = row[0].WKT else: if geometry_ops: geo['wkt'] = geometry_ops.generalize_geometry( row[0].WKT, generalize_value) else: geo['xmin'] = row[0].extent.XMin geo['xmax'] = row[0].extent.XMax geo['ymin'] = row[0].extent.YMin geo['ymax'] = row[0].extent.YMax mapped_fields = dict( zip(ordered_fields.keys(), row[1:])) mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['meta_table_name'] = dsc.name for nf in new_fields: if nf['name'] == '*' or nf['name'] == dsc.name: for k, v in nf['new_fields'].iteritems(): mapped_fields[k] = v if global_id_field: mapped_fields['meta_{0}'.format( global_id_field)] = mapped_fields.pop( 'fi_{0}'.format(global_id_field)) mapped_fields['geometry_type'] = dsc.shapeType mapped_fields['format_category'] = 'GIS' mapped_fields['format_type'] = 'Feature' mapped_fields[ 'format'] = "application/vnd.esri.{0}.feature".format( dsc.dataType.lower()) entry['id'] = '{0}_{1}_{2}'.format( job.location_id, os.path.splitext( os.path.basename(data_path))[0], i) entry['location'] = job.location_id entry['action'] = job.action_type entry['entry'] = { 'geo': geo, 'fields': mapped_fields } job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(dsc.name, i / row_count), 'esri_worker') except (AttributeError, RuntimeError): continue # Add and entry for the table and it's schema. schema['rows'] = row_count table_entry['id'] = '{0}_{1}'.format(job.location_id, dsc.name) table_entry['location'] = job.location_id table_entry['action'] = job.action_type table_entry['format_type'] = 'Schema' table_entry['entry'] = { 'fields': { '_discoveryID': job.discovery_id, 'name': dsc.name, 'path': dsc.catalogPath, 'format': 'schema' } } table_entry['entry']['fields']['schema'] = schema job.send_entry(table_entry)
def index_service(connection_info): """Index the records in Map and Feature Services.""" from utils import worker_utils geometry_ops = worker_utils.GeometryOps() job.connect_to_zmq() entry = {} items = {} url = '' if 'portal_url' in connection_info: connection_url = connection_info['portal_url'] else: connection_url = connection_info['server_url'] user_name = connection_info['user_name'] password = connection_info['password'] token = connection_info['token'] generate_token = connection_info['generate_token'] service_name = connection_info['service_name'] service_type = connection_info['service_type'] folder_name = connection_info['folder_name'] if 'instance' in connection_info: instance = connection_info['instance'] else: instance = 'arcgis' # Create the ArcGIS service helper and get the service url and the service items (layers/tables). ags_helper = worker_utils.ArcGISServiceHelper(connection_url, user_name, password, instance=instance) try: if token == '' and generate_token == 'false': url, items = ags_helper.find_item_url(service_name, service_type, folder_name) elif token and folder_name: url, items = ags_helper.find_item_url(service_name, service_type, folder_name, token=token) elif token: url, items = ags_helper.find_item_url(service_name, service_type, token=token) elif generate_token and folder_name: url, items = ags_helper.find_item_url(service_name, service_type, folder_name, token=token) elif generate_token == 'true': url, items = ags_helper.find_item_url(service_name, service_type, token=ags_helper.token) except IndexError: status_writer.send_state(status.STAT_FAILED, "Cannot locate {0}.".format(service_name)) return except worker_utils.InvalidToken as invalid_token: status_writer.send_state(status.STAT_FAILED, invalid_token.message) return except Exception as ex: status_writer.send_state(status.STAT_FAILED, ex.message) return # Support wildcards for filtering layers and views in the service. layers = items['layers'] + items['tables'] layers_to_keep = job.tables_to_keep() for layer in layers_to_keep: lk = layer.split('*') if len(lk) == 3 and layer.startswith('*') and layer.endswith('*'): layers = [l['id'] for l in layers if lk[1] in l['name']] elif layer.endswith('*'): layers = [l for l in layers if lk[0] in l['name']] elif layer.startswith('*'): layers = [l for l in layers if lk[1] in l['name']] else: layers = [l for l in layers if lk[0] == l['name']] # Index the records for each layer and table within a feature or map service. for layer in layers: i = 0. geo = {} layer_id = layer['id'] layer_name = layer['name'] mapped_attributes = OrderedDict() status_writer.send_status('Indexing {0}...'.format((url, layer_name))) # Get the list of fields and field types. fields_types = {} try: fields = ags_helper.get_item_fields(url, layer_id, ags_helper.token) except KeyError: status_writer.send_status( "Layer {0} has no fields.".format(layer_name)) continue for f in fields: fields_types[f['name']] = f['type'] # Check if the layer is empty and ensure to get all features, not just first 1000 (esri default). objectid_groups, row_count = ags_helper.get_item_row_count( url, layer_id, ags_helper.token) oid_field_name = ags_helper.oid_field_name if not row_count: status_writer.send_status( "Layer {0} has no features.".format(layer_name)) continue else: increment = float(job.get_increment(row_count)) for group in objectid_groups: group = [oid for oid in group if not oid == None] rows = ags_helper.get_item_rows(url, layer_id, ags_helper.token, where='{0} IN {1}'.format( oid_field_name, tuple(group))) features = None if 'features' in rows: features = rows['features'] if not features: status_writer.send_status( "Layer {0} has no features.".format(layer_name)) continue if 'attributes' in features[0]: attributes = OrderedDict(features[0]['attributes']) else: status_writer.send_status( "Layer {0} has no attributes.".format(layer_name)) if 'geometryType' in rows: geometry_type = rows['geometryType'] else: geometry_type = 'Table' if 'spatialReference' in rows: geo['srid'] = rows['spatialReference']['wkid'] # Map the field and it's value. if not job.fields_to_keep == ['*']: for fk in job.fields_to_keep: mapped_fields = dict((name, val) for name, val in attributes.items() if fk in name) if job.fields_to_skip: for fs in job.fields_to_skip: [ mapped_fields.pop(name) for name in attributes if name in fs ] else: mapped_fields = copy.deepcopy(attributes) # This will generate the field mapping dictionary. job.tables_to_keep() date_fields = set() field_map = None for mapping in job.field_mapping: if mapping['name'] == layer_name: field_map = mapping['map'] break if not field_map: for mapping in job.field_mapping: if mapping['name'] == '*': field_map = mapping['map'] break if field_map: for k, v in mapped_fields.items(): if k in field_map: new_field = field_map[k] mapped_attributes[new_field] = mapped_fields.pop(k) else: field_type = job.default_mapping(fields_types[k]) if field_type == 'fd_': # Because dates are being returned as longs. mapped_attributes[field_type + k] = v date_fields.add(field_type + k) else: mapped_attributes[field_type + k] = mapped_fields.pop(k) else: for k, v in mapped_fields.items(): field_type = job.default_mapping(fields_types[k]) if field_type == 'fd_': # Because dates are being returned as longs. mapped_attributes[field_type + k] = v date_fields.add(field_type + k) else: mapped_attributes[field_type + k] = mapped_fields.pop(k) i += len(features) if geometry_type == 'Table': for x, row in enumerate(features): entry['id'] = '{0}_{1}_{2}_{3}'.format( job.location_id, layer_name, i, x) entry['location'] = job.location_id entry['action'] = job.action_type mapped_fields = dict( zip(mapped_attributes.keys(), row['attributes'].values())) # Convert longs to datetime. for df in date_fields: mapped_fields[df] = get_date(mapped_fields[df]) mapped_fields['title'] = layer_name mapped_fields['meta_table_name'] = layer_name mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['format_category'] = 'GIS' mapped_fields['format_type'] = 'Service Layer Record' mapped_fields[ 'format'] = 'application/vnd.esri.service.layer.record' entry['entry'] = {'fields': mapped_fields} job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(layer_name, i / row_count), 'esri_worker') else: # Faster to do one if check for geometry type then to condense code and check in every iteration. if geometry_type == 'esriGeometryPoint': for x, feature in enumerate(features): pt = feature['geometry'] geo['lon'] = pt['x'] geo['lat'] = pt['y'] entry['id'] = '{0}_{1}_{2}_{3}'.format( job.location_id, layer_name, int(i), x) entry['location'] = job.location_id entry['action'] = job.action_type mapped_fields = dict( zip(mapped_attributes.keys(), feature['attributes'].values())) # Convert longs to datetime. for df in date_fields: mapped_fields[df] = get_date(mapped_fields[df]) mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['title'] = layer_name mapped_fields['geometry_type'] = 'Point' mapped_fields['meta_table_name'] = layer_name mapped_fields['format_category'] = 'GIS' mapped_fields['format_type'] = 'Service Layer Feature' mapped_fields[ 'format'] = 'application/vnd.esri.service.layer.record' entry['entry'] = {'geo': geo, 'fields': mapped_fields} job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(layer_name, int(i) / row_count), 'esri_worker') else: generalize_value = job.generalize_value for x, feature in enumerate(features): try: geometry = make_feature( feature) # Catch possible null geometries. except RuntimeError: continue if generalize_value > 0.9: geo['xmin'], geo[ 'xmax'] = geometry.extent.XMin, geometry.extent.XMax geo['ymin'], geo[ 'ymax'] = geometry.extent.YMin, geometry.extent.YMax elif generalize_value == 0 or generalize_value == 0.0: geo['wkt'] = geometry.WKT else: if geometry_ops: geo['wkt'] = geometry_ops.generalize_geometry( geometry.WKT, generalize_value) else: geo['xmin'], geo[ 'xmax'] = geometry.extent.XMin, geometry.extent.XMax geo['ymin'], geo[ 'ymax'] = geometry.extent.YMin, geometry.extent.YMax entry['id'] = '{0}_{1}_{2}_{3}'.format( job.location_id, layer_name, int(i), x) entry['location'] = job.location_id entry['action'] = job.action_type mapped_fields = dict( zip(mapped_attributes.keys(), OrderedDict(feature['attributes']).values())) try: # Convert longs to datetime. for df in date_fields: mapped_fields[df] = get_date(mapped_fields[df]) mapped_fields['title'] = layer_name mapped_fields['geometry_type'] = geometry.type mapped_fields['meta_table_name'] = layer_name mapped_fields['meta_table_path'] = layer['path'] mapped_fields[ 'meta_table_location'] = os.path.dirname( layer['path']) mapped_fields['format_category'] = 'GIS' mapped_fields[ 'format_type'] = 'Service Layer Feature' mapped_fields[ 'format'] = 'application/vnd.esri.service.layer.record' mapped_fields['_discoveryID'] = job.discovery_id entry['entry'] = { 'geo': geo, 'fields': mapped_fields } except KeyError: job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(layer_name, i / row_count), 'esri_worker')
def index_service(connection_info): """Index the records in Map and Feature Services.""" from utils import worker_utils geometry_ops = worker_utils.GeometryOps() geo_json_converter = worker_utils.GeoJSONConverter() job.connect_to_zmq() entry = {} items = {} url = '' if 'portal_url' in connection_info: connection_url = connection_info['portal_url'] else: connection_url = connection_info['server_url'] user_name = connection_info['user_name'] password = connection_info['password'] token = connection_info['token'] generate_token = connection_info['generate_token'] service_name = connection_info['service_name'] service_type = connection_info['service_type'] folder_name = connection_info['folder_name'] if 'instance' in connection_info: instance = connection_info['instance'] else: instance = 'arcgis' if 'verify_ssl' in connection_info and connection_info['verify_ssl'].lower( ) in ['false', 'n']: verify_ssl = False else: verify_ssl = True # Create the ArcGIS service helper and get the service url and the service items (layers/tables). ags_helper = worker_utils.ArcGISServiceHelper(connection_url, user_name, password, verify_ssl, instance=instance) try: if token == '' and generate_token == 'false': url, items = ags_helper.find_item_url(service_name, service_type, folder_name) elif token and folder_name: url, items = ags_helper.find_item_url(service_name, service_type, folder_name, token=token) elif token: url, items = ags_helper.find_item_url(service_name, service_type, token=token) elif generate_token and folder_name: url, items = ags_helper.find_item_url(service_name, service_type, folder_name, token=token) elif generate_token == 'true': url, items = ags_helper.find_item_url(service_name, service_type, token=ags_helper.token) except IndexError: status_writer.send_state(status.STAT_FAILED, "Cannot locate {0}.".format(service_name)) return except worker_utils.InvalidToken as invalid_token: status_writer.send_state(status.STAT_FAILED, invalid_token.message) return except Exception as ex: status_writer.send_state(status.STAT_FAILED, ex.message) return # Support wildcards for filtering layers and views in the service. layers = items['layers'] + items['tables'] layers_to_keep = job.tables_to_keep() layers_process = [] for layer in layers_to_keep: lk = layer.split('*') if len(lk) == 3 and layer.startswith('*') and layer.endswith('*'): layers_process += [l['id'] for l in layers if lk[1] in l['name']] elif layer.endswith('*'): layers_process += [l for l in layers if lk[0] in l['name']] elif layer.startswith('*'): layers_process += [l for l in layers if lk[1] in l['name']] else: layers_process += [l for l in layers if lk[0] == l['name']] # Index the records for each layer and table within a feature or map service. skip_tables = job.tables_to_skip() for layer in layers_process: if layer['name'] in skip_tables: continue i = 0. geo = {} layer_id = layer['id'] layer_name = layer['name'] mapped_attributes = OrderedDict() status_writer.send_status('Indexing {0}...'.format((url, layer_name))) # Get the list of fields and field types. fields_types = {} try: fields = ags_helper.get_item_fields(url, layer_id, ags_helper.token) except KeyError: status_writer.send_status( "Layer {0} has no fields.".format(layer_name)) continue for f in fields: fields_types[f['name']] = f['type'] # Check if the layer is empty and ensure to get all features, not just first 1000 (esri default). query = job.get_table_query(layer_name) if query: where_clause = query elif 'query' in connection_info: where_clause = connection_info['query'] else: where_clause = "1=1" try: groups, row_count = ags_helper.get_item_row_count( url, layer_id, ags_helper.token, where_clause) except Exception: where_clause = "1=1" groups, row_count = ags_helper.get_item_row_count( url, layer_id, ags_helper.token, where_clause) oid_field_name = ags_helper.oid_field_name if not row_count: status_writer.send_status( "Layer {0} has no features.".format(layer_name)) continue else: increment = float(job.get_increment(row_count)) for group in groups: rows = ags_helper.get_item_rows(url, layer_id, ags_helper.token, where=where_clause, resultOffset=group, resultRecordCount=1000, response_format='geojson') features = None if 'features' in rows: features = rows['features'] if not features: status_writer.send_status( "Layer {0} has no features.".format(layer_name)) continue if 'properties' in features[0]: attributes = OrderedDict(features[0]['properties']) elif 'attributes' in features[0]: attributes = OrderedDict(features[0]['attributes']) else: status_writer.send_status( "Layer {0} has no attributes.".format(layer_name)) if 'geometry' in features[0] and features[0]['geometry']: if 'type' in features[0]['geometry']: geometry_type = features[0]['geometry']['type'] elif 'rings' in features[0]['geometry']: geometry_type = 'Rings' elif 'paths' in features[0]['geometry']: geometry_type = 'Paths' elif 'x' in features[0]['geometry'] and 'y' in features[0][ 'geometry']: geometry_type = 'Point' else: geometry_type = 'Table' if 'crs' in rows: if ':' in rows['crs']['properties']['name']: geo['srid'] = rows['crs']['properties']['name'].split( ':')[1] else: geo['srid'] = rows['crs']['properties']['name'] elif 'spatialReference' in rows: geo['srid'] = rows['spatialReference']['wkid'] # Map the field and it's value. if not job.fields_to_keep == ['*']: for fk in job.fields_to_keep: mapped_fields = dict((name, val) for name, val in attributes.items() if fk in name) if job.fields_to_skip: for fs in job.fields_to_skip: [ mapped_fields.pop(name) for name in attributes if name in fs ] else: mapped_fields = copy.deepcopy(attributes) # This will generate the field mapping dictionary. job.tables_to_keep() date_fields = set() field_map = None for mapping in job.field_mapping: if mapping['name'] == layer_name: field_map = mapping['map'] break if not field_map: for mapping in job.field_mapping: if mapping['name'] == '*': field_map = mapping['map'] break if field_map: for k, v in mapped_fields.items(): if k in field_map: new_field = field_map[k] mapped_attributes[new_field] = mapped_fields.pop(k) else: field_type = job.default_mapping(fields_types[k]) if field_type == 'fd_': # Because dates are being returned as longs. mapped_attributes[field_type + k] = v date_fields.add(field_type + k) else: mapped_attributes[field_type + k] = mapped_fields.pop(k) else: for k, v in mapped_fields.items(): field_type = job.default_mapping(fields_types[k]) if field_type == 'fd_': # Because dates are being returned as longs. mapped_attributes[field_type + k] = v date_fields.add(field_type + k) else: mapped_attributes[field_type + k] = mapped_fields.pop(k) i += len(features) if geometry_type == 'Table': for x, row in enumerate(features): hash_obj = hashlib.md5( os.path.join( url, '{0}_{1}_{2}_{3}'.format(job.location_id, layer_name, i, x))) entry['id'] = hash_obj.hexdigest() entry['location'] = job.location_id entry['action'] = job.action_type if 'properties' in row: mapped_fields = dict( zip(mapped_attributes.keys(), row['properties'].values())) else: mapped_fields = dict( zip(mapped_attributes.keys(), row['attributes'].values())) # Convert longs to datetime. for df in date_fields: mapped_fields[df] = get_date(mapped_fields[df]) mapped_fields['id'] = entry['id'] mapped_fields['meta_table_name'] = layer_name mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['format_category'] = 'GIS' mapped_fields['format_type'] = 'Service Layer Record' mapped_fields[ 'format'] = 'application/vnd.esri.service.layer.record' entry['entry'] = {'fields': mapped_fields} job.send_entry(entry) if (x % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(layer_name, i / row_count), 'agol_worker') else: # Faster to do one if check for geometry type then to condense code and check in every iteration. if geometry_type == 'Point': for x, feature in enumerate(features, 1): if 'type' not in feature['geometry']: pt = feature['geometry'] geo['lon'] = pt['x'] geo['lat'] = pt['y'] else: geo['wkt'] = geo_json_converter.convert_to_wkt( feature['geometry'], 14) hash_obj = hashlib.md5( os.path.join( url, '{0}_{1}_{2}_{3}'.format( job.location_id, layer_name, int(i), x))) entry['id'] = hash_obj.hexdigest() entry['location'] = job.location_id entry['action'] = job.action_type if 'properties' in feature: mapped_fields = dict( zip(mapped_attributes.keys(), feature['properties'].values())) else: mapped_fields = dict( zip(mapped_attributes.keys(), feature['attributes'].values())) # Convert longs to datetime. for df in date_fields: mapped_fields[df] = get_date(mapped_fields[df]) mapped_fields['id'] = entry['id'] mapped_fields['_discoveryID'] = job.discovery_id mapped_fields['geometry_type'] = 'Point' mapped_fields['meta_table_name'] = layer_name mapped_fields['format_category'] = 'GIS' mapped_fields['format_type'] = 'Service Layer Feature' mapped_fields[ 'format'] = 'application/vnd.esri.service.layer.record' entry['entry'] = {'geo': geo, 'fields': mapped_fields} job.send_entry(entry) if (x % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(layer_name, i / row_count), 'agol_worker') else: generalize_value = job.generalize_value for x, feature in enumerate(features, 1): try: # geometry = make_feature(feature) # Catch possible null geometries. if geometry_type == 'Rings': geometry = geo_json_converter.create_polygon( feature['geometry']['rings'][0]) elif geometry_type == 'Paths': geometry = geo_json_converter.create_polyline( feature['geometry']['paths'][0]) elif geometry_type == 'LineString': geometry = geo_json_converter.create_polyline( feature['geometry']['coordinates']) else: geometry = geo_json_converter.convert_to_wkt( feature['geometry'], 14) except RuntimeError: continue # if generalize_value > 0.9: # geo['xmin'], geo['xmax'] = geometry.extent.XMin, geometry.extent.XMax # geo['ymin'], geo['ymax'] = geometry.extent.YMin, geometry.extent.YMax if geometry: if generalize_value == 0 or generalize_value == 0.0: if isinstance(geometry, str): geo['wkt'] = geometry else: geo['wkt'] = geometry.WKT else: if geometry_ops: geo['wkt'] = geometry_ops.generalize_geometry( geometry, generalize_value) else: geo['wkt'] = geometry.WKT # geo['xmin'], geo['xmax'] = geometry.extent.XMin, geometry.extent.XMax # geo['ymin'], geo['ymax'] = geometry.extent.YMin, geometry.extent.YMax hash_obj = hashlib.md5( os.path.join( url, '{0}_{1}_{2}_{3}'.format( job.location_id, layer_name, int(i), x))) entry['id'] = hash_obj.hexdigest() entry['location'] = job.location_id entry['action'] = job.action_type if 'properties' in feature: mapped_fields = dict( zip( mapped_attributes.keys(), OrderedDict( feature['properties']).values())) else: mapped_fields = dict( zip( mapped_attributes.keys(), OrderedDict( feature['attributes']).values())) try: # Convert longs to datetime. for df in date_fields: mapped_fields[df] = get_date(mapped_fields[df]) if geometry_type in ('Paths', 'LineString'): mapped_fields['geometry_type'] = 'Polyline' elif geometry_type == 'Rings': mapped_fields['geometry_type'] = 'Polygon' else: mapped_fields['geometry_type'] = geometry_type mapped_fields['meta_table_name'] = layer_name try: mapped_fields['meta_table_path'] = layer[ 'path'] except KeyError: layer['path'] = url + '/' + str(layer_id) mapped_fields['meta_table_path'] = layer[ 'path'] mapped_fields['id'] = entry['id'] mapped_fields[ 'meta_table_location'] = os.path.dirname( layer['path']) mapped_fields['format_category'] = 'GIS' mapped_fields[ 'format_type'] = 'Service Layer Feature' mapped_fields[ 'format'] = 'application/vnd.esri.service.layer.record' mapped_fields['_discoveryID'] = job.discovery_id entry['entry'] = { 'geo': geo, 'fields': mapped_fields } job.send_entry(entry) except KeyError: job.send_entry(entry) if (i % increment) == 0: status_writer.send_percent( i / row_count, "{0} {1:%}".format(layer_name, i / row_count), 'agol_worker')