def execute(self, quals, columns, **kwargs): """ This method is invoked every time a SELECT is executed on the foreign table. Parses the quals argument searching for search criteria, contacts soundcloud using the official soundcloud python library and returns the search results inside the columns of the foreign table. Available columns are: title, url, search. :param list quals:a list of conditions which are used are used in the WHERE part of the select and can be used to restrict the number of the results :param list columns: the columns of the foreign table """ if not quals: msg = 'specify a search term' log_to_postgres(level=ERROR, message=msg) # create a client object using the apikey client = soundcloud.Client(client_id=self.apikey) for qual in quals: # Manage quals, pass it as search therm if the field is 'search' if qual.field_name == "search" or qual.operator == "=": # Perform a simple search using the qual value # and the soundcloud client (limit to 10 results) tracks = client.get('/tracks', q=qual.value, limit=10) for track in tracks: # Format the response line line = { 'title': track.title, 'url': track.permalink_url, 'search': qual.value } yield line
def get_token(self): # Do OAuth username/password token_url = '%s/services/oauth2/token' % self.login_server params = urllib.urlencode({ 'grant_type': 'password', 'client_id': self.client_id, 'client_secret': self.client_secret, 'username': self.username, 'password': self.password }) log_to_postgres('Getting token from %s' % token_url, DEBUG) try: data = urllib2.urlopen(token_url, params).read() except urllib2.URLError, e: if hasattr(e, 'code'): if e.code == 400: log_to_postgres( 'Bad Request', ERROR, 'Check the client_id, client_secret, username and password') else: log_to_postgres('HTTP status %d' % e.code, ERROR) elif hasattr(e, 'reason'): log_to_postgres('Error posting to URL %s: %d %s' % (token_url, e.reason[0], e.reason[1]), ERROR, 'Check the login_server') else: log_to_postgres('Unknown error %s' % e, ERROR)
def to_sargs(quals): log_to_postgres(str(quals), WARNING) good_quals = ['=', '>', '>=', '<=', '<>', ('=', True), ('<>', False)] converted = [to_sarg(q) for q in quals if q.operator in good_quals] sargs = " and " .join(["(%s)" % a[0] for a in converted if a]) params = [a[1] for a in converted if a] return (sargs, params)
def __init__(self, options, columns): super(DatabaseDotComForeignDataWrapper, self).__init__(options, columns) self.column_map = CaseInsensitiveDict(dict([(x, x) for x in columns])) self.obj_type = options.get('obj_type', None) if self.obj_type is None: log_to_postgres('You MUST set the obj_type', ERROR) self.client_id = options.get('client_id', None) if self.client_id is None: log_to_postgres('You MUST set the client_id', ERROR) self.client_secret = options.get('client_secret', None) if self.client_secret is None: log_to_postgres('You MUST set the client_secret', ERROR) self.username = options.get('username', None) if self.username is None: log_to_postgres('You MUST set the username', ERROR) self.password = options.get('password', None) if self.password is None: log_to_postgres('You MUST set the password', ERROR) self.login_server = options.get('login_server', 'https://login.salesforce.com') self.oauth = self.get_token()
def __init__(self, options, columns): """ Init method for the Foreign Data Wrapper. Used to manage the options necessary to run barman :type options: Options passed during the creation of the FDW :type columns: the columns of the foreign table """ super(BarmanEnhancedForeignDataWrapper, self).__init__(options, columns) if 'table_name' not in options: log_to_postgres('The table_name parameter is required', ERROR) if 'barman_user' not in options: log_to_postgres('The barman_user parameter is required', ERROR) if 'barman_host' not in options: log_to_postgres('Option barman_host is required for ' 'the Barman FDW setup.', ERROR) self.schema = options['schema'] if 'schema' in options else None self.table_name = options['table_name'] self.barman_host = options['barman_host'] self.barman_user = options['barman_user'] self._row_id_column = 'server' # The columns we'll be using (defaults to 'all'): self.columns = columns log_to_postgres('Barman FDW Config options: %s' % options, DEBUG) log_to_postgres('Barman FDW Config columns: %s' % columns, DEBUG)
def execute(self, quals, columns, sortkeys=None): log_to_postgres('Query Columns: %s' % columns, DEBUG) log_to_postgres('Query Filters: %s' % quals, DEBUG) statement = self.make_cypher(quals, columns, sortkeys) params = {} for qual in quals: params[unicode(qual.field_name)] = self.convert_to_neo4j(self.columns[qual.field_name], qual.value) log_to_postgres('Neo4j query is : ' + unicode(statement), DEBUG) log_to_postgres('With params : ' + unicode(params), DEBUG) # Execute & retrieve neo4j data session = self.driver.session() try: for record in session.run(statement, params): line = {} for column_name in columns: # TODO: from neo4j type to pg types line[column_name] = self.convert_to_pg(self.columns[column_name], record[column_name]) log_to_postgres("sending result item to PG : " + unicode(line), DEBUG) yield line except CypherError: raise RuntimeError("Bad cypher query : " + statement) finally: session.close()
def execute(self, quals, columns): if self.query: statement = self.query else: statement = "SELECT " + ",".join(self.columns.keys()) + " FROM " + self.table log_to_postgres('Hive query: ' + unicode(statement), DEBUG) try: transport = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute(statement) for row in client.fetchAll(): line = {} cols = row.split("\t"); idx = 0 for column_name in self.columns: line[column_name] = cols[idx] idx = idx + 1 yield line except Thrift.TException, tx: log_to_postgres(tx.message, ERROR)
def execute(self, quals, columns): conn = boto.connect_s3(self.aws_access_key, self.aws_secret_key) bucket = conn.get_bucket(self.bucket) stream = StringIO() key = bucket.get_key(self.filename) key.get_contents_to_file(stream) stream.seek(0) reader = csv.reader(stream, delimiter=self.delimiter, quotechar=self.quotechar) count = 0 checked = False for line in reader: if count >= self.skip_header: if not checked: # On first iteration, check if the lines are of the # appropriate length checked = True if len(line) > len(self.columns): log_to_postgres("There are more columns than " "defined in the table", WARNING) if len(line) < len(self.columns): log_to_postgres("There are less columns than " "defined in the table", WARNING) row=line[:len(self.columns)] nulled_row = [v if v else None for v in row] yield nulled_row count += 1
def execute(self, quals, columns ): line = {} for qual in quals : if qual.field_name == "fn_name": self.fn_name = qual.value elif qual.field_name == "cmd": self.cmd = qual.value if self.fn_name == "exec": try: res = commands.getstatusoutput(self.cmd) line["fn_name"] = self.fn_name line["val"] = "" line["result"] = res line["cmd"] = self.cmd yield(line) except Exception as e: line["fn_name"] = self.fn_name line["val"] = "" line["result"] = "Error %s " % e yield(line) log_to_postgres("There was an error executing docker command Error: %s" % e , ERROR,"Check your commands for errors")
def __init__(self, options, columns): # Calling super constructor super(Neo4jForeignDataWrapper, self).__init__(options, columns) # Managed server option if 'url' not in options: log_to_postgres('The Bolt url parameter is required and the default is "bolt://localhost:7687"', WARNING) self.url = options.get("url", "bolt://localhost:7687") if 'user' not in options: log_to_postgres('The user parameter is required and the default is "neo4j"', ERROR) self.user = options.get("user", "neo4j") if 'password' not in options: log_to_postgres('The password parameter is required for Neo4j', ERROR) self.password = options.get("password", "") if 'cypher' not in options: log_to_postgres('The cypher parameter is required', ERROR) self.cypher = options.get("cypher", "MATCH (n) RETURN n LIMIT 100") # Setting table columns self.columns = columns # Create a neo4j driver instance self.driver = GraphDatabase.driver( self.url, auth=basic_auth(self.user, self.password)) self.columns_stat = self.compute_columns_stat() self.table_stat = int(options.get("estimated_rows", -1)) if(self.table_stat < 0): self.table_stat = self.compute_table_stat() log_to_postgres('Table estimated rows : ' + unicode(self.table_stat), DEBUG)
def connect(self): # try to connect try: bucket = self.client.bucket(self.bucket) except Exception, e: log_to_postgres('Connection Falure: %s' % e, ERROR)
def execute(self, quals, columns, **kwargs): """ This method is invoked every time a SELECT is executed on the foreign table. :param list quals:a list of conditions which are used are used in the WHERE part of the select and can be used to restrict the number of the results :param list columns: the columns of the foreign table """ # create a client object using the apikey # Ports are handled in ~/.ssh/config since we use OpenSSH diagnose_cmd = "barman diagnose" ssh_cmd = "%s@%s" % (self.barman_user, self.barman_host) ssh = subprocess.Popen(["ssh", "%s" % ssh_cmd, diagnose_cmd], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = ssh.communicate() result = json.loads(output[0]) if output[1]: error = ssh.stderr.readlines() log_to_postgres("ERROR: %s" % error, DEBUG) else: servers = result['servers'] for server, values in servers.items(): line = { 'server': server, 'backups': len(values['backups']), 'description': values['config']['description'], 'config': json.dumps(values['config']) } yield line
def execute(self, quals, columns, retry = True): cols = ''; for column_name in list(columns): cols += ',%s' % column_name cols = cols[1:] where = '' parameters = [] for qual in quals: operator = 'LIKE' if qual.operator == '~~' else qual.operator if qual.value is None: where += ' AND %s %s NULL' % ( qual.field_name, operator ) else: where += ' AND %s %s \'%s\'' % ( qual.field_name, operator, qual.value ) where = where[5:] query = 'SELECT '+cols+' FROM '+self.obj_type if len(where) > 0: query += ' WHERE %s ' % where log_to_postgres('SOQL query is %s' % query) params = urllib.urlencode({ 'q': query.encode('utf8') }) query_url = (self.oauth['instance_url'] + '/services/data/' + self.api_version + '/query?%s' % params) headers = { 'Authorization': 'OAuth %s' % self.oauth['access_token'] } req = urllib2.Request(query_url, None, headers) queue = Queue() try: stream = urllib2.urlopen(req); except urllib2.URLError, e: if hasattr(e, 'code'): if e.code == 401 and retry: log_to_postgres('Invalid token %s - trying refresh' % self.oauth['access_token']) self.oauth = self.get_token() for line in self.execute(quals, columns, False): yield line return else: log_to_postgres('HTTP status %d' % e.code, ERROR) elif hasattr(e, 'reason'): log_to_postgres('Error posting to URL %s: %d %s' % (token_url, e.reason[0], e.reason[1]), ERROR) else: log_to_postgres('Unknown error %s' % e, ERROR)
def _report_pk_violation(self, item): keys = sorted(item.keys()) values = [item[key] for key in keys] log_to_postgres("Duplicate key value violates filesystem" " integrity.", detail="Key (%s)=(%s) already exists" % (', '.join(keys), ', '.join(values)), level=ERROR)
def __init__(self, options, columns): """ Initialize with options passed through the create foreign table statement """ super(EchoPulse, self).__init__(options, columns) # Resolve data files found in directory self.basedir = options['directory'] sources = (source for source in os.listdir(self.basedir) if subtree_pattern.match(source)) self.source_dirs = [ os.path.realpath(os.path.join(self.basedir, source)) for source in sources if os.path.isdir(os.path.join(self.basedir, source)) ] # default mapping for coordinates self.new_dimnames = { 'range': 'x', 'theta': 'y', 'phi': 'z' } # get custom mapping given in options varmapping = [ opt for opt in options.keys() if opt.startswith('map_') ] for var in varmapping: self.new_dimnames.update({var.strip('map_'): options[var]}) # get pointcloud structure from the directory tree self.ordered_dims = self.scan_structure() log_to_postgres('{} echo/pulse directories linked' .format(len(self.source_dirs)))
def execute(self, quals, columns): if self.query: statement = self.query else: statement = "SELECT " + ",".join(self.columns.keys()) + " FROM " + self.table log_to_postgres('Hive query: ' + unicode(statement), DEBUG) try: endpoint = 'http://{}:{}/kylin/api'.format(self.host, self.port) client = pykylin.connect(username=self.user, password=self.password, endpoint=endpoint, project=self.project, limit=self.limit) cursor = client.cursor() cursor.execute(statement) for row in cursor.fetchall(): line = {} idx = 0 for column_name in self.columns: line[column_name] = row[idx] idx = idx + 1 yield line except NotImplementedError, ix: log_to_postgres(ix.message, ERROR)
def convert_coltype(self, col): _type_map = { "BOOLEAN_TYPE" : "boolean", "TINYINT_TYPE" : "smallint", "SMALLINT_TYPE" : "smallint", "INT_TYPE" : "int", "BIGINT_TYPE" : "bigint", "FLOAT_TYPE" : "float4", "DOUBLE_TYPE" : "float8", "STRING_TYPE" : "text", "TIMESTAMP_TYPE" : "timestamp", "BINARY_TYPE" : "bytea", "ARRAY_TYPE" : "json", "MAP_TYPE" : "json", "STRUCT_TYPE" : "json", # "UNIONTYPE_TYPE" : "", "DECIMAL_TYPE" : "numeric", # "NULL_TYPE" : "", "DATE_TYPE" : "date", "VARCHAR_TYPE" : "varchar", "CHAR_TYPE" : "char", # "INTERVAL_YEAR_MONTH_TYPE" : "", # "INTERVAL_DAY_TIME_TYPE" : "", } (name, _type, size, _, precision, scale, _) = col if (_type in _type_map): _type = _type_map[_type] if (size): _type += "(%d)" % size if (precision): _type += "(%d,%d)" % (precision, scale) return ColumnDefinition(name, type_name=_type) else: log_to_postgres('Cannot handle type %s' % _type)
def build_query(self, quals, columns, sortkeys=None): source = self.options["table"] query = "select %s from `%s` " % ( ",".join(map(lambda a : '`%s`' % a, columns)), source) sargs, params = to_sargs(quals) if (sargs): query += " where %s" % (sargs) log_to_postgres(query, WARNING) return query,params
def __init__(self, fdw_options, fdw_columns): super(LlapFdw, self).__init__(fdw_options, fdw_columns) required_params = set(['table','schema','hostname']) for p in required_params: if p not in fdw_options: log_to_postgres('The %s parameter is required' % p, ERROR) self.conn = LlapConnection(**fdw_options) self.cols = fdw_columns self.options = fdw_options
def get_path_keys(self): """ This method must return a list of tuple of the form (column_name, expected_number_of_row). The expected_number_of_row must be computed as if a where column_name = some_value filter were applied. This helps the planner to estimate parameterized paths cost, and change the plan accordingly. For example, informing the planner that a filter on a column may return exactly one row, instead of the full billion, may help it on deciding to use a nested-loop instead of a full sequential scan. """ log_to_postgres('get_path_keys is called', DEBUG) return self.columns_stat
def parse_scope(self, scope=None): if scope in (None, "", "one"): return ldap3.SEARCH_SCOPE_SINGLE_LEVEL elif scope == "sub": return ldap3.SEARCH_SCOPE_WHOLE_SUBTREE elif scope == "base": return ldap3.SEARCH_SCOPE_BASE_OBJECT else: log_to_postgres("Invalid scope specified: %s" % scope, ERROR)
def parse_scope(self, scope=None): if scope in (None, "", "one"): return ldap.SCOPE_ONELEVEL elif scope == "sub": return ldap.SCOPE_SUBTREE elif scope == "base": return ldap.SCOPE_BASE else: log_to_postgres("Invalid scope specified: %s" % scope, ERROR)
def get_rel_size(self, quals, columns): """ This method must return a tuple of the form (expected_number_of_row, expected_mean_width_of_a_row (in bytes)). The quals and columns arguments can be used to compute those estimates. For example, the imapfdw computes a huge width whenever the payload column is requested. """ log_to_postgres('get_rel_size is called', DEBUG) # TODO: take the min of the columns stat based on the quals ? return (self.table_stat, len(columns)*100)
def __init__(self, options, columns): super(AccessLogFDW, self).__init__(options, columns) self._access_log_path = '/home/vagrant/access_log.gz' log_to_postgres("caching row data") self._rows = list(get_rows(self._access_log_path)) self._rows_by_error = defaultdict(list) for row in self._rows: self._rows_by_error[row['error']].append(row)
def _run_rethinkdb_action(self, action): # try to connect try: conn = r.connect(host=self.host, port=self.port, db=self.database, auth_key=self.auth_key) except Exception, e: log_to_postgres('Connection Falure: %s' % e, ERROR)
def delete(self, _key): log_to_postgres('Delete Request - id: %s' % _key, DEBUG) (client, bucket) = self.connect() try: bucket.delete(_key) except Exception, ex: log_to_postgres('Riak error: %s' % ex, ERROR)
def __init__(self, options, columns): super(PartitionMsgpackForeignDataWrapper, self).__init__(options, columns) self.columns = columns if 'filename' not in options: log_to_postgres( 'Filename is required option of the partition msgpack fdw.', ERROR, hint='Try adding the missing option in the table creation statement') # FIXME: raise RuntimeError('filename is required option of the partition msgpack fdw.') self.filename = options['filename']
def __init__(self, options, columns): super(DynamoFdw, self).__init__(options, columns) self.columns = columns try: self.aws_access_key_id = options['aws_access_key_id'] self.aws_secret_access_key = options['aws_secret_access_key'] self.aws_region = options['aws_region'] self.remote_table = options['remote_table'] except KeyError: log_to_postgres("You must specify these options when creating the FDW: aws_access_key_id,aws_secret_access_key,aws_region,remote_table",ERROR)
def execute(self, quals, columns): log_to_postgres("call execute()", DEBUG) for i in range(0, self.limit): line = {} for column in columns: line[column] = self.columns[column]() yield line
def generate_condition(self, qual, cypher_variable=None): """ Build a neo4j condition from a qual """ condition = '' if cypher_variable is not None: query_param_name = cypher_variable else: query_param_name = qual.field_name # quals is a list with ANY if qual.list_any_or_all == ANY: values = [ '( %s )' % self.generate_condition(Qual(qual.field_name, qual.operator[0], value), '`' + unicode(query_param_name) + '`' + '[' + unicode(array_index) + ']') for array_index, value in enumerate(qual.value) ] condition = ' ( ' + ' OR '.join(values) + ' ) ' # quals is a list with ALL elif qual.list_any_or_all == ALL: conditions = [ self.generate_condition(Qual(qual.field_name, qual.operator[0], value), '`' + unicode(query_param_name) + '`' + '[' + unicode(array_index) + ']') for array_index, value in enumerate(qual.value) ] # quals is just a string else: if qual.operator in ('~~', '!~~', '~~*', '!~~*'): # Convert to cypher regex regex = qual.value.replace('%', '.*') # For the negation, we prefix with NOT if qual.operator in ('!~~', '!~~*'): condition += ' NOT ' # Adding the variable name condition += qual.field_name + " =~ '" # If it's a ILIKE, we prefix regex with (?i) if qual.operator in ('~~*', '!~~*'): condition += '(?i)' # We add the regex condition += regex + "' " else: if query_param_name.startswith('`'): condition = qual.field_name + qual.operator + "$" + unicode(query_param_name) else: condition = qual.field_name + qual.operator + "$`" + unicode(query_param_name) + '`' log_to_postgres('Condition is : ' + unicode(condition), DEBUG) return condition
def execute(self, quals, columns): intrusion_set_list = [] conn_string = "host=127.0.0.1 port=5555 dbname=ctias user=ctias password=citas" query = "SELECT unnest(hash_list) FROM threat_miner_indicator" report_api = "http://www.threatcrowd.org/searchApi/v2/file/report" try: conn = ag.connect(conn_string) cur = conn.cursor() cur.execute(query) while True: records = cur.fetchall() if not records: break for i in range(0, len(records)): line = dict() #intrusion_set_id = records[i][0] #intrusion_set_name = records[i][1] indicator_hash = records[i][0] reports = json.loads( requests.get(report_api, { "resource": indicator_hash }).text) if (reports['response_code'] == '1'): for column_name in self.columns: if (column_name == 'md5'): line[column_name] = reports[column_name] elif (column_name == 'sha1'): line[column_name] = reports[column_name] elif (column_name == 'filename'): line[column_name] = reports['scans'] elif (column_name == 'exploit_ip'): line[column_name] = reports['ips'] elif (column_name == 'exploit_domain'): line[column_name] = reports['domains'] yield line except Exception, e: log_to_postgres(e)
def execute(self, quals, columns): intrusion_set_list = [] conn_string = _conn_string query = "MATCH (a:ioc) WHERE a.type=\'ip\' AND a.value <> \'-\' AND a.value <> \'\' " \ "AND a.value <> \'0.0.0.0\' AND a.value <> \'10.10.10.1\' RETURN DISTINCT a.value AS ip_value" try: conn = ag.connect(conn_string) cur = conn.cursor() cur.execute(query) while True: records = cur.fetchall() if not records: break for i in range(0, len(records)): line = dict() indicator_ip = records[i][0] obj = IPWhois(indicator_ip) try: res = obj.lookup_rdap(asn_methods=['whois', 'dns']) except: continue for column_name in self.columns: if (column_name == 'ip'): line[column_name] = indicator_ip elif (column_name == 'asn'): line[column_name] = res['asn'] elif (column_name == 'country_code'): line[column_name] = res['asn_country_code'] elif (column_name == 'date'): line[column_name] = res['asn_date'] elif (column_name == 'description'): line[column_name] = res['asn_description'] elif (column_name == 'registry'): line[column_name] = res['asn_registry'] yield line except Exception, e: log_to_postgres(e)
def compute_columns_stat(self): result = list() session = self.driver.session() try: for column_name in self.columns: quals = [Qual(column_name, '=', 'WHATEVER')] query = 'EXPLAIN ' + self.make_cypher(quals, self.columns, None) rs = session.run(query, {}) explain_summary = rs.summary().plan[2] stats = explain_summary['EstimatedRows'] log_to_postgres( 'Explain query for column ' + unicode(column_name) + ' is : ' + unicode(query), DEBUG) log_to_postgres( 'Stat for column ' + unicode(column_name) + ' is : ' + unicode(explain_summary['EstimatedRows']), DEBUG) result.append(((column_name, ), int(stats))) except CypherError: raise RuntimeError("Bad cypher query : " + query) finally: session.close() log_to_postgres('Columns stats are :' + unicode(result), DEBUG) return result
def __init__(self, fdw_options, fdw_columns): super(AIC_fdw, self).__init__(fdw_options, fdw_columns) self.MAX_RESULTS = 250 self.LIKE_WILDCARD = '%' self.table_type = fdw_options.get('table_type', 'thing') self.region = fdw_options.get("region") if self.region is None: log_to_postgres("Please set the AWS region", ERROR) self.aws_access_key = fdw_options.get('aws_access_key') if self.aws_access_key is None: log_to_postgres("Please set the AWS access key", ERROR) self.aws_secret_key = fdw_options.get('aws_secret_key') if self.aws_secret_key is None: log_to_postgres("Please set the AWS secret key", ERROR) self.endpoint_url = fdw_options.get("url") self.columns = fdw_columns # self._row_id_column = fdw_columns.keys()[0] if 'thing' == self.table_type.lower(): self._row_id_column = 'thing_name' elif 'thing-type' == self.table_type.lower(): self._row_id_column = 'thing_type_name' elif 'thing-group' == self.table_type.lower(): self._row_id_column = 'thing_group_name' else: log_to_postgres( "This FDW does not support table_type {}".format( self.table_type), ERROR) client_args = { 'aws_access_key_id': self.aws_access_key, 'aws_secret_access_key': self.aws_secret_key, 'region_name': self.region } if self.endpoint_url: client_args.update({'endpoint_url': self.endpoint_url}) self.core_client = boto3.client('iot', **client_args) self.data_client = boto3.client('iot-data', **client_args)
def compute_table_stat(self): stats = 100000000 if self.driver.supports_multi_db(): session = self.driver.session(database=self.database) else: session = self.driver.session() try: rs = session.run('EXPLAIN ' + self.cypher, {}) explain_summary = rs.consume().plan['args'] stats = explain_summary['EstimatedRows'] log_to_postgres( 'Stat for table is ' + str(explain_summary['EstimatedRows']), DEBUG) except CypherSyntaxError: raise RuntimeError("Bad cypher query : " + cypher) except CypherTypeError: raise RuntimeError("Bad cypher type in query : " + cypher) finally: session.close() log_to_postgres('Table stat is :' + str(stats), DEBUG) return stats
def setClient(self): """ Attempt to connect to BigQuery client """ try: # Attempt connection bq = BqClient() bq.setClient() # Verbose log if self.verbose: log_to_postgres( "Connection to BigQuery client with BqClient instance ID " + str(id(bq)), INFO) # Add to pool self.client = bq return bq except RuntimeError: log_to_postgres("Connection to BigQuery client failed", ERROR)
def execute(self, quals, columns): intrusion_set_list = [] conn_string = _conn_string query = "SELECT unnest(email_list),id,name FROM threat_miner_indicator" report_api = "http://www.threatcrowd.org/searchApi/v2/email/report" try: conn = ag.connect(conn_string) cur = conn.cursor() cur.execute(query) while True: records = cur.fetchall() if not records: break for i in range(0, len(records)): line = dict() intrusion_set_id = records[i][1] intrusion_set_name = records[i][2] indicator_email = records[i][0] reports = json.loads( requests.get(report_api, { "email": indicator_email }).text) if (reports['response_code'] == '1'): for column_name in self.columns: if (column_name == 'id'): line[column_name] = intrusion_set_id elif (column_name == 'name'): line[column_name] = intrusion_set_name elif (column_name == 'dtime'): line[column_name] = '2999-12-31 00:00:00' elif (column_name == 'email'): line[column_name] = indicator_email elif (column_name == 'exploit_domain'): line[column_name] = reports['domains'] yield line except Exception, e: log_to_postgres(e)
def execute(self, quals, columns): s3 = boto3.client( 's3', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, endpoint_url=self.endpoint, config=Config(signature_version='s3'), ) log_to_postgres(message=str(quals), level=logging.WARNING) log_to_postgres(message=str(columns), level=logging.WARNING) obj_list = s3.list_objects(Bucket=self.bucket, Prefix=self.prefix) for c in obj_list['Contents']: res = s3.get_object(Bucket=self.bucket, Key=c['Key']) body = res['Body'].read() if self.store_as == 'gzip': body = zlib.decompress(body) for row in body.split('\n'): if self.format == 'json': record = json.loads(row) else: record = json.loads(row.split('\t')[2]) yield record
def reader_lastools(file): log_to_postgres('read pc using lastools') attribute = 'xyzitcupRGB' """ x, y, z, a - scan angle, i - intensity, n - number of returns for given pulse, r - number of this return, c - classification, u - user data, p - point source ID, e - edge of flight line flagw d - direction of scan flag red, green, blue """ cmd = '/home/mutian/fdw/LAStools.o/bin/las2txt -i {} -parse {} -stdout'.format( file, attribute) for line in execute_lastools(cmd): yield line pass
def __init__(self, options, columns): super(pgFDW_mapd, self).__init__(options, columns) for attribute in connection_attrs.keys(): if attribute not in options: log_to_postgres( 'The ' + attribute + ' parameter should be defined. defaulting to :' + connection_attrs.get(attribute), WARNING) setattr(self, attribute, connection_attrs.get(attribute)) else: setattr(self, attribute, options.get(attribute)) self.connection = connect(user=self.user, password=self.password, host=self.host, port=self.port, dbname=self.dbname) self.limit = options.get('limit', 100000) self.query = options.get('query', None) if self.query != None: self.table_name = "(" + options.get('query') + ")" else: self.table_name = options.get('table_name', None)
def setOptions(self, options): """ Set table options at class level """ # Set options at class scope self.dataset = options.get('fdw_dataset') or options.get('schema') self.table = options.get('fdw_table') or options.get('tablename') self.convertToTz = options.get('fdw_convert_tz') # Set verbose option self.setOptionVerbose(options.get('fdw_verbose')) # Set SQL dialect self.setOptionSqlDialect(options.get('fdw_sql_dialect')) # Set grouping option self.setOptionGroupBy(options.get('fdw_group')) # Set casting rules self.setOptionCasting(options.get('fdw_casting')) # Set what to do if imported table has too many columns self.tooManyColumns = options.get("fdw_colcount") or "error" if self.tooManyColumns not in ("error", "trim", "skip"): log_to_postgres( "fdw_colcount must be one of 'error', 'trim', 'skip', if provided", ERROR) self.tooManyColumns = "error" # Set what to do if imported table columns share a 63 # character prefix self.sharedPrefix = options.get("fdw_colnames") or "error" if self.sharedPrefix not in ("error", "skip", "trim"): log_to_postgres( "fdw_colnames must be one of 'error', 'trim', 'skip', if provided", ERROR) self.sharedPrefix = "error"
def setOptionCasting(self, castingRules): """ Conversion rules are received as a string, for example: '{"key": "FLOAT64", "datetime": "DATE"}' The string will be converted to a dict """ if castingRules: # Cast string as a dict try: import ast self.castingRules = ast.literal_eval(castingRules) except Exception as e: log_to_postgres( "fdw_casting conversion failed: `" + str(e) + "`", ERROR) # For security reasons, ensure that the string was correctly casted as a dict try: if type(self.castingRules) is not dict: raise ValueError('fdw_casting format is incorrect.') except Exception as e: log_to_postgres( "fdw_casting conversion failed: `" + str(e) + "`", ERROR)
def __init__(self, options, columns): super(SubscriberFDW, self).__init__(options, columns) self.endpoint_url = 'http://api.moosend.com/v3/' self.api_key = options.get('api_key', None) self.list_id = options.get('list_id', None) self.primary_key_column_name = 'Email' self.page_size = options.get('page_size', 500) if self.api_key is None: log_to_postgres( "MoosendFDW: You must supply an API key to Moosend in the options.", ERROR) if self.list_id is None: log_to_postgres( "MoosendFDW: You must supply a mailing list ID in the options.", ERROR) self.columns = columns self.main_fields = ("ID", "Name", "Email", "CreatedOn", "UnsubscribedOn", "UnsubscribedFromID", "SubscribeType", "SubscribeMethod") self.custom_fields = [c for c in columns if c not in self.main_fields]
def commit(self): if len(self.buffer) == 0: return s3 = boto3.client( 's3', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, endpoint_url=self.endpoint, config=Config(signature_version='s3'), ) s3.create_bucket( Bucket=self.bucket, ) if self.format == 'json': payload = '\n'.join(map(lambda row: json.dumps(row), self.buffer)) log_to_postgres(message=payload, level=logging.WARNING) key = self.prefix + str(uuid.uuid4()) s3.put_object(Bucket=self.bucket, Key=key, Body=payload) self.buffer = [] else: raise NotImplementedError( "This FDW does not support the {} format".format(self.format))
def getOperator(self, operator): """ Validate operator """ # List of BigQuery operators supported # Exhaustive list: https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#operators # Non listed operators may or may not work operators = ['=', '<', '>', '<=', '>=', '!=', '<>', 'LIKE', 'NOT LIKE'] # Mapping between multicorn operators and BigQuery operators mapping = {} mapping['~~'] = 'LIKE' mapping['!~~'] = 'NOT LIKE' if operator in operators: # Operator is natively supported return operator elif operator in mapping: # Multicorn operator has a BigQuery equivalent return mapping[operator] else: # Operator is not supported log_to_postgres( "Operator `" + operator + "` is not currently supported", ERROR)
def execute(self, quals, columns, sortkeys=None): log_to_postgres('Query Columns: %s' % columns, DEBUG) log_to_postgres('Query Filters: %s' % quals, DEBUG) statement = self.make_cypher(quals, columns, sortkeys) params = {} for qual in quals: params[str(qual.field_name)] = self.convert_to_neo4j( self.columns[qual.field_name], qual.value) log_to_postgres('Neo4j query is : ' + str(statement), DEBUG) log_to_postgres('With params : ' + str(params), DEBUG) # Execute & retrieve neo4j data if self.driver.supports_multi_db(): session = self.driver.session(database=self.database) else: session = self.driver.session() try: for record in session.run(statement, params): line = {} for column_name in columns: # TODO: from neo4j type to pg types line[column_name] = self.convert_to_pg( self.columns[column_name], record[column_name]) log_to_postgres("sending result item to PG : " + str(line), DEBUG) yield line except CypherSyntaxError: raise RuntimeError("Bad cypher query : " + statement) except CypherTypeError: raise RuntimeError("Bad cypher type in query : " + statement) finally: session.close()
def __init__(self, fdw_options, fdw_columns): super(SystemFdw, self).__init__(fdw_options, fdw_columns) log_to_postgres('options: %s' % fdw_options, DEBUG) log_to_postgres('columns: %s' % fdw_columns, DEBUG) if 'filepath' in fdw_options: self.filepath = fdw_options["filepath"] else: log_to_postgres('filepath parameter is required.', ERROR)
def execute(self, quals, columns): log_to_postgres("[CSVGZ_FDW] %s %s" % (quals, columns), logging.DEBUG) log_to_postgres("[CSVGZ_FDW] %s" % (self.file_name), logging.DEBUG) with gzip.open(self.file_name, 'rt') as f: reader = csv.reader(f) try: for row in reader: line = {} i = 0 for column_name in self.columns: if column_name in columns: line[column_name] = row[i] i = i + 1 yield line except OSError as e: log_to_postgres("[CSVGZ_FDW] OS Error (%s)" % (e), logging.ERROR) except csv.Error as e: log_to_postgres("[CSVGZ_FDW] Error {}" % (e), logging.ERROR) except: log_to_postgres( "[CSVGZ_FDW] Unexpected error: %s" % (sys.exc_info()[0]), logging.ERROR)
def _get_column_stats(self, columns): column_stats = {} # Temp solutions # Get total number of rows #total_rows = self.model.objects_in(self.db).count() # Get average rows per value in column (total divided by number of unique values) #exprs = ['intDiv(%d, uniqCombined(%s)) as %s' % (total_rows, c, c) for c in columns] #sql = "SELECT %s FROM $db.`%s`" % (', '.join(exprs), self.table_name) #for row in self.db.select(sql): # for c in columns: # column_stats[c] = dict(average_rows=getattr(row, c), size=4) # Get average size per column. This may fail because data_uncompressed_bytes is a recent addition sql = "SELECT * FROM system.columns WHERE database='%s' AND table='%s'" % ( self.db_name, self.table_name) for col_def in self.db.select(sql): column_stats[col_def.name] = dict(average_rows=100000, size=self._calc_col_size( col_def, 100000)) # Debug for c in columns: log_to_postgres(c + ': ' + repr(column_stats[c])) return column_stats
def report_errors(table_name: str): """Context manager that ignores exceptions and serializes them to JSON using PG's notice mechanism instead. The data source is meant to load these to report on partial failures (e.g. failed to load one table, but not others).""" try: yield except Exception as e: logging.error( "Error scanning %s, ignoring: %s: %s", table_name, get_exception_name(e), e, exc_info=e, ) log_to_postgres( "SPLITGRAPH: " + json.dumps( { "table_name": table_name, "error": get_exception_name(e), "error_text": str(e), } ) )
def __init__(self, options, columns): super(FakerForeignDataWrapper, self).__init__(options, columns) log_to_postgres("call __init__()", DEBUG) self.limit = 100 self.seed = None self.locale = 'en_US' self.columns = {} if 'max_results' in options: self.limit = int(options['max_results']) if 'locale' in options: self.locale = options['locale'] if 'seed' in options: self.seed = int(options['seed']) faker = Faker(self.locale) faker.seed(self.seed) for column in columns: func = getattr(faker, column, lambda: None) self.columns[column] = func
def _matches(self, quals, row): """ Returns True if row matches to all quals. Otherwise returns False. Args: quals (list of Qual): row (list or tuple): Returns: bool: True if row matches to all quals, False otherwise. """ for qual in quals: op = QUAL_OPERATOR_MAP.get(qual.operator) if op is None: log_to_postgres( 'Unknown operator {} in the {} qual. Row will be returned.' .format(qual.operator, qual), WARNING, hint='Implement that operator in the ambryfdw wrapper.') continue elem_index = self.columns.index(qual.field_name) if not op(row[elem_index], qual.value): return False return True
def delete(self, rowid): log_to_postgres('Delete Request - rowid: %s' % rowid, DEBUG) if not rowid: log_to_postgres('Update request requires rowid (PK).', ERROR) try: if self.type == 'ContentType': self.client.delete_content_type(rowid) elif self.type == 'Entry': self.client.delete_entry(rowid) elif self.type == 'Asset': self.client.delete_asset(rowid) else: log_to_postgres('Deleting entities of type %s is not supported yet.' % self.type, ERROR) except invalidContentfulResponse as e: log_to_postgres(e, ERROR) return {}
def insert(self, new_values): """ This method is invoked every time a INSERT is executed on the foreign table. """ log_to_postgres('Barman FDW INSERT output: %s' % new_values) backup_cmd = "barman backup %s" % new_values['server'] ssh_cmd = "%s@%s" % (self.barman_user, self.barman_host) log_to_postgres('Barman FDW INSERT output: %s' % backup_cmd) ssh = subprocess.Popen(["ssh", '-A', "%s" % ssh_cmd, backup_cmd], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = ssh.communicate() log_to_postgres('Barman FDW INSERT output: %s' % output[0]) log_to_postgres('Barman FDW INSERT errors: %s' % output[1]) return new_values
def execute(self, quals, columns): log_to_postgres('exec quals: %s' % quals, DEBUG) log_to_postgres('exec columns: %s' % columns, DEBUG) resp = {} try: resp = self._client.metadata(self._path, **self._metadata_params) except Exception as e: log_to_postgres( 'could not get metadata for path "%s": %s' % (self._path, str(e)), ERROR) for item in resp.get('contents', []): yield item
def execute(self, quals, columns): log_to_postgres('Query Columns: %s' % columns, DEBUG) log_to_postgres('Query Filters: %s' % quals, DEBUG) (client, bucket) = self.connect() filters = [] for qual in quals: try: operator_function = get_operator_function(qual.operator) except UnknownOperatorException, e: log_to_postgres(e, ERROR) filters.append( lambda x: operator_function(x[qual.field_name], qual.value))
def execute(self, quals, columns): log_to_postgres('exec quals: %s' % quals, DEBUG) log_to_postgres('exec columns: %s' % columns, DEBUG) for coll_name in self._db.collection_names(): try: res_rows = self._db.command({'collStats': coll_name}) utils.convertNestedItemsToJson(res_rows) yield res_rows except Exception as e: log_to_postgres( 'could not get stats for coll [%s]: %s' % (coll_name, str(e)), ERROR)
def __init__(self, options, columns): super(HiveServer2ForeignDataWrapper, self).__init__(options, columns) if 'host' not in options: log_to_postgres('The host parameter is required and the default is localhost.', WARNING) self.host = options.get("host", "localhost") if 'port' not in options: log_to_postgres('The host parameter is required and the default is 10000.', WARNING) self.port = options.get("port", "10000") if 'table' and 'query' not in options: log_to_postgres('table or query parameter is required.', ERROR) self.table = options.get("table", None) self.query = options.get("query", None) self.columns = columns
def __init__(self, options, columns): super(FilesystemFdw, self).__init__(options, columns) root_dir = options.get('root_dir') pattern = options.get('pattern') self.content_column = options.get('content_column', None) self.filename_column = options.get('filename_column', None) self.file_mode = int(options.get('file_mode', '700'), 8) self.structured_directory = StructuredDirectory( root_dir, pattern, file_mode=self.file_mode) self.folder_columns = [ key[0] for key in self.structured_directory._path_parts_properties if key ] # Keep a set of files that should not be seen inside the transaction, # because they have "logically" been deleted, but are not yet commited self.invisible_files = set() # Keep a dictionary of updated content. self.updated_content = dict() # Assume 100 files/folder per folder self.total_files = 100**len(pattern.split('/')) if self.filename_column: if self.filename_column not in columns: log_to_postgres( "The filename column (%s) does not exist" "in the column list" % self.filename_column, ERROR, "You should try to create your table with an " "additional column: \n" "%s character varying" % self.filename_column) else: columns.pop(self.filename_column) if self.content_column: if self.content_column not in columns: log_to_postgres( "The content column (%s) does not exist" "in the column list" % self.content_column, ERROR, "You should try to create your table with an " "additional column: \n" "%s bytea" % self.content_column) else: columns.pop(self.content_column) if len(self.structured_directory.properties) < len(columns): missing_columns = set(columns.keys()).difference( self.structured_directory.properties) log_to_postgres("Some columns are not mapped in the structured fs", level=WARNING, hint="Remove the following columns: %s " % missing_columns)
def execute(self, quals, columns): log_to_postgres('Query Columns: %s' % columns, DEBUG) log_to_postgres('Query Filters: %s' % quals, DEBUG) myQuery = r.table(self.table)\ .pluck(self.columns.keys()) for qual in quals: try: operatorFunction = getOperatorFunction(qual.operator) except unknownOperatorException, e: log_to_postgres(e, ERROR) myQuery = myQuery.filter( operatorFunction(r.row[qual.field_name], qual.value))