def run(self): """ Runs the clean algorythm. """ self.cc_log("INFO", "Data Processing Clean: Started") if self.format.lower() == "json": if self.drop and isinstance(self.drop, str): self.drop = [self.drop] if self.keep and isinstance(self.keep, str): self.keep = [self.keep] json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) self.cc_log("INFO", "Started to clean line for line, please wait!") while not json_fr.isEOF(): data = json_fr.readRecord() keepLine, cleaned_line = self.clean_json(data) self.cc_log("DEBUG", cleaned_line) if keepLine: json_fw.writeRecord(cleaned_line) json_fr.close() json_fw.close() else: raise NotImplementedError( "The defined format is not implement yet. Please add!") self.cc_log("INFO", "Data Processing Clean: Finished") return True
def run(self): """ Runs the join algorythm. """ self.cc_log("INFO", "Data Processing Join: Started") if self.left_joinon and isinstance(self.left_joinon, str): self.left_joinon = [self.left_joinon] if self.right_joinon and isinstance(self.right_joinon, str): self.right_joinon = [self.right_joinon] # Create the B-Tree for quick and easy search b_tree = genBTree(self.joinwith, self.left_joinon) json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) # Loop through all the left table failed_counter = 0 while not json_fr.isEOF(): data = json_fr.readRecord() key = keyGen(self.right_joinon, data) (data, b_tree, failed_counter) = self.join(b_tree, key, data, failed_counter) json_fw.writeRecord(data) json_fr.close() json_fw.close() self.cc_log( "INFO", "%i (btree) & %i (keyerror) records could not be mached" % (len(b_tree), failed_counter)) self.cc_log("INFO", "Data Processing Join: Finished") return True
def process_ip_lookup_data(self, lookup_data): """ Writes an ip lookup dataset to the target file if not already processed and sets it as processed after. **Parameters**: lookup_data : dict single ip lookup dataset received via shodan api. **Returns**: ``True`` if the ip lookup data was written. ``False`` if the lookup was already processed and not written. """ data_ts = lookup_data["timestamp"] # Check if we did process the timestamp before but with another targetname processed_ts = self.kv_store.get("processed_ts", section=self.moduleName) if not processed_ts: processed_ts = [] if data_ts not in processed_ts: self.cc_log( "INFO", 'Banner data for TS %s has not been processed yet' % (data_ts)) json_fw = json_file_writer(self.target) json_fw.writeRecord(lookup_data) json_fw.close() processed_ts.append(data_ts) self.kv_store.put("processed_ts", processed_ts, section=self.moduleName, force=True) # Save the newest processed timestamp to be able to tell times between last run and the current run newest_processed_ts = self.kv_store.get("newest_processed_ts", section=self.moduleName) if not newest_processed_ts or self.shodan_ts_is_newer( data_ts, newest_processed_ts): self.kv_store.put("newest_processed_ts", data_ts, section=self.moduleName, force=True) # Save the target file for an explicit timestamp to be able to tell the file if it was already processed self.kv_store.put(data_ts, self.target, section=self.moduleName, force=True) return True else: original_target = self.kv_store.get(data_ts, section=self.moduleName) self.cc_log( "WARNING", "Dataset with the TS %s has been already processed with the target %s - skip rest of the path!" % (data_ts, original_target)) return False
def run(self): """ Runs the classing algorythm. """ self.cc_log("INFO", "Data Processing Classing: Started") json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) while not json_fr.isEOF(): record = json_fr.readRecord() classes = self.getClasses(record) record['classes'] = classes json_fw.writeRecord(record) json_fr.close() json_fw.close() self.cc_log("INFO", "Data Processing Classing: Finished") return True
def run(self): """ Runs the group algorythm. """ self.cc_log("INFO", "Data Processing Group: Started") data_dict = {} json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) # load data self.cc_log("DEBUG", "Started to group, please wait...!") while not json_fr.isEOF(): data = json_fr.readRecord() for attribute in self.groupBy.split('.'): data = data.get(attribute, {}) if not data: self.cc_log("DEBUG", "Skip a line, attribute was not found!") continue # Skip as attribute seems to not be found # check if the groupRegex is set and get the first group of it if self.groupRegex: data = re.search(self.groupRegex, data) if not data or not data.group(0): data = "others" else: data = data.group(0) self.cc_log("DEBUG", "Regex grouped %s" % data) if data in data_dict: data_dict[data] += 1 else: data_dict[data] = 1 for entry in self.dictToList(data_dict): json_fw.writeRecord(entry) json_fr.close() json_fw.close() self.cc_log( "INFO", "Data Processing Group: Aggregated the data set into " + str(len(data_dict.keys())) + " data entries") self.cc_log("INFO", "Data Processing Group: Finished") return True
def run_shodan_search_query(self): """ Runs the shodan api search query lookup. **Returns**: ``True`` if the search query lookup was successfull and the data written. ``False`` if the lookup failed. """ self.cc_log( "INFO", "Data Store Shodan: Started Search Query Lookup With Query '%s'" % self.query) s_api = shodan.Shodan(self.apiKey) json_fw = json_file_writer(self.target) counter = 0 for banner in s_api.search_cursor(self.query, minify=self.minify, retries=self.retries): json_fw.writeRecord(banner) counter += 1 self.cc_log("DEBUG", "Data amount: %s!" % (counter)) if counter >= self.limit: break json_fw.close() if counter > 0: self.cc_log( "INFO", "A total of %s banner data was downloaded!" % (counter)) return True self.cc_log("WARNING", "No data was downloaded via search cursor lookup!") return False
def run(self): """ Runs the filter algorythm. """ self.cc_log("INFO", "Data Processing Filter: Started") count = 0 json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) # load data self.cc_log("DEBUG", "Started to filter, please wait...!") while not json_fr.isEOF(): data = json_fr.readRecord() if self.filter(data): json_fw.writeRecord(data) else: count += 1 json_fr.close() json_fw.close() self.cc_log( "INFO", "Data Processing Filter: Filtered " + str(count) + " data sets") self.cc_log("INFO", "Data Processing Filter: Finished") return True
def run(self): """ Runs the diff algorythm. **Returns**: ``True`` if the run works fine. """ self.cc_log("INFO", "Data Processing Diff: Started") if self.attributes_diff and isinstance(self.attributes_diff, str): self.attributes_diff = [self.attributes_diff] if self.key_attributes and isinstance(self.key_attributes, str): self.key_attributes = [self.key_attributes] # if the target does not exist create the file and add all the data if not path.isfile(self.target): json_fr = json_file_reader(self.src) self.cc_log("DEBUG", "Opened source file") json_fw = json_file_writer(self.target) self.cc_log("DEBUG", "Opened target file - please have patience") while not json_fr.isEOF(): data = json_fr.readRecord() data = self.genDataSet(keyGen(self.key_attributes, data), data, self.attributes_diff) json_fw.writeRecord(data) json_fr.close() json_fw.close() # else create a B-Tree out of the src file with the nessecary data else: self.cc_log( "DEBUG", "Generating B-Tree for the diff - please have patience") b_tree = genBTree(self.src, self.key_attributes) # move the old target so it can be read from and does not collide with the writer old_target = self.target + '.old' move(self.target, old_target) json_fr = json_file_reader(old_target) json_fw = json_file_writer(self.target) self.cc_log("INFO", "Started to generate the diff - please have patience") while not json_fr.isEOF(): old_data = json_fr.readRecord() try: # update all the data new_data = b_tree.pop(old_data["cc_id"]) diff_data = self.getDataByAttributes( self.attributes_diff, new_data) old_data = self.compareData(old_data, diff_data) except KeyError: # if the id cannot be found it must be delete old_data["cc_status"] = "delete" old_data["cc_time_id"] = self.time_id json_fw.writeRecord(old_data) # add the left over data self.cc_log("INFO", "Adding leftover data...") while b_tree: key = b_tree.minKey() data = self.genDataSet(key, b_tree.pop(key), self.attributes_diff) json_fw.writeRecord(data) remove(old_target) json_fr.close() json_fw.close() self.kv_store.put(key="diff_last_src", value=(self.time_id), section=self.moduleName, force=True) self.cc_log("INFO", "Data Processing Diff: Finished") return True
def run(self): """ Runs the clean algorythm. **Returns**: ``True`` if this run succeeded. ``False`` if this run did not succeed. """ self.cc_log("INFO", "Data Processing Country: Started") self.cc_log( "DEBUG", "Trying to open the MaxMind GeoLite2-Country DB, please wait!") try: db = geoip2.database.Reader(self.max_mind_db_path) except Exception as e: self.logger.exception(e) self.cc_log( "ERROR", "Failed to open the MaxMind GeoLite2-Country DB at %s - please check the file!" % (self.max_mind_db_path)) return False self.cc_log("DEBUG", "Opened the MaxMindGeoLite2-Country DB!") json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) self.cc_log( "INFO", "Started to lookup ips and write into the target, please wait!") while not json_fr.isEOF(): data = json_fr.readRecord() country_code = "-99" found_ip = data for attribute in self.ip_input_attribute.split('.'): found_ip = found_ip[attribute] if not found_ip or found_ip == data: self.cc_log( "WARNING", "No IP found at the give ipInputAttribute place - Add country code -99 to this dataset!" ) else: # Lookup ip for country try: ip_info = db.country(found_ip) if ip_info.country.iso_code: country_code = ip_info.country.iso_code self.cc_log( "DEBUG", "Found country code %s for ip %s" % (ip_info.country.iso_code, found_ip)) except Exception as e: self.cc_log( "WARNING", "No country code found for ip %s - add -99 to country code" % (found_ip)) data[self.output_attribute] = country_code json_fw.writeRecord(data) json_fr.close() json_fw.close() db.close() self.cc_log("INFO", "Data Processing Country: Finished") return True
def setUp(self): if not os.path.exists(TESTDATA_GEN_OUTPUT_FOLDER): os.makedirs(TESTDATA_GEN_OUTPUT_FOLDER) self.fw = json_file_writer(TESTDATA_TARGET_FILENAME)