def update_second_hop_latency(probe_latency_queue: mp.Queue, finish_event: threading.Event): Session = create_session_for_process(engine) db_session = Session() update_sql = 'UPDATE probes SET second_hop_latency = {} WHERE id = {} AND ' \ '(second_hop_latency IS NULL OR second_hop_latency > {});' probe_dct = {} while not finish_event.is_set() or not probe_latency_queue.empty(): try: probe_id, latency = probe_latency_queue.get(timeout=1) if probe_dct.get(probe_id, sys.maxsize) > latency: probe_dct[probe_id] = latency except queue.Empty: if finish_event.is_set(): break for probe_id, latency in probe_dct.items(): db_session.execute(update_sql.format(latency, probe_id, latency)) logger.debug("Committing updates to second hop latency") db_session.commit() db_session.close() Session.remove()
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global logger logger = util.setup_logger(args.logging_file, 'delete_measurements', args.log_level) global engine engine = create_engine(args.database_name) Session = create_session_for_process(engine) db_session = Session() oldest_date_allowed = datetime.date.today() - datetime.timedelta( days=args.days_in_past) db_session.query(MeasurementResult).filter( MeasurementResult.timestamp < oldest_date_allowed).delete() db_session.commit() db_session.close() Session.remove() logger.info('deleted all measurements before {}'.format( oldest_date_allowed.strftime('%Y-%m-%d')))
def create_trie(code_blacklist_filepath: str, word_blacklist_filepath: str): """ Creates a RecordTrie with the marisa library :param code_blacklist_filepath: the path to the code blacklist file :param word_blacklist_filepath: the path to the word blacklist file :rtype: marisa_trie.RecordTrie """ Session = create_session_for_process(engine) db_session = Session() try: locations = db_session.query(LocationInfo) code_blacklist_set = set() if code_blacklist_filepath: with open(code_blacklist_filepath) as code_blacklist_file: for line in code_blacklist_file: line = line.strip() if line[0] != '#': code_blacklist_set.add(line) word_blacklist_set = set() if word_blacklist_filepath: with open(word_blacklist_filepath) as word_blacklist_file: for line in word_blacklist_file: line = line.strip() if line[0] != '#': word_blacklist_set.add(line) return create_trie_obj(locations, code_blacklist_set, word_blacklist_set) finally: db_session.close() Session.remove()
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global log log = setup_logger(args.logging_file, 'parse_ripe_probes') engine = create_engine(args.database_name) Session = create_session_for_process(engine) db_session = Session() ripe_sema = threading.BoundedSemaphore(50) stop_event = threading.Event() token_generator_thread = start_token_generating_thread( ripe_sema, args.ripe_requests_per_second, stop_event) probes = get_probes(db_session, ripe_sema) stop_event.set() token_generator_thread.join() log.info('writing probes to tmp') os.makedirs(os.path.dirname(PROBE_CACHING_PATH), exist_ok=True) with open(PROBE_CACHING_PATH, 'w') as ripe_temp_file: probe_info_to_write = { probe.id: is_in_nat for probe, is_in_nat in probes.values() } json.dump(probe_info_to_write, ripe_temp_file)
def measurement_results_saver(measurement_results_queue: queue.Queue, stop_event: threading.Event): Session = create_session_for_process(engine) db_session = Session() results = [] count_results = 0 while not stop_event.is_set() or not measurement_results_queue.empty(): try: measurement_result = measurement_results_queue.get(timeout=5) results.append(measurement_result) count_results += 1 if count_results % 10**5 == 0: db_session.bulk_save_objects(results) db_session.commit() results.clear() except queue.Empty: pass db_session.bulk_save_objects(results) db_session.commit() db_session.close() Session.remove()
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global logger logger = util.setup_logger(args.logging_file, 'parse_ripe_archive', args.log_level) logger.info('starting caida parsing for archivepath %s', args.archive_path) if not os.path.isdir(args.archive_path): print('Archive path does not lead to a directory', file=sys.stderr) return 1 global engine engine = create_engine(args.database_name) Session = create_session_for_process(engine) db_session = Session() parsed_file_name = '{}-parsed-caida-files.txt'.format(args.database_name) parsed_files = set() if not os.path.exists(parsed_file_name): logger.debug('Creating parsed files history file for database %s', args.database_name) else: with open(parsed_file_name) as parsed_files_histoy_file: for line in parsed_files_histoy_file: parsed_files.add(line.strip()) filenames, probe_dct = get_filenames(args.archive_path, args.file_regex, args.days_in_past, parsed_files, db_session) if not filenames: logger.info('found no files to parse') return 0 mp_manager = mp.Manager() new_parsed_files = mp_manager.Queue() with concurrent.ProcessPoolExecutor(max_workers=args.number_processes) as processing_executor: processing_results = processing_executor.map( functools.partial(parse_caida_data, not args.plaintext, args.days_in_past, probe_dct, new_parsed_files), filenames) try: while True: try: processing_results.__next__() except StopIteration: break except Exception: logger.exception('process threw exception') finally: with open(parsed_file_name, 'a') as parsed_files_histoy_file: while not new_parsed_files.empty(): parsed_files_histoy_file.write(new_parsed_files.get(timeout=1) + '\n')
def parse_codes(args): """start real parsing""" Session = create_session_for_process(engine) db_session = Session() start_time = time.clock() start_rtime = time.time() try: with db_session.no_autoflush: if args.airport_codes: parse_airport_codes(args, db_session) if args.metropolitan_file: metropolitan_locations = parse_metropolitan_codes(args.metropolitan_file, db_session) if args.merge_radius: add_locations(AIRPORT_LOCATION_CODES, metropolitan_locations, args.merge_radius, db_session, create_new_locations=False) else: add_locations(AIRPORT_LOCATION_CODES, metropolitan_locations, 100, db_session, create_new_locations=False) if args.locode: parse_locode_codes(args.locode, db_session) if args.clli: get_clli_codes(args.clli, db_session) logger.debug('Finished clli parsing') if args.geonames: get_geo_names(args.geonames, args.min_population, db_session) logger.debug('Finished geonames parsing') locations = merge_location_codes(args.merge_radius, db_session) sanitize_location_names(locations) db_session.add_all(locations) db_session.commit() finally: db_session.close() Session.remove() end_time = time.clock() end_rtime = time.time() logger.debug('finished and needed {} seconds of the processor computation time\n' 'And {} seconds of the real world time.\n' 'Collected data on {} locations.'.format((end_time - start_time), int(end_rtime - start_rtime), len(locations)))
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global logger logger = util.setup_logger(args.logging_file, 'parse_zmap_results') global engine engine = create_engine(args.database_name) Session = create_session_for_process(engine) db_session = Session() filenames = get_filenames(args.zmap_results_dir, args.file_regex) locations = {} config_parser = configparser.ConfigParser() if os.path.isfile(args.locations_config_file): config_parser.read(args.locations_config_file) for filename in filenames: location_name = __get_location_name(filename) if location_name not in config_parser or \ 'lat' not in config_parser[location_name] or \ 'lon' not in config_parser[location_name]: logger.critical( '{} not defined in config file or has not the right format! ' 'Aborting!'.format(location_name)) return 3 location = location_for_coordinates( config_parser[location_name]['lat'], config_parser[location_name]['lon'], db_session) probe = ZmapProbe(probe_id=location_name, location=location) db_session.add(probe) db_session.commit() locations[location_name] = probe.id else: raise ValueError('locations_config_file path does not lead to a file') parse(filenames, locations, db_session) db_session.close() Session.remove()
def preprocess_file_part(filepath: str, pnr: int, line_queue: mp.Queue, ip_encoding_filter: bool, regex_strategy: RegexStrategy, tlds: typing.Set[str], whitelist: typing.Set[str], parsed_ips: typing.Set[str], parsed_ips_lock: mp.Lock, domain_label_queue: mp.Queue, finished_reading_event: mp.Event): """ Sanitize filepart from start to end pnr is a number to recognize the process ipregex should be a regex with 4 integers to filter the Isp client domain names """ logger.info('starting') label_stats = collections.defaultdict(int) Session = create_session_for_process(engine) db_session = Session() try: bad_characters = collections.defaultdict(int) count_good_lines = 0 count_isp_lines = 0 while not finished_reading_event.is_set() or not line_queue.empty(): try: line = line_queue.get(timeout=2) except queue.Empty: time.sleep(1) continue if len(line) == 0: continue line = line.strip() ip, domain = line.split(',', 1) if not domain: logger.info( 'Warning found empty domain for IP {} skipping'.format(ip)) continue if ':' in ip: ip_version = constants.IPV6_IDENTIFIER else: ip_version = constants.IPV4_IDENTIFIER with parsed_ips_lock: parsed_ips.add(ip) n_good_lines_count, n_ip_lines_count = classify_domain( ip, domain, ip_version, ip_encoding_filter, regex_strategy, tlds, whitelist, db_session, domain_label_queue) count_good_lines += n_good_lines_count count_isp_lines += n_ip_lines_count db_session.commit() else: logger.info('finished no more lines') directory = os.path.dirname(filepath) filename = util.get_path_filename(filepath) logger.info('good lines: {} ips lines: {}'.format( count_good_lines, count_isp_lines)) with open(os.path.join(directory, '{0}-{1}-character.stats'.format(filename, pnr)), 'w', encoding='utf-8') as characterStatsFile: json.dump(bad_characters, characterStatsFile) with open(os.path.join( directory, '{0}-{1}-domain-label.stats'.format(filename, pnr)), 'w', encoding='utf-8') as labelStatFile: json.dump(label_stats, labelStatFile) finally: db_session.close() Session.remove() domain_label_queue.close()
def handle_labels(labels_queue: mp.Queue, stop_event: threading.Event): """Handels the label results and saves them to the database not blocking the other db queries""" class DomainLabelHolder: def __init__(self, label): self.label = label self.label_id = 0 self.domain_ids = set() self._handled_domain_ids = set() def add_domain_id(self, domain_id): if domain_id not in self.domain_ids and domain_id not in self._handled_domain_ids: self.domain_ids.add(domain_id) def get_insert_values(self): values = [{ 'domain_id': domain_id, 'domain_label_id': self.label_id } for domain_id in self.domain_ids] return values def handled_domain_ids(self): self._handled_domain_ids = self._handled_domain_ids.union( self.domain_ids) self.domain_ids.clear() def __hash__(self): return hash(self.label) def save_labels(domain_labels_dct, labels_to_save, new_labels, db_sess): if new_labels: db_sess.bulk_save_objects([label.label for label in new_labels], return_defaults=True) for label_obj in new_labels: label_obj.label_id = label_obj.label.id values_to_insert = [] for label_obj in labels_to_save: if label_obj.domain_ids: values_to_insert.extend(label_obj.get_insert_values()) label_obj.handled_domain_ids() insert_expr = domain_to_label_table.insert().values(values_to_insert) new_labels.clear() labels_to_save.clear() db_sess.execute(insert_expr) Session = create_session_for_process(engine) db_session = Session() new_labels = [] labels_to_save = set() domain_labels = {} counter = 0 ccounter = 0 while not stop_event.is_set() or not labels_queue.empty(): try: domain_label_tuples = labels_queue.get(timeout=1) for label_name, domain_id in domain_label_tuples: try: label = domain_labels[label_name] except KeyError: label_obj = DomainLabel(label_name) label = DomainLabelHolder(label_obj) domain_labels[label_name] = label new_labels.append(label) labels_to_save.add(label) label.add_domain_id(domain_id) counter += 1 if counter >= 10**4: save_labels(domain_labels, labels_to_save, new_labels, db_session) counter = 0 ccounter += 1 if ccounter >= 10: db_session.commit() ccounter = 0 except queue.Empty: pass save_labels(domain_labels, labels_to_save, new_labels, db_session) db_session.commit() logger.info('stopped') labels_queue.close()
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global logger logger = util.setup_logger(args.log_file, 'validate-stats', loglevel=args.log_level) logger.debug('starting') engine = create_engine(args.database_name) db_session = create_session_for_process(engine)() if args.allowed_measurement_age: oldest_date_allowed = datetime.datetime.now() - datetime.timedelta( seconds=args.allowed_measurement_age) sql_args = 'TIMESTAMP \'' + oldest_date_allowed.strftime( '%Y-%m-%d %H:%M:%S') + '\', ' else: sql_args = 'NULL, ' if args.minimum_measurement_age: sql_args += 'TIMESTAMP \'' + args.minimum_measurement_age + '\', ' else: sql_args += 'NULL, ' sql_args += str(args.exclude_traceroute) + ', ' + str( args.exclude_caida_measurements) slq_query = 'SELECT * from domainsWithDistanceRTTs({});'.format(sql_args) results = db_session.execute(slq_query) rtt_distances = [] domains_count = collections.defaultdict(int) location_id_count = collections.defaultdict(int) probes_count = collections.defaultdict(int) for domain_id, domain_name, hint_location_id, hint_location_name, location_hint_id, \ measurement_result_id, probe_id, distance, min_rtt in results: rtt_distances.append((domain_id, min_rtt, distance)) domains_count[domain_base_name(domain_name)] += 1 location_id_count[hint_location_id] += 1 probes_count[probe_id] += 1 with open(args.output_filename, 'w') as output_file: str_to_wrt = '\n'.join([ '{}; {}; {}'.format(domain_id, rtt, dist) for domain_id, rtt, dist in rtt_distances ]) output_file.write(str_to_wrt) print('domains count: ') pprint.pprint( sorted(domains_count.items(), key=operator.itemgetter(1), reverse=True)) print('location_id_count') pprint.pprint( sorted(location_id_count.items(), key=operator.itemgetter(1), reverse=True)) print('probes_count') pprint.pprint( sorted(probes_count.items(), key=operator.itemgetter(1), reverse=True))
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global logger logger = util.setup_logger(args.logging_file, 'parse_ripe_archive') if not os.path.isdir(args.archive_path): print('Archive path does not lead to a directory', file=sys.stderr) return 1 global engine engine = create_engine(args.database_name) parsed_file_name = '{}-parsed-ripe-files.txt'.format(args.database_name) parsed_files = set() if not os.path.exists(parsed_file_name): logger.debug( 'Creating parsed files history file for database {}'.format( args.database_name)) else: with open(parsed_file_name) as parsed_files_histoy_file: for line in parsed_files_histoy_file: parsed_files.add(line.strip()) file_names = get_filenames(args.archive_path, args.file_regex, parsed_files, args.days_in_past) if not file_names: logger.info('No files found') return logger.info('%s files to parse', len(file_names)) Session = create_session_for_process(engine) db_session = Session() probe_dct = load_probes_from_cache(db_session) for probe, _ in probe_dct.values(): _ = probe.id db_session.expunge(probe) db_session.close() Session.remove() new_parsed_files = mp.Queue() probe_latency_queue = mp.Queue() finish_event = threading.Event() probe_latency_thread = threading.Thread(target=update_second_hop_latency, args=(probe_latency_queue, finish_event), name='update probe latency') probe_latency_thread.start() finished_reading_event = mp.Event() line_queue = mp.Queue(args.number_processes * buffer_lines_per_process) try: with concurrent.ThreadPoolExecutor( max_workers=args.workers) as read_thread_executor: read_thread_results = read_thread_executor.map( functools.partial(read_file, not args.plaintext, args.days_in_past, line_queue, new_parsed_files), file_names) time.sleep(1) processes = [] for index in range(0, args.number_processes): process = mp.Process(target=parse_ripe_data, args=(line_queue, finished_reading_event, probe_dct, probe_latency_queue), name='ripe-parsing-{}'.format(index)) processes.append(process) process.start() try: while True: try: read_thread_results.__next__() except StopIteration: break except Exception: logger.exception('read thread returned with exception') finished_reading_event.set() for process in processes: process.join() except KeyboardInterrupt: finished_reading_event.set() print( 'trying to do a graceful shutdown press Ctrl+C another time to force ' 'shutdown') for process in processes: process.join() finally: with open(parsed_file_name, 'a') as parsed_files_histoy_file: while not new_parsed_files.empty(): filename = new_parsed_files.get() parsed_files_histoy_file.write(filename + '\n') finish_event.set() logger.debug('finish event set waiting for second hop latency thread') probe_latency_thread.join()
def search_process(index, trie, code_to_location_blacklist, limit, nr_processes, location_match_queue, amount=1000, debug: bool = False): """ for all amount=0 """ Session = create_session_for_process(engine) db_session = Session() match_count = collections.defaultdict(int) entries_count = 0 label_count = 0 entries_wl_count = 0 label_wl_count = 0 label_length = 0 if debug: last_search = datetime.datetime.now() - datetime.timedelta(minutes=1) else: last_search = datetime.datetime.now() - datetime.timedelta(days=7) for domain_label in get_all_domain_labels(index, block_limit=limit, nr_processes=nr_processes, db_session=db_session): label_count += 1 label_length += len(domain_label.name) if domain_label.last_searched: if domain_label.last_searched > last_search: if domain_label.hints: label_wl_count += 1 continue else: location_match_queue.put(domain_label.id) pm_count = collections.defaultdict(int) temp_gr_count = search_in_label(domain_label, trie, code_to_location_blacklist, location_match_queue) for key, value in temp_gr_count.items(): match_count[key] += value pm_count[key] += value if temp_gr_count: label_wl_count += 1 entries_count += 1 if entries_count == amount: break if entries_count == amount: break def build_stat_string_for_logger(): """ Builds a string for the final output :returns str: a string with a lot of logging info """ stats_string = 'Stats for this process following:' stats_string += '\n\ttotal entries: {}'.format(entries_count) stats_string += '\n\ttotal labels: {}'.format(label_count) stats_string += '\n\ttotal label length: {}'.format(label_length) stats_string += '\n\tentries with location found: {}'.format( entries_wl_count) stats_string += '\n\tlabel with location found: {}'.format( label_wl_count) stats_string += '\n\tmatches: {}'.format(sum(match_count.values())) stats_string += '\n\tmatch count:\n\t\t{}'.format(match_count) return stats_string logger.info(build_stat_string_for_logger()) db_session.close() Session.remove() location_match_queue.close()
def ripe_check_process( pid: int, ripe_create_sema: mp.Semaphore, ripe_slow_down_sema: mp.Semaphore, bill_to_address: str, wo_measurements: bool, allowed_measurement_age: int, api_key: str, domain_block_limit: int, nr_processes: int, include_ip_encoded: bool, measurement_strategy: MeasurementStrategy, number_of_probes_per_measurement: int, buffer_time: float, packets_per_measurement: int, use_efficient_probes: bool, location_to_probes_dct: typing.Dict[str, typing.Tuple[RipeAtlasProbe, float]], stop_without_old_results: bool, ip_list: typing.List[str], endless_measurements: bool, random_domains: bool): """Checks for all domains if the suspected locations are correct""" correct_type_count = collections.defaultdict(int) domain_type_count = collections.defaultdict(int) Session = create_session_for_process(engine) db_session = Session() db_session.expire_on_commit = False def increment_count_for_type(ctype: LocationCodeType): correct_type_count[ctype.name] += 1 def increment_domain_type_count(dtype: DomainLocationType): """Append current domain in the domain dict to the dtype""" domain_type_count[dtype] += 1 threads = [] domain_types = [DomainType.valid] if include_ip_encoded: domain_types.append(DomainType.ip_encoded) measurement_results_queue = queue.Queue() stop_event = threading.Event() save_measurements_thread = threading.Thread( target=measurement_results_saver, args=(measurement_results_queue, stop_event)) save_measurements_thread.start() try: if ip_list: domain_generator = get_domains_for_ips( ip_list, db_session, domain_block_limit, endless_mode=endless_measurements) else: domain_generator = get_all_domains_splitted_efficient( pid, domain_block_limit, nr_processes, domain_types, db_session, use_random_order=random_domains, endless_mode=endless_measurements) generator_lock = threading.Lock() def next_domain_info(): try: with generator_lock: domain = domain_generator.__next__() location_hints = domain.all_label_matches location_hint_tuples = [] for location_hint in location_hints: if isinstance(location_hint, CodeMatch): _ = location_hint.location.city_name _ = location_hint.code_type location_hint_tuples.append( (location_hint, location_hint.location)) try: db_session.expunge(location_hint) db_session.expunge(location_hint.location) except InvalidRequestError: pass loc_ip_version = constants.IPV4_IDENTIFIER if domain.ipv4_address else \ constants.IPV6_IDENTIFIER measurement_results_query = get_measurements_for_domain( domain, loc_ip_version, allowed_measurement_age, sorted_return=True, db_session=db_session, allow_all_zmap_measurements=True) measurement_result_tuples = [] for res in measurement_results_query: _ = res.probe.location measurement_result_tuples.append( (res, res.probe.location)) try: db_session.expunge(res.probe.location) db_session.expunge(res) except InvalidRequestError: pass db_session.expunge(domain) return domain, location_hint_tuples, measurement_result_tuples except StopIteration: return None for _ in range(0, MAX_THREADS): # TODO use ThreadPoolExecutor thread = threading.Thread( target=domain_check_threading_manage, args=(next_domain_info, increment_domain_type_count, increment_count_for_type, ripe_create_sema, ripe_slow_down_sema, bill_to_address, wo_measurements, allowed_measurement_age, api_key, measurement_strategy, number_of_probes_per_measurement, buffer_time, packets_per_measurement, use_efficient_probes, location_to_probes_dct, measurement_results_queue, stop_without_old_results)) threads.append(thread) thread.start() for thread in threads: thread.join() except KeyboardInterrupt: logger.warning('SIGINT recognized stopping Process') pass finally: stop_event.set() save_measurements_thread.join() db_session.close() Session.remove() count_alive = 0 for thread in threads: if thread.is_alive(): count_alive += 1 logger.info('correct_count {}'.format(correct_type_count))
def main(): """Main function""" parser = argparse.ArgumentParser() __create_parser_arguments(parser) args = parser.parse_args() global engine engine = create_engine(args.database_name) global logger logger = util.setup_logger(args.log_file, 'check', loglevel=args.log_level, hourly_log_rotation=True) logger.debug('starting') start_time = time.time() Session = create_session_for_process(engine) db_session = Session() db_session.expire_on_commit = False ripe_slow_down_sema = mp.BoundedSemaphore(args.ripe_request_burst_limit) ripe_create_sema = mp.Semaphore(args.measurement_limit) global MAX_THREADS if args.debug: MAX_THREADS = 1 else: MAX_THREADS = int(args.measurement_limit / args.number_processes * 1.5) finish_event = threading.Event() generator_thread = util.start_token_generating_thread( ripe_slow_down_sema, args.ripe_request_limit, finish_event) locations = db_session.query(LocationInfo) if not locations.count(): logger.error('No locations found! Aborting!') print('No locations found! Aborting!') return 1 if not args.disable_probe_fetching: probe_distances = load_probes_from_cache(db_session).values() location_to_probes_dct = assign_location_probes( locations, [probe for probe, _ in probe_distances], db_session) db_session.commit() null_locations = [ location for location in locations if location.id not in location_to_probes_dct ] logger.info('{} locations without nodes'.format(len(null_locations))) else: locations = db_session.query(LocationInfo) location_to_probes_dct = {} loc_without_probes = 0 probes = set() for location in locations: if location.nearby_probes: location_to_probes_dct[location.id] = [] for probe in location.nearby_probes: probes.add(probe) _ = str(probe.location.lat + probe.location.lon) + probe.location.id + \ str(probe.second_hop_latency) + probe.probe_id + str(probe.id) location_to_probes_dct[location.id].append( (probe, location.gps_distance_haversine(probe.location), probe.location)) else: loc_without_probes += 1 logger.debug('expunging probes') for probe in probes: try: db_session.expunge(probe.location) db_session.expunge(probe) except InvalidRequestError: pass logger.debug('updating probes') update_probes(probes) logger.info('{} locations without nodes'.format(loc_without_probes)) measurement_strategy = MeasurementStrategy(args.measurement_strategy) logger.debug('finished ripe') processes = [] process_count = args.number_processes if args.debug: process_count = 1 if args.ip_filter_file: ip_set = set() with open(args.ip_filter_file) as ip_filter_file: for line in ip_filter_file: ip_set.add(line.strip()) ips = list(ip_set) else: ips = None db_session.close() for pid in range(0, process_count): if ips: ips_count = len(ips) ips_start_index = int(pid * (ips_count / process_count)) ips_end_index = int((pid + 1) * (ips_count / process_count)) if pid + 1 == process_count: ips_end_index = ips_count ips_for_process = ips[ips_start_index:ips_end_index] else: ips_for_process = None process = mp.Process( target=ripe_check_process, args=(pid, ripe_create_sema, ripe_slow_down_sema, args.bill_to, args.without_new_measurements, args.allowed_measurement_age, args.api_key, args.domain_block_limit, process_count, args.include_ip_encoded, measurement_strategy, args.probes_per_measurement, args.buffer_time, args.measurement_packets, args.use_efficient_probes, location_to_probes_dct, args.stop_without_old_results, ips_for_process, args.endless_measurements, args.random_domains), name='domain_checking_{}'.format(pid)) processes.append(process) for process in processes: process.start() alive = len(processes) while alive > 0: try: process_sts = [pro.is_alive() for pro in processes] if process_sts.count(True) != alive: alive = process_sts.count(True) logger.debug('{} processes alive'.format(alive)) for process in processes: process.join() except KeyboardInterrupt: pass if finish_event: finish_event.set() if generator_thread: generator_thread.join() logger.debug('{} processes alive'.format(alive)) end_time = time.time() logger.info('running time: {}'.format((end_time - start_time))) return 0
def main(): """Main function""" parser = configargparse.ArgParser( default_config_files=['find_default.ini']) __create_parser_arguments(parser) args = parser.parse_args() global logger logger = util.setup_logger(args.logging_file, 'find', loglevel=args.log_level) global engine engine = create_engine(args.database_name) trie = create_trie(args.code_blacklist_file, args.word_blacklist_file) code_to_location_blacklist = {} if args.code_to_location_blacklist_file: with open(args.code_to_location_blacklist_file ) as code_to_location_blacklist_file: json_txt = "" for line in code_to_location_blacklist_file: line = line.strip() if line[0] != '#': json_txt += line code_to_location_blacklist = json.loads(json_txt) location_match_queue = mp.Queue() stop_event = threading.Event() handle_location_matches_thread = threading.Thread( target=handle_location_matches, name='handle-location-matches', args=(location_match_queue, stop_event)) handle_location_matches_thread.start() processes = [] for index in range(0, args.number_processes): process = mp.Process(target=search_process, args=(index, trie, code_to_location_blacklist, args.domain_block_limit, args.number_processes, location_match_queue), kwargs={ 'amount': args.amount, 'debug': args.log_level == 'DEBUG' }, name='find_locations_{}'.format(index)) process.start() processes.append(process) for process in processes: process.join() Session = create_session_for_process(engine) db_session = Session() update_query = update(DomainLabel).values( last_searched=datetime.datetime.now()) db_session.execute(update_query) db_session.close() stop_event.set() handle_location_matches_thread.join() location_match_queue.join_thread()
def parse_caida_data(bz2_compressed: bool, days_in_past: int, probe_id_dct: typing.Dict[str, int], parsed_files_queue: mp.Queue, filename): Session = create_session_for_process(engine) db_session = Session() try: logger.debug('parsing %s', filename) probe_id = get_probe_id_from_path(filename) probe_db_id = probe_id_dct[probe_id] modification_time = datetime.datetime.fromtimestamp(os.path.getmtime(filename)) measurements = collections.defaultdict(dict) if abs((modification_time - datetime.datetime.now()).days) >= days_in_past: return if bz2_compressed: with bz2.open(filename, 'rt') as ripe_archive_file: for line in ripe_archive_file: measurement = parse_measurement(line, probe_db_id, days_in_past) if measurement: if measurement.probe_id not in \ measurements[measurement.destination_address] or \ measurements[measurement.destination_address][ measurement.probe_id].min_rtt > \ measurement.min_rtt: measurements[measurement.destination_address][measurement.probe_id] = \ measurement else: with open(filename) as caida_archive_filedesc, \ mmap.mmap(caida_archive_filedesc.fileno(), 0, access=mmap.ACCESS_READ) as caida_archive_file: line = caida_archive_file.readline().decode('utf-8') while len(line) > 0: measurement = parse_measurement(line, probe_db_id, days_in_past) if measurement: if measurement.probe_id not in \ measurements[measurement.destination_address] or \ measurements[measurement.destination_address][ measurement.probe_id].min_rtt > \ measurement.min_rtt: measurements[measurement.destination_address][ measurement.probe_id] = \ measurement line = caida_archive_file.readline().decode('utf-8') measurement_results = [] for probe_measurement_dct in measurements.values(): measurement_results.extend(probe_measurement_dct.values()) db_session.bulk_save_objects(measurement_results) logger.info('parsed and saved %s measurements', len(measurement_results)) db_session.commit() except Exception: logger.exception('Error while parsing file: ') else: parsed_files_queue.put(filename) db_session.commit() db_session.close() Session.remove() logger.info('parse process for file %s finished', filename)
def parse_ripe_data(line_queue: mp.Queue, finished_reading: mp.Event, probe_dct: typing.Dict[int, RipeAtlasProbe], probe_latency_queue: mp.Queue): Session = create_session_for_process(engine) db_session = Session() results = collections.defaultdict(dict) min_rtt_results = collections.defaultdict(dict) def line_generator(): try: rline = line_queue.get(timeout=5) except queue.Empty: logger.exception('found empty queue') return status_msg = False read_fails = 0 while True: if rline: yield rline if finished_reading.is_set() and (line_queue.empty() or read_fails == 5): return if finished_reading.is_set() and not status_msg: logger.debug('reading finished finishing processing') status_msg = True rline = None try: rline = line_queue.get(timeout=2) read_fails = 0 except queue.Empty: logger.debug('failed reading') read_fails += 1 def save_measurement_results(m_results: collections.defaultdict, db_sess): measurement_results = [] for probe_measurement_dct in m_results.values(): measurement_results.extend(probe_measurement_dct.values()) db_sess.bulk_save_objects(measurement_results) logger.info('parsed and saved %s measurements', len(measurement_results)) db_sess.commit() m_results.clear() failure_counter = 0 parsed_lines = 0 for line in line_generator(): parsed_lines += 1 try: measurement_result_dct = json.loads(line) measurement_result = parse_measurement(measurement_result_dct, probe_dct, probe_latency_queue) if not measurement_result: continue probe_id = measurement_result.probe_id destination_address = measurement_result.destination_address current_min_result = min_rtt_results[destination_address].get( probe_id, None) if current_min_result is None or current_min_result > measurement_result.min_rtt: results[destination_address][probe_id] = measurement_result min_rtt_results[destination_address][ probe_id] = measurement_result.min_rtt if len(results) >= 10**6: save_measurement_results(results, db_session) except Exception: failure_counter += 1 logger.exception('Error while parsing line %s', line) if parsed_lines > 100 and failure_counter >= parsed_lines / 10: logger.critical( 'failure rate too high "%s" of "%s"! stopping!', failure_counter, parsed_lines) save_measurement_results(results, db_session) break save_measurement_results(results, db_session) db_session.close() Session.remove() line_queue.close() logger.info('finished parsing')
def handle_location_matches(location_match_queue: mp.Queue, stop_event: threading.Event): class LocationMatch: def __init__(self, location_hint): self.id = 0 self.location_hint = location_hint self.location_code = location_hint.code self.location_code_type = location_hint.code_type self.location_id = location_hint.location_id self.domain_label_ids = set() self.old_domain_label_ids = set() def get_insert_values(self): return [{ 'location_hint_id': self.id, 'domain_label_id': domain_label_id } for domain_label_id in self.domain_label_ids] def add_domain_label_id(self, domain_label_id): if domain_label_id not in self.domain_label_ids and \ domain_label_id not in self.old_domain_label_ids: self.domain_label_ids.add(domain_label_id) def handled_domains(self): self.old_domain_label_ids = self.old_domain_label_ids.union( self.domain_label_ids) self.domain_label_ids.clear() def __hash__(self): return hash( make_location_hint_key(self.location_id, self.location_code, self.location_code_type)) def make_location_hint_key(location_id, code, code_type): return '{},{},{}'.format(location_id, code, code_type) def save(matches_to_save, new_matches, db_sess): if new_matches: db_sess.bulk_save_objects( [match.location_hint for match in new_matches], return_defaults=True) for new_match in new_matches: new_match.id = new_match.location_hint.id insert_values = [] for match in matches_to_save: insert_values.extend(match.get_insert_values()) match.handled_domains() new_matches.clear() matches_to_save.clear() if insert_values: insert_expr = location_hint_label_table.insert().values( insert_values) db_sess.execute(insert_expr) Session = create_session_for_process(engine) db_session = Session() new_matches = [] matches_to_save = set() location_hints = {} counter = 0 ccounter = 0 while not (stop_event.is_set() and location_match_queue.empty()): try: location_matches_tuples = location_match_queue.get(timeout=1) if isinstance(location_matches_tuples, int): delete_expr = location_hint_label_table.delete().where( 'location_hint_labels.domain_label_id' == location_matches_tuples) db_session.execute(delete_expr) logger.debug('saving for {}'.format(location_matches_tuples)) else: for location_id, location_code, location_code_type, domain_label_id in \ location_matches_tuples: location_hint_key = make_location_hint_key( location_id, location_code, location_code_type) try: location_hint = location_hints[location_hint_key] except KeyError: real_code_type = LocationCodeType(location_code_type) location_hint_obj = CodeMatch(location_id, code_type=real_code_type, domain_label=None, code=location_code) db_session.add(location_hint_obj) location_hint = LocationMatch(location_hint_obj) location_hints[location_hint_key] = location_hint new_matches.append(location_hint) matches_to_save.add(location_hint) location_hint.add_domain_label_id(domain_label_id) counter += 1 if counter >= 10**4: logger.debug('saving') save(matches_to_save, new_matches, db_session) counter = 0 ccounter += 1 if ccounter >= 10: db_session.commit() ccounter = 0 except queue.Empty: pass save(matches_to_save, new_matches, db_session) db_session.commit() db_session.close() Session.remove() location_match_queue.close() logger.info('stopped')