예제 #1
0
def update_second_hop_latency(probe_latency_queue: mp.Queue,
                              finish_event: threading.Event):
    Session = create_session_for_process(engine)
    db_session = Session()

    update_sql = 'UPDATE probes SET second_hop_latency = {} WHERE id = {} AND ' \
                 '(second_hop_latency IS NULL OR second_hop_latency > {});'

    probe_dct = {}

    while not finish_event.is_set() or not probe_latency_queue.empty():
        try:
            probe_id, latency = probe_latency_queue.get(timeout=1)
            if probe_dct.get(probe_id, sys.maxsize) > latency:
                probe_dct[probe_id] = latency
        except queue.Empty:
            if finish_event.is_set():
                break

    for probe_id, latency in probe_dct.items():
        db_session.execute(update_sql.format(latency, probe_id, latency))

    logger.debug("Committing updates to second hop latency")
    db_session.commit()

    db_session.close()
    Session.remove()
예제 #2
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global logger
    logger = util.setup_logger(args.logging_file, 'delete_measurements',
                               args.log_level)

    global engine
    engine = create_engine(args.database_name)

    Session = create_session_for_process(engine)
    db_session = Session()

    oldest_date_allowed = datetime.date.today() - datetime.timedelta(
        days=args.days_in_past)
    db_session.query(MeasurementResult).filter(
        MeasurementResult.timestamp < oldest_date_allowed).delete()

    db_session.commit()
    db_session.close()
    Session.remove()

    logger.info('deleted all measurements before {}'.format(
        oldest_date_allowed.strftime('%Y-%m-%d')))
예제 #3
0
def create_trie(code_blacklist_filepath: str, word_blacklist_filepath: str):
    """
    Creates a RecordTrie with the marisa library
    :param code_blacklist_filepath: the path to the code blacklist file
    :param word_blacklist_filepath: the path to the word blacklist file
    :rtype: marisa_trie.RecordTrie
    """
    Session = create_session_for_process(engine)
    db_session = Session()
    try:
        locations = db_session.query(LocationInfo)

        code_blacklist_set = set()
        if code_blacklist_filepath:
            with open(code_blacklist_filepath) as code_blacklist_file:
                for line in code_blacklist_file:
                    line = line.strip()
                    if line[0] != '#':
                        code_blacklist_set.add(line)

        word_blacklist_set = set()
        if word_blacklist_filepath:
            with open(word_blacklist_filepath) as word_blacklist_file:
                for line in word_blacklist_file:
                    line = line.strip()
                    if line[0] != '#':
                        word_blacklist_set.add(line)

        return create_trie_obj(locations, code_blacklist_set,
                               word_blacklist_set)
    finally:
        db_session.close()
        Session.remove()
예제 #4
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global log
    log = setup_logger(args.logging_file, 'parse_ripe_probes')

    engine = create_engine(args.database_name)
    Session = create_session_for_process(engine)
    db_session = Session()

    ripe_sema = threading.BoundedSemaphore(50)
    stop_event = threading.Event()
    token_generator_thread = start_token_generating_thread(
        ripe_sema, args.ripe_requests_per_second, stop_event)
    probes = get_probes(db_session, ripe_sema)

    stop_event.set()
    token_generator_thread.join()

    log.info('writing probes to tmp')

    os.makedirs(os.path.dirname(PROBE_CACHING_PATH), exist_ok=True)

    with open(PROBE_CACHING_PATH, 'w') as ripe_temp_file:
        probe_info_to_write = {
            probe.id: is_in_nat
            for probe, is_in_nat in probes.values()
        }
        json.dump(probe_info_to_write, ripe_temp_file)
예제 #5
0
def measurement_results_saver(measurement_results_queue: queue.Queue,
                              stop_event: threading.Event):
    Session = create_session_for_process(engine)
    db_session = Session()

    results = []
    count_results = 0

    while not stop_event.is_set() or not measurement_results_queue.empty():
        try:
            measurement_result = measurement_results_queue.get(timeout=5)
            results.append(measurement_result)
            count_results += 1

            if count_results % 10**5 == 0:
                db_session.bulk_save_objects(results)
                db_session.commit()

                results.clear()
        except queue.Empty:
            pass

    db_session.bulk_save_objects(results)
    db_session.commit()

    db_session.close()
    Session.remove()
예제 #6
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global logger
    logger = util.setup_logger(args.logging_file, 'parse_ripe_archive', args.log_level)

    logger.info('starting caida parsing for archivepath %s', args.archive_path)

    if not os.path.isdir(args.archive_path):
        print('Archive path does not lead to a directory', file=sys.stderr)
        return 1

    global engine
    engine = create_engine(args.database_name)

    Session = create_session_for_process(engine)
    db_session = Session()

    parsed_file_name = '{}-parsed-caida-files.txt'.format(args.database_name)
    parsed_files = set()
    
    if not os.path.exists(parsed_file_name):
        logger.debug('Creating parsed files history file for database %s', args.database_name)
    else:
        with open(parsed_file_name) as parsed_files_histoy_file:
            for line in parsed_files_histoy_file:
                parsed_files.add(line.strip())

    filenames, probe_dct = get_filenames(args.archive_path, args.file_regex, args.days_in_past,
                                         parsed_files, db_session)

    if not filenames:
        logger.info('found no files to parse')
        return 0

    mp_manager = mp.Manager()
    new_parsed_files = mp_manager.Queue()

    with concurrent.ProcessPoolExecutor(max_workers=args.number_processes) as processing_executor:
        processing_results = processing_executor.map(
            functools.partial(parse_caida_data, not args.plaintext, args.days_in_past,
                              probe_dct, new_parsed_files), filenames)

        try:
            while True:
                try:
                    processing_results.__next__()
                except StopIteration:
                    break
                except Exception:
                    logger.exception('process threw exception')
        finally:
            with open(parsed_file_name, 'a') as parsed_files_histoy_file:
                while not new_parsed_files.empty():
                    parsed_files_histoy_file.write(new_parsed_files.get(timeout=1) + '\n')
예제 #7
0
def parse_codes(args):
    """start real parsing"""
    Session = create_session_for_process(engine)
    db_session = Session()
    start_time = time.clock()
    start_rtime = time.time()
    try:
        with db_session.no_autoflush:
            if args.airport_codes:
                parse_airport_codes(args, db_session)
                if args.metropolitan_file:
                    metropolitan_locations = parse_metropolitan_codes(args.metropolitan_file,
                                                                      db_session)
                    if args.merge_radius:
                        add_locations(AIRPORT_LOCATION_CODES, metropolitan_locations, 
                                      args.merge_radius, db_session, create_new_locations=False)
                    else:
                        add_locations(AIRPORT_LOCATION_CODES, metropolitan_locations, 100,
                                      db_session, create_new_locations=False)

            if args.locode:
                parse_locode_codes(args.locode, db_session)

            if args.clli:
                get_clli_codes(args.clli, db_session)
                logger.debug('Finished clli parsing')

            if args.geonames:
                get_geo_names(args.geonames, args.min_population, db_session)
                logger.debug('Finished geonames parsing')

        locations = merge_location_codes(args.merge_radius, db_session)

        sanitize_location_names(locations)

        db_session.add_all(locations)
        db_session.commit()
    finally:
        db_session.close()
        Session.remove()

    end_time = time.clock()
    end_rtime = time.time()
    logger.debug('finished and needed {} seconds of the processor computation time\n'
                 'And {} seconds of the real world time.\n'
                 'Collected data on {} locations.'.format((end_time - start_time),
                                                          int(end_rtime - start_rtime),
                                                          len(locations)))
예제 #8
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global logger
    logger = util.setup_logger(args.logging_file, 'parse_zmap_results')

    global engine
    engine = create_engine(args.database_name)

    Session = create_session_for_process(engine)
    db_session = Session()

    filenames = get_filenames(args.zmap_results_dir, args.file_regex)
    locations = {}

    config_parser = configparser.ConfigParser()
    if os.path.isfile(args.locations_config_file):
        config_parser.read(args.locations_config_file)
        for filename in filenames:
            location_name = __get_location_name(filename)
            if location_name not in config_parser or \
                    'lat' not in config_parser[location_name] or \
                    'lon' not in config_parser[location_name]:
                logger.critical(
                    '{} not defined in config file or has not the right format! '
                    'Aborting!'.format(location_name))
                return 3

            location = location_for_coordinates(
                config_parser[location_name]['lat'],
                config_parser[location_name]['lon'], db_session)
            probe = ZmapProbe(probe_id=location_name, location=location)
            db_session.add(probe)
            db_session.commit()
            locations[location_name] = probe.id
    else:
        raise ValueError('locations_config_file path does not lead to a file')

    parse(filenames, locations, db_session)

    db_session.close()
    Session.remove()
예제 #9
0
def preprocess_file_part(filepath: str, pnr: int, line_queue: mp.Queue,
                         ip_encoding_filter: bool,
                         regex_strategy: RegexStrategy, tlds: typing.Set[str],
                         whitelist: typing.Set[str],
                         parsed_ips: typing.Set[str], parsed_ips_lock: mp.Lock,
                         domain_label_queue: mp.Queue,
                         finished_reading_event: mp.Event):
    """
    Sanitize filepart from start to end
    pnr is a number to recognize the process
    ipregex should be a regex with 4 integers to filter the Isp client domain names
    """
    logger.info('starting')
    label_stats = collections.defaultdict(int)

    Session = create_session_for_process(engine)
    db_session = Session()
    try:
        bad_characters = collections.defaultdict(int)
        count_good_lines = 0
        count_isp_lines = 0

        while not finished_reading_event.is_set() or not line_queue.empty():
            try:
                line = line_queue.get(timeout=2)
            except queue.Empty:
                time.sleep(1)
                continue

            if len(line) == 0:
                continue
            line = line.strip()
            ip, domain = line.split(',', 1)

            if not domain:
                logger.info(
                    'Warning found empty domain for IP {} skipping'.format(ip))
                continue

            if ':' in ip:
                ip_version = constants.IPV6_IDENTIFIER
            else:
                ip_version = constants.IPV4_IDENTIFIER

            with parsed_ips_lock:
                parsed_ips.add(ip)

            n_good_lines_count, n_ip_lines_count = classify_domain(
                ip, domain, ip_version, ip_encoding_filter, regex_strategy,
                tlds, whitelist, db_session, domain_label_queue)
            count_good_lines += n_good_lines_count
            count_isp_lines += n_ip_lines_count

            db_session.commit()
        else:
            logger.info('finished no more lines')

        directory = os.path.dirname(filepath)
        filename = util.get_path_filename(filepath)

        logger.info('good lines: {} ips lines: {}'.format(
            count_good_lines, count_isp_lines))
        with open(os.path.join(directory,
                               '{0}-{1}-character.stats'.format(filename,
                                                                pnr)),
                  'w',
                  encoding='utf-8') as characterStatsFile:
            json.dump(bad_characters, characterStatsFile)

        with open(os.path.join(
                directory, '{0}-{1}-domain-label.stats'.format(filename, pnr)),
                  'w',
                  encoding='utf-8') as labelStatFile:
            json.dump(label_stats, labelStatFile)
    finally:
        db_session.close()
        Session.remove()
        domain_label_queue.close()
예제 #10
0
def handle_labels(labels_queue: mp.Queue, stop_event: threading.Event):
    """Handels the label results and saves them to the database not blocking the other db queries"""
    class DomainLabelHolder:
        def __init__(self, label):
            self.label = label
            self.label_id = 0
            self.domain_ids = set()
            self._handled_domain_ids = set()

        def add_domain_id(self, domain_id):
            if domain_id not in self.domain_ids and domain_id not in self._handled_domain_ids:
                self.domain_ids.add(domain_id)

        def get_insert_values(self):
            values = [{
                'domain_id': domain_id,
                'domain_label_id': self.label_id
            } for domain_id in self.domain_ids]
            return values

        def handled_domain_ids(self):
            self._handled_domain_ids = self._handled_domain_ids.union(
                self.domain_ids)
            self.domain_ids.clear()

        def __hash__(self):
            return hash(self.label)

    def save_labels(domain_labels_dct, labels_to_save, new_labels, db_sess):
        if new_labels:
            db_sess.bulk_save_objects([label.label for label in new_labels],
                                      return_defaults=True)
            for label_obj in new_labels:
                label_obj.label_id = label_obj.label.id

        values_to_insert = []
        for label_obj in labels_to_save:
            if label_obj.domain_ids:
                values_to_insert.extend(label_obj.get_insert_values())
                label_obj.handled_domain_ids()

        insert_expr = domain_to_label_table.insert().values(values_to_insert)
        new_labels.clear()
        labels_to_save.clear()
        db_sess.execute(insert_expr)

    Session = create_session_for_process(engine)
    db_session = Session()

    new_labels = []
    labels_to_save = set()
    domain_labels = {}
    counter = 0
    ccounter = 0

    while not stop_event.is_set() or not labels_queue.empty():
        try:
            domain_label_tuples = labels_queue.get(timeout=1)
            for label_name, domain_id in domain_label_tuples:
                try:
                    label = domain_labels[label_name]
                except KeyError:
                    label_obj = DomainLabel(label_name)
                    label = DomainLabelHolder(label_obj)
                    domain_labels[label_name] = label
                    new_labels.append(label)

                labels_to_save.add(label)
                label.add_domain_id(domain_id)

                counter += 1
                if counter >= 10**4:
                    save_labels(domain_labels, labels_to_save, new_labels,
                                db_session)
                    counter = 0
                    ccounter += 1

                    if ccounter >= 10:
                        db_session.commit()
                        ccounter = 0

        except queue.Empty:
            pass

    save_labels(domain_labels, labels_to_save, new_labels, db_session)
    db_session.commit()
    logger.info('stopped')
    labels_queue.close()
예제 #11
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global logger
    logger = util.setup_logger(args.log_file,
                               'validate-stats',
                               loglevel=args.log_level)
    logger.debug('starting')

    engine = create_engine(args.database_name)

    db_session = create_session_for_process(engine)()

    if args.allowed_measurement_age:
        oldest_date_allowed = datetime.datetime.now() - datetime.timedelta(
            seconds=args.allowed_measurement_age)
        sql_args = 'TIMESTAMP \'' + oldest_date_allowed.strftime(
            '%Y-%m-%d %H:%M:%S') + '\', '
    else:
        sql_args = 'NULL, '

    if args.minimum_measurement_age:
        sql_args += 'TIMESTAMP \'' + args.minimum_measurement_age + '\', '
    else:
        sql_args += 'NULL, '

    sql_args += str(args.exclude_traceroute) + ', ' + str(
        args.exclude_caida_measurements)

    slq_query = 'SELECT * from domainsWithDistanceRTTs({});'.format(sql_args)

    results = db_session.execute(slq_query)

    rtt_distances = []
    domains_count = collections.defaultdict(int)
    location_id_count = collections.defaultdict(int)
    probes_count = collections.defaultdict(int)

    for domain_id, domain_name, hint_location_id, hint_location_name, location_hint_id, \
            measurement_result_id, probe_id, distance, min_rtt in results:
        rtt_distances.append((domain_id, min_rtt, distance))
        domains_count[domain_base_name(domain_name)] += 1
        location_id_count[hint_location_id] += 1
        probes_count[probe_id] += 1

    with open(args.output_filename, 'w') as output_file:
        str_to_wrt = '\n'.join([
            '{}; {}; {}'.format(domain_id, rtt, dist)
            for domain_id, rtt, dist in rtt_distances
        ])
        output_file.write(str_to_wrt)

    print('domains count: ')
    pprint.pprint(
        sorted(domains_count.items(), key=operator.itemgetter(1),
               reverse=True))
    print('location_id_count')
    pprint.pprint(
        sorted(location_id_count.items(),
               key=operator.itemgetter(1),
               reverse=True))
    print('probes_count')
    pprint.pprint(
        sorted(probes_count.items(), key=operator.itemgetter(1), reverse=True))
예제 #12
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global logger
    logger = util.setup_logger(args.logging_file, 'parse_ripe_archive')

    if not os.path.isdir(args.archive_path):
        print('Archive path does not lead to a directory', file=sys.stderr)
        return 1

    global engine
    engine = create_engine(args.database_name)

    parsed_file_name = '{}-parsed-ripe-files.txt'.format(args.database_name)
    parsed_files = set()

    if not os.path.exists(parsed_file_name):
        logger.debug(
            'Creating parsed files history file for database {}'.format(
                args.database_name))
    else:
        with open(parsed_file_name) as parsed_files_histoy_file:
            for line in parsed_files_histoy_file:
                parsed_files.add(line.strip())

    file_names = get_filenames(args.archive_path, args.file_regex,
                               parsed_files, args.days_in_past)

    if not file_names:
        logger.info('No files found')
        return

    logger.info('%s files to parse', len(file_names))

    Session = create_session_for_process(engine)
    db_session = Session()
    probe_dct = load_probes_from_cache(db_session)

    for probe, _ in probe_dct.values():
        _ = probe.id
        db_session.expunge(probe)

    db_session.close()
    Session.remove()

    new_parsed_files = mp.Queue()
    probe_latency_queue = mp.Queue()
    finish_event = threading.Event()

    probe_latency_thread = threading.Thread(target=update_second_hop_latency,
                                            args=(probe_latency_queue,
                                                  finish_event),
                                            name='update probe latency')
    probe_latency_thread.start()

    finished_reading_event = mp.Event()

    line_queue = mp.Queue(args.number_processes * buffer_lines_per_process)

    try:
        with concurrent.ThreadPoolExecutor(
                max_workers=args.workers) as read_thread_executor:
            read_thread_results = read_thread_executor.map(
                functools.partial(read_file, not args.plaintext,
                                  args.days_in_past, line_queue,
                                  new_parsed_files), file_names)
            time.sleep(1)

            processes = []

            for index in range(0, args.number_processes):
                process = mp.Process(target=parse_ripe_data,
                                     args=(line_queue, finished_reading_event,
                                           probe_dct, probe_latency_queue),
                                     name='ripe-parsing-{}'.format(index))
                processes.append(process)
                process.start()

            try:
                while True:
                    try:
                        read_thread_results.__next__()
                    except StopIteration:
                        break
                    except Exception:
                        logger.exception('read thread returned with exception')

                finished_reading_event.set()

                for process in processes:
                    process.join()
            except KeyboardInterrupt:
                finished_reading_event.set()
                print(
                    'trying to do a graceful shutdown press Ctrl+C another time to force '
                    'shutdown')

                for process in processes:
                    process.join()

    finally:
        with open(parsed_file_name, 'a') as parsed_files_histoy_file:
            while not new_parsed_files.empty():
                filename = new_parsed_files.get()
                parsed_files_histoy_file.write(filename + '\n')

    finish_event.set()
    logger.debug('finish event set waiting for second hop latency thread')

    probe_latency_thread.join()
예제 #13
0
def search_process(index,
                   trie,
                   code_to_location_blacklist,
                   limit,
                   nr_processes,
                   location_match_queue,
                   amount=1000,
                   debug: bool = False):
    """
    for all amount=0
    """
    Session = create_session_for_process(engine)
    db_session = Session()

    match_count = collections.defaultdict(int)
    entries_count = 0
    label_count = 0
    entries_wl_count = 0
    label_wl_count = 0
    label_length = 0

    if debug:
        last_search = datetime.datetime.now() - datetime.timedelta(minutes=1)
    else:
        last_search = datetime.datetime.now() - datetime.timedelta(days=7)

    for domain_label in get_all_domain_labels(index,
                                              block_limit=limit,
                                              nr_processes=nr_processes,
                                              db_session=db_session):
        label_count += 1
        label_length += len(domain_label.name)

        if domain_label.last_searched:
            if domain_label.last_searched > last_search:
                if domain_label.hints:
                    label_wl_count += 1

                continue
            else:
                location_match_queue.put(domain_label.id)

        pm_count = collections.defaultdict(int)

        temp_gr_count = search_in_label(domain_label, trie,
                                        code_to_location_blacklist,
                                        location_match_queue)

        for key, value in temp_gr_count.items():
            match_count[key] += value
            pm_count[key] += value

        if temp_gr_count:
            label_wl_count += 1

        entries_count += 1

        if entries_count == amount:
            break

        if entries_count == amount:
            break

    def build_stat_string_for_logger():
        """
        Builds a string for the final output
        :returns str: a string with a lot of logging info
        """
        stats_string = 'Stats for this process following:'
        stats_string += '\n\ttotal entries: {}'.format(entries_count)
        stats_string += '\n\ttotal labels: {}'.format(label_count)
        stats_string += '\n\ttotal label length: {}'.format(label_length)
        stats_string += '\n\tentries with location found: {}'.format(
            entries_wl_count)
        stats_string += '\n\tlabel with location found: {}'.format(
            label_wl_count)
        stats_string += '\n\tmatches: {}'.format(sum(match_count.values()))
        stats_string += '\n\tmatch count:\n\t\t{}'.format(match_count)
        return stats_string

    logger.info(build_stat_string_for_logger())

    db_session.close()
    Session.remove()
    location_match_queue.close()
예제 #14
0
def ripe_check_process(
        pid: int, ripe_create_sema: mp.Semaphore,
        ripe_slow_down_sema: mp.Semaphore, bill_to_address: str,
        wo_measurements: bool, allowed_measurement_age: int, api_key: str,
        domain_block_limit: int, nr_processes: int, include_ip_encoded: bool,
        measurement_strategy: MeasurementStrategy,
        number_of_probes_per_measurement: int, buffer_time: float,
        packets_per_measurement: int, use_efficient_probes: bool,
        location_to_probes_dct: typing.Dict[str, typing.Tuple[RipeAtlasProbe,
                                                              float]],
        stop_without_old_results: bool, ip_list: typing.List[str],
        endless_measurements: bool, random_domains: bool):
    """Checks for all domains if the suspected locations are correct"""
    correct_type_count = collections.defaultdict(int)

    domain_type_count = collections.defaultdict(int)
    Session = create_session_for_process(engine)
    db_session = Session()
    db_session.expire_on_commit = False

    def increment_count_for_type(ctype: LocationCodeType):
        correct_type_count[ctype.name] += 1

    def increment_domain_type_count(dtype: DomainLocationType):
        """Append current domain in the domain dict to the dtype"""
        domain_type_count[dtype] += 1

    threads = []

    domain_types = [DomainType.valid]
    if include_ip_encoded:
        domain_types.append(DomainType.ip_encoded)

    measurement_results_queue = queue.Queue()
    stop_event = threading.Event()
    save_measurements_thread = threading.Thread(
        target=measurement_results_saver,
        args=(measurement_results_queue, stop_event))
    save_measurements_thread.start()

    try:
        if ip_list:
            domain_generator = get_domains_for_ips(
                ip_list,
                db_session,
                domain_block_limit,
                endless_mode=endless_measurements)
        else:
            domain_generator = get_all_domains_splitted_efficient(
                pid,
                domain_block_limit,
                nr_processes,
                domain_types,
                db_session,
                use_random_order=random_domains,
                endless_mode=endless_measurements)

        generator_lock = threading.Lock()

        def next_domain_info():
            try:
                with generator_lock:
                    domain = domain_generator.__next__()
                    location_hints = domain.all_label_matches
                    location_hint_tuples = []

                    for location_hint in location_hints:
                        if isinstance(location_hint, CodeMatch):
                            _ = location_hint.location.city_name
                            _ = location_hint.code_type
                            location_hint_tuples.append(
                                (location_hint, location_hint.location))

                            try:
                                db_session.expunge(location_hint)
                                db_session.expunge(location_hint.location)
                            except InvalidRequestError:
                                pass

                    loc_ip_version = constants.IPV4_IDENTIFIER if domain.ipv4_address else \
                        constants.IPV6_IDENTIFIER

                    measurement_results_query = get_measurements_for_domain(
                        domain,
                        loc_ip_version,
                        allowed_measurement_age,
                        sorted_return=True,
                        db_session=db_session,
                        allow_all_zmap_measurements=True)

                    measurement_result_tuples = []

                    for res in measurement_results_query:
                        _ = res.probe.location
                        measurement_result_tuples.append(
                            (res, res.probe.location))
                        try:
                            db_session.expunge(res.probe.location)
                            db_session.expunge(res)
                        except InvalidRequestError:
                            pass

                    db_session.expunge(domain)
                    return domain, location_hint_tuples, measurement_result_tuples
            except StopIteration:
                return None

        for _ in range(0, MAX_THREADS):
            # TODO use ThreadPoolExecutor
            thread = threading.Thread(
                target=domain_check_threading_manage,
                args=(next_domain_info, increment_domain_type_count,
                      increment_count_for_type, ripe_create_sema,
                      ripe_slow_down_sema, bill_to_address, wo_measurements,
                      allowed_measurement_age, api_key, measurement_strategy,
                      number_of_probes_per_measurement, buffer_time,
                      packets_per_measurement, use_efficient_probes,
                      location_to_probes_dct, measurement_results_queue,
                      stop_without_old_results))

            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

    except KeyboardInterrupt:
        logger.warning('SIGINT recognized stopping Process')
        pass
    finally:
        stop_event.set()
        save_measurements_thread.join()

        db_session.close()
        Session.remove()

    count_alive = 0
    for thread in threads:
        if thread.is_alive():
            count_alive += 1

    logger.info('correct_count {}'.format(correct_type_count))
예제 #15
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    __create_parser_arguments(parser)
    args = parser.parse_args()

    global engine
    engine = create_engine(args.database_name)

    global logger
    logger = util.setup_logger(args.log_file,
                               'check',
                               loglevel=args.log_level,
                               hourly_log_rotation=True)
    logger.debug('starting')

    start_time = time.time()
    Session = create_session_for_process(engine)
    db_session = Session()
    db_session.expire_on_commit = False

    ripe_slow_down_sema = mp.BoundedSemaphore(args.ripe_request_burst_limit)
    ripe_create_sema = mp.Semaphore(args.measurement_limit)
    global MAX_THREADS

    if args.debug:
        MAX_THREADS = 1
    else:
        MAX_THREADS = int(args.measurement_limit / args.number_processes * 1.5)

    finish_event = threading.Event()
    generator_thread = util.start_token_generating_thread(
        ripe_slow_down_sema, args.ripe_request_limit, finish_event)

    locations = db_session.query(LocationInfo)

    if not locations.count():
        logger.error('No locations found! Aborting!')
        print('No locations found! Aborting!')
        return 1

    if not args.disable_probe_fetching:
        probe_distances = load_probes_from_cache(db_session).values()

        location_to_probes_dct = assign_location_probes(
            locations, [probe for probe, _ in probe_distances], db_session)
        db_session.commit()

        null_locations = [
            location for location in locations
            if location.id not in location_to_probes_dct
        ]

        logger.info('{} locations without nodes'.format(len(null_locations)))
    else:
        locations = db_session.query(LocationInfo)
        location_to_probes_dct = {}

        loc_without_probes = 0
        probes = set()

        for location in locations:
            if location.nearby_probes:
                location_to_probes_dct[location.id] = []
                for probe in location.nearby_probes:
                    probes.add(probe)
                    _ = str(probe.location.lat + probe.location.lon) + probe.location.id + \
                        str(probe.second_hop_latency) + probe.probe_id + str(probe.id)
                    location_to_probes_dct[location.id].append(
                        (probe,
                         location.gps_distance_haversine(probe.location),
                         probe.location))
            else:
                loc_without_probes += 1

        logger.debug('expunging probes')

        for probe in probes:
            try:
                db_session.expunge(probe.location)
                db_session.expunge(probe)
            except InvalidRequestError:
                pass

        logger.debug('updating probes')
        update_probes(probes)

        logger.info('{} locations without nodes'.format(loc_without_probes))

    measurement_strategy = MeasurementStrategy(args.measurement_strategy)

    logger.debug('finished ripe')

    processes = []

    process_count = args.number_processes

    if args.debug:
        process_count = 1

    if args.ip_filter_file:
        ip_set = set()
        with open(args.ip_filter_file) as ip_filter_file:
            for line in ip_filter_file:
                ip_set.add(line.strip())

        ips = list(ip_set)
    else:
        ips = None

    db_session.close()

    for pid in range(0, process_count):

        if ips:
            ips_count = len(ips)
            ips_start_index = int(pid * (ips_count / process_count))
            ips_end_index = int((pid + 1) * (ips_count / process_count))

            if pid + 1 == process_count:
                ips_end_index = ips_count

            ips_for_process = ips[ips_start_index:ips_end_index]
        else:
            ips_for_process = None

        process = mp.Process(
            target=ripe_check_process,
            args=(pid, ripe_create_sema, ripe_slow_down_sema, args.bill_to,
                  args.without_new_measurements, args.allowed_measurement_age,
                  args.api_key, args.domain_block_limit, process_count,
                  args.include_ip_encoded, measurement_strategy,
                  args.probes_per_measurement, args.buffer_time,
                  args.measurement_packets, args.use_efficient_probes,
                  location_to_probes_dct, args.stop_without_old_results,
                  ips_for_process, args.endless_measurements,
                  args.random_domains),
            name='domain_checking_{}'.format(pid))

        processes.append(process)

    for process in processes:
        process.start()

    alive = len(processes)
    while alive > 0:
        try:
            process_sts = [pro.is_alive() for pro in processes]
            if process_sts.count(True) != alive:
                alive = process_sts.count(True)
                logger.debug('{} processes alive'.format(alive))
            for process in processes:
                process.join()
        except KeyboardInterrupt:
            pass

    if finish_event:
        finish_event.set()

    if generator_thread:
        generator_thread.join()

    logger.debug('{} processes alive'.format(alive))
    end_time = time.time()
    logger.info('running time: {}'.format((end_time - start_time)))
    return 0
예제 #16
0
def main():
    """Main function"""
    parser = configargparse.ArgParser(
        default_config_files=['find_default.ini'])

    __create_parser_arguments(parser)
    args = parser.parse_args()

    global logger
    logger = util.setup_logger(args.logging_file,
                               'find',
                               loglevel=args.log_level)

    global engine
    engine = create_engine(args.database_name)

    trie = create_trie(args.code_blacklist_file, args.word_blacklist_file)

    code_to_location_blacklist = {}
    if args.code_to_location_blacklist_file:
        with open(args.code_to_location_blacklist_file
                  ) as code_to_location_blacklist_file:
            json_txt = ""
            for line in code_to_location_blacklist_file:
                line = line.strip()
                if line[0] != '#':
                    json_txt += line
            code_to_location_blacklist = json.loads(json_txt)

    location_match_queue = mp.Queue()
    stop_event = threading.Event()
    handle_location_matches_thread = threading.Thread(
        target=handle_location_matches,
        name='handle-location-matches',
        args=(location_match_queue, stop_event))
    handle_location_matches_thread.start()

    processes = []
    for index in range(0, args.number_processes):
        process = mp.Process(target=search_process,
                             args=(index, trie, code_to_location_blacklist,
                                   args.domain_block_limit,
                                   args.number_processes,
                                   location_match_queue),
                             kwargs={
                                 'amount': args.amount,
                                 'debug': args.log_level == 'DEBUG'
                             },
                             name='find_locations_{}'.format(index))
        process.start()
        processes.append(process)

    for process in processes:
        process.join()

    Session = create_session_for_process(engine)
    db_session = Session()
    update_query = update(DomainLabel).values(
        last_searched=datetime.datetime.now())
    db_session.execute(update_query)
    db_session.close()

    stop_event.set()
    handle_location_matches_thread.join()
    location_match_queue.join_thread()
예제 #17
0
def parse_caida_data(bz2_compressed: bool, days_in_past: int, probe_id_dct: typing.Dict[str, int],
                     parsed_files_queue: mp.Queue, filename):
    Session = create_session_for_process(engine)
    db_session = Session()

    try:
        logger.debug('parsing %s', filename)

        probe_id = get_probe_id_from_path(filename)
        probe_db_id = probe_id_dct[probe_id]

        modification_time = datetime.datetime.fromtimestamp(os.path.getmtime(filename))

        measurements = collections.defaultdict(dict)

        if abs((modification_time - datetime.datetime.now()).days) >= days_in_past:
            return

        if bz2_compressed:
            with bz2.open(filename, 'rt') as ripe_archive_file:
                for line in ripe_archive_file:
                    measurement = parse_measurement(line, probe_db_id, days_in_past)

                    if measurement:
                        if measurement.probe_id not in \
                                measurements[measurement.destination_address] or \
                                measurements[measurement.destination_address][
                                    measurement.probe_id].min_rtt > \
                                measurement.min_rtt:
                            measurements[measurement.destination_address][measurement.probe_id] = \
                                measurement
        else:
            with open(filename) as caida_archive_filedesc, \
                    mmap.mmap(caida_archive_filedesc.fileno(), 0,
                              access=mmap.ACCESS_READ) as caida_archive_file:
                line = caida_archive_file.readline().decode('utf-8')

                while len(line) > 0:
                    measurement = parse_measurement(line, probe_db_id, days_in_past)

                    if measurement:
                        if measurement.probe_id not in \
                                measurements[measurement.destination_address] or \
                                measurements[measurement.destination_address][
                                    measurement.probe_id].min_rtt > \
                                measurement.min_rtt:
                            measurements[measurement.destination_address][
                                measurement.probe_id] = \
                                measurement

                    line = caida_archive_file.readline().decode('utf-8')

        measurement_results = []
        for probe_measurement_dct in measurements.values():
            measurement_results.extend(probe_measurement_dct.values())

        db_session.bulk_save_objects(measurement_results)
        logger.info('parsed and saved %s measurements', len(measurement_results))
        db_session.commit()

    except Exception:
        logger.exception('Error while parsing file: ')
    else:
        parsed_files_queue.put(filename)

    db_session.commit()

    db_session.close()
    Session.remove()

    logger.info('parse process for file %s finished', filename)
예제 #18
0
def parse_ripe_data(line_queue: mp.Queue, finished_reading: mp.Event,
                    probe_dct: typing.Dict[int, RipeAtlasProbe],
                    probe_latency_queue: mp.Queue):
    Session = create_session_for_process(engine)
    db_session = Session()

    results = collections.defaultdict(dict)
    min_rtt_results = collections.defaultdict(dict)

    def line_generator():
        try:
            rline = line_queue.get(timeout=5)
        except queue.Empty:
            logger.exception('found empty queue')
            return

        status_msg = False
        read_fails = 0
        while True:
            if rline:
                yield rline
            if finished_reading.is_set() and (line_queue.empty()
                                              or read_fails == 5):
                return
            if finished_reading.is_set() and not status_msg:
                logger.debug('reading finished finishing processing')
                status_msg = True
            rline = None
            try:
                rline = line_queue.get(timeout=2)
                read_fails = 0
            except queue.Empty:
                logger.debug('failed reading')
                read_fails += 1

    def save_measurement_results(m_results: collections.defaultdict, db_sess):
        measurement_results = []
        for probe_measurement_dct in m_results.values():
            measurement_results.extend(probe_measurement_dct.values())

        db_sess.bulk_save_objects(measurement_results)
        logger.info('parsed and saved %s measurements',
                    len(measurement_results))
        db_sess.commit()

        m_results.clear()

    failure_counter = 0
    parsed_lines = 0

    for line in line_generator():
        parsed_lines += 1
        try:
            measurement_result_dct = json.loads(line)

            measurement_result = parse_measurement(measurement_result_dct,
                                                   probe_dct,
                                                   probe_latency_queue)

            if not measurement_result:
                continue

            probe_id = measurement_result.probe_id
            destination_address = measurement_result.destination_address
            current_min_result = min_rtt_results[destination_address].get(
                probe_id, None)

            if current_min_result is None or current_min_result > measurement_result.min_rtt:
                results[destination_address][probe_id] = measurement_result
                min_rtt_results[destination_address][
                    probe_id] = measurement_result.min_rtt

            if len(results) >= 10**6:
                save_measurement_results(results, db_session)

        except Exception:
            failure_counter += 1
            logger.exception('Error while parsing line %s', line)

            if parsed_lines > 100 and failure_counter >= parsed_lines / 10:
                logger.critical(
                    'failure rate too high "%s" of "%s"! stopping!',
                    failure_counter, parsed_lines)
                save_measurement_results(results, db_session)
                break

    save_measurement_results(results, db_session)
    db_session.close()
    Session.remove()
    line_queue.close()

    logger.info('finished parsing')
예제 #19
0
def handle_location_matches(location_match_queue: mp.Queue,
                            stop_event: threading.Event):
    class LocationMatch:
        def __init__(self, location_hint):
            self.id = 0
            self.location_hint = location_hint
            self.location_code = location_hint.code
            self.location_code_type = location_hint.code_type
            self.location_id = location_hint.location_id
            self.domain_label_ids = set()
            self.old_domain_label_ids = set()

        def get_insert_values(self):
            return [{
                'location_hint_id': self.id,
                'domain_label_id': domain_label_id
            } for domain_label_id in self.domain_label_ids]

        def add_domain_label_id(self, domain_label_id):
            if domain_label_id not in self.domain_label_ids and \
                    domain_label_id not in self.old_domain_label_ids:
                self.domain_label_ids.add(domain_label_id)

        def handled_domains(self):
            self.old_domain_label_ids = self.old_domain_label_ids.union(
                self.domain_label_ids)
            self.domain_label_ids.clear()

        def __hash__(self):
            return hash(
                make_location_hint_key(self.location_id, self.location_code,
                                       self.location_code_type))

    def make_location_hint_key(location_id, code, code_type):
        return '{},{},{}'.format(location_id, code, code_type)

    def save(matches_to_save, new_matches, db_sess):
        if new_matches:
            db_sess.bulk_save_objects(
                [match.location_hint for match in new_matches],
                return_defaults=True)
            for new_match in new_matches:
                new_match.id = new_match.location_hint.id

        insert_values = []
        for match in matches_to_save:
            insert_values.extend(match.get_insert_values())
            match.handled_domains()

        new_matches.clear()
        matches_to_save.clear()

        if insert_values:
            insert_expr = location_hint_label_table.insert().values(
                insert_values)
            db_sess.execute(insert_expr)

    Session = create_session_for_process(engine)
    db_session = Session()

    new_matches = []
    matches_to_save = set()
    location_hints = {}
    counter = 0
    ccounter = 0

    while not (stop_event.is_set() and location_match_queue.empty()):
        try:
            location_matches_tuples = location_match_queue.get(timeout=1)
            if isinstance(location_matches_tuples, int):
                delete_expr = location_hint_label_table.delete().where(
                    'location_hint_labels.domain_label_id' ==
                    location_matches_tuples)
                db_session.execute(delete_expr)
                logger.debug('saving for {}'.format(location_matches_tuples))
            else:
                for location_id, location_code, location_code_type, domain_label_id in \
                        location_matches_tuples:
                    location_hint_key = make_location_hint_key(
                        location_id, location_code, location_code_type)
                    try:
                        location_hint = location_hints[location_hint_key]
                    except KeyError:
                        real_code_type = LocationCodeType(location_code_type)
                        location_hint_obj = CodeMatch(location_id,
                                                      code_type=real_code_type,
                                                      domain_label=None,
                                                      code=location_code)
                        db_session.add(location_hint_obj)
                        location_hint = LocationMatch(location_hint_obj)
                        location_hints[location_hint_key] = location_hint
                        new_matches.append(location_hint)

                    matches_to_save.add(location_hint)
                    location_hint.add_domain_label_id(domain_label_id)

                    counter += 1
                    if counter >= 10**4:
                        logger.debug('saving')
                        save(matches_to_save, new_matches, db_session)
                        counter = 0

                        ccounter += 1
                        if ccounter >= 10:
                            db_session.commit()
                            ccounter = 0

        except queue.Empty:
            pass

    save(matches_to_save, new_matches, db_session)
    db_session.commit()
    db_session.close()
    Session.remove()
    location_match_queue.close()
    logger.info('stopped')