Пример #1
0
 def timer_expired(sig, frame):
     progress_logger.info('Watchdog expired (more than {} seconds passed); {}'.format(Watchdog.duration, str(context)))
     memory_logger.info('Dumping all stack frames after watchdog expired:'.format(Watchdog.duration))
     Watchdog.log_all_stack_frames()
     MemTracker.logMemory('watchdog expired')
     Watchdog.duration *= 2
     progress_logger.info('Setting new watchdog timer, for {} seconds'.format(Watchdog.duration))
     Watchdog.reset()
Пример #2
0
 def timer_expired(sig, frame):
     progress_logger.info(
         'Watchdog expired (more than {} seconds passed); {}'.format(
             Watchdog.duration, str(context)))
     memory_logger.info(
         'Dumping all stack frames after watchdog expired:'.format(
             Watchdog.duration))
     Watchdog.log_all_stack_frames()
     MemTracker.logMemory('watchdog expired')
     Watchdog.duration *= 2
     progress_logger.info(
         'Setting new watchdog timer, for {} seconds'.format(
             Watchdog.duration))
     Watchdog.reset()
 def build_detectors(self, inbox):
     logs.context['step'] = 'build_detectors'
     detectors = [Detector(inbox) for Detector in self.features]
     verbose = self.should_enable_extra_debugging(inbox, detectors)
     for i, detector in enumerate(detectors):
         logs.context['detector'] = type(detector).__name__
         detector.create_sender_profile(self.sender_profile_num_emails)
         logs.Watchdog.reset()
         if verbose:
             progress_logger.info('Finished creating {} sender profile, RSS = {}'.format(type(detector).__name__, MemTracker.cur_mem_usage()))
             MemTracker.logMemory('finished creating {} sender profile'.format(type(detector).__name__))
         else:
             logs.RateLimitedMemTracker.checkmem('finished creating {} sender profile'.format(type(detector).__name__))
     del logs.context['step']
     del logs.context['detector']
     return detectors
 def build_detectors(self, inbox):
     logs.context['step'] = 'build_detectors'
     detectors = [Detector(inbox) for Detector in self.features]
     verbose = self.should_enable_extra_debugging(inbox, detectors)
     for i, detector in enumerate(detectors):
         logs.context['detector'] = type(detector).__name__
         detector.create_sender_profile(self.sender_profile_indeces)
         logs.Watchdog.reset()
         if verbose:
             progress_logger.info('Finished creating {} sender profile, RSS = {}'.format(type(detector).__name__, MemTracker.cur_mem_usage()))
             MemTracker.logMemory('finished creating {} sender profile'.format(type(detector).__name__))
         else:
             logs.RateLimitedMemTracker.checkmem('finished creating {} sender profile'.format(type(detector).__name__))
     del logs.context['step']
     del logs.context['detector']
     return detectors
    def execute(self):
        detector_names = ', '.join([d.__name__ for d in self.detectors])
        progress_logger.info(
            "Config settings: use_name_in_from={}, parallel={}, detectors={}".
            format(self.use_name_in_from, self.parallel, detector_names))

        start_time = time.time()
        MemTracker.initialize(memory_logger)
        logs.Watchdog.initialize()
        logs.context = {'phase': 'generate_features'}
        if self.generate_data_matrix or self.generate_test_matrix:
            self.generate_features()
        logs.context = {}
        MemTracker.logMemory(
            "After generating features/Before generating model")
        logs.context = {'phase': 'generate_model_output'}
        if self.generate_model:
            self.generate_model_output()
        logs.context = {}
        MemTracker.logMemory("After generating model")
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
            (end_time - start_time) % 60)
        progress_logger.info(
            "Phish Detector took {} minutes, {} seconds to run.".format(
                min_elapsed, sec_elapsed))
        logs.RateLimitedLog.flushall()
    def execute(self):
        detector_names = ', '.join([d.__name__ for d in self.detectors])
        progress_logger.info("Config settings: use_name_in_from={}, parallel={}, detectors={}".format(self.use_name_in_from, self.parallel, detector_names))

        start_time = time.time()
        MemTracker.initialize(memory_logger)
        logs.Watchdog.initialize()
        logs.context = {'phase': 'generate_features'}
        if self.generate_data_matrix or self.generate_test_matrix:
            self.generate_features()
        logs.context = {}
        MemTracker.logMemory("After generating features/Before generating model")
        logs.context = {'phase': 'generate_model_output'}
        if self.generate_model:
            self.generate_model_output()
        logs.context = {}
        MemTracker.logMemory("After generating model")
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
        progress_logger.info("Phish Detector took {} minutes, {} seconds to run.".format(min_elapsed, sec_elapsed))
        logs.RateLimitedLog.flushall()
 def should_enable_extra_debugging(self, inbox, detectors):
     if len(inbox) < 10000:
         return False
     detector_names = ', '.join([type(d).__name__ for d in detectors])
     progress_logger.info('Enabling extra debugging for large inbox with {} messages, {}; RSS = {}, creating sender profiles for {}'.format(len(inbox), logs.context, MemTracker.cur_mem_usage(), detector_names))
     return True
 def should_enable_extra_debugging(self, inbox, detectors):
     if len(inbox) < 10000:
         return False
     detector_names = ', '.join([type(d).__name__ for d in detectors])
     progress_logger.info('Enabling extra debugging for large inbox with {} messages, {}; RSS = {}, creating sender profiles for {}'.format(len(inbox), logs.context, MemTracker.cur_mem_usage(), detector_names))
     return True
Пример #9
0
    def test_and_report(self):
        """ Assumptions:
         - test.mat exists in directory structure and
           clf is classifier trained on all data matrices.
         - test.mat has data['email_index']
        Results is [path, index, probability]
        """
        self.clean_all()
        if not os.path.exists(self.results_dir):
            os.makedirs(self.results_dir)

        # creates this file in common/output
        email_probabilities = open(os.path.join("output", "email_probabilities.txt"), "w")

        low_volume_top_10 = PriorityQueue()
        high_volume_top_10 = PriorityQueue()

        numPhish, testSize = 0, 0
        numEmails4Sender = {}

        logging_interval = 60 # TODO(matthew): Move to config.yaml
        progress_logger.info("Starting to test on data.")
        start_time = time.time()
        last_logged_time = start_time

        results = np.empty(shape=(0, TOTAL_SIZE), dtype='S200')

        end_of_last_memory_track = dt.datetime.now()
        num_senders_completed = 0

        for root, dirs, files in os.walk(self.email_path):
            curr_time = time.time()
            if (curr_time - last_logged_time) > logging_interval * 60:
                progress_logger.info('Exploring directory #{}'.format(num_senders_completed))
                progress_logger.info('Testing has run for {} minutes'.format(int((curr_time - start_time) / 60)))
                last_logged_time = curr_time
            if self.memlog_freq >= 0:
                now = dt.datetime.now()
                time_elapsed = now - end_of_last_memory_track
                minutes_elapsed = time_elapsed.seconds / 60.0
                if minutes_elapsed > self.memlog_freq:
                    MemTracker.logMemory("After completing " + str(num_senders_completed) + " iterations in test_and_report")
                    end_of_last_memory_track = dt.datetime.now()
            if 'test.mat' in files:
                path = os.path.join(root, "test.mat")
                data = sio.loadmat(path)
                test_X = data['test_data']
                sample_size = test_X.shape[0]
                if sample_size == 0:
                    continue
                test_indx = np.arange(sample_size).reshape(sample_size, 1)
                indx = data['email_index'].reshape(sample_size, 1)
                test_mess_id = data['message_id'].reshape(sample_size, 1).astype("S200")
                test_res = self.output_phish_probabilities(test_X, indx, root, test_indx, test_mess_id)
                if test_res is not None:
                    for email in test_res:
                        testSize += 1
                        sender = self.get_sender(email[0])
                        emailPath = email[0]
                        probability = float(email[2])
                        message_ID = email[4].strip(" ")
                        if probability > 0.5:
                            numPhish += 1

                        # caches the num_emails value for each sender
                        if sender not in numEmails4Sender:
                            num_emails = sum(1 for line in open(emailPath))
                            numEmails4Sender[sender] = num_emails
                        else:
                            num_emails = numEmails4Sender[sender]

                        # checks which priority queue to add item to
                        if num_emails < self.bucket_thres:
                            low_volume_top_10.push(email, probability)
                        else:
                            high_volume_top_10.push(email, probability)

                        # writes an email's message ID and phish probability to a file
                        email_probabilities.write(message_ID + "," + str(probability) + "\n")
    
        email_probabilities.close()
        self.num_phish, self.test_size = numPhish, testSize
        low_volume_output = low_volume_top_10.createOutput()
        high_volume_output = high_volume_top_10.createOutput()
        output = [low_volume_output, high_volume_output]

        # DEBUG information - don't print to main log
        # debug_logger.info(pp.pformat(output))

        self.d_name_per_feat = self.parse_feature_names()
        self.pretty_print(low_volume_output, "low_volume")
        self.pretty_print(high_volume_output, "high_volume")
        self.write_summary_output(output)

        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
        progress_logger.info("Finished testing on data in {} minutes, {} seconds. {} directories tested.".format(min_elapsed, sec_elapsed, num_senders_completed))
Пример #10
0
 def checkmem_rate_limited(self):
     cur = MemTracker.cur_mem_usage()
     if cur >= 2*self.max_mem_logged:
         self.max_mem_logged = cur
         MemTracker.logMemory(self.task + "; " + str(context))
Пример #11
0
    def test_and_report(self):
        """ Assumptions:
         - test.mat exists in directory structure and
           clf is classifier trained on all data matrices.
         - test.mat has data['email_index']
        Results is [path, index, probability]
        """
        self.clean_all()
        if not os.path.exists(self.results_dir):
            os.makedirs(self.results_dir)

        # creates this file in common/output
        email_probabilities = open(os.path.join("output", "email_probabilities.txt"), "w")
        #low_volume_top_10 = PriorityQueue(self.bucket_size)
        #high_volume_top_10 = PriorityQueue(self.bucket_size)
	
	top_emails = PriorityQueue(self.bucket_size)

        numPhish, testSize = 0, 0

        logging_interval = 60 # TODO(matthew): Move to config.yaml
        progress_logger.info("Starting to test on data.")
        start_time = time.time()
        last_logged_time = start_time

        results = np.zeros(shape=(0, TOTAL_SIZE), dtype='S200')

        end_of_last_memory_track = dt.datetime.now()
        num_senders_completed = 0

        num_message_id_failed = 0
        total_completed = 0

        self.d_name_per_feat = self.parse_feature_names()
        
	if (self.filterRecipients):
	    targetRecipientName, targetRecipientEmail = self.createTargetRecipientList()
 
        for root, dirs, files in os.walk(self.email_path):
            curr_time = time.time()
            if (curr_time - last_logged_time) > logging_interval * 60:
                progress_logger.info('Exploring directory #{}'.format(num_senders_completed))
                progress_logger.info('Testing has run for {} minutes'.format(int((curr_time - start_time) / 60)))
                last_logged_time = curr_time
            if self.memlog_freq >= 0:
                now = dt.datetime.now()
                time_elapsed = now - end_of_last_memory_track
                minutes_elapsed = time_elapsed.seconds / 60.0
                if minutes_elapsed > self.memlog_freq:
                    MemTracker.logMemory("After completing " + str(num_senders_completed) + " iterations in test_and_report")
                    end_of_last_memory_track = dt.datetime.now()
            if 'test.mat' in files:
                path = os.path.join(root, "test.mat")
                data = sio.loadmat(path)
                test_X = data['test_data']
                sample_size = test_X.shape[0]
                if sample_size == 0:
                    continue
                test_indx = np.arange(sample_size).reshape(sample_size, 1)
                indx = data['email_index'].reshape(sample_size, 1)
                try:
                    test_mess_id = data['message_id'].reshape(sample_size, 1).astype("S200")
                except ValueError as e:
                    progress_logger.info("Size mismatch of data['message_id'], data['test_data']: {}, {}".format(data['message_id'].shape, data['test_data'].shape))
                    debug_logger.info("Size mismatch in {}".format(path))
                    num_message_id_failed += 1
                    test_mess_id = np.zeros(shape=(sample_size, 1), dtype="S200")
                test_res = self.get_email_records(test_X, indx, root, test_indx, test_mess_id)
                if test_res is not None:
                    for email in test_res:
			filtered = False
			if self.filterRecipients:
			    filtered = not self.isTargetRecipient(targetRecipientName, targetRecipientEmail, email.email["TO"])
			if not filtered:
                            testSize += 1
                            sender = email.path
                            emailPath = email.path
                            probability = float(email.probability_phish)
                            message_ID = email.email_message_id.strip(" ")
                            if probability > 0.5:
                                numPhish += 1

			    top_emails.push(email, probability)
                            # writes an email's message ID and phish probability to a file
                            email_probabilities.write(message_ID + "," + str(probability) + "\n")
                    total_completed += 1
                num_senders_completed += 1
        progress_logger.info("total # of times size of data['message_id'] != sample_size: " + str(num_message_id_failed))
        progress_logger.info("total # of successes: " + str(total_completed))

        email_probabilities.close()
        self.num_phish, self.test_size = numPhish, testSize
        top_emails_output = top_emails.createOutput()
        output = [top_emails_output]

        # DEBUG information - don't print to main log
        # debug_logger.info(pp.pformat(output))
	self.pretty_print(top_emails_output, "all_emails")
        self.write_summary_output(output)

        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
        progress_logger.info("Finished testing on data in {} minutes, {} seconds. {} directories tested.".format(min_elapsed, sec_elapsed, num_senders_completed))
Пример #12
0
    def generate_features(self):
        if self.use_name_in_from != 0:
            Detector.USE_NAME = True

        dir_to_generate = []

        if (self.filter_senders):
            targetSenderNames, targetSenderEmails = self.createTargetSendersSet(
            )

        progress_logger.info(
            'Starting directory aggregation in feature generation.')
        start_time = time.time()
        for dirpath, dirnames, filenames in os.walk(self.root_dir):
            if ((self.generate_data_matrix
                 and self.regular_filename in filenames
                 and self.phish_filename in filenames)
                    or (self.generate_test_matrix
                        and self.regular_filename in filenames)):
                command = [
                    "wc", "-l", "{}/{}".format(dirpath, self.regular_filename)
                ]
                filtered = False
                if (self.filter_senders):
                    lastPartofPath = os.path.basename(
                        os.path.normpath(dirpath))
                    targetMatch = self.isTargetSender(targetSenderNames,
                                                      targetSenderEmails,
                                                      lastPartofPath)
                    filtered = not targetMatch
                if filtered:
                    continue
                try:
                    wc_output = subprocess.check_output(command)
                    wc_output_split = wc_output.split()
                    line_count = int(wc_output_split[0])

                    if line_count < 50000 and not filtered:  # Ignore inboxes with more than 50,000 emails
                        dir_to_generate.append(dirpath)
                    logs.Watchdog.reset()
                except subprocess.CalledProcessError:
                    debug_logger.warn(
                        'Could not calculate line count for directory {}'.
                        format(dirpath))
                    continue
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
            (end_time - start_time) % 60)
        progress_logger.info(
            'Finished directory aggregation in feature generation in {} minutes, {} seconds'
            .format(min_elapsed, sec_elapsed))

        BATCH_SIZE = self.batch_threading_size
        if self.parallel:
            progress_logger.info(
                'Starting feature generation with {} threads in parallel with batch size {}...'
                .format(self.num_threads, BATCH_SIZE))
            start_time = time.time()
            feature_generators = []
            for directory in dir_to_generate:
                feature_generator = self.prep_features(directory)
                feature_generators.append(feature_generator)
                if len(feature_generators) == BATCH_SIZE:
                    p = Pool(self.num_threads)
                    p.map(run_generator, feature_generators)
                    p.close()
                    p.join()
                    feature_generators = []
            if len(feature_generators) > 0:
                p = Pool(self.num_threads)
                p.map(run_generator, feature_generators)
                p.close()
                p.join()
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
                (end_time - start_time) % 60)
            progress_logger.info(
                'Finished feature generation in {} minutes, {} seconds.'.
                format(min_elapsed, sec_elapsed))
        else:
            progress_logger.info(
                'Starting feature generation serially for {} directories'.
                format(len(dir_to_generate)))
            start_time = time.time()
            last_logged_time = start_time
            dir_count = 0
            end_of_last_memory_track = dt.datetime.now()
            for directory in dir_to_generate:
                dir_count += 1
                logs.context = {'feature gen': dir_count}
                curr_time = time.time()
                if (curr_time - last_logged_time) > self.logging_interval * 60:
                    progress_logger.info(
                        'Processing directory #{} of {}'.format(
                            dir_count, len(dir_to_generate)))
                    progress_logger.info(
                        'Feature generation has run for {} minutes'.format(
                            int((curr_time - start_time) / 60)))
                    last_logged_time = curr_time
                feature_generator = self.prep_features(directory)
                feature_generator.run()
                logs.Watchdog.reset()
                now = dt.datetime.now()
                time_elapsed = now - end_of_last_memory_track
                minutes_elapsed = time_elapsed.seconds / 60.0
                if minutes_elapsed > self.memlog_gen_features_frequency:
                    MemTracker.logMemory(
                        'After generating features for {}th sender'.format(
                            dir_count))
                    end_of_last_memory_track = dt.datetime.now()
                logs.context = {}
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
                (end_time - start_time) % 60)
            progress_logger.info(
                'Finished feature generation in {} minutes, {} seconds.'.
                format(min_elapsed, sec_elapsed))
Пример #13
0
    def generate_features(self):
        if self.use_name_in_from != 0:
            Detector.USE_NAME = True

        dir_to_generate = []

        progress_logger.info('Starting directory aggregation in feature generation.')
        start_time = time.time()
        for dirpath, dirnames, filenames in os.walk(self.root_dir):
            if ((self.generate_data_matrix and self.regular_filename in filenames and self.phish_filename in filenames)
                or (self.generate_test_matrix and self.regular_filename in filenames)):
                dir_to_generate.append(dirpath)
                logs.Watchdog.reset()
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
        progress_logger.info('Finished directory aggregation in feature generation in {} minutes, {} seconds'.format(min_elapsed, sec_elapsed))
        
        BATCH_SIZE = self.batch_threading_size
        if self.parallel:
            progress_logger.info('Starting feature generation with {} threads in parallel with batch size {}...'.format(self.num_threads, BATCH_SIZE))
            start_time = time.time()
            feature_generators = []
            for directory in dir_to_generate:
                feature_generator = self.prep_features(directory)
                feature_generators.append(feature_generator)
                if len(feature_generators) == BATCH_SIZE:
                    p = Pool(self.num_threads)
                    p.map(run_generator, feature_generators)
                    p.close()
                    p.join()
                    feature_generators = []
            if len(feature_generators) > 0:
                p = Pool(self.num_threads)
                p.map(run_generator, feature_generators)
                p.close()
                p.join()
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
            progress_logger.info('Finished feature generation in {} minutes, {} seconds.'.format(min_elapsed, sec_elapsed))
        else:
            progress_logger.info('Starting feature generation serially for {} directories'.format(len(dir_to_generate)))
            start_time = time.time()
            last_logged_time = start_time
            dir_count = 0
            end_of_last_memory_track = dt.datetime.now()
            for directory in dir_to_generate:
                dir_count += 1
                logs.context = {'feature gen': dir_count}
                curr_time = time.time()
                if (curr_time - last_logged_time) > self.logging_interval * 60:
                    progress_logger.info('Processing directory #{} of {}'.format(dir_count, len(dir_to_generate)))
                    progress_logger.info('Feature generation has run for {} minutes'.format(int((curr_time - start_time) / 60)))
                    last_logged_time = curr_time
                feature_generator = self.prep_features(directory)
                feature_generator.run()
                logs.Watchdog.reset()
                now = dt.datetime.now()
                time_elapsed = now - end_of_last_memory_track
                minutes_elapsed = time_elapsed.seconds / 60.0
                if minutes_elapsed > self.memlog_gen_features_frequency:
                    MemTracker.logMemory('After generating features for {}th sender'.format(dir_count))
                    end_of_last_memory_track = dt.datetime.now()
                logs.context = {}
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
            progress_logger.info('Finished feature generation in {} minutes, {} seconds.'.format(min_elapsed, sec_elapsed))
Пример #14
0
 def checkmem_rate_limited(self):
     cur = MemTracker.cur_mem_usage()
     if cur >= 2 * self.max_mem_logged:
         self.max_mem_logged = cur
         MemTracker.logMemory(self.task + "; " + str(context))