Пример #1
0
class WCAggregatorBolt(Bolt):
    outputs = [
        Stream(fields=['info', 'wordcounts_list'], name="default"),
        Stream(fields=['info', 'keywords'], name="rec")
    ]

    def initialize(self, conf, ctx):
        self.pid = os.getpid()
        self.wcs = []
        self.count = 0

    def process(self, tup):
        info = tup.values[0]
        wordcount_dict = tup.values[1]

        self.wcs.append(wordcount_dict)
        self.count += 1
        if self.count == 10:
            self.count = 0
            global_wordcount = wcm.aggregate_wordcount_dicts(self.wcs)
            global_wordcount_dict_best = {
                k: wcm.take_firsts(v, n=3)
                for k, v in global_wordcount.items()
                if k in ["PERSON", "ORGANIZATION"]
            }
            global_wordcount_best = wcm.aggregate_subjects(
                global_wordcount_dict_best)
            self.logger.info("sending tokens to RecursiveBolt")
            for token in global_wordcount_best:
                self.emit([info, token[0]], stream="rec")
        self.emit([info, self.wcs])
Пример #2
0
class MultiBolt(Bolt):
    outputs = [
        Stream(['value'], 'word'),
        Stream(['value'], 'number'),
    ]

    def initialize(self, conf, ctx):
        self.pid = os.getpid()

    def process(self, tup):
        word = tup.values.word
        if isinstance(word, int):
            self.emit([word], stream='number')
        else:
            self.emit([word], stream='word')
Пример #3
0
class TDMSParseBolt(Bolt):
    # channel properties as tuple fields
    tuple_fields = ('timestamp', 'time_offset', 'time_increment', 'samples',
                    'channel_name', 'module_name', 'data')

    # group channels as output streams' name
    channels = ('FCXF-X-02-T01', 'FCXF-X-02-T02', 'FCXF-X-02-T03',
                'FCXF-X-02-T04', 'FCXF-X-03-T01', 'FCXF-X-03-T02',
                'FCXF-X-03-T03', 'FCXF-X-03-T04', 'FCXF-X-03-T05',
                'FCXF-X-03-T06', 'FCXF-X-04-T01', 'FCXF-X-04-T02',
                'FCXF-X-04-T03', 'FCXF-X-04-T04', 'FCXF-X-02-S01',
                'FCXF-X-02-S02', 'FCXF-X-02-S03', 'FCXF-X-02-S04',
                'FCXF-X-02-A01', 'FCXF-X-03-A01', 'FCXF-X-03-A02',
                'FCXF-X-04-A01', 'FCXF-X-03-S05', 'FCXF-X-03-S06',
                'FCXF-X-03-S01', 'FCXF-X-03-S02', 'FCXF-X-03-S03',
                'FCXF-X-03-S04', 'FCXF-X-04-S01', 'FCXF-X-04-S02',
                'FCXF-X-04-S03', 'FCXF-X-04-S04')

    # output streams declare
    outputs = [Stream(tuple_fields, channel) for channel in channels]

    def process(self, tup):
        data = tup.values[0]

        # decode the base64 encoded data
        decoded_data = base64.b64decode(data)

        # the tdms data starts with 'TDSm'
        i = decoded_data.index('TDSm')

        data_stream = io.BytesIO(decoded_data[i:])
        tdms_file = TdmsFile(data_stream)

        for tup in self._parse(tdms_file):
            self.emit(tup, stream=tup[-3])

    def _parse(self, tdms_file):
        for group in tdms_file.groups():
            for channel in tdms_file.group_channels(group):
                if channel.channel in self.channels:
                    # acquire this channel's 'wf_start_time' property
                    # and get its timestamp value for JSON serialize
                    start_time = channel.property('wf_start_time')
                    timestamp = time.mktime(start_time.timetuple())
                    tup = [timestamp]

                    # acquire this channel's other properties
                    others = [
                        v for k, v in channel.properties.items()
                        if k != 'wf_start_time'
                    ]
                    tup.extend(others)

                    # acquire channel data
                    data = channel.data.tolist()
                    tup.append(data)

                    yield tup
Пример #4
0
def setup_debug_hooks(topo_class):
    inputs = []
    graph = topo_class.to_flux_dict('')
  
    for spec in topo_class.specs:
        for source_name, output in spec.outputs.items():
            if source_name == "error_handler_bolt":
                continue

            inputs.append(spec[source_name])
  
    topo_class.debug_bolt = ShellBolt.spec(command='coffee', script='node/bolts/debug.coffee',
                                           outputs=[Stream(fields=['config'])],
                                           inputs=inputs)
  
    topo_class.debug_bolt.name = "debug_bolt"
  
    Topology.add_bolt_spec(topo_class.debug_bolt, topo_class.thrift_bolts)
    topo_class.specs.append(topo_class.debug_bolt)
  
    directory = 'src/topology_graphs'

    if not os.path.exists(directory):
        os.makedirs(directory)

    topo_name = topo_class.__module__
    split_topo_name = topo_name.split('.')

    if len(split_topo_name) > 1:
        topo_name = split_topo_name[1]
    
    path = "{}/{}.json".format(directory, topo_name)
    fullpath = os.path.abspath(path)
  
    with open(fullpath, 'w+') as outfile:
        json.dump(graph, outfile, indent=2)
Пример #5
0
class Tokenizer(AbstractBolt):
    """Split the mail in token parts (body, attachments, etc.) and sends
    these parts to others bolts."""

    outputs = [
        Stream(
            fields=['sha256_random', 'mail', 'is_filtered'],
            name='mail'),
        Stream(
            fields=['sha256_random', 'raw_mail', 'mail_type', 'is_filtered'],
            name='raw_mail'),
        Stream(
            fields=['sha256_random', 'body', 'is_filtered'],
            name='body'),
        Stream(
            fields=['sha256_random', 'network', 'is_filtered'],
            name='network'),
        Stream(
            fields=['sha256_random', 'with_attachments', 'attachments'],
            name='attachments')]

    def __getattr__(self, name):
        return self.conf[name]

    def get_persistent_path(self, filter_name):
        return os.path.join(
            self.persistent_path, "{}.dump".format(filter_name))

    def initialize(self, stormconf, context):
        super(Tokenizer, self).initialize(stormconf, context)

        self.filter_types = ("mails", "attachments", "network")
        self.load_filters()

        self.mailparser = {
            MAIL_PATH: mailparser.parse_from_file,
            MAIL_PATH_OUTLOOK: mailparser.parse_from_file_msg,
            MAIL_STRING: mailparser.parse_from_string}

    def load_filters(self):
        for i in self.filter_types:
            if getattr(self, "filter_" + i):
                path = self.get_persistent_path(i)
                try:
                    obj = load_obj(path)
                    setattr(self, "analyzed_" + i, obj)
                except (IOError, EOFError, ValueError, BadPickleGet):
                    setattr(self, "analyzed_" + i, deque(
                        maxlen=getattr(self, "maxlen_" + i)))

    def dump_filters(self):
        for i in self.filter_types:
            if getattr(self, "filter_" + i):
                path = self.get_persistent_path(i)
                dump_obj(path, getattr(self, "analyzed_" + i))
                self.log("Dumped RAM filter {!r} in {!r}".format(i, path))

    def _make_mail(self, tup):
        raw_mail = tup.values[0]
        mail_type = tup.values[5]
        rand = '_' + ''.join(random.choice('0123456789') for i in range(10))
        self.parser = self.mailparser[mail_type](raw_mail)

        # get only the mains headers because this number can explode
        # Elastic can't manage all possible headers
        mail = self.parser.mail_partial
        mail["headers"] = self.parser.headers_json

        # Data mail sources
        mail["mail_server"] = tup.values[1]
        mail["mailbox"] = tup.values[2]
        mail["priority"] = tup.values[3]
        mail["sender_ip"] = self.parser.get_server_ipaddress(tup.values[4])

        # Fingerprints of body mail
        (mail["md5"], mail["sha1"], mail["sha256"], mail["sha512"],
            mail["ssdeep"]) = fingerprints(self.parser.body.encode('utf-8'))
        sha256_rand = mail["sha256"] + rand

        if mail_type in (MAIL_PATH, MAIL_PATH_OUTLOOK):
            mail_string = raw_mail.split("/")[-1].replace(".processing", "")
            self.log("{}: {}".format(mail_string, mail["sha256"]))
            with open(raw_mail) as f:
                mail["size"] = len(f.read())
        elif mail_type in (MAIL_STRING):
            mail["size"] = len(raw_mail)

        # Add path to result
        if mail_type == MAIL_PATH:
            mail["mail_file"] = raw_mail.split("/")[-1].replace(
                ".processing", "")

        # Dates
        if mail.get('date'):
            mail["date"] = mail.get('date').isoformat()
        else:
            mail["date"] = datetime.datetime.utcnow().isoformat()

        mail["analisys_date"] = datetime.datetime.utcnow().isoformat()

        # Adding custom headers
        for h in tup.values[6]:
            mail["custom_" + h] = get_header(self.parser.message, h)

        # Remove attachments
        mail.pop("attachments", None)

        return sha256_rand, mail

    def process_tick(self, freq):
        """Every freq seconds you reload configuration. """
        super(Tokenizer, self).process_tick(freq)
        self.dump_filters()

    def process(self, tup):
        try:
            sha256_rand, mail = self._make_mail(tup)
            sha256 = sha256_rand.split("_")[0]
            self.log("Processing started: {}".format(sha256))
            with_attachments = False
            attachments = []
            body = self.parser.body
            raw_mail = tup.values[0]
            mail_type = tup.values[5]

            # If filter network is enabled
            is_filtered_net = False
            if self.filter_network:
                if mail["sender_ip"] in self.analyzed_network:
                    is_filtered_net = True

                # Update database ip addresses analyzed
                self.analyzed_network.append(mail["sender_ip"])

            # If filter mails is enabled
            is_filtered_mail = False
            if self.filter_mails:
                if mail["sha1"] in self.analyzed_mails:
                    mail.pop("body", None)
                    body = six.text_type()
                    raw_mail = six.text_type()
                    is_filtered_mail = True

                # Update database mails analyzed
                self.analyzed_mails.append(mail["sha1"])

            if self.parser.attachments:
                with_attachments = True
                attachments = MailAttachments.withhashes(
                    self.parser.attachments)

                # If filter attachments is enabled
                if self.filter_attachments:
                    hashes = attachments.filter(self.analyzed_attachments)
                    self.analyzed_attachments.extend(hashes)

        except TypeError, e:
            self.raise_exception(e, tup)

        except UnicodeDecodeError, e:
            self.raise_exception(e, tup)
Пример #6
0
class Tokenizer(AbstractBolt):
    """Split the mail in token parts (body, attachments, etc.) and sends
    these parts to others bolts."""

    outputs = [
        Stream(
            fields=['sha256_random', 'mail', 'is_filtered'],
            name='mail'),
        Stream(
            fields=['sha256_random', 'raw_mail', 'mail_type', 'is_filtered'],
            name='raw_mail'),
        Stream(
            fields=['sha256_random', 'body', 'is_filtered'],
            name='body'),
        Stream(
            fields=['sha256_random', 'network', 'is_filtered'],
            name='network'),
        Stream(
            fields=['sha256_random', 'with_attachments', 'attachments'],
            name='attachments')]

    def initialize(self, stormconf, context):
        super(Tokenizer, self).initialize(stormconf, context)

        self.mailparser = {
            MAIL_PATH: mailparser.parse_from_file,
            MAIL_PATH_OUTLOOK: mailparser.parse_from_file_msg,
            MAIL_STRING: mailparser.parse_from_string}

        self.mails_analyzed = deque(maxlen=self.conf["maxlen_mails"])
        self.network_analyzed = deque(maxlen=self.conf["maxlen_network"])
        self.attachments_analyzed = deque(
            maxlen=self.conf["maxlen_attachments"])

        self._load_filters()

    def _load_filters(self):
        self.filter_mails_enabled = self.conf["filter_mails"]
        self.filter_network_enabled = self.conf["filter_network"]
        self.filter_attachments_enabled = self.conf["filter_attachments"]

    def _make_mail(self, tup):
        raw_mail = tup.values[0]
        mail_type = tup.values[5]
        rand = '_' + ''.join(random.choice('0123456789') for i in range(10))
        self.parser = self.mailparser[mail_type](raw_mail)
        mail = self.parser.mail

        # Data mail sources
        mail["mail_server"] = tup.values[1]
        mail["mailbox"] = tup.values[2]
        mail["priority"] = tup.values[3]
        mail["sender_ip"] = self.parser.get_server_ipaddress(tup.values[4])

        # Fingerprints of body mail
        (mail["md5"], mail["sha1"], mail["sha256"], mail["sha512"],
            mail["ssdeep"]) = fingerprints(self.parser.body.encode('utf-8'))
        sha256_rand = mail["sha256"] + rand

        # Add path to result
        if mail_type == MAIL_PATH:
            mail["path_mail"] = raw_mail

        # Dates
        if mail.get('date'):
            mail["date"] = mail.get('date').isoformat()
        else:
            mail["date"] = datetime.datetime.utcnow().isoformat()

        mail["analisys_date"] = datetime.datetime.utcnow().isoformat()

        # Adding custom headers
        for h in tup.values[6]:
            mail["custom_" + h] = self.parser.message.get(h)

        # Remove attachments
        mail.pop("attachments", None)

        return sha256_rand, mail

    def process_tick(self, freq):
        """Every freq seconds you reload configuration. """
        super(Tokenizer, self).process_tick(freq)
        self._load_filters()

    def process(self, tup):
        try:
            sha256_rand, mail = self._make_mail(tup)
            with_attachments = False
            attachments = []
            body = self.parser.body
            raw_mail = tup.values[0]
            mail_type = tup.values[5]

            # If filter network is enabled
            is_filtered_net = False
            if self.filter_network_enabled:
                if mail["sender_ip"] in self.network_analyzed:
                    is_filtered_net = True

                # Update database ip addresses analyzed
                self.network_analyzed.append(mail["sender_ip"])

            # If filter mails is enabled
            is_filtered_mail = False
            if self.filter_mails_enabled:
                if mail["sha1"] in self.mails_analyzed:
                    mail.pop("body", None)
                    body = six.text_type()
                    raw_mail = six.text_type()
                    is_filtered_mail = True

                # Update database mails analyzed
                self.mails_analyzed.append(mail["sha1"])

            if self.parser.attachments:
                with_attachments = True
                attachments = MailAttachments.withhashes(
                    self.parser.attachments)

                # If filter attachments is enabled
                if self.filter_attachments_enabled:
                    hashes = attachments.filter(self.attachments_analyzed)
                    self.attachments_analyzed.extend(hashes)

        except TypeError, e:
            self.raise_exception(e, tup)

        except UnicodeDecodeError, e:
            self.raise_exception(e, tup)
Пример #7
0
class Tokenizer(AbstractBolt):
    """Split the mail in token parts (body, attachments, etc.). """

    outputs = [
        Stream(fields=['sha256_random', 'mail', 'is_filtered'], name='mail'),
        Stream(fields=['sha256_random', 'body', 'is_filtered'], name='body'),
        Stream(fields=['sha256_random', 'network', 'is_filtered'],
               name='network'),
        Stream(fields=['sha256_random', 'with_attachments', 'attachments'],
               name='attachments')
    ]

    def initialize(self, stormconf, context):
        super(Tokenizer, self).initialize(stormconf, context)

        self._parser = MailParser()
        self._mails_analyzed = deque(maxlen=self.conf["maxlen_mails"])
        self._network_analyzed = deque(maxlen=self.conf["maxlen_network"])
        self._attachments_analyzed = deque(
            maxlen=self.conf["maxlen_attachments"])
        self._load_filters()

    def _load_filters(self):
        self._filter_mails_enabled = self.conf["filter_mails"]
        self._filter_network_enabled = self.conf["filter_network"]
        self._filter_attachments_enabled = self.conf["filter_attachments"]

    @property
    def filter_mails_enabled(self):
        return self._filter_mails_enabled

    @property
    def filter_network_enabled(self):
        return self._filter_network_enabled

    @property
    def filter_attachments_enabled(self):
        return self._filter_attachments_enabled

    @property
    def parser(self):
        return self._parser

    def _make_mail(self, tup):
        raw_mail = tup.values[0]
        mail_format = tup.values[5]
        rand = '_' + ''.join(random.choice('0123456789') for i in range(10))

        # Check if kind_data is correct
        if mail_format != STRING and mail_format != PATH:
            raise InvalidMailFormat(
                "Invalid mail format {!r}. Choose {!r} or {!r}".format(
                    mail_format, STRING, PATH))

        # Parsing mail
        if mail_format == PATH:
            if os.path.exists(raw_mail):
                self.parser.parse_from_file(raw_mail)
        else:
            self.parser.parse_from_string(raw_mail)

        # Getting all parts
        mail = self.parser.parsed_mail_obj

        # Data mail sources
        mail["mail_server"] = tup.values[1]
        mail["mailbox"] = tup.values[2]
        mail["priority"] = tup.values[3]
        mail["sender_ip"] = self.parser.get_server_ipaddress(tup.values[4])

        # Fingerprints of body mail
        (mail["md5"], mail["sha1"], mail["sha256"], mail["sha512"],
         mail["ssdeep"]) = fingerprints(self.parser.body.encode('utf-8'))
        sha256_rand = mail["sha256"] + rand

        # Add path to result
        if mail_format == PATH:
            mail["path_mail"] = raw_mail

        # Dates
        if mail.get('date'):
            mail["date"] = mail.get('date').isoformat()
        else:
            mail["date"] = datetime.datetime.utcnow().isoformat()

        mail["analisys_date"] = datetime.datetime.utcnow().isoformat()

        # Remove attachments
        mail.pop("attachments", None)

        return sha256_rand, mail

    def process_tick(self, freq):
        """Every freq seconds you reload configuration. """
        super(Tokenizer, self).process_tick(freq)
        self._load_filters()

    def process(self, tup):
        try:
            sha256_rand, mail = self._make_mail(tup)
            with_attachments = False
            attachments = []
            body = self.parser.body

            # If filter network is enabled
            is_filtered = False
            if self.filter_network_enabled:
                if mail["sender_ip"] in self._network_analyzed:
                    is_filtered = True

                # Update databese mail analyzed
                self._network_analyzed.append(mail["sender_ip"])

            # If filter mails is enabled
            is_filtered = False
            if self.filter_mails_enabled:
                if mail["sha1"] in self._mails_analyzed:
                    mail.pop("body", None)
                    body = six.text_type()
                    is_filtered = True

                # Update databese mail analyzed
                self._mails_analyzed.append(mail["sha1"])

            # Emit only attachments
            raw_attach = self.parser.attachments_list

            if raw_attach:
                with_attachments = True
                attachments = MailAttachments.withhashes(raw_attach)

                # If filter attachments is enabled
                if self.filter_attachments_enabled:
                    hashes = attachments.filter(self._attachments_analyzed)
                    self._attachments_analyzed.extend(hashes)

        except TypeError, e:
            self.raise_exception(e, tup)

        except UnicodeDecodeError, e:
            self.raise_exception(e, tup)
class DetrendBolt(Bolt):
    # todo: try pyecharts
    tup_fields = ('timestamp', 'time_offset', 'time_increment', 'samples',
                  'channel_name', 'module_name', 'data')
    outputs = [Stream(fields=tup_fields, name='detrend')]

    def initialize(self, storm_conf, context):
        # String, dt1: how often to recalculate a mode value, eg. '5s', '20ms'
        # String, dt2: how long data is taken in calculating the mode value, eg. '5s', '20ms'
        self.dt1 = pd.Timedelta(storm_conf.get('dt1', '5s'))
        self.dt2 = pd.Timedelta(storm_conf.get('dt1', '15s'))

    def process(self, tup):
        """
        step 1: accept tup and convert to pandas.Dataframe;
        step 2: call self._detrend() function to remove the mode value from the original data;
        stip 3: convert resulted pandas.Dataframe to list and return.
        :param streamparse.Tuple, tup: tup.values =
                [timestamp, time_offset, time_increment, samples, channel_name, module_name, data]
        :return streamparse.Tuple, tup: tup.values =
                [timestamp, time_offset, time_increment, samples, channel_name, module_name, data]
        """
        timestamp = tup.values[0]
        time_increment = tup.values[2]
        channel_name = tup.values[4]
        data = tup.values[6]

        index = pd.date_range(start=pd.Timestamp(timestamp, unit='s',
                                                 tz='UTC'),
                              periods=data.__len__(),
                              freq='{}ms'.format(int(time_increment / 0.001)))
        df = pd.DataFrame(data=data, index=index, columns=[channel_name])

        try:
            self.history = self.history.combine_first(df)
        except AttributeError:
            self.history = df
            self.freq = df.index.freq
            self.res = list(tup.values)
        self.history = self.history.asfreq(self.freq)

        for df in self._detrend():
            self.res[0] = df.index[0].timestamp()
            self.res[6] = df[channel_name].values.tolist()
            self.emit(self.res, stream='detrend')

    def _detrend(self):
        """
        remove the mode value from the original data
        :return pandas.Dataframe df: de-trended signal segments
        """
        self.log('length of history data: {}'.format(self.history.__len__()),
                 level='info')

        n = int(self.dt1 / self.freq)
        m = int(self.dt2 / self.freq)

        while self.history.__len__() >= m and self.history.head(m).notnull().all().all() \
                or self.history.__len__() >= 2 * m and self.history.head(m).notnull().any().all():
            mode = self.history.head(m).mode()
            df = self.history[:n] - mode.loc[0].values
            self.history = self.history[n:]
            yield df
Пример #9
0
class Tokenizer(AbstractBolt):
    """Split the mail in token parts (body, attachments, etc.). """

    outputs = [
        Stream(fields=['sha256_random', 'mail', 'is_filtered'], name='mail'),
        Stream(fields=['sha256_random', 'body', 'is_filtered'], name='body'),
        Stream(fields=['sha256_random', 'with_attachments', 'attachments'],
               name='attachments')
    ]

    def initialize(self, stormconf, context):
        super(Tokenizer, self).initialize(stormconf, context)

        self._parser = MailParser()
        self._mails_analyzed = deque(maxlen=self.conf["maxlen_mails"])
        self._attachments_analyzed = deque(
            maxlen=self.conf["maxlen_attachments"])
        self._load_filters()

    def _load_filters(self):
        self._filter_mails_enabled = self.conf["filter_mails"]
        self._filter_attachments_enabled = self.conf["filter_attachments"]

    @property
    def filter_mails_enabled(self):
        return self._filter_mails_enabled

    @property
    def filter_attachments_enabled(self):
        return self._filter_attachments_enabled

    @property
    def parser(self):
        return self._parser

    def _filter_attachments(self):
        """
        Filter the attachments that are in memory, already analyzed
        """
        attachments = self.parser.attachments_list
        new_attachments = []

        for i in attachments:
            if i.get("content_transfer_encoding") == "base64":
                f = fingerprints(i["payload"].decode('base64'))
            else:
                f = fingerprints(i["payload"])

            if self.filter_attachments_enabled and \
                    f[1] in self._attachments_analyzed:
                new_attachments.append({
                    "md5": f[0],
                    "sha1": f[1],
                    "sha256": f[2],
                    "sha512": f[3],
                    "ssdeep": f[4],
                    "is_filtered": True
                })
            else:
                i["is_filtered"] = False
                new_attachments.append(i)

            self._attachments_analyzed.append(f[1])

        return new_attachments

    def _make_mail(self, tup):
        raw_mail = tup.values[0]
        mail_format = tup.values[4]
        rand = '_' + ''.join(random.choice('0123456789') for i in range(10))

        # Check if kind_data is correct
        if mail_format != STRING and mail_format != PATH:
            raise InvalidMailFormat(
                "Invalid mail format '{}'. Choose '{}' or '{}'".format(
                    mail_format, STRING, PATH))

        # Parsing mail
        if mail_format == PATH:
            if os.path.exists(raw_mail):
                self.parser.parse_from_file(raw_mail)
        else:
            self.parser.parse_from_string(raw_mail)

        # Getting all parts
        mail = self.parser.parsed_mail_obj

        # Data mail sources
        mail['mail_server'] = tup.values[1]
        mail['mailbox'] = tup.values[2]
        mail['priority'] = tup.values[3]

        # Fingerprints of body mail
        (mail['md5'], mail['sha1'], mail['sha256'], mail['sha512'],
         mail['ssdeep']) = fingerprints(self.parser.body.encode('utf-8'))
        sha256_rand = mail['sha256'] + rand

        # Add path to result
        if mail_format == PATH:
            mail['path_mail'] = raw_mail

        # Dates
        if mail.get('date'):
            mail['date'] = mail.get('date').isoformat()
        else:
            mail['date'] = datetime.datetime.utcnow().isoformat()

        mail['analisys_date'] = datetime.datetime.utcnow().isoformat()

        # Remove attachments
        mail.pop("attachments", None)

        return sha256_rand, raw_mail, mail

    def process_tick(self, freq):
        """Every freq seconds you reload configuration. """
        super(Tokenizer, self).process_tick(freq)
        self._load_filters()

    def process(self, tup):
        sha256_rand, raw_mail, mail = self._make_mail(tup)
        with_attachments = False
        attachments = []

        # If mail is already analyzed
        if self.filter_mails_enabled and \
                mail["sha1"] in self._mails_analyzed:
            mail.pop("body", None)
            body = ""
            is_filtered = True
        else:
            body = self.parser.body
            is_filtered = False

        # Emit mail
        self.emit([sha256_rand, mail, is_filtered], stream="mail")

        # Emit body
        self.emit([sha256_rand, body, is_filtered], stream="body")

        # Update databese mail analyzed
        self._mails_analyzed.append(mail["sha1"])

        # Emit only attachments
        if self.parser.attachments_list:
            attachments = self._filter_attachments()
            with_attachments = True

        self.emit([sha256_rand, with_attachments, attachments],
                  stream="attachments")
Пример #10
0
class DetrendBolt(Bolt):
    tup_fields = ('timestamp', 'time_offset', 'time_increment', 'samples',
                  'channel_name', 'module_name', 'data')
    outputs = [Stream(fields=tup_fields, name='detrend')]

    def initialize(self, storm_conf, context):
        """
        receive parameters set in topology definition from storm_conf argument
        :param dict storm_conf: the Storm configuration for this component
        :param dict context: information about the component’s place within the topology
        """
        self.min_tup_num = storm_conf.get('min_tup_num', 3)

    def process(self, tup):
        """
        step 1: receive tup and convert to pandas.Dataframe;
        step 2: call self._detrend() function to remove the mode value from the original data;
        step 3: convert resulted pandas.Dataframe to list and send out.
        :param streamparse.Tuple, tup: tup.values =
                [timestamp, time_offset, time_increment, samples, channel_name, module_name, data]
        :return streamparse.Tuple, tup: tup.values =
                [timestamp, time_offset, time_increment, samples, channel_name, module_name, data]
        """
        timestamp = tup.values[0]
        time_increment = tup.values[2]
        channel_name = tup.values[4]
        data = tup.values[6]

        start_time = pd.Timestamp(timestamp, unit='s', tz='UTC')
        periods = data.__len__()
        freq = '{}ms'.format(int(time_increment / 0.001))

        index = pd.MultiIndex.from_product(
            [[start_time],
             pd.date_range(start=start_time, periods=periods, freq=freq)],
            names=['start_time', 'timestamp'])
        df = pd.DataFrame(data=data, index=index, columns=[channel_name])

        try:
            self.history = self.history.combine_first(df)
        except AttributeError:
            self.history = df
            self.res = list(tup.values)

        for df in self._detrend():
            self.res[0] = df.index[0].timestamp()
            self.res[6] = df[channel_name].values.tolist()
            self.emit(self.res, stream='detrend')

    def _detrend(self):
        """
        remove the mode value from the original data
        :return pandas.Dataframe df: de-trended signal segments
        """
        self.log('\nlength of history data: {}'.format(self.history.__len__()),
                 level='info')

        start_time_index = self.history.index.get_level_values(
            level='start_time')
        unique_values = start_time_index.unique()
        self.log('\nstart time of tuples: {}'.format(unique_values))
        while unique_values.__len__() >= self.min_tup_num:
            mode = self.history.mode()
            df = self.history.loc[unique_values[0]] - mode.loc[0].values
            self.history.drop(index=unique_values[0],
                              level='start_time',
                              inplace=True)
            unique_values = unique_values.delete(0)
            yield df
Пример #11
0
class NeutralAxisBolt(Bolt):
    tup_fields = ('timestamp_min', 'timestamp_max', 'neutral_axis')
    outputs = [Stream(fields=tup_fields, name='neutral_axis')]

    def initialize(self, storm_conf, context):
        """
        receive parameters set in topology definition from storm_conf argument
        :param dict storm_conf: the Storm configuration for this component
        :param dict context: information about the component’s place within the topology
        """

        threshold = storm_conf['threshold']
        group_freq = storm_conf['group_freq']
        height = storm_conf['height']

        height = dict(height)
        columns = height.keys()
        index = pd.MultiIndex.from_tuples([],
                                          names=['group_time', 'sample_time'])

        self.threshold = threshold
        self.group_freq = group_freq
        self.height = pd.Series(height, name='height')
        self.height.sort_values(ascending=True, inplace=True)
        self.history = pd.DataFrame(data=[], index=index, columns=columns)

    def process(self, tup):
        """
        step 1: receive tup and reorganize to pandas.Dataframe;
        step 2: call self._neutral_axis() function to get the neutral axis height of a section ;
        step 3: convert resulted pandas.Dataframe to list and send out.
        :param streamparse.Tuple, tup: tup.values =
                [timestamp, time_offset, time_increment, samples, channel_name, module_name, data]
        :return streamparse.Tuple, tup: tup.values =
                [minima_timestamp, maxima_timestamp, neutral_axis]
        """
        # convert tuple to pandas.DataFrame with pandas.MultiIndex
        timestamp = tup.values[0]
        time_increment = tup.values[2]
        channel_name = tup.values[4]
        data = tup.values[6]

        start = pd.Timestamp(timestamp, unit='s', tz='UTC')
        periods = data.__len__()
        freq = '{}ms'.format(int(time_increment / 0.001))

        sample_time = pd.date_range(start=start, periods=periods, freq=freq)
        group_time = sample_time.floor(self.group_freq)

        index = pd.MultiIndex.from_arrays([group_time, sample_time],
                                          names=['group_time', 'sample_time'])
        df = pd.DataFrame(data=data, index=index, columns=[channel_name])

        # self.history holds all history data
        self.history = self.history.combine_first(df)
        # todo: discard history data 10 minutes before the latest data

        # group self.history by index <group_time> level
        notnull_count = self.history.groupby(
            level='group_time').apply(lambda x: x.notnull().sum().sum())
        count = self.history.columns.size * pd.Timedelta(
            self.group_freq) / pd.Timedelta(freq)
        notnull_count = notnull_count[notnull_count >= count]

        if not notnull_count.empty:
            # extract data ready for processing and discard them from history data
            data_ready = self.history.loc[notnull_count.index]
            self.history.drop(index=notnull_count.index,
                              level='group_time',
                              inplace=True)

            res = data_ready.groupby(level='group_time').apply(
                self._neutral_axis)
            for idx in res.index:
                self.emit(res[idx], stream='neutral_axis')

    def _neutral_axis(self, df):
        self.log('---------------excuting neutral_axis method---------------')
        master_channel = self.height.index[0]

        if df[master_channel].ptp() > self.threshold:
            idx_max = df[master_channel].idxmax()
            idx_min = df[master_channel].idxmin()
            ptp = df.loc[idx_max] - df.loc[idx_min]
            ptp.rename('ptp', inplace=True)

            xy = pd.concat([ptp, self.height], axis='columns')
            # deg = 1 stands for linear fit
            # the lowest polynomial coefficients (y-intercept) is the height of neutral axis
            neutral_axis = polyfit(xy['ptp'], xy['height'], 1)[-1]

            return [
                idx_min[1].timestamp(), idx_max[1].timestamp(), neutral_axis
            ]