Пример #1
0
    def _get_training_data_y(self):
        data_getter = DataGetter()

        config.DF_BASE_START_DATE = config.TRAINING_DATE_START
        config.DF_BASE_END_DATE = config.TRAINING_DATE_END
        df_result = data_getter.get_deltas()
        df_result = df_result[config.TRAINING_DATA_TARGET]
        y = df_result.values
        if self.X is None:
            raise Exception(
                'X needs to be defined before defining Y. Run _get_training_data_x before this method.'
            )
        y = y[0:self.X.shape[0]]
        y = self._one_hot_encode(y, 2)

        config.DF_BASE_START_DATE = config.VALIDATION_DATE_START
        config.DF_BASE_END_DATE = config.VALIDATION_DATE_END
        df_result = data_getter.get_deltas()
        df_result = df_result[config.TRAINING_DATA_TARGET]
        y_val = df_result.values
        if self.X_val is None:
            raise Exception(
                'X needs to be defined before defining Y. Run _get_training_data_x before this method.'
            )
        y_val = y_val[0:self.X_val.shape[0]]
        y_val = self._one_hot_encode(y_val, 2)

        return y, y_val
    def __init_model(self):
        self.train_manager = DataGetter(
            'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json')
        '''
            在data_getter中bio_convert中lebel和setence增扩为元素相同长度相同的列表,对长度不足的句子补[0],对齐最长句句子长度,这步不知道有
            没有用,但是在原代码的data_manager中有这一步,data_getter和data_manager输出对象应该是相同的
            
        '''
        #self.total_size = len(self.train_manager.sentence)
        data = {
            "batch_size": 20,
            #"input_size": self.train_manager.input_size,
            "vocab": self.train_manager.vocab,
            "tag_map": self.train_manager.get_tag2idx,
        }
        self.save_params(data)
        dev_manager = DataGetter(
            'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json')
        #self.dev_batch = dev_manager.iteration()
        '''
            将iteration代码移植到data_getter中
        '''

        self.model = BiLSTMCRF(
            tag_map=self.train_manager.tag_map,
            batch_size=self.batch_size,
            vocab_size=Config.max_len,
            dropout=self.dropout,
            embedding_dim=self.embedding_size,
            hidden_dim=self.hidden_size,
        )
        self.restore_model()
Пример #3
0
 def _get_testing_data_x(self):
     data_getter = DataGetter()
     config.DF_BASE_START_DATE = config.TESTING_DATE_START
     config.DF_BASE_END_DATE = config.TESTING_DATE_END
     reshaped_data_lstm = data_getter.get_reshaped_data_for_lstm()
     self.X = reshaped_data_lstm
     return reshaped_data_lstm
Пример #4
0
 def _get_testing_data_y(self):
     data_getter = DataGetter()
     df_result = data_getter.get_deltas()
     df_result = df_result[config.TRAINING_DATA_TARGET]
     y = df_result.values
     if self.X is None:
         raise Exception(
             'X needs to be defined before defining Y. Run _get_testing_data_x before this method.'
         )
     y = y[0:self.X.shape[0]]
     y = self._one_hot_encode(y, 2)
     return y
Пример #5
0
    def _get_training_data_x(self):
        data_getter = DataGetter()

        config.DF_BASE_START_DATE = config.TRAINING_DATE_START
        config.DF_BASE_END_DATE = config.TRAINING_DATE_END
        X_train = data_getter.get_reshaped_data_for_lstm()
        self.X = X_train

        config.DF_BASE_START_DATE = config.VALIDATION_DATE_START
        config.DF_BASE_END_DATE = config.VALIDATION_DATE_END
        X_val = data_getter.get_reshaped_data_for_lstm()
        self.X_val = X_val

        return X_train, X_val
Пример #6
0
def can(roles, kwargs):
    event_id = kwargs.get('event_id', None)
    if event_id:
        for el in DataGetter.get_user_events_roles(event_id):
            if el.role in roles:
                return True
    return False
Пример #7
0
def can(roles, kwargs):
    event_id = kwargs.get("event_id", None)
    if event_id:
        for el in DataGetter.get_user_events_roles(event_id):
            if el.role in roles:
                return True
    return False
Пример #8
0
    def __init_model(self):
        self.train_manager = DataGetter(
            'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json')
        #self.total_size = len(self.train_manager.sentence)
        data = {
            "batch_size": 20,
            #"input_size": self.train_manager.input_size,
            "vocab": self.train_manager.vocab,
            "tag_map": self.train_manager.get_tag2idx,
        }
        self.save_params(data)
        dev_manager = DataGetter(
            'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json')
        #self.dev_batch = dev_manager.iteration()

        self.model = BiLSTMCRF(
            tag_map=self.train_manager.tag_map,
            batch_size=self.batch_size,
            vocab_size=Config.max_len,
            dropout=self.dropout,
            embedding_dim=self.embedding_size,
            hidden_dim=self.hidden_size,
        )
        self.restore_model()
Пример #9
0
from data_getter import DataGetter

D = DataGetter()
D_name = D.get_name()
D_number = D.get_number()


class DuichengWindow():
    def __init__(self):
        self.L1_name = D_name[0]
        self.L2_name = D_name[1]
        self.L3_name = D_name[2]
        self.L4_name = D_name[3]
        self.L5_name = D_name[4]
        self.L = D_number[0]
        self.W = D_number[1]
        self.G = D_number[2]
        self.num1 = D_number[3]
        self.num2 = D_number[4]
        self.L_a = D_number[5]
        self.L_b = self.L - 2 * self.G - self.L_a
        self.W_a1 = (self.W - 2 * self.G) // self.num1
        self.W_a2 = (self.W - 2 * self.G) // self.num2

    def SumWindow(self):

        if self.num2 == 1:
            L1 = 2 * ((self.L - self.G * 2) + (self.W - self.G * 2))
            print('窗框{}:{}'.format(self.L1_name, L1))
            L2 = self.num1 * (2 * (self.L_a + self.W_a1))
            print('窗扇{}:{}'.format(self.L2_name, L2))
class Train(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self):
        self.train_manager = DataGetter(
            'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json')
        '''
            在data_getter中bio_convert中lebel和setence增扩为元素相同长度相同的列表,对长度不足的句子补[0],对齐最长句句子长度,这步不知道有
            没有用,但是在原代码的data_manager中有这一步,data_getter和data_manager输出对象应该是相同的
            
        '''
        #self.total_size = len(self.train_manager.sentence)
        data = {
            "batch_size": 20,
            #"input_size": self.train_manager.input_size,
            "vocab": self.train_manager.vocab,
            "tag_map": self.train_manager.get_tag2idx,
        }
        self.save_params(data)
        dev_manager = DataGetter(
            'D://nlp work//gwx//task//NER//dataset//CLUE-NER2020//train.json')
        #self.dev_batch = dev_manager.iteration()
        '''
            将iteration代码移植到data_getter中
        '''

        self.model = BiLSTMCRF(
            tag_map=self.train_manager.tag_map,
            batch_size=self.batch_size,
            vocab_size=Config.max_len,
            dropout=self.dropout,
            embedding_dim=self.embedding_size,
            hidden_dim=self.hidden_size,
        )
        self.restore_model()

    def load_config(self):
        try:
            fopen = open("config.py", 'r', encoding='UTF-8')
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("config.py", "w", encoding='UTF-8')
            config = {
                "embedding_size": 3,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
            }
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = "models/"
        self.tags = train_manager.label
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                train_labels, train_sentences = train_manager.bio_converter()
                input_tensor = CustomData.__getitem__(index)

                sentences_tensor = torch.tensor(train_sentences,
                                                dtype=torch.long)  #转换为张量
                tags_tensor = torch.tensor(train_labels, dtype=torch.long)
                length_tensor = torch.tensor(len(self.train_manager.sentence),
                                             dtype=torch.long)

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate(input_tensor)
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self, input_tensor):
        train_labels, train_sentences = train_manager.bio_converter()
        length = len(self.train_manager.sentence)
        _, paths = self.model(input_tensor, train_sentences)
        print("\teval")
        for label in self.labels:
            f1_score(labels, paths, label, self.model.tag_map)
Пример #11
0
 def runBacktestOffline(self, instrument, granularity, startep, endep):
     dg = DataGetter(MSSQLGetter(OandaGetter(instrument, granularity)))
     ti = BackTestTicker(dg, startep, endep)
     self._run(ti)
Пример #12
0
"""
This module builds and executes the mailer system.
"""

import os
import sys
import csv
import requests
from mailer import Mailer
from data_getter import DataGetter
from parameters import (MAIL_SENDER, DATABASE_LOCATION, TARGET_DATABASE,
                        TEMPLATE_FILE, ATTACHMENTS_FOLDER, MAIL_COLUMN_NAME)

mailer = Mailer()
getter = DataGetter(DATABASE_LOCATION)


def get_mail_title():
    """Gets the title for the mail."""
    trailing = sys.argv[1:]
    return 'Sample Diffusion' if not trailing else " ".join(trailing)


def generate_message(template, subject, attachments, **kwargs):
    """Generates the adequate message depending on attachments."""
    text = template.format(**kwargs)
    if not attachments:
        return mailer.create_text_message(MAIL_SENDER,
                                          kwargs[MAIL_COLUMN_NAME], subject,
                                          text)
    return mailer.create_attachments_message(MAIL_SENDER,
Пример #13
0
 def __init__(self, hostid):
     self.hostid = hostid
     self.load_items()
     self.dgetter = DataGetter()
     self.set_anom()
     self.alarm_on = False
Пример #14
0
class ZabbixAnomaly(object):
    items = {}
    anom = None
    datum = None
    hostid = ""
    dgetter = None
    alarm_on = False

    def __init__(self, hostid):
        self.hostid = hostid
        self.load_items()
        self.dgetter = DataGetter()
        self.set_anom()
        self.alarm_on = False

    def set_anom(self):
        host = confget("jubatus_server", "host")
        port = int(confget("jubatus_server", "port"))
        name = confget("jubatus_server", "name")
        self.anom = client.Anomaly(host, port, name)


    def load_items(self):
        strsql = "select itemid, valuesize from target_items where hostid = \'%s\' and enabled = 1;" % (self.hostid)
        data = self.exec_selsql(strsql)
        items = {}
        for row in data:
            itemid = str(row[0])
            valuesize = row[1]
            self.items[itemid] = Item(itemid, valuesize)


    def norm(self, itemid, value):
        return self.items[str(itemid)].norm_value(value)


    def learn(self, endclock):
        self._run_anomaly(endclock, "add")

    def detect(self, endclock):
        self._run_anomaly(endclock, "calc")


    # method="add"|"calc"
    def _run_anomaly(self, endclock, method="add"):
        # Prepare data
        if method == "add":
            g_data = self.dgetter.g_get_history(endclock)
        if method == "calc":
            g_data = self.dgetter.g_copy_history(endclock)
        datadict = {}

        for row in g_data:
            (itemid, clock, value) = row
            itemid = str(itemid)

            if datadict.has_key(itemid):
                self._juba_proc(clock, datadict, method)
                datadict = {}

            if not datadict.has_key(self.hostid):
                (hour, weekday) = self.expand_clock(clock)
                datadict["hostid"] = self.hostid
                datadict["weekday"] = weekday*1.0/7
                datadict["hour"] = hour*1.0/24


            datadict[itemid] = value

        if len(datadict) > 0:
            self._juba_proc(clock, datadict, method)

        if method=="add":
            cf.log("Saving learned model")
            self.anom.save("latest")
            self.dgetter.remove_history(endclock)


    def _juba_proc(self, clock, datadict, method="add"):
        #if DEBUG:
        #    print datadict
        datum = Datum()
        for k in datadict.keys():
            #print "key:%s value:%s" % (str(k), str(datadict[k]))
            if k == "hostid":
                datum.add_number(str(k), int(datadict[k])*1.0/ZBX_ITEMID_DIGITS)
            elif k == "weekday" or k == "hour":
                datum.add_number(str(k), datadict[k])
            elif k != "hostid" and k != "weekday" and k != "hour":
                datum.add_number(str(k), self.norm(k, datadict[k]))
        #print datum

        retry_cnt = JUBA_RETRY_MAX
        while True:
            try:
                if method=="add":
                    print datum
                    ret = self.anom.add(datum)
                    exit()
                if method=="calc":
                    print datum
                    score = self.anom.calc_score(datum)
                    if score == float('Inf') or score > ML_LIMIT:
                        #print datadict
                        if self.alarm_on == False:
                            self.alarm_on = True
                            cf.log("[%s] score=%f" % (cf.clock2strjst(clock), score))
                    else:
                        if self.alarm_on == True:
                            self.alarm_on = False
                            cf.log("[%s] score recovered to normal:score=%f" % (cf.clock2strjst(clock), score))

                break
            except (msgpackrpc.error.TransportError, msgpackrpc.error.TimeoutError) as e:
                retry_count -= 1
                if retry_count <= 0:
                    raise
                self.anom.get_client().close()
                self.set_anom()

                print e
                time.sleep(JUBA_RETRY_INTERVAL)
                continue


    def expand_clock(self, clock):
        d = cf.clock2datetime(clock)
        hour = d.hour
        weekday = d.weekday()
        return (hour, weekday)


    def exec_selsql(self, strsql):
        return sf.exec_selsql(MYSQL_JUBATUS, strsql)