def process(params): # load and run pipeline datasource = CSVDataSource(params.input_file) p = Pipeline('DEV') p.load_config(params.conf_file) p.load_model(params.model_file) p(datasource) # save, if necessary if params.output_file: datasource.save(open(params.output_file, 'w')) # push to elasticsearch es = Elasticsearch([{ 'host': 'localhost', 'port': 9200 }], http_auth=('admin', 'admin')) data = [item for item in datasource] for item in data: item['model'] = p._scoring_model_name item['raw'] = str(item['labels']) for key in item: if item[key] == 'NaN' or (is_numeric(item[key]) and np.isnan(item[key])): item[key] = None helpers.bulk(es, data, index="anomalies", doc_type="type")
def process(params): # load and run pipeline datasource = CSVDataSource(params.input_file) p = Pipeline('DEV') p.load_config(params.conf_file) model = p.build_pipeline(datasource) json.dump(model, open(params.model_file, 'w'))
def process(params): datasource = CSVDataSource(params.input_file) sys.stdout.write('Preprocessing') field_type = _detect_field_type(datasource) sys.stdout.write('\t::Detected field types:\n') for key in field_type: sys.stdout.write('\t\t"{0}": {1}\n'.format(key, field_type[key])) generators = _get_generators(datasource, field_type) sys.stdout.write('\t::Suggested generators:\n') for item in generators: sys.stdout.write('\t\t{0}: {1}\n'.format(item[0], item[1])) _write_conf(generators, params.output_file)
label_list.append(label) all_labels.append(label_list) dataset[dest_field_labels] = all_labels dataset['_labels'] = all_labels if self._detect_anomalies is not None: scores = self._detect_anomalies(dataset) dataset[dest_field_score] = scores if __name__ == '__main__': p = Pipeline('DEV') p.load_config('tests/pipeline_test.conf') import time ts1 = time.time() datasource = CSVDataSource('tests/test_small.csv') ts2 = time.time() pipeline_model = p.build_pipeline(datasource) ts3 = time.time() p(datasource) ts4 = time.time() json.dump(pipeline_model, open('tests/pipeline.json', 'w'), indent=4) for item in datasource[:10]: print(item) print() print() print( "Timing:\n\tLoad dataset: {0}\n\tBuild pipeline: {1}\n\tApply models:{2}\n\tDataset size: {3} entries\n".format( ts2 - ts1, ts3 - ts2, ts4 - ts3, len(datasource)))
def from_pretrained(pretrained: str) -> AnomalyDetection: tmp = json.loads(pretrained) pre_model = pickle.loads(base64.b64decode(tmp['model'])) model = SupervisedClassifierAnomaly() model._encoder = pre_model['encoder'] model._ind_to_ground_truth = pre_model['ind_to_ground_truth'] model._is_binary_preds = pre_model['is_binary_preds'] model._model = pre_model['classifier'] return model if __name__ == "__main__": from osas.data.datasources import CSVDataSource data_source = CSVDataSource('corpus/hubble_test_tags.csv') def coverter(x): return ast.literal_eval(x) data_source._data['_labels'] = data_source._data['_labels'].apply( lambda x: coverter(x)) model = StatisticalNGramAnomaly() tmp = model.build_model(data_source) tmp = json.dumps(tmp) model2 = StatisticalNGramAnomaly.from_pretrained(tmp) scores = model(data_source) scores2 = model2(data_source) import operator
reg_lab = [(regex, label) for regex, label in zip(regex_list, label_list)] kblg = KnowledgeBased(reg_lab, field_name) return kblg if __name__ == '__main__': mfc = MultinomialFieldCombiner(['user', 'parent_process'], absolute_threshold=500, relative_threshold=0.005) nfc = NumericField('count') tf = TextField('command', lm_mode='token', ngram_range=(3, 5)) klg = KeywordBased( keyword_list=['bash', 'java', 'netcat', 'sudo', 'apache2'], field_name='command') from osas.data.datasources import CSVDataSource dataset = CSVDataSource('corpus/test.csv') print("Building model") klg.build_model(dataset) print("Done") # rez = mfc.build_model(dataset) for item in dataset[:20]: print("\n\n") print(item) print("") print(klg(item)) print("\n\n") print("=" * 20)