def vectorize_subset(subset): # Checking dataset data_dir = os.path.join(os.getcwd(), 'ember') if subset == 'train': paths = [ os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6) ] n_rows = 900000 elif subset == 'test': paths = [ os.path.join(data_dir, "test_features.jsonl"), ] n_rows = 200000 else: logging.error('subset must be "train" or "test"') sys.exit(1) for p in paths: if not os.path.exists(p): logging.info('File not found: {}'.format(p)) sys.exit(1) X_path = os.path.join(data_dir, "X_{}.dat".format(subset)) y_path = os.path.join(data_dir, "y_{}.dat".format(subset)) if os.path.exists(X_path + '.shd256') and os.path.exists(y_path + '.shd256'): with open(X_path + '.shd256', 'r') as f: X_checksum = f.read() with open(y_path + '.shd256', 'r') as f: y_checksum = f.read() if X_checksum == sha256_checksum( X_path) and y_checksum == sha256_checksum(y_path): logging.info('"{}" subset is vectorized'.format(subset)) return # Allocate storage space dim = FeatureExtractor.dim X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(n_rows, dim)) y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=n_rows) del X, y logging.info('Vectorzing samples in "{}" subset'.format(subset)) pool = mp.Pool() arg_iterator = ( (row, raw_data, X_path, y_path, n_rows) for row, raw_data in enumerate(raw_feature_iterator(paths))) for _ in tqdm(pool.imap_unordered(vectorize_data, arg_iterator), unit='row', unit_scale=True, ncols=96, miniters=1, total=n_rows): pass X_checksum = sha256_checksum(X_path) with open(X_path + '.shd256', 'w') as f: f.write(X_checksum) y_checksum = sha256_checksum(y_path) with open(y_path + '.shd256', 'w') as f: f.write(y_checksum)
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump, output): """Malice PDF Plugin.""" try: # set up logging init_logging(verbose) # TODO: check if PDF is too big (max size 3000000 ??) # TODO: if PDFiD fails maybe build different response JSON with errors etc. pdfid_results = MalPDFiD(file_path).run() pdf_results = { 'pdfid': pdfid_results, 'streams': MalPdfParser(file_path, pdfid_results, should_dump=dump, dump_path=output, verbose=verbose).run(), } # pdf_dict['pdf']['peepdf'] = MalPeepdf(file_path).run() malice_scan = { 'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)), 'name': 'pdf', 'category': 'document', 'results': pdf_results } malice_scan['results']['markdown'] = json2markdown(pdf_results) # write to elasticsearch if eshost: try: e = Elastic(eshost, timeout=timeout) e.write(results=malice_scan) except Exception as e: log.exception( "failed to index malice/pdf results into elasticsearch") if table: print(malice_scan['results']['markdown']) else: print(json.dumps(pdf_results, indent=True)) # POST dropped files as a JSON blob back to malice server/daemon if callback: proxies = None if proxy: proxies = { 'http': proxy, 'https': proxy, } malice_scan['parent'] = os.environ.get('MALICE_SCANID', sha256_checksum(file_path)) requests.post(callback, json=malice_scan, proxies=proxies) except Exception as e: log.exception("failed to run malice plugin: pdf") return
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump, output, peid): """Malice PExecutable Scanner""" try: # set up logging init_logging(verbose) # TODO: check if EXE is too big (max size 3000000 ??) pe_results = MalPEFile(file_path, peid_db_path=peid, should_dump=dump, dump_path=output).run() malice_scan = { 'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)), 'name': 'pescan', 'category': 'exe', 'results': pe_results } try: malice_scan['results']['markdown'] = json2markdown(pe_results) except Exception as e: log.exception("failed to render jinja template") malice_scan['results']['markdown'] = e.message # write to elasticsearch if eshost: try: e = Elastic(eshost, timeout=timeout) e.write(results=malice_scan) except Exception as e: log.exception( "failed to index malice/pescan results into elasticsearch") if table: print(malice_scan['results']['markdown']) else: pe_results.pop('markdown') print(json.dumps(pe_results, indent=True)) # POST dropped files as a JSON blob back to malice server/daemon if callback: proxies = None if proxy: proxies = { 'http': proxy, 'https': proxy, } malice_scan['parent'] = os.environ.get('MALICE_SCANID', sha256_checksum(file_path)) requests.post(callback, json=malice_scan, proxies=proxies) except Exception as e: log.exception("failed to run malice plugin: pescan") return
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump, output, peid): """Malice PExecutable Scanner""" try: # set up logging init_logging(verbose) # TODO: check if EXE is too big (max size 3000000 ??) pe_results = MalPEFile(file_path, peid_db_path=peid, should_dump=dump, dump_path=output).run() malice_scan = { 'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)), 'name': 'pescan', 'category': 'exe', 'results': pe_results } try: malice_scan['results']['markdown'] = json2markdown(pe_results) except Exception as e: log.exception("failed to render jinja template") malice_scan['results']['markdown'] = e.message # write to elasticsearch if eshost: try: e = Elastic(eshost, timeout=timeout) e.write(results=malice_scan) except Exception as e: log.exception("failed to index malice/pescan results into elasticsearch") if table: print(malice_scan['results']['markdown']) else: pe_results.pop('markdown') print(json.dumps(pe_results, indent=True)) # POST dropped files as a JSON blob back to malice server/daemon if callback: proxies = None if proxy: proxies = { 'http': proxy, 'https': proxy, } malice_scan['parent'] = os.environ.get('MALICE_SCANID', sha256_checksum(file_path)) requests.post(callback, json=malice_scan, proxies=proxies) except Exception as e: log.exception("failed to run malice plugin: pescan") return
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump, output): """Malice Office/OLE/RTF Scanner""" try: # set up logging init_logging(verbose) # TODO: check if DOC is too big (max size 3000000 ??) o_results = MalOffice(file_path, should_dump=dump, dump_path=output).run() malice_scan = { 'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)), 'name': 'office', 'category': 'document', 'results': o_results } malice_scan['results']['markdown'] = json2markdown(o_results) # write to elasticsearch if eshost: try: e = Elastic(eshost, timeout=timeout) e.write(results=malice_scan) except Exception as e: log.exception( "failed to index malice/office results into elasticsearch") if table: print(malice_scan['results']['markdown']) else: o_results.pop('markdown') print(json.dumps(o_results, indent=True)) # POST dropped files as a JSON blob back to malice server/daemon if callback: proxies = None if proxy: proxies = { 'http': proxy, 'https': proxy, } malice_scan['parent'] = os.environ.get('MALICE_SCANID', sha256_checksum(file_path)) requests.post(callback, json=malice_scan, proxies=proxies) except Exception as e: log.exception("failed to run malice plugin: office") return
def addToIndex(cls, **kwargs): image_hash = util.sha256_checksum(kwargs['filename']) picture = session.query(cls).filter_by(file_hash=image_hash) if(picture.scalar()): util.logger.debug( 'Attempted to add existing image to index, skipping...') return False image = ColorThief(kwargs['filename']) dominant_color = image.get_color(quality=6) cls.create( filename=kwargs['filename'], file_hash=util.sha256_checksum(kwargs['filename']), color=json.dumps(dominant_color), ts_indexed=datetime.datetime.now()) session.commit()
def __calc_doubt_dest__(self, factor): if self.doubts == None: return None src_noext, _ext = os.path.splitext(factor.src) hash = sha256_checksum(factor.src) bn = hash + _ext dst = os.path.join(self.doubts, bn) return dst
def save_numpy_file(subset): data_dir = os.path.join(os.getcwd(), 'ember') if subset == 'train': n_rows = 900000 elif subset == 'test': n_rows = 200000 else: logging.error('subset must be "train" or "test"') sys.exit(1) X_npy = os.path.join(data_dir, "X_{}.npy".format(subset)) y_npy = os.path.join(data_dir, "y_{}.npy".format(subset)) if os.path.exists(X_npy + '.shd256') and os.path.exists(y_npy + '.shd256'): with open(X_npy + '.shd256', 'r') as f: X_checksum = f.read() with open(y_npy + '.shd256', 'r') as f: y_checksum = f.read() if X_checksum == sha256_checksum( X_npy) and y_checksum == sha256_checksum(y_npy): logging.info( 'Numpy files of "{}" subset is existed!'.format(subset)) return logging.info( 'Saving numpy files for labeled samples in "{}" subset'.format(subset)) dim = FeatureExtractor.dim X_dat = os.path.join(data_dir, "X_{}.dat".format(subset)) y_dat = os.path.join(data_dir, "y_{}.dat".format(subset)) X = np.memmap(X_dat, dtype=np.float32, mode="r", shape=(n_rows, dim)) y = np.memmap(y_dat, dtype=np.float32, mode="r", shape=n_rows) labeled_rows = (y != -1) np.save(X_npy, X[labeled_rows]) np.save(y_npy, y[labeled_rows]) X_checksum = sha256_checksum(X_npy) with open(X_npy + '.shd256', 'w') as f: f.write(X_checksum) y_checksum = sha256_checksum(y_npy) with open(y_npy + '.shd256', 'w') as f: f.write(y_checksum)
def __determine_dest__(self, src, pref_dest): src_dir = path.dirname(src) src_bn = path.basename(src) src_bn_noext, src_ext = path.splitext(src_bn) dst_dir = path.dirname(pref_dest) dst_bn = path.basename(pref_dest) dst_bn_noext, dst_ext = path.splitext(dst_bn) op_ctx = OpContext(src) if path.exists(pref_dest): if self.hash_check: src_hash = sha256_checksum(src) siblings = [] for sibling in os.listdir(dst_dir): if sibling.startswith(dst_bn_noext) and sibling.endswith( dst_ext): siblings.append(sibling) sibling_hash = sha256_checksum( path.join(dst_dir, sibling)) if sibling_hash == src_hash: op_ctx.duplication = path.join(dst_dir, sibling) return op_ctx aimed = False id = 1 while not aimed: new_dst_bn = dst_bn_noext + self.rename_pattern % ( id) + dst_ext new_dst = path.join(dst_dir, new_dst_bn) if not path.exists(new_dst): op_ctx.dest = new_dst return op_ctx id += 1 raise AssertionError('Error happens') else: op_ctx.dest = pref_dest return op_ctx
def __init__(self, file_path, should_dump=False, dump_path=None): self.file = file_path self.sha256 = sha256_checksum(self.file) self.data = open(file_path, 'rb').read() self.dump = None self.results = {} if not path.exists(self.file): raise Exception("file does not exist: {}".format(self.file)) if should_dump: if path.isdir(dump_path): self.dump = dump_path else: log.error("folder does not exist: {}".format(dump_path)) self.dump = None
print "------------------------------------" print "zookeeper configuratioins : %s %s" %(java64_home,zookeeper_hosts) # zookeeper configuratioins : /usr/jdk64/jdk1.8.0_112 [u'master', u'node1', u'node2'] print "------------------------------------" all_hosts = default("/clusterHostInfo/all_hosts", []) all_racks = default("/clusterHostInfo/all_racks", []) cluster_name = config["clusterName"] # clickhouse-config.xml clickhouse_config_json_template = config['configurations']['clickhouse-config'] tcp_port = config['configurations']['clickhouse-config']['tcp_port'] users_config = config['configurations']['clickhouse-config']['users_config'] clickhouse_data_path = config['configurations']['clickhouse-config']['path'] # clickhouse-metrika cluster configurations clickhouse_metrika_json_template = config['configurations']['clickhouse-metrika'] # clickhouse-user configurations clickhouse_users_json_template = config['configurations']['clickhouse-users']['clickhouse_users'] user_admin = config['configurations']['clickhouse-users']['user_admin'] user_admin_password = config['configurations']['clickhouse-users']['user_admin_password'] user_admin_password_sha256 = utils.sha256_checksum(user_admin_password) user_ck = config['configurations']['clickhouse-users']['user_ck'] user_ck_password = config['configurations']['clickhouse-users']['user_ck_password'] user_ck_password_sha256 = utils.sha256_checksum(user_ck_password)
if args.verbose: print('Scanning %s' % dirName) except UnicodeEncodeError as e: continue for filename in fileList: full_path = os.path.join(dirName, filename) try: p = subprocess.Popen(['file', '--brief', full_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, errors = p.communicate() file_type = str(output.strip().decode()) if "exe" in file_type: file_hash = utils.sha256_checksum(full_path) file_info = dict() file_info["path"] = full_path file_info["type"] = file_type file_info["sha256"] = file_hash status = utils.check_file_status(file_info) if status == utils.FILE_UNKNOWN: utils.add_file_to_db(file_info) elif status == utils.FILE_KNOWN_TOUCHED: utils.add_alert_do_db(file_info) elif status == utils.FILE_KNOWN_UNTOUCHED:
def __init__(self, file_path): self.file = file_path self.sha256 = sha256_checksum(self.file) self.oPDFiD = None