def _maybe_extract(data_dir, extracted_data, archive): # If data_dir/extracted_data does not exist, extract archive in data_dir if not gfile.Exists(path.join(data_dir, extracted_data)): print('Extracting "%s"...' % archive) with tarfile.open(archive) as tar: members = list(tar.getmembers()) for i, member in enumerate(members): print_progress(i + 1, len(members)) tar.extract(member, path=data_dir)
def process(input, output, silence_length, silence_thresh, temp_dir, padding, language, ambient_noise, keep_temporary, silent, seek_step, max_workers): # open the audio file stored in file system speech = AudioSegment.from_file(input) print('Splitting (this could take a while...)') if not silent else None # split track where silence is <silence-length> ms. or bigger chunks = split_on_silence( speech, # must be silent for at least <silence-length> ms. min_silence_len=silence_length, # consider it silent if quieter than <silence-thresh> dBFS silence_thresh=silence_thresh, seek_step=seek_step) total = len(chunks) # create temporary dir if it doesn't exist try: os.mkdir(temp_dir) except (FileExistsError): pass # Create <padding> ms silence chunk silence = AudioSegment.silent(duration=padding) futures = [] try: with ThreadPoolExecutor(max_workers=max_workers) as executor: # process each chunk for i, chunk in enumerate(chunks): futures.append( executor.submit( process_chunck, silence + chunk + silence, i, os.path.basename(os.path.splitext(input)[0]), temp_dir, ambient_noise, keep_temporary, language)) print_progress(0, total, prefix='Converting:') if not silent else None for i, future in enumerate(as_completed(futures)): if future.exception(): # if exception was not handled abort as conversion won't be able to complete executor._threads.clear() thread._threads_queues.clear() raise future.exception() print_progress(i + 1, total, prefix='Converting:') if not silent else None except Exception as e: sys.stderr.write('\nError: Canceling execution: {}\n'.format(e)) sys.exit(1) print('\nSaving...') if not silent else None with open(output, 'w+') as f: for text in map(lambda f: f.result(), futures): if text != None: f.write('{}.\n'.format(text))
def _maybe_extract(target_dir, extracted_data, archive_path): # If target_dir/extracted_data does not exist, extract archive in target_dir extracted_path = path.join(target_dir, extracted_data) if not path.exists(extracted_path): print('No directory "%s" - extracting archive...' % archive_path) with tarfile.open(archive_path) as tar: members = list(tar.getmembers()) for i, member in enumerate(members): print_progress(i + 1, len(members)) tar.extract(member, path=target_dir) else: print('Found directory "%s" - not extracting it from archive.' % archive_path)
def _maybe_download(archive_name, target_dir, archive_url): # If archive file does not exist, download it... archive_path = path.join(target_dir, archive_name) if not path.exists(archive_path): print('No archive "%s" - downloading...' % archive_path) req = requests.get(archive_url, stream=True) total_size = int(req.headers.get('content-length', 0)) done = 0 with open(archive_path, 'wb') as f: for data in req.iter_content(1024 * 1024): done += len(data) f.write(data) print_progress(done, total_size) else: print('Found archive "%s" - not downloading.' % archive_path) return archive_path
def _maybe_download(archive_name, target_dir, archive_url): # If archive file does not exist, download it... archive_path = path.join(target_dir, archive_name) if not path.exists(archive_path): print('No archive "%s" - downloading...' % archive_path) req = requests.get(archive_url, stream=True) total_size = int(req.headers.get('content-length', 0)) done = 0 with open(archive_path, 'wb') as f: for data in req.iter_content(1024*1024): done += len(data) f.write(data) print_progress(done, total_size) else: print('Found archive "%s" - not downloading.' % archive_path) return archive_path
def one_sample(sample): mp3_filename = path.join(*(sample[0].split('/'))) mp3_filename = path.join(extracted_dir, mp3_filename) # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = path.getsize(wav_filename) with lock: if int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(sample[1])): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames / SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) print_progress(counter['all'], num_samples) counter['all'] += 1
def one_sample(sample): mp3_filename = path.join(*(sample[0].split('/'))) mp3_filename = path.join(extracted_dir, mp3_filename) # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = path.getsize(wav_filename) with lock: if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames/SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) print_progress(counter['all'], num_samples) counter['all'] += 1
def match(data_in, fp_in, ip=None, force=False, binary=False, log_path=None, test=False, print_report=False, latex=False): data = load_data(data_in) fps = joblib.load(fp_in) method_names = list(fps["method_fingerprints"].keys()) print("Matching ...") if log_path: log_file = open(log_path, "w") results = [] if not ip: for method_name in method_names: if not methods.methods.get(method_name): print("Warning: no such method '{}'".format(method_name)) continue if not fps["method_fingerprints"].get(method_name): print("Warning: the fingerprint file does not contain a fingerprint for method '{}'".format(method_name)) continue method = methods.methods[method_name] method.use_fingerprints(fps["method_fingerprints"][method_name]) if test: configs = method.get_configs() else: configs = [method.get_default_config()] num_matched = 0 for conf in configs: method.use_config(conf) start = time.time() num_matched = 0 y_true = [] y_pred = [] labels = [] print_progress(0, len(data)) if test: # cannot use pool in test because diff method needs to cache results # which won't work with multiprocessing match_map = map(functools.partial(method.match, force=force, test=test), data.values()) else: pool = multiprocessing.Pool(4) match_map = pool.imap_unordered(functools.partial(method.match, force=force, test=test), data.values()) count = 0 last_count = 0 time_window_start = time.time() time_left=None for host, matches in match_map: if count > 0: if time.time() - time_window_start > 2: elapsed = time.time() - time_window_start avg_host_time = elapsed/(count-last_count) time_left = int(avg_host_time*(len(data)-count)) last_count = count time_window_start = time.time() count += 1 print_progress(count, len(data), estimated_time=time_left) host_labels = host.label_str() if (method.is_binary_classifier() or binary) and host_labels != "unlabeled": host_labels = "malicious" if host_labels not in labels: labels.append(host_labels) match_labels = Label.to_str(matches) if (method.is_binary_classifier() or binary) and match_labels != "unlabeled": match_labels = "malicious" if match_labels not in labels: labels.append(match_labels) if match_labels != "unlabeled" and not print_report: print("\x1b[2K\r{}: {}".format(host.ip, match_labels)) y_true.append(labels.index(host_labels)) y_pred.append(labels.index(match_labels)) if not test: pool.close() end = time.time() if print_report: report = classification_report(y_true, y_pred, target_names=labels, zero_division=0, digits=5, output_dict=True if latex else False) if latex: report = report_to_latex_table(report) perf_text = " ----- Performance result -----\n" perf_text += "Method: {}\n".format(method_name) perf_text += "Config: " + ", ".join("{} = {}".format(k, v) for k, v in conf.items()) + "\n" perf_text += "Classification report:\n" perf_text += str(report) + "\n" perf_text += "Confusion Matrix (x-axis: guess, y-axis: true):\n" perf_text += "Labels: {}\n".format(labels) perf_text += str(confusion_matrix(y_true, y_pred)) + "\n" perf_text += "Took {} seconds to perform".format(end-start) perf_text += "\n\n" precision = precision_score(y_true, y_pred, average="micro") results.append({"method": method_name, "config": conf, "precision": precision}) if log_path: log_file.write(perf_text) log_file.flush() print("") print(perf_text) if hasattr(method, "post_match"): method.post_match() # if two or more methods were used, print precision ranking if len(results) > 1: result_text = " ----- Best performing method/config -----\n" for i, result in enumerate(sorted(results, key=lambda k: k["precision"], reverse=True)): result_text += "{}.\n".format(i+1) result_text += "Method: {}\n".format(result["method"]) result_text += "Config: " + ", ".join("{} = {}".format(k, v) for k, v in result["config"].items()) + "\n" result_text += "Precision: {}\n\n".format(result["precision"]) if log_path: log_file.write(result_text) print(result_text) else: host = data.get(ip) if not host: print("Error: No host {} exists in data file.".format(ip)) sys.exit(1) if log_path: log_file.close()
def database_extract(output, database, label_path, pcap_path, keep): host_map = {} tls_map = {} for db_file in database: print("Extracting data from {} ...".format(db_file)) try: open(db_file, "r") dbh = sqlite3.connect(db_file) except: print("error: Failed opening database '{}'.".format(db_file)) sys.exit(1) dbh.row_factory = sqlite3.Row curse = dbh.cursor() curse.execute("SELECT COUNT(*) FROM Probe;") total_rows = curse.fetchone()[0] curse.execute("SELECT * FROM Probe;") processed_rows = 0 while True: row = curse.fetchone() print_progress(processed_rows, total_rows) processed_rows += 1 if not row: break ip = row["ip"] uuid = row["uuid"] if not host_map.get(ip): host_map[ip] = modules.host.Host(ip, uuid) if keep != "both" and host_map[ip].uuid != uuid: if keep == "old": # don't use the probe data that comes from newer scan continue elif keep == "new": # keep the newer scan , trash the older probe data host_map[ip] = modules.host.Host(ip, uuid) if ip in tls_map: del tls_map[ip] module_name = row["name"] port = row["port"] if port == 0: mod_obj = modules.get_module(module_name) if not mod_obj: continue # ip module stuff mod_obj.add_data(row) if mod_obj.name == "geoip": host_map[ip].geoip = mod_obj elif mod_obj.name == "rdns": host_map[ip].rdns = mod_obj else: # module stuff if module_name == "tls": if ip not in tls_map: tls_map[ip] = {} port_obj = tls_map[ip].get(port) if not port_obj: port_obj = modules.get_port("tls", port) tls_map[ip][port] = port_obj else: port_obj = host_map[ip].ports.get(port) if not port_obj: port_obj = modules.get_port(module_name, port) host_map[ip].insert_port(port_obj) try: port_obj.add_data(row) except Exception as e: print("Error adding data for {}:{}".format(ip, port)) import traceback traceback.print_exc() sys.exit(1) curse.close() print("") # adding tls module to ports for ip, port_map in tls_map.items(): for port, tls in port_map.items(): port_obj = host_map[ip].ports.get(port) if not port_obj: port_obj = modules.get_port("generic", port) host_map[ip].insert_port(port_obj) port_obj.tls = tls # remove ip that doesn't have any ports open, or none gives any response print("Filtering hosts without any ports open") remove_ip = set() for ip in host_map: if len(host_map[ip].ports) == 0: # TODO: add a flag that decides whether to exclude this or not #print("{}: No ports open, omitting".format(ip)) remove_ip.add(ip) continue """if len(host_map[ip].responsive_ports()) == 0: # TODO: add a flag that decides whether to exclude this or not print("{}: No ports responded, omitting".format(ip)) remove_ip.append(ip) continue""" for ip in remove_ip: del host_map[ip] print("Filtered {} hosts".format(len(remove_ip))) # add labels to hosts if label_path: print("Adding labels to hosts") with open(label_path, "r") as f: line = f.readline() while line != "": csv = line.strip().split(",") line = f.readline() if len(csv) != 4: continue mwdb_id, ip, port, family = csv if ip in host_map: try: port = int(port) except: # some c2 doesn't have port specified in label port = None pass host_map[ip].add_label(mwdb_id, family, port) # remove labels where label port is not open # and remove the ip if it loses all label, since it means the relevant (C2 acting) port is closed print("Filtering hosts without any label ports open") remove_ip = set() for ip in host_map: if host_map[ip].filter_labels(): remove_ip.add(ip) for ip in remove_ip: del host_map[ip] print("Filtered {} hosts".format(len(remove_ip))) if pcap_path: print("Adding pcap data...") pcap_extract(pcap_path, host_map) # TODO: serialize host object print("{} hosts processed".format(len(host_map))) print("Saving data to file {} ...".format(output)) joblib.dump(host_map, output) dbh.close()
def _maybe_convert_set(extracted_dir, source_csv, target_csv): print() if path.exists(target_csv): print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv)) return print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv)) train_dir = path.join(extracted_dir, TRAIN_DIR_NAME) dev_dir = path.join(extracted_dir, DEV_DIR_NAME) test_dir = path.join(extracted_dir, TEST_DIR_NAME) train_files = glob(path.join(train_dir, '*.mp3')) dev_files = glob(path.join(dev_dir, '*.mp3')) test_files = glob(path.join(test_dir, '*.mp3')) samples = [] with open(source_csv) as source_csv_file: reader = csv.DictReader(source_csv_file) for row in reader: if (((TRAIN_CSV_NAME in source_csv) and any( str(row['filename']) in train_file for train_file in train_files)) or ((DEV_CSV_NAME in source_csv) and any( str(row['filename']) in dev_file for dev_file in dev_files)) or ((TEST_CSV_NAME in source_csv) and any( str(row['filename']) in test_file for test_file in test_files))): samples.append((row['filename'], row['text'])) # Mutable counters for the concurrent embedded routine counter = {'all': 0, 'too_short': 0, 'too_long': 0} lock = Lock() num_samples = len(samples) rows = [] def one_sample(sample): mp3_filename = path.join(*(sample[0].split('/'))) mp3_filename = path.join(extracted_dir, mp3_filename) # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = path.getsize(wav_filename) with lock: if int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(sample[1])): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames / SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) print_progress(counter['all'], num_samples) counter['all'] += 1 print('Importing mp3 files...') pool = Pool(cpu_count()) pool.map(one_sample, samples) pool.close() pool.join() print_progress(num_samples, num_samples) print('Writing "%s"...' % target_csv) with open(target_csv, 'w') as target_csv_file: writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES) writer.writeheader() for i, row in enumerate(rows): filename, file_size, transcript = row print_progress(i + 1, len(rows)) writer.writerow({'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript}) print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
def _maybe_convert_set(extracted_dir, source_csv, target_csv): print() if path.exists(target_csv): print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv)) return print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv)) samples = [] with open(source_csv) as source_csv_file: reader = csv.DictReader(source_csv_file) for row in reader: samples.append((row['filename'], row['text'])) # Mutable counters for the concurrent embedded routine counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } lock = Lock() num_samples = len(samples) rows = [] def one_sample(sample): mp3_filename = path.join(*(sample[0].split('/'))) mp3_filename = path.join(extracted_dir, mp3_filename) # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = path.getsize(wav_filename) with lock: if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames/SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) print_progress(counter['all'], num_samples) counter['all'] += 1 print('Importing mp3 files...') pool = Pool(cpu_count()) pool.map(one_sample, samples) pool.close() pool.join() print_progress(num_samples, num_samples) print('Writing "%s"...' % target_csv) with open(target_csv, 'w') as target_csv_file: writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES) writer.writeheader() for i, row in enumerate(rows): filename, file_size, transcript = row print_progress(i + 1, len(rows)) writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript }) print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))