def performance_report(y_test, y_pred): try: print(classification_report(y_test, y_pred, target_names=categories, zero_division=0)) except ValueError as e: logger.critical(f'Classification report: Invalid param `target_names`.') logger.error(e) print(classification_report(y_test, y_pred, zero_division=0))
def save_obj(obj, filepath): try: with open(filepath, 'wb') as f: pickle.dump(obj, f) logger.debug(f'Pickled object `{filepath}` saved to disk.') except Exception as e: logger.critical(f'An unexpected error occurred while saving the object `{filepath}`.') logger.error(e)
def get_all_tags(): try: conn = sqlite3.connect(DBFILE) c = conn.cursor() c.execute('SELECT filepath, tag, text, hosts FROM tags') res = c.fetchall() return res except Exception as e: logger.critical(f'Unable to fetch tags from database.') logger.error(e) sys.exit(2)
def load_obj(filepath): try: with open(filepath, 'rb') as f: obj = pickle.load(f, encoding='latin1') logger.debug(f'Pickled object loaded from `{filepath}`.') return obj except FileNotFoundError: logger.critical(f'Pickled object `{filepath}` was not found. Exiting.') sys.exit(2) except Exception as e: logger.critical(f'An unexpected error occurred while loading the object `{filepath}`.') logger.error(e) sys.exit(2)
def trainv2(std=False, algo='mnb', n=None): """Train a model using Naive Bayes Parameters ---------- std : bool Standardize the data. algo : str The algorithm to use. Can be either `mnb` or `cnb` n : int Select n samples from each category. (Default: All) Returns ------- """ df = create_dataframe(n=n) counts, df = process_dataframe(df, algo=algo) ### Todo: Remove save_obj(df, 'v2_dataframe.p') save_obj(counts, 'v2_counts.p') ### # messages_train, messages_test, labels_train, labels_test x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.3, random_state=69) if std: x_train, x_test = standardize(x_train, x_test) if algo == 'cnb': model = ComplementNB() elif algo == 'mnb': model = MultinomialNB() else: logger.critical( f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.' ) model = MultinomialNB() model.fit(x_train, y_train) save_model(model, version='v2', algo=algo) y_pred = model.predict(x_test) print(f'Accuracy: {accuracy_score(y_test, y_pred)}') measure.evaluate(model, x_test, y_test) measure.performance_report(y_test, y_pred) measure.plot_confusion_mat(model, x_test, y_test) title = f'Learning Curves ({algo.upper()})' learning_curve.plot(model, x_test, y_test, title=title)
def warn_failed(failed): logger.critical(f'An error occurred with {len(failed)} files. See `failed.out` for filenames.') try: fail_cat = { 'legit': len([fn for fn in failed if fn.startswith('legit')]), 'spam': len([fn for fn in failed if fn.startswith('spam')]), 'phish': len([fn for fn in failed if fn.startswith('phish')]), 'malware': len([fn for fn in failed if fn.startswith('malware')]), 'fraud': len([fn for fn in failed if fn.startswith('fraud')]) } logger.critical(f'Failed:\n{fail_cat}') except: pass with open('failed.out', 'w') as f: f.write('\n'.join(failed))
def train(std=False, algo='mnb'): """Train a model using Naive Bayes Parameters ---------- std : bool Standardize the data algo : str The algorithm to use. Can be either `mnb` or `cnb` Returns ------- """ dictionary = make_dictionary() features, labels = make_dataset(dictionary) ### Todo: Remove save_obj(features, 'v1_features.p') save_obj(labels, 'v1_labels.p') ### x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=69) if std: x_train, x_test = standardize(x_train, x_test) if algo == 'cnb': model = ComplementNB() elif algo == 'mnb': model = MultinomialNB() else: logger.critical( f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.' ) model = MultinomialNB() model.fit(x_train, y_train) save_model(model, version='v1', algo=algo) y_pred = model.predict(x_test) print(f'Accuracy: {accuracy_score(y_test, y_pred)}') measure.evaluate(model, x_test, y_test) measure.performance_report(y_test, y_pred) measure.plot_confusion_mat(model, x_test, y_test) title = f'Learning Curves ({algo.upper()})' learning_curve.plot(model, x_test, y_test, title=title)
def get_n_tags(n): cats = [0, 1, 2, 3, 4] res = [] try: conn = sqlite3.connect(DBFILE) c = conn.cursor() for cat in cats: try: c.execute('SELECT filepath, tag, text, hosts FROM tags WHERE tag=?', (cat,)) tags = c.fetchall() if n <= len(tags): res += random.sample(tags, n) else: logger.warn(f'n={n} is higher than the number of samples in {cat}. Selecting {len(tags)} (all) samples.') res += random.sample(tags, len(tags)) except Exception as e: logger.critical(f'Unable to fetch {n} tags for category {cat}.') logger.error(e) return res except Exception as e: logger.critical(f'Unable to fetch tags from database.') logger.error(e) sys.exit(2)
import sys from katatasso.helpers.const import CATEGORIES from katatasso.helpers.extraction import (get_tfidf_counts, make_dictionary, process_dataframe) from katatasso.helpers.logger import rootLogger as logger from katatasso.helpers.utils import load_model try: from sklearn.metrics import accuracy_score from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer import juicer import numpy as np import pandas as pd except ModuleNotFoundError as e: logger.critical(f'Module `{e.name}` not found. Please install before proceeding.') sys.exit(2) def classify(text, algo='mnb'): """Classify the text using a Naive Bayes model with word vector counts Parameters ---------- text : str The text input to classify algo : str The algorithm to use `mnb` for Multinomial Naïve Bayes,
def main(): TEXT = None CONFIG = {} result = {} if len(sys.argv) < 2: print(HELPMSG) logger.critical('No input specified') sys.exit(2) argv = sys.argv[1:] try: opts, args = getopt.getopt(argv, 'hf:st:c:na:l:o:d:v', [ 'help', 'infile=', 'stdin', 'std', 'algo=', '--limit', 'train=', 'classify=', 'outfile=', 'format=', 'verbose', 'log-file' ]) except getopt.GetoptError: print(HELPMSG) sys.exit(2) if not opts: print(HELPMSG) sys.exit(0) """ Increase verbosity """ opts_v = len(list(filter(lambda opt: opt == ('-v', ''), opts))) if opts_v > 4: opts_v = 4 v = 0 while v < opts_v: increase_log_level() v += 1 """ Log to file """ if v > 0: enable_logfile = list( filter(lambda opt: opt[0] in ('--log-file'), opts)) if enable_logfile: log_to_file() for opt, arg in opts: if opt == '--help': print(HELPMSG) sys.exit(0) elif opt in ('-f', '--infile'): file_path = arg logger.debug(f'Using input file {file_path}') try: with open(file_path, 'r') as f: TEXT = f.read() except FileNotFoundError: logger.critical( f'The specified file {file_path} does not exist.') sys.exit(2) except Exception as e: logger.critical( f'An error occurred while reading the file `{file_path}`.') logger.error(e) sys.exit(2) elif opt in ('-s', '--stdin'): try: logger.debug(f'Using input from STDIN') TEXT = sys.stdin.read() except Exception as e: logger.critical(f'An error occurred while reading from stdin.') logger.error(e) sys.exit(2) elif opt in ('-n', '--std'): logger.debug(f'OPTION: Standardizing data.') CONFIG['std'] = True elif opt in ('-a', '--algo'): logger.debug('OPTION: Using Complement Naïve Bayes algorithm') if arg not in ['mnb', 'cnb']: print(HELPMSG) logger.critical( f'The specified algorithm `{arg}` is not available.') sys.exit(2) else: CONFIG['algo'] = arg elif opt in ('-l', '--limit'): if arg.isnumeric: logger.debug(f'OPTION: Using n={arg} samples.') CONFIG['n'] = int(arg) else: print(HELPMSG) logger.critical(f'n={arg} is non-numeric.') sys.exit(2) elif opt in ('-t', '--train'): logger.debug(f'ACTION: Creating model from dataset') if arg == 'v1': katatasso.train(std=CONFIG.get('std', False), algo=CONFIG.get('algo', 'mnb')) elif arg == 'v2': katatasso.trainv2(std=CONFIG.get('std', False), algo=CONFIG.get('algo', 'mnb'), n=CONFIG.get('n', None)) else: logger.critical( f'Please specify either `v1` or `v2`. E.g. `katatasso -t v2`' ) sys.exit(2) elif opt in ('-c', '--classify'): if TEXT: logger.debug(f'ACTION: Classifying input') if CONFIG.get('cnb'): algo = 'cnb' else: algo = 'mnb' if arg == 'v1': category = katatasso.classify(TEXT, algo=algo) elif arg == 'v2': category = katatasso.classifyv2(TEXT, algo=algo) else: logger.critical( f'Please specify either `v1` or `v2`. E.g. `katatasso -c v2`' ) sys.exit(2) result = { 'category': category, 'accuracy': 'n/a', 'alias': CATEGORIES.get(category) } else: logger.critical(f'Missing input (specify using -f or -s)') sys.exit(2) elif opt in ('-o', '--outfile'): logger.debug(f'CONFIG: Setting output file to {arg}') CONFIG['outfile'] = arg elif opt in ('-d', '--format'): if arg in ['plain', 'json']: logger.debug(f'CONFIG: Setting output file format to {arg}') CONFIG['format'] = arg else: logger.critical('Invalid format. Must be one of [plain, json]') sys.exit(2) if result: outformat = CONFIG.get('format') outfile = CONFIG.get('outfile') if outfile: ext = 'json' if outformat == 'json' else 'txt' fname = f'{outfile}.{ext}' if outformat == 'plain': with open(fname, 'w') as f: f.write('\n'.join(list(result.values()))) elif outformat == 'json': import json with open(fname, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=4) logger.debug(f'Results saved to file `{fname}`') sys.exit(0) else: for k, v in result.items(): print(f'{k}: {v}') sys.exit(0)