def __create_dataframe(): failed = [] labels = [] contents = [] tags = get_all_tags() tagger = juicer.initStanfordNERTagger() if tags: for filepath, tag in progress_bar(tags): try: email = emailyzer.from_file(filepath) content = email.html_as_text # Preprocess, extract entities words = juicer.extract_stanford(content, named_only=False, stemming=False, tagger=tagger) contents.append(words) labels.append(tag) except AttributeError: failed.append(filepath.replace(CLF_TRAININGDATA_PATH, '')) pass df = pd.DataFrame(list(zip(labels, contents)), columns = ['label', 'message']) if failed: warn_failed(failed) return df else: logger.error('No tags were found in the database.') return None
def make_dictionary(): failed = [] tags = get_all_tags() words = [] logger.debug('Creating dictionary..') if tags: for filepath, tag, text, hosts in progress_bar(tags): try: words += text.split() except AttributeError: failed.append(filepath.replace(CLF_TRAININGDATA_PATH, '')) pass # Remove non-alphanumeric values words = [word for word in words if word.isalpha()] # Get the count of each word dictionary = Counter(words) if failed: warn_failed(failed) return dictionary.most_common(CLF_DICT_NUM) else: logger.error('No tags were found in the database.') return None
def performance_report(y_test, y_pred): try: print(classification_report(y_test, y_pred, target_names=categories, zero_division=0)) except ValueError as e: logger.critical(f'Classification report: Invalid param `target_names`.') logger.error(e) print(classification_report(y_test, y_pred, zero_division=0))
def save_obj(obj, filepath): try: with open(filepath, 'wb') as f: pickle.dump(obj, f) logger.debug(f'Pickled object `{filepath}` saved to disk.') except Exception as e: logger.critical(f'An unexpected error occurred while saving the object `{filepath}`.') logger.error(e)
def get_all_tags(): try: conn = sqlite3.connect(DBFILE) c = conn.cursor() c.execute('SELECT filepath, tag, text, hosts FROM tags') res = c.fetchall() return res except Exception as e: logger.critical(f'Unable to fetch tags from database.') logger.error(e) sys.exit(2)
def load_obj(filepath): try: with open(filepath, 'rb') as f: obj = pickle.load(f, encoding='latin1') logger.debug(f'Pickled object loaded from `{filepath}`.') return obj except FileNotFoundError: logger.critical(f'Pickled object `{filepath}` was not found. Exiting.') sys.exit(2) except Exception as e: logger.critical(f'An unexpected error occurred while loading the object `{filepath}`.') logger.error(e) sys.exit(2)
def create_dataframe(n=None): labels = [] contents = [] if n: tags = get_n_tags(n) else: tags = get_all_tags() if tags: for filepath, tag, text, hosts in progress_bar(tags): contents.append(text) labels.append(tag) return pd.DataFrame(list(zip(labels, contents)), columns = ['label', 'message']) else: logger.error('No tags were found in the database.') return None
def get_n_tags(n): cats = [0, 1, 2, 3, 4] res = [] try: conn = sqlite3.connect(DBFILE) c = conn.cursor() for cat in cats: try: c.execute('SELECT filepath, tag, text, hosts FROM tags WHERE tag=?', (cat,)) tags = c.fetchall() if n <= len(tags): res += random.sample(tags, n) else: logger.warn(f'n={n} is higher than the number of samples in {cat}. Selecting {len(tags)} (all) samples.') res += random.sample(tags, len(tags)) except Exception as e: logger.critical(f'Unable to fetch {n} tags for category {cat}.') logger.error(e) return res except Exception as e: logger.critical(f'Unable to fetch tags from database.') logger.error(e) sys.exit(2)
def main(): TEXT = None CONFIG = {} result = {} if len(sys.argv) < 2: print(HELPMSG) logger.critical('No input specified') sys.exit(2) argv = sys.argv[1:] try: opts, args = getopt.getopt(argv, 'hf:st:c:na:l:o:d:v', [ 'help', 'infile=', 'stdin', 'std', 'algo=', '--limit', 'train=', 'classify=', 'outfile=', 'format=', 'verbose', 'log-file' ]) except getopt.GetoptError: print(HELPMSG) sys.exit(2) if not opts: print(HELPMSG) sys.exit(0) """ Increase verbosity """ opts_v = len(list(filter(lambda opt: opt == ('-v', ''), opts))) if opts_v > 4: opts_v = 4 v = 0 while v < opts_v: increase_log_level() v += 1 """ Log to file """ if v > 0: enable_logfile = list( filter(lambda opt: opt[0] in ('--log-file'), opts)) if enable_logfile: log_to_file() for opt, arg in opts: if opt == '--help': print(HELPMSG) sys.exit(0) elif opt in ('-f', '--infile'): file_path = arg logger.debug(f'Using input file {file_path}') try: with open(file_path, 'r') as f: TEXT = f.read() except FileNotFoundError: logger.critical( f'The specified file {file_path} does not exist.') sys.exit(2) except Exception as e: logger.critical( f'An error occurred while reading the file `{file_path}`.') logger.error(e) sys.exit(2) elif opt in ('-s', '--stdin'): try: logger.debug(f'Using input from STDIN') TEXT = sys.stdin.read() except Exception as e: logger.critical(f'An error occurred while reading from stdin.') logger.error(e) sys.exit(2) elif opt in ('-n', '--std'): logger.debug(f'OPTION: Standardizing data.') CONFIG['std'] = True elif opt in ('-a', '--algo'): logger.debug('OPTION: Using Complement Naïve Bayes algorithm') if arg not in ['mnb', 'cnb']: print(HELPMSG) logger.critical( f'The specified algorithm `{arg}` is not available.') sys.exit(2) else: CONFIG['algo'] = arg elif opt in ('-l', '--limit'): if arg.isnumeric: logger.debug(f'OPTION: Using n={arg} samples.') CONFIG['n'] = int(arg) else: print(HELPMSG) logger.critical(f'n={arg} is non-numeric.') sys.exit(2) elif opt in ('-t', '--train'): logger.debug(f'ACTION: Creating model from dataset') if arg == 'v1': katatasso.train(std=CONFIG.get('std', False), algo=CONFIG.get('algo', 'mnb')) elif arg == 'v2': katatasso.trainv2(std=CONFIG.get('std', False), algo=CONFIG.get('algo', 'mnb'), n=CONFIG.get('n', None)) else: logger.critical( f'Please specify either `v1` or `v2`. E.g. `katatasso -t v2`' ) sys.exit(2) elif opt in ('-c', '--classify'): if TEXT: logger.debug(f'ACTION: Classifying input') if CONFIG.get('cnb'): algo = 'cnb' else: algo = 'mnb' if arg == 'v1': category = katatasso.classify(TEXT, algo=algo) elif arg == 'v2': category = katatasso.classifyv2(TEXT, algo=algo) else: logger.critical( f'Please specify either `v1` or `v2`. E.g. `katatasso -c v2`' ) sys.exit(2) result = { 'category': category, 'accuracy': 'n/a', 'alias': CATEGORIES.get(category) } else: logger.critical(f'Missing input (specify using -f or -s)') sys.exit(2) elif opt in ('-o', '--outfile'): logger.debug(f'CONFIG: Setting output file to {arg}') CONFIG['outfile'] = arg elif opt in ('-d', '--format'): if arg in ['plain', 'json']: logger.debug(f'CONFIG: Setting output file format to {arg}') CONFIG['format'] = arg else: logger.critical('Invalid format. Must be one of [plain, json]') sys.exit(2) if result: outformat = CONFIG.get('format') outfile = CONFIG.get('outfile') if outfile: ext = 'json' if outformat == 'json' else 'txt' fname = f'{outfile}.{ext}' if outformat == 'plain': with open(fname, 'w') as f: f.write('\n'.join(list(result.values()))) elif outformat == 'json': import json with open(fname, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=4) logger.debug(f'Results saved to file `{fname}`') sys.exit(0) else: for k, v in result.items(): print(f'{k}: {v}') sys.exit(0)