示例#1
0
def make_dictionary():
    failed = []
    tags = get_all_tags()
    words = []
    logger.debug('Creating dictionary..')
    if tags:
        for filepath, tag, text, hosts in progress_bar(tags):
            try:
                words += text.split()
            except AttributeError:
                failed.append(filepath.replace(CLF_TRAININGDATA_PATH, ''))
                pass

        # Remove non-alphanumeric values
        words = [word for word in words if word.isalpha()]

        # Get the count of each word
        dictionary = Counter(words)

        if failed:
            warn_failed(failed)

        return dictionary.most_common(CLF_DICT_NUM)
    else:
        logger.error('No tags were found in the database.')
        return None
示例#2
0
def save_obj(obj, filepath):
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(obj, f)
        logger.debug(f'Pickled object `{filepath}` saved to disk.')
    except Exception as e:
        logger.critical(f'An unexpected error occurred while saving the object `{filepath}`.')
        logger.error(e)
示例#3
0
def load_obj(filepath):
    try:
        with open(filepath, 'rb') as f:
            obj = pickle.load(f, encoding='latin1')
        logger.debug(f'Pickled object loaded from `{filepath}`.')
        return obj
    except FileNotFoundError:
        logger.critical(f'Pickled object `{filepath}` was not found. Exiting.')
        sys.exit(2)
    except Exception as e:
        logger.critical(f'An unexpected error occurred while loading the object `{filepath}`.')
        logger.error(e)
        sys.exit(2)
示例#4
0
def get_file_paths():
    logger.debug(f'Retrieving files from {CLF_TRAININGDATA_PATH}..')
    files = os.listdir(CLF_TRAININGDATA_PATH)
    logger.debug(f'     Found {len(files)} files in dir')
    file_paths = [CLF_TRAININGDATA_PATH + filename for filename in files if filename.endswith('.msg') or filename.endswith('.eml')]
    logger.debug(f'     Found {len(file_paths)} MSG/EML files in dir')
    logger.debug(f'         MSG: {len([filename for filename in file_paths if filename.endswith(".msg")])} || EML: {len([filename for filename in file_paths if filename.endswith(".eml")])}')
    return file_paths
示例#5
0
def make_dataset(dictionary):
    failed = []
    features = []
    labels = []
    tags = get_all_tags()
    if tags:
        logger.debug(f'Creating dataset from {len(tags)} entries')
        for filepath, tag, text, hosts in progress_bar(tags):
            try:
                data = []
                words = text.split()
                for entry in dictionary:
                    data.append(words.count(entry[0]))
                features.append(data)
                labels.append(tag)
            except AttributeError:
                failed.append(filepath.replace(CLF_TRAININGDATA_PATH, ''))
                pass
        if failed:
            warn_failed(failed)
        
    return features, labels
示例#6
0
def main():
    TEXT = None
    CONFIG = {}

    result = {}

    if len(sys.argv) < 2:
        print(HELPMSG)
        logger.critical('No input specified')
        sys.exit(2)

    argv = sys.argv[1:]

    try:
        opts, args = getopt.getopt(argv, 'hf:st:c:na:l:o:d:v', [
            'help', 'infile=', 'stdin', 'std', 'algo=', '--limit', 'train=',
            'classify=', 'outfile=', 'format=', 'verbose', 'log-file'
        ])
    except getopt.GetoptError:
        print(HELPMSG)
        sys.exit(2)

    if not opts:
        print(HELPMSG)
        sys.exit(0)
    """
    Increase verbosity
    """
    opts_v = len(list(filter(lambda opt: opt == ('-v', ''), opts)))
    if opts_v > 4:
        opts_v = 4
    v = 0
    while v < opts_v:
        increase_log_level()
        v += 1
    """
    Log to file
    """
    if v > 0:
        enable_logfile = list(
            filter(lambda opt: opt[0] in ('--log-file'), opts))
        if enable_logfile:
            log_to_file()

    for opt, arg in opts:
        if opt == '--help':
            print(HELPMSG)
            sys.exit(0)
        elif opt in ('-f', '--infile'):
            file_path = arg
            logger.debug(f'Using input file {file_path}')
            try:
                with open(file_path, 'r') as f:
                    TEXT = f.read()
            except FileNotFoundError:
                logger.critical(
                    f'The specified file {file_path} does not exist.')
                sys.exit(2)
            except Exception as e:
                logger.critical(
                    f'An error occurred while reading the file `{file_path}`.')
                logger.error(e)
                sys.exit(2)
        elif opt in ('-s', '--stdin'):
            try:
                logger.debug(f'Using input from STDIN')
                TEXT = sys.stdin.read()
            except Exception as e:
                logger.critical(f'An error occurred while reading from stdin.')
                logger.error(e)
                sys.exit(2)
        elif opt in ('-n', '--std'):
            logger.debug(f'OPTION: Standardizing data.')
            CONFIG['std'] = True
        elif opt in ('-a', '--algo'):
            logger.debug('OPTION: Using Complement Naïve Bayes algorithm')
            if arg not in ['mnb', 'cnb']:
                print(HELPMSG)
                logger.critical(
                    f'The specified algorithm `{arg}` is not available.')
                sys.exit(2)
            else:
                CONFIG['algo'] = arg
        elif opt in ('-l', '--limit'):
            if arg.isnumeric:
                logger.debug(f'OPTION: Using n={arg} samples.')
                CONFIG['n'] = int(arg)
            else:
                print(HELPMSG)
                logger.critical(f'n={arg} is non-numeric.')
                sys.exit(2)
        elif opt in ('-t', '--train'):
            logger.debug(f'ACTION: Creating model from dataset')
            if arg == 'v1':
                katatasso.train(std=CONFIG.get('std', False),
                                algo=CONFIG.get('algo', 'mnb'))
            elif arg == 'v2':
                katatasso.trainv2(std=CONFIG.get('std', False),
                                  algo=CONFIG.get('algo', 'mnb'),
                                  n=CONFIG.get('n', None))
            else:
                logger.critical(
                    f'Please specify either `v1` or `v2`. E.g. `katatasso -t v2`'
                )
                sys.exit(2)
        elif opt in ('-c', '--classify'):
            if TEXT:
                logger.debug(f'ACTION: Classifying input')
                if CONFIG.get('cnb'):
                    algo = 'cnb'
                else:
                    algo = 'mnb'
                if arg == 'v1':
                    category = katatasso.classify(TEXT, algo=algo)
                elif arg == 'v2':
                    category = katatasso.classifyv2(TEXT, algo=algo)
                else:
                    logger.critical(
                        f'Please specify either `v1` or `v2`. E.g. `katatasso -c v2`'
                    )
                    sys.exit(2)
                result = {
                    'category': category,
                    'accuracy': 'n/a',
                    'alias': CATEGORIES.get(category)
                }
            else:
                logger.critical(f'Missing input (specify using -f or -s)')
                sys.exit(2)
        elif opt in ('-o', '--outfile'):
            logger.debug(f'CONFIG: Setting output file to {arg}')
            CONFIG['outfile'] = arg
        elif opt in ('-d', '--format'):
            if arg in ['plain', 'json']:
                logger.debug(f'CONFIG: Setting output file format to {arg}')
                CONFIG['format'] = arg
            else:
                logger.critical('Invalid format. Must be one of [plain, json]')
                sys.exit(2)

    if result:
        outformat = CONFIG.get('format')
        outfile = CONFIG.get('outfile')
        if outfile:
            ext = 'json' if outformat == 'json' else 'txt'
            fname = f'{outfile}.{ext}'
            if outformat == 'plain':
                with open(fname, 'w') as f:
                    f.write('\n'.join(list(result.values())))
            elif outformat == 'json':
                import json
                with open(fname, 'w', encoding='utf-8') as f:
                    json.dump(result, f, ensure_ascii=False, indent=4)
            logger.debug(f'Results saved to file `{fname}`')
            sys.exit(0)
        else:
            for k, v in result.items():
                print(f'{k}: {v}')
            sys.exit(0)