Exemplo n.º 1
0
 def __init__(self):
     """
     Initiator class to load all necessary configurations, logger and HTTP client instances
     """
     config_parser = ConfigParser()
     self.config = config_parser.return_json(
         path_to_config='TOREPLACE/config/example.json')
Exemplo n.º 2
0
    def load_from(config_meta):
        with open(config_meta['path']) as f:
            config = yaml.load(f)
            parser = ConfigParser(config)
            parser._create_directories()

        task = Task.load_from(parser.task)
        dataset = Dataset.load_from(parser.dataset)
        model_config = config['model']
        label_helper = Label.load_from(parser.label)
        user = config['user']

        # Set up logger
        log_level = config_meta['log_level']
        logger = logging.getLogger('label_app')
        logger.setLevel(getattr(logging, log_level))

        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(
            logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
        logger.addHandler(ch)

        return LabelApp(task, dataset, label_helper, user, model_config,
                        parser, logger)
Exemplo n.º 3
0
    def __init__(self, theConfigFilePath, theLogger=Logger()):
        # parse config file
        self.configParser = ConfigParser()
        # self.configParser.parseFile(theConfigFilePath)
        self.configParser.generateDirectories()

        self.log = theLogger

        # keytool path to parse apk's signature
        self.keytoolPath = None

        # sanddroid directories
        self.mainDir = os.path.dirname(__file__)

        self.appList = []  # list to store apk file - full path
        self.runningApps = []  # list to store apk file which in being analyzed

        self.runHeadless = False
        self.emulatorStartPort = 5554

        self.numThreads = 1
        self.maxThreadRuntime = 600

        # control running threads
        self.threadLogFileList = []  # list to store thread log file path
        self.numFinishedApps = 0  # number of analyzed apps
        self.numRunningThreads = 0  # number of running threads
        self.threadList = []  # list of threads, size=numThreads
        self.threadActiveMask = [
        ]  # bitmask to determine if thread is active, size=numThreads

        self.avdsheartbeat = (0, 0, 0, 0)  # list avds' times used in one cycle
        self.avdheartbeat = 0
        self.startTime = datetime.datetime.now()
Exemplo n.º 4
0
    def __init__(self, config_filepath):
        """
        Class initializer.

        Inputs:
            config_filepath: (str) Configuration filepath.
        """
        self.configparser = ConfigParser(config_filepath)

        # read configuration file
        self.filepath = self.configparser.getstr('filepath')
        self.full_ontology_pkl = self.configparser.getstr('full_ontology_pkl')
        self.candidate_ontology_pkl = self.configparser.getstr(
            'candidate_ontology_pkl')
        self.skeleton_and_entities_pkl = self.configparser.getstr(
            'skeleton_and_entities_pkl')
        self.overwrite_pkl = self.configparser.getbool('overwrite_pickle_flag')
        self.outputFoodOn = self.configparser.getstr('outputFoodOn')

        self.num_seeds = self.configparser.getint('num_seeds')
        self.num_min_extracted_entities = self.configparser.getint(
            'num_min_extracted_entities')

        # generate pairs from csv file
        self.pd_foodon_pairs = self.generate_pairs()
        self.all_classes, self.all_entities = self.get_classes_and_entities()
        self.foodon_graph, self.graph_dict, self.graph_dict_flip = self.generate_graph(
        )
Exemplo n.º 5
0
    def __init__(self,
                 theApkObj,
                 theAvdName,
                 decompressDir,
                 runHeadless,
                 theLogger=Logger()):
        Thread.__init__(self)
        # configParser
        self.configParser = ConfigParser()

        self.apkObj = theApkObj
        self.log = theLogger
        self.curDir = os.path.dirname(__file__)

        self.staticAnalyzer = None
        self.dynamicAnalyzer = None
        self.logcatAnalyzer = None

        self.startTimeStr = None
        self.endTimeStr = None

        self.emulator = None
        self.emulatorPort = 5554
        self.avdName = theAvdName
        self.runHeadless = runHeadless

        self.decompressPath = decompressDir
        self.logcatFile = None

        self.session = None

        self.cancelFlag = False  # Flag for canceling run
Exemplo n.º 6
0
    def __init__(self, config_filepath):
        """
        Class initializer.

        Inputs:
            config_filepath: (str) Configuration filepath.
        """
        self.configparser = ConfigParser(config_filepath)
Exemplo n.º 7
0
    def __init__(self, config_file):
        """
        Class initializer.

        Inputs:
        """
        self.configparser = ConfigParser(config_file)
        self.epoch_callback = EpochCallback()
        self.model = None
Exemplo n.º 8
0
    def load_from(config_path):
        with open(config_path) as f:
            config = yaml.load(f)
            parser = ConfigParser(config)
            parser._create_directories()

        task = Task.load_from(parser.task)
        dataset = Dataset.load_from(parser.dataset)
        model_config = config['model']
        label_helper = Label.load_from(parser.label)
        user = config['user']

        return LabelApp(task, dataset, label_helper, user, model_config,
                        parser)
Exemplo n.º 9
0
def main(config_path, epochs=3):
    # For pretraining, we do everything the same, except we replace the
    # dataset:judgements_file with model:pretrain_file.
    with open(config_path) as f:
        config = yaml.load(f)
        parser = ConfigParser(config)
        parser.dataset['judgements_file'] = parser.model['pretrain_file']

    task = Task.load_from(parser.task)
    dataset = PretrainJSONDataset(parser.dataset)
    model_config = config['model']
    label_helper = Label.load_from(parser.label)
    user = config['user']
    label_app = LabelApp(task, dataset, label_helper, user, model_config,
                         parser)
    label_app.trainer.load_existing()
    label_app.trainer.train_epochs(epochs=epochs)
Exemplo n.º 10
0
    def __init__(self, module: str, cli: str):
        self.current_env = os.environ.copy()
        self.json_data = ConfigParser(
            path="/home/vlad/infra/armature/armature/conf/modules.json"
        ).return_json()

        self.module = module
        self.cli = cli
        self.module_data = self.json_data['modules'][module]
Exemplo n.º 11
0
def main():
    """
    Main function.
    """
    # set log, parse args, and read configuration
    args = parse_argument()
    configparser = ConfigParser(args.config_file)
    set_logging(configparser.getstr('logfile'))

    # parse FoodOn
    parse_foodon = ParseFoodOn(configparser.getstr('foodon_parse_config'))
    classes_dict = parse_foodon.get_candidate_classes()
    classes_dict_skeleton, candidate_entities = parse_foodon.get_seeded_skeleton(
        classes_dict)

    # run
    scoring_manager = ScoringManager(classes_dict_skeleton, candidate_entities,
                                     configparser.getstr('scoring_config'))

    scoring_manager.run_iteration()
Exemplo n.º 12
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # baseline / best classifiers
    baseline_classifier = main_config.get_str('baseline')
    best_classifier = main_config.get_str('classifier')

    # plot PR curve and print confusion matrix
    plot_pr_print_cm(baseline_classifier, best_classifier, main_config,
                     model_manager)
Exemplo n.º 13
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # parse config
    classifier = main_config.get_str('classifier')
    pre_built_models_dir = os.path.join(main_config.get_str('pre_built_models_dir'), classifier)
    num_classifiers = main_config.get_int('num_classifiers')

    # we need to build the models first if they do not exist
    if not dir_exists(pre_built_models_dir):
        save_models(
            classifier,
            pre_built_models_dir,
            main_config,
            model_manager,
            num_classifiers)

    make_recommendation(
        classifier,
        pre_built_models_dir,
        main_config,
        model_manager,
        num_classifiers)
Exemplo n.º 14
0
def save_models(classifier,
                pre_built_models_dir,
                main_config,
                model_manager,
                num_classifiers):
    log.info('Pre-built model directory specified for %s does not exist.', classifier)
    log.info('Building models again.')

    # create directory
    create_dir(pre_built_models_dir)

    # load config parsers
    preprocess_config = ConfigParser(main_config.get_str('preprocess_config'))
    classifier_config = ConfigParser(main_config.get_str('classifier_config'))
    classifier_config.overwrite('classifier', classifier)

    # perform preprocessing
    X, y = model_manager.preprocess(preprocess_config, section=classifier)

    # select subset of features if requested
    selected_features = main_config.get_str_list('selected_features')
    if selected_features:
        log.info('Selecting subset of features: %s', selected_features)
        X = X[selected_features]

    # train multiple classifiers
    for i in range(num_classifiers):
        log.debug('Processing classifier %d/%s', i+1, num_classifiers)

        cmanager = ClassifierManager(classifier_config)
        clf = CalibratedClassifierCV(cmanager.get_classifier(), method='sigmoid', cv=5)
        clf.fit(X, y)

        save_pkl(
            clf,
            os.path.join(pre_built_models_dir, 'model_{}.pkl'.format(i)))
Exemplo n.º 15
0
def main():
    """
    Main function.
    """
    # set log, parse args, and read configuration
    set_logging()
    args = parse_argument()
    configparser = ConfigParser(args.config_file)

    # load data to train with
    sentence_column = configparser.getstr('sentence_column')

    pd_data = pd.read_csv(configparser.getstr('input_filepath'), sep='\t')

    pd_data.fillna('', inplace=True)
    pd_data = pd_data[pd_data[sentence_column] != '']

    # use specified column as sentences
    sentences = pd_data[sentence_column].tolist()
    sentences = [sentence.split() for sentence in sentences]

    # init word2vec manager
    w2vm = Word2VecManager(args.config_file)

    # start training and load pre-training data if prompted
    if configparser.getbool('pre_train'):
        pretrained = configparser.getstr('pre_trained_vectors')
    else:
        pretrained = None

    w2vm.train(sentences, pretrained=pretrained)

    # save word embeddings and model
    w2vm.save_model(configparser.getstr('model_saveto'))
    w2vm.save_vectors(configparser.getstr('vectors_saveto'))
    w2vm.save_loss(configparser.getstr('loss_saveto'))
Exemplo n.º 16
0
Arquivo: scoring.py Projeto: IBPA/LOVE
    def __init__(self, candidate_classes_info, candidate_entities,
                 scoring_config):
        """
        Class initializer.
        """
        # config parser
        if isinstance(scoring_config, str):
            scoring_config = ConfigParser(scoring_config)

        # save arguments
        self.candidate_classes_info = candidate_classes_info
        self.candidate_entities = candidate_entities

        # parse config file
        self.alpha = scoring_config.getfloat('alpha')
        self.num_mapping_per_iteration = scoring_config.getint(
            'num_mapping_per_iteration')
        self.initial_siblings_scores = scoring_config.getstr(
            'initial_siblings_scores')
        self.initial_parents_scores = scoring_config.getstr(
            'initial_parents_scores')
        self.pairs_filepath = scoring_config.getstr('pairs_filepath')
        self.populated_filepath = scoring_config.getstr('populated_filepath')
        self.preprocess_config_filepath = scoring_config.getstr(
            'preprocess_config')
        self.similarity_method = scoring_config.getstr('similarity_method')

        log.debug('alpha: %f', self.alpha)
        log.debug('num_mapping_per_iteration: %d',
                  self.num_mapping_per_iteration)
        log.debug('initial_siblings_scores: %s', self.initial_siblings_scores)
        log.debug('initial_parents_scores: %s', self.initial_parents_scores)
        log.debug('pairs_filepath: %s', self.pairs_filepath)
        log.debug('populated_filepath: %s', self.populated_filepath)
        log.debug('similarity_method: %s', self.similarity_method)

        # preprocess manager
        self.fpm = FdcPreprocessManager(self.preprocess_config_filepath)

        # number of candidate classes & entities
        self.num_candidate_classes = len(self.candidate_classes_info)
        self.num_candidate_entities = len(self.candidate_entities)

        log.debug('Number of candidate classes: %d',
                  self.num_candidate_classes)
        log.debug('Number of candidate entities: %d',
                  self.num_candidate_entities)

        # extract the seeded entities to make complete list of entities
        seed_entities = self._unpack_sublist(
            [x[1] for _, x in self.candidate_classes_info.items()])
        self.all_entity_labels = list(
            set(self.candidate_entities + seed_entities))

        # all labels of candidate class
        self.candidate_classes_label = list(self.candidate_classes_info.keys())

        # complete list of class labels
        other_classes = self._unpack_sublist(
            [x[0] for _, x in self.candidate_classes_info.items()], depth=2)
        self.all_class_labels = list(
            set(self.candidate_classes_label + other_classes))

        # calculate embedding lookup table for class / entity labels
        if 'we_' in self.similarity_method:
            self.keyed_vectors = KeyedVectors.load_word2vec_format(
                scoring_config.getstr('word_embeddings'))
            # self.keyed_vectors.save('./output/glove_wiki_embeddings.bin')
            # self.keyed_vectors = KeyedVectors.load('./output/glove_wiki_embeddings.bin')

            self.pd_class_label_embeddings = self._calculate_label_embeddings(
                self.all_class_labels)
            self.pd_entity_label_embeddings = self._calculate_label_embeddings(
                self.all_entity_labels)

            # save_pkl(self.pd_class_label_embeddings, './output/pd_class_label_embeddings.pkl')
            # save_pkl(self.pd_entity_label_embeddings, './output/pd_entity_label_embeddings.pkl')
            # sys.exit()

            # self.pd_class_label_embeddings = load_pkl('./output/pd_class_label_embeddings.pkl')
            # self.pd_entity_label_embeddings = load_pkl('./output/pd_entity_label_embeddings.pkl')

        # do initial calculation of the scores
        self.pd_siblings_scores, self.pd_parents_scores = self._calculate_initial_scores(
        )
Exemplo n.º 17
0
    print("==> Stitching frames to create final output videos")
    stitch_videos(model_output_path, cfg.paths.frames, predictions_dict)

    # Delete frame directories
    for video in predictions_dict.keys():
        directory_path = f"{model_output_path}/{video}"
        shutil.rmtree(directory_path)

    print("==> Generating confusion matrix")
    metrics.compute_confusion_matrix(predictions_dict, classes,
                                     model_output_path)

    # Upload to AWS S3 only if bucket name is given in config
    if cfg.bucket:
        print("==> Creating zip file")
        zip_videos(model_output_path, cfg.name)

        print("==> Uploading to AWS S3")
        response = upload_videos(model_output_path, cfg.name, cfg.bucket)

        if response:
            print(f"Output download link: {response}")

    total_time = datetime.datetime.now() - start_time
    print(f"Total time: {total_time.total_seconds()}")


if __name__ == "__main__":
    cfg = ConfigParser().config
    main(cfg)
Exemplo n.º 18
0
def make_recommendation(classifier,
                        pre_built_models_dir,
                        main_config,
                        model_manager,
                        num_classifiers):
    if not dir_exists(pre_built_models_dir):
        raise RuntimeError('Pre-built model directory does not exist!')

    log.info('Using pre-built model directory: %s', pre_built_models_dir)

    # load config parsers
    preprocess_config = ConfigParser(main_config.get_str('preprocess_recommender_config'))
    classifier_config = ConfigParser(main_config.get_str('classifier_config'))
    classifier_config.overwrite('classifier', classifier)

    # perform preprocessing
    X, y = model_manager.preprocess(preprocess_config, section=classifier, final_model=True)

    # select subset of features if requested
    selected_features = main_config.get_str_list('selected_features')
    if selected_features:
        log.info('Selecting subset of features: %s', selected_features)
        X = X[selected_features]

    def _revert_column(pd_data):
        values = list(set(pd_data.tolist()))

        replace_dict = {}
        for value in values:
            replace_dict[value] = list(filter(lambda a: a != value, values))[0]

        return pd_data.replace(to_replace=replace_dict)

    # get test data and its inverse for TRT column
    X_inv = X.copy()
    X_inv['TRT'] = _revert_column(X_inv['TRT'])
    pos_trt_idx = (X['TRT'] == 1.0)

    y_probs = []
    y_probs_inv = []
    for i in range(num_classifiers):
        log.debug('Processing classifier %d/%s', i+1, num_classifiers)

        classifier_filepath = os.path.join(pre_built_models_dir, 'model_{}.pkl'.format(i))
        log.debug('Loading classifier: %s', classifier_filepath)
        clf = load_pkl(classifier_filepath)

        y_probs.append(clf.predict_proba(X)[:, 1])
        y_probs_inv.append(clf.predict_proba(X_inv)[:, 1])

    y_probs = pd.DataFrame(y_probs).T
    y_probs.index = X.index
    y_probs_inv = pd.DataFrame(y_probs_inv).T
    y_probs_inv.index = X.index

    # make recommendation
    y_probs_avg = y_probs.mean(axis=1)
    y_probs_inv_avg = y_probs_inv.mean(axis=1)

    y_probs_avg_diff = y_probs_avg - y_probs_inv_avg
    inv_minus_pos = y_probs_inv_avg - y_probs_avg
    y_probs_avg_diff[~pos_trt_idx] = inv_minus_pos[~pos_trt_idx]

    pval = pd.Series(index=X.index)
    for index, _ in pval.items():
        _, pval[index] = ttest_rel(y_probs.loc[index], y_probs_inv.loc[index])

    # calculate y_probs_trt / right now it's y_probs  ################
    pd_concat = pd.concat(
        [pos_trt_idx, y_probs_avg, y_probs_inv_avg, y_probs_avg_diff, pval], axis=1)
    pd_concat.columns = ['pos_trt', 'y_probs_avg', 'y_probs_inv_avg', 'y_probs_avg_diff', 'pval']

    print(pd_concat)
Exemplo n.º 19
0
# -*- coding: utf-8 -*-
from db.db_worker import DBWorker
from db.sql_requests import SQLRequests
from utils.config_parser import ConfigParser
from utils.settings_parser import SettingsParser
import logging

if __name__ == "__main__":

    config = ConfigParser().get_config_settings()
    settings = SettingsParser().get_test_settings()

    logging.basicConfig(filename=config['log_filename'],
                        level=logging.INFO,
                        format=config['log_format'])
    logger = logging.getLogger()

    db_worker = DBWorker(config['host'], config['user'], config['password'],
                         config['database'])
    sql_request = SQLRequests(db_worker, settings)

    try:
        db_worker.connect()

        logger.info('Шаг 1')

        sql_request.get_min_working_time()

        logger.info('-' * 200)

        logger.info('Шаг 2')
Exemplo n.º 20
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # perform analysis on these classifiers
    classifiers = main_config.get_str_list('classifier')

    # do prediction
    classifiers_ys = {}
    for classifier in classifiers:
        log.info('Running model for classifier \'%s\'', classifier)

        # load config parsers
        preprocess_config = ConfigParser(
            main_config.get_str('preprocess_config'))
        classifier_config = ConfigParser(
            main_config.get_str('classifier_config'))

        # perform preprocessing
        X, y = model_manager.preprocess(preprocess_config, section=classifier)

        # run classification model
        classifier_config.overwrite('classifier', classifier)

        X = model_manager.feature_selector(X, y, classifier_config)

        score_avg, score_std, ys = model_manager.run_model_cv(
            X, y, 'f1', classifier_config)

        classifiers_ys[classifier] = ys

    # plot PR curve
    fig = plt.figure()

    lines = []
    labels = []
    for classifier, ys in classifiers_ys.items():
        y_trues, y_preds, y_probs = ys

        y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs)
        line, label = plot_pr(y_trues, y_probs_1, classifier)

        lines.append(line)
        labels.append(label)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('PR Curve')
    plt.legend(lines, labels, loc='lower right', prop={'size': 8})

    save_figure(
        fig,
        os.path.join(main_config.get_str('visualization_dir'), 'pr_curve.png'))

    # plot ROC curve
    fig = plt.figure()

    lines = []
    labels = []
    for classifier, ys in classifiers_ys.items():
        y_trues, y_preds, y_probs = ys

        y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs)
        line, label = plot_roc(y_trues, y_probs_1, classifier)

        lines.append(line)
        labels.append(label)

    # plt.plot([0, 1], [0, 1], color='k', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(lines, labels, loc='lower right', prop={'size': 8})

    save_figure(
        fig,
        os.path.join(main_config.get_str('visualization_dir'),
                     'roc_curve.png'))
Exemplo n.º 21
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # run models for all possible combination of preprocessing
    scale_modes = main_config.get_str_list('scale_mode')
    mvi_modes = main_config.get_str_list('mvi_mode')
    outlier_modes = main_config.get_str_list('outlier_mode')
    classifiers = main_config.get_str_list('classifier')

    classifier_score_dict = {classifier: 0 for classifier in classifiers}
    classifier_best_combination_dict = {
        classifier: None
        for classifier in classifiers
    }
    all_combinations = [scale_modes, mvi_modes, outlier_modes, classifiers]
    all_combinations = list(itertools.product(*all_combinations))
    failed_combinations = []

    for idx, combination in enumerate(all_combinations):
        # unpack the tuple
        scale_mode = combination[0]
        mvi_mode = combination[1]
        outlier_mode = combination[2]
        classifier = combination[3]

        # log current combination
        combination_str_joined = ', '.join(list(combination))
        log.info('Running grid search %d/%d: (%s)', idx + 1,
                 len(all_combinations), combination_str_joined)

        # some classifiers must use minmax scaler
        if classifier in ['MultinomialNB', 'CategoricalNB'
                          ] and scale_mode != 'minmax':
            log.info('Skipping this combination...')
            continue

        # overwrite the config file using the current combination
        preprocess_config = ConfigParser(
            main_config.get_str('preprocess_config'))
        classifier_config = ConfigParser(
            main_config.get_str('classifier_config'))

        preprocess_config.overwrite('scale_mode', scale_mode)
        preprocess_config.overwrite('mvi_mode', mvi_mode)
        preprocess_config.overwrite('outlier_mode', outlier_mode)
        classifier_config.overwrite('classifier', classifier)

        # perform preprocessing
        X, y = model_manager.preprocess(preprocess_config)

        # run classification model
        try:
            score = model_manager.grid_search(
                X, y, main_config.get_str('optimize_scoring'),
                classifier_config,
                main_config.get_str('updated_classifier_config'))
        except (IndexError, ValueError) as e:
            failed_combinations.append(combination_str_joined)
            log.error(e)
            continue

        # update the best preprocessing combination
        if classifier_score_dict[classifier] < score:
            classifier_score_dict[classifier] = score
            classifier_best_combination_dict[
                classifier] = combination_str_joined

    log.info('Best %s score for each classifier: %s',
             main_config.get_str('optimize_scoring'), classifier_score_dict)

    log.info(
        'Preprocessing combination of the best %s score for each classifier: %s',
        main_config.get_str('optimize_scoring'),
        classifier_best_combination_dict)

    log.info('%d failed combinations: %s', len(failed_combinations),
             failed_combinations)
Exemplo n.º 22
0
import click
from modules.executor import Executor
from utils.config_parser import ConfigParser

json_data = ConfigParser(
    path="/home/vlad/infra/armature/armature/conf/modules.json").return_json()

MODULE = "packer"


@click.group()
def cli():
    pass


@cli.command()
def prepare_template():
    """Validate configuration file"""
    click.echo('prepare_template')

    with Executor(module=MODULE, cli="prepare_template") as cli_executor:
        cli_executor.run(cli="prepare_template", use_docker_run_wrapper=True)


@cli.command()
def validate_template():
    """Validate configuration file"""
    click.echo('validate_template')

    with Executor(module=MODULE, cli="validate_template") as cli_executor:
        cli_executor.run(cli="validate_template", use_docker_run_wrapper=True)
Exemplo n.º 23
0
class FdcPreprocessManager:
    """
    Class for preprocessing the FDC data.
    """
    def __init__(self, config_filepath):
        """
        Class initializer.

        Inputs:
            config_filepath: (str) Configuration filepath.
        """
        self.configparser = ConfigParser(config_filepath)

    def _load_synonym_map(self, section='filter'):
        pd_map = pd.read_csv(self.configparser.getstr('synonym_map', section),
                             sep='\t',
                             index_col='from')

        return pd_map['to'].to_dict()

    def _map_synonyms(self, text, table):
        regex_str = '|'.join(r'\b%s\b' % re.escape(s) for s in table)
        return re.sub(regex_str, lambda x: table[x.group(0)], text)

    def _generate_custom_stopwords(self, section='filter'):
        """
        (Private) Generate custom stopwords by adding or removing
        user specified stopwords to the gensim's default stopwords.

        Inputs:
            section: (str, optional) Section name of the .ini file.

        Returns:
            (frozenset) New updated stopwords.
        """
        my_stopwords = list(gpp.STOPWORDS)

        # stopwords to add
        to_add_filename = self.configparser.getstr('stopwords_to_add', section)

        with open(to_add_filename, 'r') as file:
            to_add_list = file.read().splitlines()

        if len(to_add_list) > 0:
            log.info('Adding custom stopwords %s', to_add_list)
        else:
            log.info('Not adding any custom stopwords')

        # stopwords to remove
        to_remove_filename = self.configparser.getstr('stopwords_to_remove',
                                                      section)

        with open(to_remove_filename, 'r') as file:
            to_remove_list = file.read().splitlines()

        if len(to_remove_list) > 0:
            log.info('Removing stopwords %s', to_remove_list)
        else:
            log.info('Not removing any custom stopword')

        # add and remove stopwords
        my_stopwords.extend(to_add_list)
        my_stopwords = [x for x in my_stopwords if x not in to_remove_list]

        return frozenset(my_stopwords)

    def _custom_remove_stopwords(self, s, stopwords):
        """
        (Private) Custom remove stopwords function.

        Inputs:
            s: (str) String to process.
            stopwords: (frozenset) Custom stopwords.

        Returns:
            (str) Preprocessed string with stopwords removed.
        """
        s = gensim_utils.to_unicode(s)
        return " ".join(w for w in s.split() if w not in stopwords)

    def _custom_lemmatize(self, text):
        result = gensim_utils.lemmatize(text)
        result = b' '.join(result).decode('utf-8')
        result = re.sub(r'/[^\s]+', '', result)

        return result

    def _build_custom_filter_list(self, section='filter'):
        """
        (Private) Build list of filters based on the configuration file
        that will be applied by gpp.preprocess_string().

        Inputs:
            section: (str, optional) Section name of the .ini file.

        Returns:
            custom_filters: (list) List of functions.
        """
        custom_filters = []

        if self.configparser.getbool('lower', section):
            log.debug('Converting to lower cases')
            custom_filters.append(lambda x: x.lower())

        if self.configparser.getbool('map_synonym', section):
            log.debug('Mapping synonym')
            map_table = self._load_synonym_map(section)
            custom_filters.append(lambda x: self._map_synonyms(x, map_table))

        if self.configparser.getbool('strip_punctuation', section):
            log.debug('Stripping punctuation')
            custom_filters.append(gpp.strip_punctuation)

        if self.configparser.getbool('strip_multiple_whitespaces', section):
            log.debug('Stripping multiple whitespaces')
            custom_filters.append(gpp.strip_multiple_whitespaces)

        if self.configparser.getbool('strip_numeric', section):
            log.debug('Stripping numeric')
            custom_filters.append(gpp.strip_numeric)

        if self.configparser.getbool('remove_stopwords', section):
            log.debug('Removing stopwords')
            stopwords = self._generate_custom_stopwords(section)
            custom_filters.append(
                lambda x: self._custom_remove_stopwords(x, stopwords))

        if self.configparser.getbool('strip_short', section):
            minsize = self.configparser.getint('strip_short_minsize', section)
            log.debug('Stripping words shorter than %d', minsize)
            custom_filters.append(
                lambda x: gpp.strip_short(x, minsize=minsize))

        if self.configparser.getbool('lemmatize', section):
            log.debug('Lemmatizing text')
            custom_filters.append(self._custom_lemmatize)

        return custom_filters

    def _generate_phrase(self, pd_data, load_model=False, section='phrase'):
        """
        (Private) Generate phrase using the gensim Phrase detection module.

        Inputs:
            pd_data: (pd.Series) Data which will be used to generate phase.
            section: (str, optional) Section name of the .ini file.

        Returns:
            pd_data: (pd.Series) Input data but using phrases.
        """
        if not self.configparser.getbool('generate_phrase', section):
            log.info('Skipping phrase generation...')
            return pd_data

        if load_model:
            model_filepath = self.configparser.getstr('phrase_model', section)
            model = Phraser.load(model_filepath)

            # apply phrase model
            log.info('Applying loaded phrase model...')
            pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False)
        else:
            log.info('Generating new phrases...')

            # this is our training data
            sentences = pd_data.tolist()

            # detect phrases using the configuration
            model = Phrases(
                sentences,
                min_count=self.configparser.getint('min_count', section),
                threshold=self.configparser.getfloat('threshold', section),
                max_vocab_size=self.configparser.getint(
                    'max_vocab_size', section),
                progress_per=self.configparser.getint('progress_per', section),
                scoring=self.configparser.getstr('scoring', section))

            # apply trained model to generate phrase
            log.info('Applying phrase model...')
            pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False)

            # save phrase model
            model_filepath = self.configparser.getstr('phrase_model', section)

            log.info('Saving phrase model to \'%s\'...', model_filepath)
            model.save(model_filepath)

            # dump phrase and its score as text
            phrase_score_list = []
            for phrase, score in model.export_phrases(sentences):
                phrase_score_list.append([phrase.decode('utf-8'), score])

            pd_phrase_score = pd.DataFrame(phrase_score_list,
                                           columns=['phrase', 'score'])
            pd_phrase_score.drop_duplicates(subset='phrase', inplace=True)

            export_filepath = self.configparser.getstr('phrase_dump_filename',
                                                       section)

            log.info('Dumping phrases to \'%s\'...', export_filepath)
            pd_phrase_score.to_csv(export_filepath, sep='\t', index=False)

        return pd_data

    def preprocess_column(self, pd_data, load_model=False):
        """
        Preprocess specified column.

        Inputs:
            pd_data: (pd.Series) Input data to preprocess.

        Returns:
            pd_data: (pd.Series) Preprocess data.
        """
        # preprocess using set of filters
        custom_filters = self._build_custom_filter_list()

        log.info('Applying preprocess filters to the %s...', pd_data.name)
        pd_data = pd_data.apply(
            lambda x: gpp.preprocess_string(x, custom_filters),
            convert_dtype=False)

        # generate phrase based on the configuration
        pd_data = self._generate_phrase(pd_data, load_model=load_model)

        # join the list of words into space delimited string
        pd_data = pd_data.apply(lambda x: ' '.join(x))

        return pd_data

    def get_vocabs(self, pd_data):
        log.info('Getting all vocabs...')

        vocabs = []
        for row in pd_data.tolist():
            vocabs.extend(row.split(' '))

        vocabs = list(set(vocabs))
        vocabs = sorted(vocabs, key=str.lower)

        log.info('Got %d unique vocabularies', len(vocabs))

        return vocabs
Exemplo n.º 24
0
 def __init__(self, config_filepath):
     configparser = ConfigParser(config_filepath)
     gt_ontology_filename = configparser.getstr('gt_entitymapping')
     self.gt_ontology = load_pkl(gt_ontology_filename)
Exemplo n.º 25
0
class Word2VecManager():
    """
    """
    def __init__(self, config_file):
        """
        Class initializer.

        Inputs:
        """
        self.configparser = ConfigParser(config_file)
        self.epoch_callback = EpochCallback()
        self.model = None

    def train(self, sentences, pretrained=None):
        self.model = Word2Vec(
            size=self.configparser.getint('size'),
            window=self.configparser.getint('window'),
            min_count=self.configparser.getint('min_count'),
            workers=self.configparser.getint('workers'),
            callbacks=[self.epoch_callback])

        log.info('Building vocabularies...')
        self.model.build_vocab(sentences)
        total_examples = self.model.corpus_count

        if pretrained:
            original_vocabs = self.model.wv.vocab.keys()
            pretrained_vocabs = KeyedVectors.load_word2vec_format(pretrained).vocab.keys()
            common_vocabs = list(set(original_vocabs) & set(pretrained_vocabs))
            log.info('Intersecting %d common vocabularies for transfer learning', len(common_vocabs))

            if self.configparser.getbool('pre_train_update_vocab'):
                log.info('Updating vocabularies using vocabs from pre-trained data')
                self.model.build_vocab([list(pretrained_vocabs)], update=True, min_count=1)

            self.model.intersect_word2vec_format(pretrained, lockf=1.0)

        self.model.train(
            sentences,
            total_examples=total_examples,
            epochs=self.configparser.getint('epochs'),
            compute_loss=True)

    def save_model(self, filepath):
        assert self.model is not None

        log.info('Saving model to %s...', filepath)
        self.model.save(filepath)

    def save_vectors(self, filepath):
        assert self.model is not None

        log.info('Saving word embeddings to %s...', filepath)
        self.model.wv.save_word2vec_format(filepath)

    def save_loss(self, filepath):
        assert self.model is not None

        # sorted by key, return a list of tuples
        lists = sorted(self.epoch_callback.loss.items())

        # unpack a list of pairs into two tuples
        x, y = zip(*lists)

        plt.plot(x, y)
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.savefig(filepath)
Exemplo n.º 26
0
class SandDroid():
    def __init__(self, theConfigFilePath, theLogger=Logger()):
        # parse config file
        self.configParser = ConfigParser()
        # self.configParser.parseFile(theConfigFilePath)
        self.configParser.generateDirectories()

        self.log = theLogger

        # keytool path to parse apk's signature
        self.keytoolPath = None

        # sanddroid directories
        self.mainDir = os.path.dirname(__file__)

        self.appList = []  # list to store apk file - full path
        self.runningApps = []  # list to store apk file which in being analyzed

        self.runHeadless = False
        self.emulatorStartPort = 5554

        self.numThreads = 1
        self.maxThreadRuntime = 600

        # control running threads
        self.threadLogFileList = []  # list to store thread log file path
        self.numFinishedApps = 0  # number of analyzed apps
        self.numRunningThreads = 0  # number of running threads
        self.threadList = []  # list of threads, size=numThreads
        self.threadActiveMask = [
        ]  # bitmask to determine if thread is active, size=numThreads

        self.avdsheartbeat = (0, 0, 0, 0)  # list avds' times used in one cycle
        self.avdheartbeat = 0
        self.startTime = datetime.datetime.now()

    # ================================================================================
    # Helpers
    # ================================================================================
    def __isConfigValid(self):
        # check configure
        javaHome = os.environ.get('JAVA_HOME')
        if not javaHome:
            self.log.error('Java environment not detected')
            return False
        else:
            keytoolPath = os.path.join(javaHome, 'bin', 'keytool')
            if not os.path.exists(keytoolPath):
                self.log.error('Java keytool no exist')
                return False
            else:
                self.keytoolPath = keytoolPath

        for desc, directory in self.configParser.getDirectoies().items():
            if not directory or not os.path.exists(directory):
                self.log.error('%s doesn\'t exist!' % desc)
                return False

        if (not self.configParser.getDbUsr()) and (not self.configParser.getDbHost()) and (
        not self.configParser.getDbPort()) \
                and (not self.configParser.getDbPswd()) and (not self.configParser.getDbName()):
            return False

        return True

    def _getLogDir(self):
        """
        Get log directory
        """
        logRootDir = self.configParser.getLogDir()
        logDir = '%s/%s-%s' % (logRootDir, Utils.getDateAsString(
            self.startTime), Utils.getTimeAsString(self.startTime))
        return logDir

    def _createLogDir(self, logDir):
        """
        Create log directory
        """
        if not os.path.exists(logDir):
            try:
                os.makedirs(logDir)
            except OSError, e:
                print e
Exemplo n.º 27
0
class ParseFoodOn:
    """
    Class for parsing FoodOn.
    """
    def __init__(self, config_filepath):
        """
        Class initializer.

        Inputs:
            config_filepath: (str) Configuration filepath.
        """
        self.configparser = ConfigParser(config_filepath)

        # read configuration file
        self.filepath = self.configparser.getstr('filepath')
        self.full_ontology_pkl = self.configparser.getstr('full_ontology_pkl')
        self.candidate_ontology_pkl = self.configparser.getstr(
            'candidate_ontology_pkl')
        self.skeleton_and_entities_pkl = self.configparser.getstr(
            'skeleton_and_entities_pkl')
        self.overwrite_pkl = self.configparser.getbool('overwrite_pickle_flag')
        self.outputFoodOn = self.configparser.getstr('outputFoodOn')

        self.num_seeds = self.configparser.getint('num_seeds')
        self.num_min_extracted_entities = self.configparser.getint(
            'num_min_extracted_entities')

        # generate pairs from csv file
        self.pd_foodon_pairs = self.generate_pairs()
        self.all_classes, self.all_entities = self.get_classes_and_entities()
        self.foodon_graph, self.graph_dict, self.graph_dict_flip = self.generate_graph(
        )

    def generate_graph(self):
        graph_dict = {k: v for v, k in enumerate(self.all_classes)}
        graph_dict_flip = {v: k for v, k in enumerate(self.all_classes)}

        G = nx.DiGraph()

        for _, row in self.pd_foodon_pairs.iterrows():
            if row['Parent'] in self.all_classes and row[
                    'Child'] in self.all_classes:
                node_from = graph_dict[row['Parent']]
                node_to = graph_dict[row['Child']]
                G.add_edge(node_from, node_to)

        return G, graph_dict, graph_dict_flip

    def get_classes_and_entities(self):
        all_classes = self.pd_foodon_pairs['Parent'].tolist()
        all_classes = list(set(all_classes))
        all_classes.sort()
        log.debug('Found %d classes.', len(all_classes))

        child = self.pd_foodon_pairs['Child'].tolist()
        child = list(set(child))
        child.sort()
        all_entities = [c for c in child if c not in all_classes]
        log.debug('Found %d entities.', len(all_entities))

        return all_classes, all_entities

    def generate_pairs(self):
        log.info('Generating pairs of FoodOn.')

        if file_exists(self.outputFoodOn) and not self.overwrite_pkl:
            log.info('Using pre-generated pairs file.')
            return pd.read_csv(self.outputFoodOn, sep='\t')

        # 1.Read specified columns from FoodON.csv file
        foodon = pd.read_csv(
            self.filepath, usecols=['Class ID', 'Parents', 'Preferred Label'])

        # 2.Create dictionary of URI and ClassLabel
        labels_tmp = foodon[["Class ID", "Preferred Label"]].copy()
        self.labels = labels_tmp.set_index(
            'Class ID')['Preferred Label'].to_dict()

        # 3.Create data frame with columns - child and all its' parents
        foodonOrigDF = (foodon[[
            "Class ID", "Parents"
        ]].copy()).rename(columns={'Class ID': 'Child'})

        # 4.Split above DF into pairs of Child-Parent
        pairs = []
        for _, row in foodonOrigDF.iterrows():
            parents = str(row['Parents'])
            parentList = parents.split("|")
            for pClass in parentList:
                child = str(row['Child'])
                pairs.append([child, pClass])
        foodonDF = pd.DataFrame(pairs, columns=['Child', 'Parent'])
        foodonDF = self.filter_ontology(
            foodonDF, 'http://purl.obolibrary.org/obo/FOODON_00001872')
        foodonDF = self.get_subtree(
            foodonDF, 'http://purl.obolibrary.org/obo/FOODON_00001002')

        # In foodonDF, replace URI by label
        for idx, pair in foodonDF.iterrows():
            pair['Child'] = self.labels[pair['Child']]
            if pair['Parent'] in self.labels:
                pair['Parent'] = self.labels[pair['Parent']]

        foodonDF.drop_duplicates(inplace=True, ignore_index=True)
        foodonDF.to_csv(self.outputFoodOn, sep='\t', index=False)

        return foodonDF

    def filter_ontology(self, dfObj, classname):
        # Remove class and its children from the ontology.
        # Works only if the children are leaf nodes.
        indexNames = dfObj[dfObj['Parent'] == classname].index
        dfObj.drop(indexNames, inplace=True)
        indexNames = dfObj[dfObj['Child'] == classname].index
        dfObj.drop(indexNames, inplace=True)

        return dfObj

    def get_subtree(self, df, rootclass):
        subtreeDF, nextlevelclasses = self.traverse_next_level(
            df, ['http://purl.obolibrary.org/obo/FOODON_00001002'])

        while (len(nextlevelclasses) > 0):
            pairsDF, nextlevelclasses = self.traverse_next_level(
                df, nextlevelclasses)
            subtreeDF = pd.concat([subtreeDF, pairsDF], ignore_index=True)

        return subtreeDF

    def traverse_next_level(self, df, classnames):
        nextlevel = []
        subtree_pairs = []
        for parent in classnames:
            selectedPairs = df[df['Parent'] == parent]
            for idex, pair in selectedPairs.iterrows():
                subtree_pairs.append([pair['Child'], pair['Parent']])
                ifparent = df[df['Parent'] ==
                              pair['Child']]  # Check if it is a leaf node
                if ifparent.empty != True:
                    nextlevel.append(pair['Child'])

        subtreeDF = pd.DataFrame(subtree_pairs, columns=['Child', 'Parent'])

        return (subtreeDF, nextlevel)

    def get_all_classes_dict(self):
        """
        Get all candidate classes.
        """
        log.info('Generating dictionary of all classes.')

        if file_exists(self.full_ontology_pkl) and not self.overwrite_pkl:
            log.info('Using pre-generated full classes dictionary file.')
            return load_pkl(self.full_ontology_pkl)

        full_classes_dict = {}
        for class_label in self.all_classes:
            pd_match = self.pd_foodon_pairs[self.pd_foodon_pairs['Parent'] ==
                                            class_label]
            children = pd_match['Child'].tolist()
            children_entities = [c for c in children if c in self.all_entities]

            node_from = self.graph_dict['foodon product type']
            node_to = self.graph_dict[class_label]

            paths = []
            if class_label == 'foodon product type':
                paths.append(tuple(['foodon product type']))
            else:
                for path in nx.all_simple_paths(self.foodon_graph,
                                                source=node_from,
                                                target=node_to):
                    translated_path = [self.graph_dict_flip[p] for p in path]
                    paths.append(tuple(translated_path[::-1]))

            full_classes_dict[class_label] = (paths, children_entities)

        save_pkl(full_classes_dict, self.full_ontology_pkl)

        return full_classes_dict

    def get_candidate_classes(self):
        """
        Get all candidate classes.
        """
        log.info('Generating dictionary of candidate classes.')

        if file_exists(self.candidate_ontology_pkl) and not self.overwrite_pkl:
            log.info(
                'Using pre-generated candidate classes dictionary file: %s',
                self.candidate_ontology_pkl)
            return load_pkl(self.candidate_ontology_pkl)

        candidate_classes_dict = {}
        for class_label in self.all_classes:
            pd_match = self.pd_foodon_pairs[self.pd_foodon_pairs['Parent'] ==
                                            class_label]
            children = pd_match['Child'].tolist()
            children_entities = [c for c in children if c in self.all_entities]

            if len(children_entities) > 0:
                node_from = self.graph_dict['foodon product type']
                node_to = self.graph_dict[class_label]

                paths = []
                if class_label == 'foodon product type':
                    paths.append(tuple(['foodon product type']))
                else:
                    for path in nx.all_simple_paths(self.foodon_graph,
                                                    source=node_from,
                                                    target=node_to):
                        translated_path = [
                            self.graph_dict_flip[p] for p in path
                        ]
                        paths.append(tuple(translated_path[::-1]))

                candidate_classes_dict[class_label] = (paths,
                                                       children_entities)

        log.info('Found %d candidate classes out of %d all classes.',
                 len(candidate_classes_dict.keys()), len(self.all_classes))

        save_pkl(candidate_classes_dict, self.candidate_ontology_pkl)

        return candidate_classes_dict

    def get_seeded_skeleton(self, candidate_classes_dict):
        log.info('Generating dictionary of skeleton candidate classes.')

        if file_exists(
                self.skeleton_and_entities_pkl) and not self.overwrite_pkl:
            log.info('Using pickled skeleton file: %s',
                     self.skeleton_and_entities_pkl)
            return load_pkl(self.skeleton_and_entities_pkl)

        skeleton_candidate_classes_dict = {}
        candidate_entities = []
        for candidate_class in candidate_classes_dict.keys():
            entities = candidate_classes_dict[candidate_class][1]

            if len(entities) <= self.num_seeds:
                temp_num_seeds = len(
                    entities) - self.num_min_extracted_entities

                if temp_num_seeds > 0:
                    seeds = random.sample(entities, temp_num_seeds)
                    candidate_entities.extend(list(set(entities) - set(seeds)))
                else:
                    seeds = entities.copy()
            else:
                seeds = random.sample(entities, self.num_seeds)
                candidate_entities.extend(list(set(entities) - set(seeds)))

            skeleton_candidate_classes_dict[candidate_class] = (
                candidate_classes_dict[candidate_class][0], seeds)

        candidate_entities = list(set(candidate_entities))
        candidate_entities.sort()

        log.info(
            'Found %d candidate entities to populate out of %d all entities.',
            len(candidate_entities), len(self.all_entities))

        return_value = (skeleton_candidate_classes_dict, candidate_entities)
        save_pkl(return_value, self.skeleton_and_entities_pkl)

        return return_value
Exemplo n.º 28
0
def main():
    """
    Main function.
    """
    # set log, parse args, and read configuration
    set_logging(log_level=log.INFO)
    args = parse_argument()
    configparser = ConfigParser(args.config_file)

    # need to apply preprocessing
    fpm = FdcPreprocessManager(configparser.getstr('preprocess_config'))

    # read FoodOn vocabs
    labels = []
    pd_foodon_pairs = pd.read_csv('./data/FoodOn/foodonpairs.txt', sep='\t')
    labels.extend(pd_foodon_pairs['Parent'].tolist())
    labels.extend(pd_foodon_pairs['Child'].tolist())
    labels = list(set(labels))

    log.info('Number of unique labels: %d', len(labels))

    processed_labels = fpm.preprocess_column(pd.Series(labels),
                                             load_model=True).tolist()
    queries = processed_labels.copy()
    for processed_label in processed_labels:
        queries.extend(processed_label.split())
    queries = list(set(queries))

    # get summaries of the wikipedia entry
    wm = WikipediaManager()

    # check if we're gonna reuse the previous results
    if configparser.getbool('reuse_previous'):
        prev_summary = configparser.getstr('prev_summaries_filepath')
        prev_failed = configparser.getstr('prev_failed_filepath')
    else:
        prev_summary = None
        prev_failed = None

    pd_summary, pd_failed = wm.get_summary(queries,
                                           prev_summary=prev_summary,
                                           prev_failed=prev_failed)

    # save results
    log.info('Saving successfully pulled wiki summaries to %s',
             configparser.getstr('summaries_filepath'))

    pd_summary.to_csv(configparser.getstr('summaries_filepath'),
                      sep='\t',
                      index=False)

    log.info('Saving failed wiki queries to %s',
             configparser.getstr('failed_filepath'))

    pd_failed.to_csv(configparser.getstr('failed_filepath'),
                     sep='\t',
                     index=False)

    # preprocess columns
    pd_summary['summary_preprocessed'] = fpm.preprocess_column(
        pd_summary['summary'], load_model=True)

    output_filepath = configparser.getstr('preprocessed_output')

    log.info('Saving preprocessed wikipedia data to %s...', output_filepath)
    pd_summary.to_csv(output_filepath, sep='\t', index=False)
Exemplo n.º 29
0
def plot_pr_print_cm(baseline_classifier, best_classifier, main_config,
                     model_manager):
    classifiers_ys = {}

    for classifier in [baseline_classifier, best_classifier]:
        log.info('Running model for classifier \'%s\'', classifier)

        # load config parsers
        preprocess_config = ConfigParser(
            main_config.get_str('preprocess_config'))
        classifier_config = ConfigParser(
            main_config.get_str('classifier_config'))

        # perform preprocessing
        X, y = model_manager.preprocess(preprocess_config, section=classifier)

        # select subset of features if requested
        selected_features = main_config.get_str_list('selected_features')
        if selected_features:
            log.info('Selecting subset of features: %s', selected_features)
            X = X[selected_features]

        # run classification model
        classifier_config.overwrite('classifier', classifier)

        score_avg, score_std, ys = model_manager.run_model_cv(
            X, y, 'f1', classifier_config)

        classifiers_ys[classifier] = ys

    # confusion matrix
    (y_trues, y_preds, y_probs) = classifiers_ys[best_classifier]

    tn = []
    fp = []
    fn = []
    tp = []

    pred_pos = []
    pred_neg = []
    known_pos = []
    known_neg = []

    f1 = []
    precision = []
    recall = []
    specificity = []
    npv = []
    fdr = []
    accuracy = []
    for fold in range(len(y_trues)):
        cm_result = confusion_matrix(y_trues[fold], y_preds[fold]).ravel()
        tn.append(cm_result[0])
        fp.append(cm_result[1])
        fn.append(cm_result[2])
        tp.append(cm_result[3])

        pred_pos.append(cm_result[3] + cm_result[1])
        pred_neg.append(cm_result[2] + cm_result[0])
        known_pos.append(cm_result[3] + cm_result[2])
        known_neg.append(cm_result[1] + cm_result[0])

        f1.append(f1_score(y_trues[fold], y_preds[fold]))
        precision.append(
            precision_score(y_trues[fold], y_preds[fold], average='binary'))
        recall.append(
            recall_score(y_trues[fold], y_preds[fold], average='binary'))
        specificity.append(cm_result[0] / (cm_result[0] + cm_result[1]))
        npv.append(cm_result[0] / (cm_result[0] + cm_result[2]))
        fdr.append(cm_result[1] / (cm_result[1] + cm_result[3]))
        accuracy.append(accuracy_score(y_trues[fold], y_preds[fold]))

    tn_mean = np.mean(tn)
    fp_mean = np.mean(fp)
    fn_mean = np.mean(fn)
    tp_mean = np.mean(tp)
    pred_pos_mean = np.mean(pred_pos)
    pred_neg_mean = np.mean(pred_neg)
    known_pos_mean = np.mean(known_pos)
    known_neg_mean = np.mean(known_neg)
    f1_mean = np.mean(f1)
    precision_mean = np.mean(precision)
    recall_mean = np.mean(recall)
    specificity_mean = np.mean(specificity)
    npv_mean = np.mean(npv)
    fdr_mean = np.mean(fdr)
    accuracy_mean = np.mean(accuracy)

    tn_std = np.std(tn)
    fp_std = np.std(fp)
    fn_std = np.std(fn)
    tp_std = np.std(tp)
    pred_pos_std = np.std(pred_pos)
    pred_neg_std = np.std(pred_neg)
    known_pos_std = np.std(known_pos)
    known_neg_std = np.std(known_neg)
    f1_std = np.std(f1)
    precision_std = np.std(precision)
    recall_std = np.std(recall)
    specificity_std = np.std(specificity)
    npv_std = np.std(npv)
    fdr_std = np.std(fdr)
    accuracy_std = np.std(accuracy)

    log.info(
        'Confusion matrix (tp, fp, fn, tn): (%.2f±%.2f, %.2f±%.2f, %.2f±%.2f, %.2f±%.2f)',
        tp_mean, tp_std, fp_mean, fp_std, fn_mean, fn_std, tn_mean, tn_std)
    log.info('pred pos: %.2f±%.2f', pred_pos_mean, pred_pos_std)
    log.info('pred neg: %.2f±%.2f', pred_neg_mean, pred_neg_std)
    log.info('known pos: %.2f±%.2f', known_pos_mean, known_pos_std)
    log.info('known neg: %.2f±%.2f', known_neg_mean, known_neg_std)
    log.info('F1: %.2f±%.2f', f1_mean, f1_std)
    log.info('Precision: %.2f±%.2f', precision_mean, precision_std)
    log.info('Recall: %.2f±%.2f', recall_mean, recall_std)
    log.info('Specificity: %.2f±%.2f', specificity_mean, specificity_std)
    log.info('Npv: %.2f±%.2f', npv_mean, npv_std)
    log.info('Fdr: %.2f±%.2f', fdr_mean, fdr_std)
    log.info('Accuracy: %.2f±%.2f', accuracy_mean, accuracy_std)

    # plot PR curve
    fig = plt.figure()

    lines = []
    labels = []
    for classifier, ys in classifiers_ys.items():
        y_trues, y_preds, y_probs = ys

        if classifier == best_classifier:
            num_folds = len(y_trues)
            precision = 0
            recall = 0

            for fold in range(num_folds):
                precision += precision_score(y_trues[fold],
                                             y_preds[fold],
                                             average='binary')
                recall += recall_score(y_trues[fold],
                                       y_preds[fold],
                                       average='binary')

            precision /= num_folds
            recall /= num_folds

            arrowprops = {'arrowstyle': '->'}
            plt.scatter(recall, precision, s=30, marker='x', c='k', zorder=3)
            plt.annotate('Operational point', (recall, precision),
                         (recall - 0.05, precision + 0.05),
                         arrowprops=arrowprops)

        y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs)
        line, label = plot_pr(y_trues, y_probs_1, classifier)

        lines.append(line)
        labels.append(label)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Best model ({}) PR curve'.format(best_classifier))
    plt.legend(lines, labels, loc='upper right', prop={'size': 10})

    save_figure(fig, main_config.get_str('pr_curve'))
Exemplo n.º 30
0
class RunnerThread(Thread):
    def __createXml(self):
        doc = xml.dom.minidom.Document()
        root = doc.createElement('static_info')
        doc.appendChild(root)

        ltime = doc.createElement('datetime')
        timeStr = time.strftime('%Y-%m-%d: %H:%M:%S',
                                time.localtime(time.time()))
        ltime.appendChild(doc.createTextNode(str(timeStr)))
        root.appendChild(ltime)

        basicInfo = doc.createElement('basicInfo')
        staticPojo = self.staticAnalyzer
        if staticPojo.basicInfo is not None and staticPojo.basicInfo is not {}:
            if staticPojo.basicInfo['VersionCode'] is not None:
                versionCode = doc.createElement('VersionCode')
                versionCode.appendChild(
                    doc.createTextNode(str(
                        staticPojo.basicInfo['VersionCode'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('VersionCode')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)
            if staticPojo.basicInfo['FileName'] is not None:
                versionCode = doc.createElement('FileName')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['FileName'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('FileName')
                versionCode.appendChild(doc.createTextNode('FileName'))
                basicInfo.appendChild(versionCode)

            if staticPojo.basicInfo['FileMD5'] is not None:
                versionCode = doc.createElement('FileMD5')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['FileMD5'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('FileMD5')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)

            if staticPojo.basicInfo['FileSize'] is not None:
                versionCode = doc.createElement('FileSize')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['FileSize'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('Filesize')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)

            if staticPojo.basicInfo['Package'] is not None:
                versionCode = doc.createElement('Package')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['Package'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('Package')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)

            if staticPojo.basicInfo['MinSDK'] is not None:
                versionCode = doc.createElement('MinSDK')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['MinSDK'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('MinSDK')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)

            if staticPojo.basicInfo['TargetSDK'] is not None:
                versionCode = doc.createElement('TargetSDK')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['TargetSDK'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('TargetSDK')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)

            if staticPojo.basicInfo['Cert'] is not None:
                versionCode = doc.createElement('Cert')
                versionCode.appendChild(
                    doc.createTextNode(str(staticPojo.basicInfo['Cert'])))

                basicInfo.appendChild(versionCode)
            else:
                versionCode = doc.createElement('Cert')
                versionCode.appendChild(doc.createTextNode('None'))
                basicInfo.appendChild(versionCode)
        root.appendChild(basicInfo)

        isRepackaged = doc.createElement('isRepackaged')
        if staticPojo.isRepackaged is False:
            isRepackaged.appendChild(doc.createTextNode('0'))
        else:
            isRepackaged.appendChild(doc.createTextNode('1'))
        root.appendChild(isRepackaged)

        isRepackaged = doc.createElement('malware')
        if staticPojo.malware is not None:
            isRepackaged.appendChild(doc.createTextNode(staticPojo.malware))
        else:
            isRepackaged.appendChild(doc.createTextNode('0'))
        root.appendChild(isRepackaged)

        isRepackaged = doc.createElement('riskValue')
        isRepackaged.appendChild(doc.createTextNode(str(staticPojo.riskValue)))
        root.appendChild(isRepackaged)

        sensitiveAPIs = doc.createElement('sensitiveAPIs')
        if staticPojo.sensitiveAPIs is not None and staticPojo.sensitiveAPIs is not {}:
            for key, value in staticPojo.sensitiveAPIs.items():
                sensitiveAPI = doc.createElement('sensitiveAPI')
                keyName = doc.createElement('name')
                keyName.appendChild(doc.createTextNode(str(key)))
                sensitiveAPI.appendChild(keyName)

                desc = doc.createElement('desc')
                desc.appendChild(doc.createTextNode(str(value)))
                sensitiveAPI.appendChild(desc)
                sensitiveAPIs.appendChild(sensitiveAPI)
        root.appendChild(sensitiveAPIs)

        sensitiveAPIs = doc.createElement('sensitiveStrs')
        if staticPojo.sensitiveStrs is not None and staticPojo.sensitiveStrs is not {}:
            for key, value in staticPojo.sensitiveStrs.items():
                sensitiveAPI = doc.createElement('sensitiveStr')
                keyName = doc.createElement('name')
                keyName.appendChild(doc.createTextNode(str(key)))
                sensitiveAPI.appendChild(keyName)

                desc = doc.createElement('desc')
                desc.appendChild(doc.createTextNode(str(value)))
                sensitiveAPI.appendChild(desc)
                sensitiveAPIs.appendChild(sensitiveAPI)
        root.appendChild(sensitiveAPIs)

        sensitiveAPIs = doc.createElement('sensitiveFiles')
        if staticPojo.sensitiveFiles is not None and staticPojo.sensitiveFiles is not {}:
            for key, value in staticPojo.sensitiveFiles.items():
                sensitiveAPI = doc.createElement('sensitiveFile')
                keyName = doc.createElement('name')
                keyName.appendChild(doc.createTextNode(str(key)))
                sensitiveAPI.appendChild(keyName)

                desc = doc.createElement('type')
                desc.appendChild(doc.createTextNode(str(value)))
                sensitiveAPI.appendChild(desc)
                sensitiveAPIs.appendChild(sensitiveAPI)
        root.appendChild(sensitiveAPIs)

        sensitiveAPIs = doc.createElement('sensitiveCodes')
        if staticPojo.sensitiveCodes is not None and staticPojo.sensitiveCodes is not {}:
            for key, value in staticPojo.sensitiveCodes.items():
                sensitiveAPI = doc.createElement('sensitiveCode')
                keyName = doc.createElement('name')
                keyName.appendChild(doc.createTextNode(str(key)))
                sensitiveAPI.appendChild(keyName)

                desc = doc.createElement('desc')
                desc.appendChild(doc.createTextNode(str(value)))
                sensitiveAPI.appendChild(desc)
                sensitiveAPIs.appendChild(sensitiveAPI)
        root.appendChild(sensitiveAPIs)

        urls_static = doc.createElement('urls')
        if staticPojo.urls is not None and staticPojo.urls is not []:
            for item in staticPojo.urls:
                url_static = doc.createElement('url')
                url_static.appendChild(doc.createTextNode(str(item)))
                urls_static.appendChild(url_static)

        root.appendChild(urls_static)

        sensitiveAPIs = doc.createElement('permissions')
        if staticPojo.permissions is not None and staticPojo.permissions is not {}:
            for key, value in staticPojo.permissions.items():
                sensitiveAPI = doc.createElement('permission')
                keyName = doc.createElement('name')
                keyName.appendChild(doc.createTextNode(str(key)))
                sensitiveAPI.appendChild(keyName)

                desc = doc.createElement('desc')
                desc.appendChild(doc.createTextNode(str(value)))
                sensitiveAPI.appendChild(desc)
                sensitiveAPIs.appendChild(sensitiveAPI)
        root.appendChild(sensitiveAPIs)

        mainActivity = doc.createElement('mainActivity')
        if staticPojo.mainActivity is not None and staticPojo.mainActivity is not {}:
            mainActivity.appendChild(
                doc.createTextNode(str(staticPojo.mainActivity)))
        root.appendChild(mainActivity)

        activities = doc.createElement('activities')
        if staticPojo.activities is not None and staticPojo.activities is not {}:
            for value in staticPojo.activities:
                activity = doc.createElement('activity')
                activity.appendChild(doc.createTextNode(str(value)))
                activities.appendChild(activity)
        root.appendChild(activities)

        services = doc.createElement('services')
        if staticPojo.services is not None and staticPojo.services is not {}:
            for value in staticPojo.services:
                service = doc.createElement('service')
                service.appendChild(doc.createTextNode(str(value)))
                services.appendChild(service)
        root.appendChild(services)

        receivers = doc.createElement('receivers')
        if staticPojo.receivers is not None and staticPojo.receivers is not {}:
            for value in staticPojo.receivers:
                receiver = doc.createElement('receiver')
                receiver.appendChild(doc.createTextNode(str(value)))
                receivers.appendChild(receiver)
        root.appendChild(receivers)

        providers = doc.createElement('provicers')
        if staticPojo.providers is not None and staticPojo.providers is not {}:
            for value in staticPojo.providers:
                provider = doc.createElement('provicer')
                provider.appendChild(doc.createTextNode(str(value)))
                providers.appendChild(provider)
        root.appendChild(providers)

        exposedActivities = doc.createElement('exposedActivities')
        if staticPojo.exposedActivities is not None and staticPojo.exposedActivities is not {}:
            for value in staticPojo.exposedActivities:
                exposedActivity = doc.createElement('exposedActivity')
                exposedActivity.appendChild(doc.createTextNode(str(value)))
                exposedActivities.appendChild(exposedActivity)
        root.appendChild(exposedActivities)

        exposedServices = doc.createElement('exposedServices')
        if staticPojo.exposedServices is not None and staticPojo.exposedServices is not {}:
            for value in staticPojo.exposedServices:
                exposedService = doc.createElement('exposedService')
                exposedService.appendChild(doc.createTextNode(str(value)))
                exposedServices.appendChild(exposedService)
        root.appendChild(exposedServices)

        exposedReceivers = doc.createElement('exposedReceivers')
        if staticPojo.exposedReceivers is not None and staticPojo.exposedReceivers is not {}:
            for value in staticPojo.exposedReceivers:
                exposedReceiver = doc.createElement('exposedReceiver')
                exposedReceiver.appendChild(doc.createTextNode(str(value)))
                exposedReceivers.appendChild(exposedReceiver)
        root.appendChild(exposedReceivers)

        classifyInfo = doc.createElement('classifyInfo')
        if staticPojo.classifyInfo is not None and staticPojo.classifyInfo is not {}:
            classifyInfo.appendChild(
                doc.createTextNode(str(staticPojo.classifyInfo)))
        root.appendChild(classifyInfo)

        xmlFilename = '/home/mindmac/workspace/SandDroidIIE/staticAnaReports/' + staticPojo.basicInfo[
            'FileMD5'] + '.xml'
        xmlFile = open(xmlFilename, 'w')
        doc.writexml(xmlFile,
                     indent='\t',
                     addindent='\t',
                     newl='\n',
                     encoding="utf-8")

    def __init__(self,
                 theApkObj,
                 theAvdName,
                 decompressDir,
                 runHeadless,
                 theLogger=Logger()):
        Thread.__init__(self)
        # configParser
        self.configParser = ConfigParser()

        self.apkObj = theApkObj
        self.log = theLogger
        self.curDir = os.path.dirname(__file__)

        self.staticAnalyzer = None
        self.dynamicAnalyzer = None
        self.logcatAnalyzer = None

        self.startTimeStr = None
        self.endTimeStr = None

        self.emulator = None
        self.emulatorPort = 5554
        self.avdName = theAvdName
        self.runHeadless = runHeadless

        self.decompressPath = decompressDir
        self.logcatFile = None

        self.session = None

        self.cancelFlag = False  # Flag for canceling run

    def checkForCancelation(self):
        """
        Checks for the cancelation flag sent from the main program.
        If cancel flag is set, abort execution by raising KeyboardInterrupt.
        """
        if self.cancelFlag:
            self.log.info('Cancelation flag found, abort thread')
            traceback.print_stack(file=self.log.log)
            raise KeyboardInterrupt

    def __getLogcatFilePath(self):
        """
        Generates logcat file name
        """
        return os.path.join(self.decompressPath, 'logcat.log')

    def getLogger(self):
        return self.log

    def staticAnalyze(self):
        """
        Static Analysis
        """

        # Static Analyzer
        self.staticAnalyzer = StaticAnalyzer(self.apkObj, self.decompressPath,
                                             self.curDir, self.log)

        # Init
        self.log.info('Initialization...')
        self.staticAnalyzer.initEnv()

        # Parse smali files
        self.log.info('Parse smali files to get methods, urls...')
        self.staticAnalyzer.parseSmali()

        # APK basic information
        self.log.info('Get APK\'s basic information')
        self.staticAnalyzer.getBasicInfo()

        # APK permissions used
        self.log.info('Get APK\'s used permissions')
        self.staticAnalyzer.getPermissions()

        # APK components used
        self.log.info('Get APK\'s used components')
        self.staticAnalyzer.getComponents()

        # APK components exposed
        self.log.info('Get APK\'s exposed components')
        self.staticAnalyzer.getExposedComps()

        # APK classifier
        self.log.info('Get APK\'s classifier information')
        self.staticAnalyzer.classifyByPermission()

        # APK fuzzy risk value
        self.log.info('Get APK\'s fuzzy risk score')
        self.staticAnalyzer.getRisk()

        # APK gexf graph
        self.log.info('Get APK\'s gexf graph', setTime=True)
        #gexfOutFile = os.path.join(self.decompressPath, '%s.gexf' % self.apkObj.getMd5Hash().upper())
        #self.staticAnalyzer.getGexf(gexfOutFile)

        # APK Malware detection
        self.log.info('Get APK\'s malicious information', setTime=True)
        self.staticAnalyzer.getMal()

        # APK repackaged
        self.log.info('Check APK if repackaged', setTime=True)
        self.staticAnalyzer.checkRepackage(self.session)

    def dynamicAnalyze(self):
        """
        Dynamic Anlysis
        """
        cur_dir = os.path.dirname(__file__)
        imageDir = os.path.join(cur_dir, 'resources', 'images')
        pcapFile = os.path.join(self.decompressPath,
                                '%s.pcap' % self.apkObj.getMd5Hash().upper())

        self.dynamicAnalyzer = DynamicAnalyzer(self.decompressPath,
                                               self.avdName, self.curDir,
                                               self.log)

        self.emulator = EmulatorClient(
            theSdkPath=self.configParser.getAndroidSdkDir(),
            thePort=self.emulatorPort,
            theImageDir=imageDir,
            thePcapFile=pcapFile,
            theRunHeadless=self.runHeadless,
            theAvdName=self.avdName,
            theLogger=self.log)

        self.checkForCancelation()

        # Start emulator
        self.log.info('Start emulator', setTime=True)
        self.emulator.start()

        # Run app
        isFinishedRunnig = self.dynamicAnalyzer.runApp(self.emulator,
                                                       self.apkObj)

        # Store logcat file
        if isFinishedRunnig:
            self.log.info('Store logcat file')
            self.emulator.stopLogcatRedirect()
            self.emulator.storeLogcatRedirectFile(
                self.dynamicAnalyzer.logcatRedirectFile, self.logcatFile)

        else:
            self.log.error('Run app failed!')

    def killEmulator(self):
        if not self.emulator:
            self.emulator.shutDown()

    def logcatAnalyze(self, logcatFile):
        """
        Analyze logcat file
        """
        try:
            if not os.path.exists(logcatFile):
                self.log.info('Logcat file %s doesn\'t exist!' % logcatFile)
                return
            else:
                # Build self.logcatAnalyzer
                self.logcatAnalyzer = LogcatAnalyzer(theLogger=self.log)
                self.logcatAnalyzer.setLogFile(logcatFile)
                self.logcatAnalyzer.extractLogEntries()

        except EmulatorClientError, ecErr:
            self.runnerThread.result['errorList'].append(ecErr)