示例#1
0
def main(targets):
    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/out', ignore_errors=True)
        shutil.rmtree('data/test', ignore_errors=True)

    if 'data' in targets:
        cfg = load_params(TOP_PATH + DATA_PARAMS)
        get_data(**cfg)

    if 'test' in targets:
        cfg = load_params(TOP_PATH + TEST_PARAMS)
        get_data(**cfg)

    if 'transform' in targets:
        if not os.path.exists(TOP_PATH + '/data/cleaned'):
            os.makedirs(TOP_PATH + '/data/cleaned')
        for filename in os.listdir(TOP_PATH + '/data/raw'):
            if 'STOPS' in filename:
                if '2018' in filename:
                    temp_df = cleaning.clean_2018_2019(TOP_PATH +
                                                       '/data/raw/' +
                                                       str(filename))
                elif '2017' in filename:
                    temp_df = cleaning.clean_2017(TOP_PATH + '/data/raw/' +
                                                  str(filename))
                else:
                    temp_df = cleaning.clean_2014_2016(TOP_PATH +
                                                       '/data/raw/' +
                                                       str(filename))
            elif 'csv' in filename:
                temp_df = cleaning.clean_trends(TOP_PATH + '/data/raw/' +
                                                str(filename))
    return
示例#2
0
def main(targets):
    
    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/cleaned', ignore_errors=True)
        shutil.rmtree('test/raw', ignore_errors=True)
        shutil.rmtree('test/cleaned', ignore_errors=True)
    
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)
        
        cfg = load_params(CLEAN_PARAMS)
        clean_stops(**cfg)
        
    if 'test-project' in targets:
        cfg = load_params(TEST_DATA_PARAMS)
        get_data(**cfg)

        cfg = load_params(TEST_CLEAN_PARAMS)
        clean_stops(**cfg)

        cfg = load_params(TEST_MODEL_PARAMS)
        driver(**cfg)

    if 'model' in targets:
        cfg = load_params(MODEL_PARAMS)
        driver(**cfg)
     
    return 
示例#3
0
def main(targets):
    '''
    Runs the main project pipeline logic, given the targets.
    targets must contain: 'data', 'analysis', 'model'. 
    `main` runs the targets in order of data=>analysis=>model.
    '''
    with open('config/data-params.json') as fh:
        data_cfg = json.load(fh)
    get_data(**data_cfg)

    with open('config/eda-params.json') as fh:
        eda_cfg = json.load(fh)
    do_eda(**eda_cfg)

    with open('config/auto-params.json') as fh:
        auto_cfg = json.load(fh)
    autophrase(**auto_cfg)

    with open('config/visual-params.json') as fh:
        visual_cfg = json.load(fh)
    visual(**visual_cfg)

    # with open('config/example-params.json') as fh:
    #     example_cfg = json.load(fh)
    # example(**example_cfg)

    return
示例#4
0
def main(targets):
    if not os.path.exists('data/'):
        os.mkdir('data/')
    if not os.path.exists('viz/'):
        os.mkdir('viz/')

    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/cleaned', ignore_errors=True)
        shutil.rmtree('viz', ignore_errors=True)

    # make the clean test target
    if 'clean-test' in targets:
        shutil.rmtree('test_data/cleaned', ignore_errors=True)
        shutil.rmtree('viz', ignore_errors=True)

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)

    if 'process' in targets:
        cfg = load_params(PROCESS_PARAMS)
        process(**cfg)

    if 'eda' in targets:
        if not os.path.exists('viz/EDA'):
            os.mkdir('viz/EDA')

        cfg = load_params(EDA_PARAMS)
        generate_viz(**cfg)

    if 'analyze' in targets:
        if not os.path.exists('viz/Analysis'):
            os.mkdir('viz/Analysis')

        cfg = load_params(ANALYZE_PARAMS)
        analyze(**cfg)

    if 'test-project' in targets:
        process_cfg = load_params(TEST_PROCESS_PARAMS)
        process(**process_cfg)
        if not os.path.exists('viz/EDA'):
            os.mkdir('viz/EDA')

        if not os.path.exists('viz/Analysis'):
            os.mkdir('viz/Analysis')

        eda_cfg = load_params(TEST_EDA_PARAMS)

        generate_viz(**eda_cfg)

        analyze_cfg = load_params(TEST_ANALYZE_PARAMS)
        analyze(**analyze_cfg)

    return
示例#5
0
def main(targets):

    if 'data' in targets:
        with open('data-params.json') as fh:
            data_cfg = json.load(fh)

        # make the data target
        get_data(**data_cfg)

    return
示例#6
0
def main(targets):
    if not os.path.exists('data/'):
        os.mkdir('data/')
    if not os.path.exists('viz/'):
        os.mkdir('viz/')

    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/cleaned', ignore_errors=True)
        shutil.rmtree('viz', ignore_errors=True)

    if 'clean-test' in targets:
        shutil.rmtree('test_data/cleaned', ignore_errors=True)
        shutil.rmtree('viz', ignore_errors=True)

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)

    if 'process' in targets:
        cfg = load_params(PROCESS_PARAMS)
        process(**cfg)

    if 'eda' in targets:
        if not os.path.exists('viz/EDA'):
            os.mkdir('viz/EDA')

        cfg_stops = load_params(EDA_STOPS_PARAMS)
        cfg_crimes = load_params(EDA_CRIMES_PARAMS)
        cfg_arrests = load_params(EDA_ARRESTS_PARAMS)

        gv_stops(**cfg_stops)
        gv_crimes(**cfg_crimes)
        gv_arrests(**cfg_arrests)

    if 'test' in targets:
        process_cfg = load_params(TEST_PROCESS_PARAMS)
        process(**process_cfg)
        if not os.path.exists('viz/EDA'):
            os.mkdir('viz/EDA')

        cfg_stops = load_params(TEST_EDA_STOPS_PARAMS)
        cfg_crimes = load_params(TEST_EDA_CRIMES_PARAMS)
        cfg_arrests = load_params(TEST_EDA_ARRESTS_PARAMS)

        gv_stops(**cfg_stops)
        gv_crimes(**cfg_crimes)
        gv_arrests(**cfg_arrests)

    return
def main(targets):

    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/cleaned', ignore_errors=True)
        shutil.rmtree('data/sunset', ignore_errors=True)
        shutil.rmtree('data/model', ignore_errors=True)
        shutil.rmtree('data/test', ignore_errors=True)

    if 'test-project' in targets:  # create project

        # make the data target
        #if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        data_years = cfg["year"]
        [get_data(data_years[i]) for i in range(len(data_years))]

        # make VoD data
        [get_veil(data_years[i]) for i in range(len(data_years))]
        [build_intertw(data_years[i]) for i in range(len(data_years))]

        # make the test target
        #if 'test' in targets:
        cfg = load_params(TEST_DATA_PARAMS)
        test_year = cfg["year"][0]
        get_data_test(test_year)
示例#8
0
文件: run.py 项目: saveree/dsc180a_A3
def main(targets):

    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/temp', ignore_errors=True)
        shutil.rmtree('data/out', ignore_errors=True)
        shutil.rmtree('data/test', ignore_errors=True)

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)

    # make the test target
    if 'test' in targets:
        cfg = load_params(TEST_PARAMS)
        get_data(**cfg)
    return
def run(save_data: bool = True) -> None:
    """ Entry-point function to run liquidity cost calculations and save image and static data 
    to files.
    """
    ticker_df, lob_df = etl.get_data()
    asks_regression_df, bids_regression_df = get_processed_lob_time_series(lob_df)
    results_dict = compute_liquidity_cost(asks_regression_df, bids_regression_df)
    if save_data:
        save_liquidity_data(results_dict)
    return
示例#10
0
def main(targets):

    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/', ignore_errors=True)

    # make the conversion target
    if 'convert' in targets:
        cfg = load_params(CONVERT_PARAMS)['data']
        convert_data(**cfg)

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)['data']
        get_data(**cfg)

    # make the test target
    if 'data-test' in targets:
        cfg = load_params(TEST_PARAMS)['data']
        get_data(**cfg)

    # make the process target
    if 'process' in targets:
        cfg = load_params(TEST_PARAMS)['process']
        process_data(**cfg)

    # make the test project target
    if 'test-project' in targets:
        cfg_data = load_params(TEST_PARAMS)['data']
        get_data(**cfg_data)

        cfg_process = load_params(TEST_PARAMS)['process']
        process_data(**cfg_process, test=True)

    return
示例#11
0
def main(targets):
    if 'test-project' in targets:
        targets.append('test')
        targets.append('transform')

    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/out', ignore_errors=True)
        shutil.rmtree('data/test', ignore_errors=True)

    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)

    if 'test' in targets:
        cfg = load_params(TEST_PARAMS)
        get_data(**cfg)

    if 'transform' in targets:
        for directory in directories:
            if not os.path.exists(directory):
                continue
            for filename in os.listdir(directory):
                if filename.endswith("csv"):
                    if '2018' in filename:
                        temp_df = cleaning.clean_2018_2019(directory + '/' +
                                                           filename)
                        df = calculations.get_inner_twilight_period(temp_df)
                        calculations.veil_of_darkness(df, 2018, notebook=False)
                    else:
                        year = int(filename[0:4])
                        temp_df = cleaning.clean_2014_2017(directory + '/' +
                                                           filename)
                        df = calculations.get_inner_twilight_period(temp_df)
                        calculations.veil_of_darkness(df, year, notebook=False)
                continue
            else:
                continue
    return
示例#12
0
def main(targets):

    # to time the function
    start_time = time.time()
    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/temp', ignore_errors=True)
        shutil.rmtree('data/out', ignore_errors=True)
        shutil.rmtree('data/raw', ignore_errors=True)

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)

    # make the test target
    if 'test' in targets:
        cfg = load_params(TEST_PARAMS)

    get_data(**cfg)
    smali_dict = load_smali_dict(**cfg)

    app = smali_dict[list(smali_dict.keys())[0]]
    a_calc = a_matrix_calc(smali_dict)
    a_matrix = A_matrix(a_calc[0], a_calc[1])
    b_calc = b_matrix_calc(app)
    b_matrix = B_matrix(b_calc[0], b_calc[1])
    p_calc = p_matrix_calc(app)
    p_matrix = P_matrix(p_calc[0], p_calc[1])
    i_calc = i_matrix_calc(app)
    i_matrix = I_matrix(i_calc[0], i_calc[1])

    cleaned = preprocessing_feature_engineering(smali_dict)
    logreg(cleaned[0], cleaned[1], cleaned[2], cleaned[3])

    print('Finished in: {} seconds'.format(time.time() - start_time))
    return
示例#13
0
def main(targets):
    '''
    Runs the main project pipeline logic, given the targets.
    targets must contain: 'data', 'analysis', 'model'.

    `main` runs the targets in order of data=>analysis=>model.
    '''

    if 'data' in targets:

        data = get_data()

    if 'analysis' in targets:
        pass

    if 'model' in targets:
        pass

    return
def get_data(sin_cos_transform=False) -> "asks_merged_df, bids_merged_df":
    """ Get data for ML modelling
    :return: asks_merged_df, bids_merged_df
    """
    # Get data
    ticker_df, lob_df = etl.get_data()
    asks_costs_df, bids_costs_df = etl.get_costs_data()
    asks_costs_df.dropna(axis=0, inplace=True)
    bids_costs_df.dropna(axis=0, inplace=True)

    ticker_df.rename({"Time_Hour": "Time"}, axis=1, inplace=True)
    for df in [asks_costs_df, bids_costs_df]:
        df.rename({"Time_Minute": "Time"}, axis=1, inplace=True)

    for df in [ticker_df, asks_costs_df, bids_costs_df]:
        df["Time"] = pd.to_datetime(df["Time"], utc=True)

    asks_merged_df = ticker_df.merge(asks_costs_df, on="Time")
    bids_merged_df = ticker_df.merge(asks_costs_df, on="Time")

    asks_merged_df, asks_new_time_cols = get_time_cols(
        input_df=asks_merged_df,
        time_cols=TIME_COLS,
        sin_cos_transform=sin_cos_transform)
    bids_merged_df, bids_new_time_cols = get_time_cols(
        input_df=bids_merged_df,
        time_cols=TIME_COLS,
        sin_cos_transform=sin_cos_transform)

    asks_merged_df["Instrument_Code"] = asks_merged_df[
        "Instrument_Code"].astype("category")
    bids_merged_df["Instrument_Code"] = asks_merged_df[
        "Instrument_Code"].astype("category")

    asks_merged_df["Instrument_Code_id"] = asks_merged_df[
        "Instrument_Code"].cat.codes
    bids_merged_df["Instrument_Code_id"] = bids_merged_df[
        "Instrument_Code"].cat.codes
    return asks_merged_df, bids_merged_df
示例#15
0
def main(targets):
    '''
    Runs the main project pipeline logic, given the targets.
    targets must contain: 'data', 'train', 'analysis', 'results'. 
    
    `main` runs the targets in order of data=>train=>analysis=>results.
    '''
    # Setup Logger
    logger = logging.getLogger('project_log')
    logger.setLevel(logging.DEBUG)
    fh = RotatingFileHandler('example.log', maxBytes=1000000, backupCount=0)
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s'
    )
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.info('STARTING PROGRAM')

    # Data Target
    if 'data' in targets or 'all' in targets:
        logger.info('Starting data target')
        with open('config/data-params.json') as fh:
            data_cfg = json.load(fh)
        with open('config/twitter-api-keys.json') as fh:
            twitter_cfg = json.load(fh)
        get_data(logger, **data_cfg, **twitter_cfg)
        logger.info('Finishing data target')

    # Train Model target
    if 'train' in targets or 'all' in targets:
        logger.info('Starting train target')
        with open('config/train-params.json') as fh:
            train_cfg = json.load(fh)
        df = pd.read_csv(
            os.path.join(train_cfg['training_data_path'],
                         'data.csv')).drop(columns=['Unnamed: 0'])
        train_model(logger, df, **train_cfg)

        convert_notebook('train', **train_cfg)
        logger.info('finished train target: wrote html file to {}'.format(
            os.path.join(train_cfg['outdir'], 'train.html')))

    # Analysis target: calculate user polarities
    if 'analysis' in targets or 'all' in targets:
        logger.info('Starting analysis target')
        with open('config/analysis-params.json') as fh:
            analysis_cfg = json.load(fh)
        # do user stats
        tweets = {}
        for tweet_id in analysis_cfg['tweet_ids']:
            path = os.path.join(analysis_cfg['user_data_path'],
                                'tweet_{}.csv'.format(tweet_id))
            tweet = pickle.load(open(path, 'rb'))
            tweets[tweet_id] = tweet
        for key, value in tweets.items():
            for user_id in list(value['user_ids'].keys()):
                value['user_ids'][user_id] = pd.read_csv(
                    os.path.join(analysis_cfg['user_data_path'],
                                 'user_{}_tweets.csv'.format(user_id)))
        mdls = []
        dims = analysis_cfg['dims']
        for dim in dims:
            path = os.path.join(analysis_cfg['model_path'],
                                '{}.mdl'.format(dim))
            mdl = pickle.load(open(path, 'rb'))
            mdls.append(mdl)
        compute_user_stats(logger, tweets, mdls, dims,
                           analysis_cfg['user_data_path'],
                           analysis_cfg['flagged'])

        convert_notebook('analysis', **analysis_cfg)
        logger.info('finished analysis target: wrote html file to {}'.format(
            os.path.join(analysis_cfg['outdir'], 'analysis.html')))

    # Results target: calculate results
    if 'results' in targets or 'all' in targets:
        logger.info('Starting results target')
        with open('config/results-params.json') as fh:
            results_cfg = json.load(fh)
        fp = os.path.join(results_cfg['user_data_path'], 'polarities.csv')
        polarities = pd.read_csv(fp, usecols=results_cfg['dims'] +
                                 ['flagged']).dropna()
        compute_results(logger, polarities, results_cfg['dims'],
                        results_cfg['outdir'])

        convert_notebook('results', **results_cfg)
        logger.info('finished results target: wrote html file to {}'.format(
            os.path.join(results_cfg['outdir'], 'results.html')))

    # Test target
    if 'test' in targets or 'all' in targets:
        logger.info('Starting TEST target')

        # Train target
        logger.info('Starting TEST train target')
        with open('config/train-params.json') as fh:
            train_cfg = json.load(fh)
        df = pd.read_csv(
            os.path.join(train_cfg['training_data_path'],
                         'data.csv')).drop(columns=['Unnamed: 0'])
        train_model(logger, df, **train_cfg)
        convert_notebook('train', **train_cfg)
        logger.info('finished TEST train target: wrote html file to {}'.format(
            os.path.join(train_cfg['outdir'], 'train.html')))

        # Analysis target
        logger.info('Starting TEST analysis target')
        with open('config/analysis-params.json') as fh:
            analysis_cfg = json.load(fh)
        # do user stats
        tweets = {}
        for tweet_id in analysis_cfg['tweet_ids']:
            path = os.path.join(analysis_cfg['user_data_path'],
                                'tweet_{}.csv'.format(tweet_id))
            tweet = pickle.load(open(path, 'rb'))
            tweets[tweet_id] = tweet
        for key, value in tweets.items():
            for user_id in list(value['user_ids'].keys()):
                value['user_ids'][user_id] = pd.read_csv(
                    os.path.join(analysis_cfg['user_data_path'],
                                 'user_{}_tweets.csv'.format(user_id)))
        mdls = []
        dims = analysis_cfg['dims']
        for dim in dims:
            path = os.path.join(analysis_cfg['model_path'],
                                '{}.mdl'.format(dim))
            mdl = pickle.load(open(path, 'rb'))
            mdls.append(mdl)
        compute_user_stats(logger, tweets, mdls, dims,
                           analysis_cfg['user_data_path'],
                           analysis_cfg['flagged'])

        convert_notebook('analysis', **analysis_cfg)
        logger.info(
            'finished TEST analysis target: wrote html file to {}'.format(
                os.path.join(analysis_cfg['outdir'], 'analysis.html')))

        # Results target: calculate results
        logger.info('Starting TEST results target')
        with open('config/results-params.json') as fh:
            results_cfg = json.load(fh)
        fp = os.path.join(results_cfg['user_data_path'], 'polarities.csv')
        polarities = pd.read_csv(fp, usecols=results_cfg['dims'] +
                                 ['flagged']).dropna()
        compute_results(logger, polarities, results_cfg['dims'],
                        results_cfg['outdir'])

        convert_notebook('results', **results_cfg)
        logger.info(
            'finished TEST results target: wrote html file to {}'.format(
                os.path.join(results_cfg['outdir'], 'results.html')))

        logger.info('finished TEST target')

    logger.info('ENDING PROGRAM')

    return
示例#16
0
文件: app3.py 项目: stayfu0705/web
import dash_html_components as html
import dash_table
import datetime
from etl import get_data

external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"]

app = dash.Dash(__name__,
                external_stylesheets=external_stylesheets,
                requests_pathname_prefix='/app3/')

datenow= datetime.datetime.now()



df = get_data('SELECT * FROM  result;', "SHOW columns FROM result")

def generate_table(dataframe, max_rows=10):
    return html.Table(

        # Header
        [html.Tr([html.Th(col) for col in dataframe.columns])] +

        # Body
        [html.Tr([
            html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
        ]) for i in range(min(len(dataframe), max_rows))]
    )


app.layout = html.Div(children=[
示例#17
0
    return param


if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description='PCA and visualization with Plink2')
    parser.add_argument('process',
                        type=str,
                        nargs=1,
                        help='the process to deal with')
    args = parser.parse_args()

    if args.process[0] == "get_data":
        cfg = load_params(DATA_PARAMS)
        get_data(cfg['files'], 'data/')
        get_metal(cfg['metal'])

    elif args.process[0] == "filter":
        cfg = load_params(FINAL_PARAMS)
        filter_recode(cfg['filename'], cfg['covar_file'], cfg['data_dir'],
                      cfg['filter_output'], cfg['hwe'], cfg['maf'],
                      cfg['geno'], cfg['mind'], cfg['chr'], cfg['min'])

    elif args.process[0] == 'pca':
        cfg = load_params(FINAL_PARAMS)
        pca(cfg['data_dir'], cfg['filter_output'])

    elif args.process[0] == 'plot_pca':
        cfg = load_params(FINAL_PARAMS)
        plot_pca(cfg['data_dir'] + 'pca.eigenvec', cfg['output_dir'])
def main(targets):
    if 'data' in targets:
        with open('../config/data-params.json') as fh:
            data_cfg = json.load(fh)
        get_data(**data_cfg)
    if 'process' in targets:
        with open('../config/data-params.json') as fh:
            data_cfg = json.load(fh)
        with open('../config/env.json') as fh:
            env_cfg = json.load(fh)
        metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q']
        k, n = data_cfg['k'], data_cfg['n']
        algorithm = data_cfg['algorithm']
        malware_pos, benign_pos = data_cfg[
            'malware_position'], data_cfg['apk_out_path'] + '/decompiled/*'
        model_out_path = data_cfg['model_out_path']
        if not os.path.exists(model_out_path):
            os.makedirs(model_out_path)
        malware_positions = glob.glob(malware_pos)
        benign_positions = glob.glob(benign_pos)
        decompiled_apks = benign_positions + malware_positions
        train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \
        np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist()
        test = [apk for apk in decompiled_apks if apk not in train]
        apk_names_train = [get_name(file) for file in train]
        apk_classes_train = [get_class(file) for file in train]
        apk_names_test = [get_name(file) for file in test]
        apk_classes_test = [get_class(file) for file in test]
        apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train))))
        apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test))))
        apk2node_train = dict(
            zip(apk_names_train, range(-len(apk_names_train), 0)))
        node2apk_train = dict(
            zip(range(-len(apk_names_train), 0), apk_names_train))
        idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys()))

        print('Collecting All APIs in Training Data')
        APIs = list(get_all_APIs(train))
        API2idx = dict(zip(APIs, range(len(APIs))))
        idx2API = dict(zip(range(len(APIs)), APIs))
        print('Processing Training Data...')
        apk2code_blocks_train, apk2call_train = apk_info_idx(
            train, API2idx, 'train')
        print('Processing Test Data...')
        apk2code_blocks_test, apk2call_test = apk_info_idx(
            test, API2idx, 'test')
        print('Building matrix_A_train...')
        matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train)
        print('Building matrix_A_test...')
        matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test)
        print('Building matrix_B_train...')
        matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train,
                                        apk2idx_train)
        print('Building matrix_P_train...')
        matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train)
        print('Building matrix_P_test...')
        matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test)
        matrix_BP_train = matrix_B_train + matrix_P_train

        print('generating random walks')
        walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \
        apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n)
        walks = [list(map(str, walk)) for walk in walks]
        print('word2vec model')
        model = Word2Vec(walks,
                         size=128,
                         window=10,
                         min_count=0,
                         sg=1,
                         workers=8,
                         iter=5)
        model.wv.save_word2vec_format(
            model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k))

        apk2class_train = dict(zip(apk_names_train, apk_classes_train))
        X_train = [
            model.wv[str(apk2node_train[apk])] for apk in apk2idx_train
            if str(apk2node_train[apk]) in model.wv
        ]
        Y_train = [
            apk2class_train[apk] for apk in apk2idx_train
            if str(apk2node_train[apk]) in model.wv
        ]
        clf = svm.SVC(kernel='rbf', gamma='scale')
        clf.fit(X_train, Y_train)

        if algorithm == 'node2vec':
            X = [
                API_mean_embedding(model, apk2idx_test[apk], matrix_A_test)
                for apk in apk2idx_test
            ]
            targets = [
                API_mean_embedding(model, apk2idx_train[apk], matrix_A_train)
                for apk in apk2idx_train
            ]
        elif algorithm == 'metapath2vec':
            # TODO: Add dic
            X = [
                API_mean_embedding_metapath(apk2idx_test[apk], dic,
                                            matrix_A_test)
                for apk in apk2idx_test
            ]
            targets = [
                API_mean_embedding_metapath(apk2idx_train[apk], dic,
                                            matrix_A_train)
                for apk in apk2idx_train
            ]
        print('neural network')
        train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \
        targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1)
        net = torch.load(model_out_path + '/net.model')

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        X_test = net(torch.tensor(X).type(
            torch.DoubleTensor).to(device)).cpu().detach()
        Y_test = apk_classes_test
        acc = clf.score(X_test, Y_test)
        print('test accuracy: ', acc)

    if 'test' in targets:
        with open('../config/test-params.json') as fh:
            data_cfg = json.load(fh)
        with open('../config/env.json') as fh:
            env_cfg = json.load(fh)
        # malware_pos, benign_pos = data_cfg['malware_position'], data_cfg['benign_position']
        # malware_positions = glob.glob(malware_pos)
        # benign_positions = glob.glob(benign_pos)
        # benign_positions = get_data(**data_cfg)
        # malware_positions = glob.glob('/datasets/dsc180a-wi20-public/Malware/amd_data_smali/*/*/*')
        # malware_positions = list(np.random.choice(malware_positions, 5, replace = False))
        # decompiled_apks = benign_positions + malware_positions
        # decompiled_positions = get_data(**data_cfg)
        metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q']
        k, n = data_cfg['k'], data_cfg['n']
        algorithm = data_cfg['algorithm']
        model_out_path = data_cfg['model_out_path']
        if not os.path.exists(model_out_path):
            os.makedirs(model_out_path)
        benign_positions = glob.glob('../Data/benign/*')
        malware_positions = glob.glob('../Data/malwares/*')
        decompiled_apks = benign_positions + malware_positions
        # train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \
        # np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist()
        train = benign_positions[:4] + malware_positions[:4]
        test = [apk for apk in decompiled_apks if apk not in train]
        apk_names_train = [get_name(file) for file in train]
        # apk_classes_train = [get_class(file) for file in train]
        apk_names_test = [get_name(file) for file in test]
        # apk_classes_test = [get_class(file) for file in test]
        apk_classes_train = [1] * int(len(benign_positions) * 0.8) + [0] * int(
            len(malware_positions) * 0.8)
        apk_classes_test = [1] * (len(benign_positions) - int(len(benign_positions)*0.8)) \
        + [0] * (len(malware_positions) - int(len(malware_positions)*0.8))
        apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train))))
        apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test))))
        apk2node_train = dict(
            zip(apk_names_train, range(-len(apk_names_train), 0)))
        node2apk_train = dict(
            zip(range(-len(apk_names_train), 0), apk_names_train))
        idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys()))

        print('Collecting All APIs in Training Data')
        APIs = list(get_all_APIs(train))
        API2idx = dict(zip(APIs, range(len(APIs))))
        idx2API = dict(zip(range(len(APIs)), APIs))
        print('Processing Training Data...')
        apk2code_blocks_train, apk2call_train = apk_info_idx(
            train, API2idx, 'train')
        print('Processing Test Data...')
        apk2code_blocks_test, apk2call_test = apk_info_idx(
            test, API2idx, 'test')
        print('Building matrix_A_train...')
        matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train)
        print('Building matrix_A_test...')
        matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test)
        print('Building matrix_B_train...')
        matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train,
                                        apk2idx_train)
        print('Building matrix_P_train...')
        matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train)
        print('Building matrix_P_test...')
        matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test)
        matrix_BP_train = matrix_B_train + matrix_P_train

        print('generating random walks')
        walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \
        apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n)
        walks = [list(map(str, walk)) for walk in walks]
        print('word2vec model')
        model = Word2Vec(walks,
                         size=128,
                         window=10,
                         min_count=0,
                         sg=1,
                         workers=8,
                         iter=5)
        model.wv.save_word2vec_format(
            model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k))

        apk2class_train = dict(zip(apk_names_train, apk_classes_train))
        X_train = [
            model.wv[str(apk2node_train[apk])] for apk in apk2idx_train
            if str(apk2node_train[apk]) in model.wv
        ]
        Y_train = [
            apk2class_train[apk] for apk in apk2idx_train
            if str(apk2node_train[apk]) in model.wv
        ]
        clf = svm.SVC(kernel='rbf', gamma='scale')
        clf.fit(X_train, Y_train)

        if algorithm == 'node2vec':
            X = [
                API_mean_embedding(model, apk2idx_test[apk], matrix_A_test)
                for apk in apk2idx_test
            ]
            targets = [
                API_mean_embedding(model, apk2idx_train[apk], matrix_A_train)
                for apk in apk2idx_train
            ]
        elif algorithm == 'metapath2vec':
            # TODO: Add dic
            X = [
                API_mean_embedding_metapath(apk2idx_test[apk], dic,
                                            matrix_A_test)
                for apk in apk2idx_test
            ]
            targets = [
                API_mean_embedding_metapath(apk2idx_train[apk], dic,
                                            matrix_A_train)
                for apk in apk2idx_train
            ]
        print('neural network')
        train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \
        targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1)
        net = torch.load(model_out_path + '/net.model')

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        X_test = net(torch.tensor(X).type(
            torch.DoubleTensor).to(device)).cpu().detach()
        Y_test = apk_classes_test
        acc = clf.score(X_test, Y_test)
        print('test accuracy: ', acc)
示例#19
0
    pca(filename=conf['temp_path'] + '/' + conf['name'])  #'data/interim/chr22'
elif args.process[0] == "remove_outlier_then_pca":
    remove_outlier(input='data/interim/chr22.eigenvec',
                   graph_output1='data/before_remove_outliers1.png',
                   graph_output2='data/before_remove_outliers2.png',
                   remove_output='data/outliers.txt',
                   filename=conf['temp_path'] + '/' + conf['name'])
elif args.process[0] == "graph_after_remove_outlier":
    after_removal(input='data/interim/chr22.eigenvec',
                  graph_output1='data/after_remove_outliers1.png',
                  graph_output2='data/after_remove_outliers2.png')
elif args.process[0] == "test-project":
    filter_recode(input_file=conf['input_file'],
                  output_dir=conf['temp_path'],
                  output_filename=conf['name'],
                  maf=conf['maf'],
                  geno=conf['geno'],
                  mind=conf['mind'])
    pca(filename=conf['temp_path'] + '/' + conf['name'])  #'data/interim/chr22'
    remove_outlier(input='data/interim/chr22.eigenvec',
                   graph_output1='data/before_remove_outliers1.png',
                   graph_output2='data/before_remove_outliers2.png',
                   remove_output='data/outliers.txt',
                   filename=conf['temp_path'] + '/' + conf['name'])
    after_removal(input='data/interim/chr22.eigenvec',
                  graph_output1='data/after_remove_outliers1.png',
                  graph_output2='data/after_remove_outliers2.png')
elif args.process[0] == 'get_data':
    conf = json.load(open(DATA_PARAMS))
    get_data(conf['person'], conf['files'], conf['config'])
示例#20
0
def main(targets):
    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/', ignore_errors=True)
        #shutil.rmtree('data/Smali', ignore_errors=True)
        #shutil.rmtree('data/APKs', ignore_errors=True)
        #shutil.rmtree('data/test',ignore_errors=True)

    if 'test_project' in targets:
        print('Loading Benign Data...')
        #loaded = load_params(TEST_PARAMS)
        #urls, smali_folders = etl.get_data(**loaded)
        #etl.smali_mover(smali_folders)
        b_app_dict = etl.get_smali_files_per_app()
        malware_apps = etl.find_malware_filepaths()
        sampled = etl.sample_malware(malware_apps, 20)
        m_app_dict = etl.get_malware_smali_files_per_app(sampled)
        b_app_dict_train, b_app_dict_test = etl.split_dictionary(b_app_dict)
        m_app_dict_train, m_app_dict_test = etl.split_dictionary(m_app_dict)
        labels = pcs.get_labels(b_app_dict_train, m_app_dict_train)

        print('Calculating A Data')
        train_api_calls, train_apis_per_app = pcs.A_matrix_calc(
            b_app_dict_train, m_app_dict_train)
        A_matrix_train = pcs.A_matrix_func(train_api_calls,
                                           train_apis_per_app).tocsr()
        test_api_calls, test_apis_per_app = pcs.A_matrix_calc(
            b_app_dict_test, m_app_dict_test)
        A_matrix_test = pcs.A_matrix_func(test_api_calls,
                                          test_apis_per_app).tocsr()

        print('Calculating B Data')
        B_api_calls_train, B_code_blocks_train = pcs.B_matrix_calc(
            b_app_dict_train, m_app_dict_train)
        B_matrix_train = pcs.B_matrix_func(B_api_calls_train,
                                           B_code_blocks_train).tocsr()
        B_api_calls_test, B_code_blocks_test = pcs.B_matrix_calc(
            b_app_dict_test, m_app_dict_test)
        B_matrix_test = pcs.B_matrix_func(B_api_calls_test,
                                          B_code_blocks_test).tocsr()

        print('Calculating P Data')
        P_api_calls_train, P_packages_train = pcs.P_matrix_calc(
            b_app_dict_train, m_app_dict_train)
        P_matrix_train = pcs.P_matrix_func(P_api_calls_train,
                                           P_packages_train).tocsr()
        P_api_calls_test, P_packages_test = pcs.P_matrix_calc(
            b_app_dict_test, m_app_dict_test)
        P_matrix_test = pcs.P_matrix_func(P_api_calls_test,
                                          P_packages_test).tocsr()

        AAT = A_matrix_train.dot(A_matrix_train.T).todense()
        ABAT = (A_matrix_train.dot(B_matrix_train)).dot(
            A_matrix_train.T).todense()
        APAT = (A_matrix_train.dot(P_matrix_train)).dot(
            A_matrix_train.T).todense()
        #APBPA = (((A_matrix_train.dot(P_matrix_train)).dot(B_matrix_train)).dot(P_matrix_train.T)).dot(A_matrix_train.T).todense()

        AAT_test = A_matrix_test.dot(A_matrix_test.T).todense()
        ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot(
            A_matrix_test.T)).todense()
        APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot(
            A_matrix_test.T)).todense()
        #APBPA_test = (((A_matrix_test.dot(P_matrix_test)).dot(B_matrix_test)).dot(P_matrix_test.T)).dot(A_matrix_test.T).todense()

        print('Calculating AAT')
        AAT = A_matrix_train.dot(A_matrix_train.T).todense()
        AAT_test = A_matrix_test.dot(A_matrix_test.T).todense()
        model = trn.train_model(AAT, labels)
        AAT_score = trn.assess_model(model, AAT_test, labels)
        print(
            'Classifier accuracy on A(A^T) metapath kernel with 10 apps: {0}'.
            format(round(AAT_score, 8)))

        print('Calculating ABAT')
        ABAT = (A_matrix_train.dot(B_matrix_train)).dot(
            A_matrix_train.T).todense()
        ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot(
            A_matrix_test.T)).todense()
        model = trn.train_model(ABAT, labels)
        ABAT_score = trn.assess_model(model, ABAT_test, labels)
        print(
            'Classifier accuracy on (AB(A^T)) metapath kernel with 10 apps: {0}'
            .format(round(ABAT_score, 8)))

        print('Calculating APAT')
        APAT = (A_matrix_train.dot(P_matrix_train)).dot(
            A_matrix_train.T).todense()
        APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot(
            A_matrix_test.T)).todense()
        model = trn.train_model(APAT, labels)
        ABAT_score = trn.assess_model(model, ABAT_test, labels)
        print(
            'Classifier accuracy on (AP(A^T)) metapath kernel with 10 apps: {0}'
            .format(round(ABAT_score, 8)))

    if 'test_data' in targets:
        loaded = load_params(DATA_PARAMS)
        urls, smali_folders = etl.get_data(**loaded)
        etl.smali_mover(smali_folders)
        b_app_dict = etl.get_smali_files_per_app()
        malware_apps = etl.find_malware_filepaths()
        sampled = etl.sample_malware(malware_apps, 20)
        m_app_dict = etl.get_malware_smali_files_per_app(sampled)
        b_app_dict_train, b_app_dict_test = etl.split_dictionary(b_app_dict)
        m_app_dict_train, m_app_dict_test = etl.split_dictionary(m_app_dict)

    if 'process' in targets:
        labels = pcs.get_labels(b_app_dict_train, m_app_dict_train)
        train_api_calls, train_apis_per_app = pcs.A_matrix_calc(
            b_app_dict_train, m_app_dict_train)
        A_matrix_train = pcs.A_matrix_func(train_api_calls,
                                           train_apis_per_app).tocsr()
        test_api_calls, test_apis_per_app = pcs.A_matrix_calc(
            b_app_dict_test, m_app_dict_test)
        A_matrix_test = pcs.A_matrix_func(test_api_calls,
                                          test_apis_per_app).tocsr()

        B_api_calls_train, B_code_blocks_train = pcs.B_matrix_calc(
            b_app_dict_train, m_app_dict_train)
        B_matrix_train = pcs.B_matrix_func(B_api_calls_train,
                                           B_code_blocks_train).tocsr()
        B_api_calls_test, B_code_blocks_test = pcs.B_matrix_calc(
            b_app_dict_test, m_app_dict_test)
        B_matrix_test = pcs.B_matrix_func(B_api_calls_test,
                                          B_code_blocks_test).tocsr()

        P_api_calls_train, P_packages_train = pcs.P_matrix_calc(
            b_app_dict_train, m_app_dict_train)
        P_matrix_train = pcs.P_matrix_func(P_api_calls_train,
                                           P_packages_train).tocsr()
        P_api_calls_test, P_packages_test = pcs.P_matrix_calc(
            b_app_dict_test, m_app_dict_test)
        P_matrix_test = pcs.P_matrix_func(P_api_calls_test,
                                          P_packages_test).tocsr()

        AAT = A_matrix_train.dot(A_matrix_train.T).todense()
        ABAT = (A_matrix_train.dot(B_matrix_train)).dot(
            A_matrix_train.T).todense()
        APAT = (A_matrix_train.dot(P_matrix_train)).dot(
            A_matrix_train.T).todense()
        APBPA = (((
            A_matrix_train.dot(P_matrix_train)).dot(B_matrix_train)).dot(
                P_matrix_train.T)).dot(A_matrix_train.T).todense()

        AAT_test = A_matrix_test.dot(A_matrix_test.T).todense()
        ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot(
            A_matrix_test.T)).todense()
        APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot(
            A_matrix_test.T)).todense()
        APBPA_test = (((
            A_matrix_test.dot(P_matrix_test)).dot(B_matrix_test)).dot(
                P_matrix_test.T)).dot(A_matrix_test.T).todense()

    if 'train' in targets:

        AAT = A_matrix_train.dot(A_matrix_train.T).todense()
        AAT_test = A_matrix_test.dot(A_matrix_test.T).todense()
        model = trn.train_model(AAT, labels)
        AAT_score = trn.assess_model(model, AAT_test, labels)

        ABAT = (A_matrix_train.dot(B_matrix_train)).dot(
            A_matrix_train.T).todense()
        ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot(
            A_matrix_test.T)).todense()
        model = trn.train_model(ABAT, labels)
        ABAT_score = trn.assess_model(model, ABAT_test, labels)

        APAT = (A_matrix_train.dot(P_matrix_train)).dot(
            A_matrix_train.T).todense()
        APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot(
            A_matrix_test.T)).todense()
        model = trn.train_model(APAT, labels)
        ABAT_score = trn.assess_model(model, ABAT_test, labels)

        APBPA = (((
            A_matrix_train.dot(P_matrix_train)).dot(B_matrix_train)).dot(
                P_matrix_train.T)).dot(A_matrix_train.T).todense()
        APBPA_test = (((
            A_matrix_test.dot(P_matrix_test)).dot(B_matrix_test)).dot(
                P_matrix_test.T)).dot(A_matrix_test.T).todense()
        model = trn.train_model(APBPA, labels)
        APBPA_score = trn.assess_model(model, APBPA_test, labels)

    if 'analysis' in targets:
        print('Starting analysis...')
        print(
            'Classifier accuracy on A(A^T) metapath kernel with 10 apps: {0}'.
            format(round(AAT_score, 8)))
        print(
            'Classifier accuracy on (AB(A^T)) metapath kernel with 10 apps: {0}'
            .format(round(ABAT_score, 8)))
        print(
            'Classifier accuracy on AP(A^T) metapath kernel with 10 apps: {0}'.
            format(round(APAT_score, 8)))
        #print('Classifier accuracy on APB(P^T)(A^T) metapath kernel with 10 apps: {0}'.format(round(APBPA_score,8)))
        print('...Done')

    return
MEASURES = ['RevFreq', 'RevSpeed', 'RevDur']
pvlimit = 0.001
alpha = 0.05
independent_variable = 'groupname'

for strain_dir in strain_dirs:
    strain_name = os.path.basename(strain_dir)
    print(f'\n\nProcessing: {strain_name}')
    # data directory
    raw_data_dir = os.path.join(strain_dir, sub_dir)
    # make output folder
    output_dir = os.path.join(os.path.dirname(raw_data_dir), OUTPUT_DIR_NAME)
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    # get data
    rawdata, db = etl.get_data(raw_data_dir)
    df_transform = rawdata.pivot(index='mwtid', columns='tap', values=MEASURES)
    # calculate integral (requires stats and etl package)
    intobj = stats.Integral(df_transform)
    data_integral = intobj.bycolumns(MEASURES)
    data_integral = etl.merge_data_mwtdb(data_integral, db)

    # save excel graphing output by measures---
    iv = 'groupname'
    savefname = f'graph_data_{PRJ_TAG}.xlsx'
    savepath = os.path.join(output_dir, savefname)
    graphpack.save_excel_graphdata(data_integral, 'groupname', MEASURES,
                                   savepath)

    # set up anova report ---
    filename = os.path.join(output_dir, 'anova.txt')