예제 #1
0
def parse_all_files():
    class_csv  = parsers.parse_csv('data/mcp/classes.csv',  3, ',', ['full',      'trashbin', 'notch_c',  'trashbin',  'notch_s', 'description'])
    method_csv = parsers.parse_csv('data/mcp/methods.csv',  4, ',', ['trashbin',  'searge_c', 'trashbin', 'searge_s',  'full', 'description'])    
    field_csv  = parsers.parse_csv('data/mcp/fields.csv',   3, ',', ['trashbin',  'trashbin', 'searge_c', 'trashbin',  'trashbin', 'searge_s', 'full', 'description'])    
    
    #client_rgs = parsers.parse_rgs(config['client_rgs']) #contains a list of notch_name to searge_name for the client
    server_rgs = parsers.parse_rgs('data/mcp/minecraft_server.rgs') #contains a list of notch_name to searge_name for the server

    #We want 3 dicts per soft. One for classes, methods and fields. Each dict is going to take the searge_name as the key, as it is the only
    #unique identifier we are sure of for now. Each dict will have at least 3 entries, notch_name, searge_name and full_name.
    #Classes will have an identical searge_name and full_name, as they are identical. Methods will also contain the notch_signature and maybe the searge_signature.
    #Packages can also be a value somewhere for the reobfuscation step.

    #Let's start with the class dictionary. For this one, we just need the rgs file.
    #class_dict_c = create_dic_class(client_rgs)
    class_dict_s = create_dic_class(server_rgs)

    #Now the fields, as they are easy to process. Need both the csv and the rgs. Third argument is to get the right column
    #field_dict_c = create_dic_member(client_rgs, field_csv, class_dict_c, 'c', 'field_map', config)
    field_dict_s = create_dic_member(server_rgs, field_csv, class_dict_s, 's', 'field_map')

    #And finally the methods. Same as before.
    #method_dict_c = create_dic_member(client_rgs, method_csv, class_dict_c, 'c', 'method_map', config)
    method_dict_s = create_dic_member(server_rgs, method_csv, class_dict_s, 's', 'method_map')

    nmethods=0
    nfields=0
    nclasses=0
    ckeys=class_dict_s.keys()
    ckeys.sort()
    for ckey in ckeys:
        nclasses=nclasses+1
        #print '* Post-processing class %s...' % ckey
        for mkey in method_dict_s:
            method = method_dict_s[mkey]
            if method['class']==ckey:
                nmethods=nmethods+1
                nmkey = method['csv'] # Use CSV name to determine method key.
                if nmkey==None:
                    nmkey=method['searge']
                class_dict_s[ckey]['methods'][nmkey]=method
        for fkey in field_dict_s:
            field = field_dict_s[fkey]
            if field['class']==ckey:
                nfields=nfields+1
                nfkey = field['csv'] # Use CSV name to determine field key.
                if nfkey==None:
                    nfkey=field['searge']
                class_dict_s[ckey]['fields'][nfkey]=field
                
    print '*** POST-PROCESSING COMPLETE ***'
    print ' + %d classes' % nclasses
    print ' + %d methods' % nmethods
    print ' + %d fields' % nfields
    
    #solve_collisions(client_dic)
    #solve_collisions(server_dic)

    return class_dict_s
예제 #2
0
def parse_all_files  (config_file):
    renamer_options = { 'class_csv'       : None,
                        'field_csv'       : None,
                        'method_csv'      : None,
                        'server_rgs'      : None,
                        'client_rgs'      : None,
                        'server_rgs_out'  : None,
                        'client_rgs_out'  : None,
                        'server_src'      : None,
                        'client_src'      : None,
                        'package_name'    : None,
                        'unknown_name'    : None,
                        'known_name'      : None,
                        'md5_file_client' : None,
                        'md5_file_server' : None,
                        }

    config     = parsers.parse_config(config_file, renamer_options)
    
    class_csv  = parsers.parse_csv(config['class_csv'],  3, ',', ['full',      'trashbin', 'notch_c',  'trashbin',  'notch_s', 'description'])
    method_csv = parsers.parse_csv(config['method_csv'], 4, ',', ['trashbin',  'searge_c', 'trashbin', 'searge_s',  'full', 'description'])    
    field_csv  = parsers.parse_csv(config['field_csv'],  3, ',', ['trashbin',  'trashbin', 'searge_c', 'trashbin',  'trashbin', 'searge_s', 'full', 'description'])    
    
    client_rgs = parsers.parse_rgs(config['client_rgs']) #contains a list of notch_name to searge_name for the client
    server_rgs = parsers.parse_rgs(config['server_rgs']) #contains a list of notch_name to searge_name for the server

    #We want 3 dicts per soft. One for classes, methods and fields. Each dict is going to take the searge_name as the key, as it is the only
    #unique identifier we are sure of for now. Each dict will have at least 3 entries, notch_name, searge_name and full_name.
    #Classes will have an identical searge_name and full_name, as they are identical. Methods will also contain the notch_signature and maybe the searge_signature.
    #Packages can also be a value somewhere for the reobfuscation step.

    #Let's start with the class dictionary. For this one, we just need the rgs file.
    class_dict_c = create_dic_class(client_rgs)
    class_dict_s = create_dic_class(server_rgs)

    #Now the fields, as they are easy to process. Need both the csv and the rgs. Third argument is to get the right column
    field_dict_c = create_dic_member(client_rgs, field_csv, class_dict_c, 'c', 'field_map', config)
    field_dict_s = create_dic_member(server_rgs, field_csv, class_dict_s, 's', 'field_map', config)

    #And finally the methods. Same as before.
    method_dict_c = create_dic_member(client_rgs, method_csv, class_dict_c, 'c', 'method_map', config)
    method_dict_s = create_dic_member(server_rgs, method_csv, class_dict_s, 's', 'method_map', config)

    client_dic = {'class':class_dict_c, 'method':method_dict_c, 'field':field_dict_c}
    server_dic = {'class':class_dict_s, 'method':method_dict_s, 'field':field_dict_s}

    #solve_collisions(client_dic)
    #solve_collisions(server_dic)

    return client_dic, server_dic, config
예제 #3
0
def local_stats_sum(args):
    input, state, cache, cache_dir, id, owner = readin(args)
    base_dir = state["baseDirectory"]
    preprocess_method = input["preprocess_method"]
    use_CV = input["use_CV"]
    output = {}

    (X, y, name_features) = parse_csv(input)
    # split and store train / test dataset
    X_train, X_test, y_train, y_test = split_save_train_test(
        input, output, cache, id, owner, cache_dir, X, y)
    # split folds for CV
    if use_CV:
        fold_indices, valid_indices = split_folds_save_valid(
            input, output, cache, cache_dir, X_train, y_train)

    # cache dict
    cache["preprocess_method"] = preprocess_method
    cache["use_CV"] = use_CV
    cache["n_features"] = X.shape[1]
    if use_CV:
        cache["n_folds"] = input["n_folds"]

    # calculate stats: n_samples, sum
    stats = Stats(cache, preprocess_method)
    stats.cal_stats(X_train=X_train, y_train=y_train, y_test=y_test)
    stats.add_output(output)
    if use_CV:
        stats_CV = Stats_CV(cache, preprocess_method)
        stats_CV.cal_stats(
            X_train=X_train,
            y_train=y_train,
            fold_indices=fold_indices,
            valid_indices=valid_indices,
        )
        stats_CV.add_output(output)

    # output dict
    if id == owner:
        output["msg"] = "to_agg_mean"
        output["label"] = input["label"]
        output["train_split_local"] = input["train_split_local"]
        output["train_split_owner"] = input["train_split_owner"]
        output["preprocess_method"] = preprocess_method
        output["max_iter"] = input["max_iter"]
        output["tol"] = input["tol"]
        output["positive"] = input["positive"]
        output["selection"] = input["selection"]
        output["lambdas"] = input["lambdas"]
        output["eps"] = input["eps"]
        output["n_lambdas"] = input["n_lambdas"]
        output["use_CV"] = use_CV

        if use_CV:
            output["n_folds"] = input["n_folds"]

        output["name_features"] = name_features

    result_dict = {"output": output, "cache": cache}
    return json.dumps(result_dict)
예제 #4
0
def csv_serving_input_fn():

    csv_row = tf.placeholder(shape=[None], dtype=tf.string)

    features = parsers.parse_csv(csv_row)
    features.pop(metadata.TARGET_NAME)
    return tf.contrib.learn.InputFnOps(preprocess.process_features(features),
                                       None, {'csv_row': csv_row})
예제 #5
0
def task():
    inp = input("Enter <<visual_descriptor_model model k imageID>> :")
    inpStringArray = inp.split()
    vd_model = inpStringArray[0]
    model = inpStringArray[1]
    imageID = int(inpStringArray[3])

    try:
        k = int(inpStringArray[2])
    except:
        print("Error : Please enter valid value of k ")
        return

    location_dict = parseLocationFile()
    csv_df = parse_csv(
        location_dict,
        vd_model)  #dataframe which contains all images and feature values
    csv_df_locations = (csv_df['location'].tolist()
                        )  #storing all locations inside list
    del csv_df[
        'location']  #deleting location row so it doesn't interfere in calculation
    # csv_df_locations = [i[1] for i in csv_df.index.tolist()]

    # Similarity calculation based on TF
    if model.upper() == 'SVD':
        item_ids = csv_df.index.tolist()  #storing all image ids inside list
        u_df, sigma_df, v_df = perform_svd(csv_df, item_ids, k)
        sorted_df = cal_sorted_distance(u_df, imageID, item_ids,
                                        csv_df_locations)
        print(k, ' latent semantics: \n', v_df)
        print('\n')
        print('Top 5 matching images: \n', sorted_df[0].head(5))
        print('\n')
        print('Top 5 matching locations: \n',
              sorted_df.drop_duplicates('location').head(5))

    elif (model.upper() == 'PCA'):
        item_ids = csv_df.index.tolist()
        image_df, v_df = perform_pca(csv_df, item_ids, k)
        print(v_df)
        sorted_df = cal_sorted_distance(image_df, imageID, item_ids,
                                        csv_df_locations)
        print('\nTop 5 matching images:\n', sorted_df[0].head(5))
        print('\nTop 5 matching locations:\n',
              sorted_df.drop_duplicates('location').head(5))

    elif model.upper() == 'LDA':
        task3_lda.task_lda(vd_model, k, imageID)

    else:
        print('Please enter valid input.')
예제 #6
0
def csv_serving_input_fn():

    csv_row = tf.placeholder(shape=[None], dtype=tf.string)

    features = parsers.parse_csv(csv_row)
    features.pop(metadata.TARGET_NAME)

    if metadata.TASK_TYPE == "custom":
        return tf.estimator.export.ServingInputReceiver(
            features=preprocess.process_features(features),
            receiver_tensors={'csv_row': csv_row})

    return tf.contrib.learn.InputFnOps(preprocess.process_features(features),
                                       None, {'csv_row': csv_row})
예제 #7
0
import parsers

parsers.parse_xml()
parsers.parse_csv()
parsers.parse_json()

parsers.write_xml()