def parse_all_files(): class_csv = parsers.parse_csv('data/mcp/classes.csv', 3, ',', ['full', 'trashbin', 'notch_c', 'trashbin', 'notch_s', 'description']) method_csv = parsers.parse_csv('data/mcp/methods.csv', 4, ',', ['trashbin', 'searge_c', 'trashbin', 'searge_s', 'full', 'description']) field_csv = parsers.parse_csv('data/mcp/fields.csv', 3, ',', ['trashbin', 'trashbin', 'searge_c', 'trashbin', 'trashbin', 'searge_s', 'full', 'description']) #client_rgs = parsers.parse_rgs(config['client_rgs']) #contains a list of notch_name to searge_name for the client server_rgs = parsers.parse_rgs('data/mcp/minecraft_server.rgs') #contains a list of notch_name to searge_name for the server #We want 3 dicts per soft. One for classes, methods and fields. Each dict is going to take the searge_name as the key, as it is the only #unique identifier we are sure of for now. Each dict will have at least 3 entries, notch_name, searge_name and full_name. #Classes will have an identical searge_name and full_name, as they are identical. Methods will also contain the notch_signature and maybe the searge_signature. #Packages can also be a value somewhere for the reobfuscation step. #Let's start with the class dictionary. For this one, we just need the rgs file. #class_dict_c = create_dic_class(client_rgs) class_dict_s = create_dic_class(server_rgs) #Now the fields, as they are easy to process. Need both the csv and the rgs. Third argument is to get the right column #field_dict_c = create_dic_member(client_rgs, field_csv, class_dict_c, 'c', 'field_map', config) field_dict_s = create_dic_member(server_rgs, field_csv, class_dict_s, 's', 'field_map') #And finally the methods. Same as before. #method_dict_c = create_dic_member(client_rgs, method_csv, class_dict_c, 'c', 'method_map', config) method_dict_s = create_dic_member(server_rgs, method_csv, class_dict_s, 's', 'method_map') nmethods=0 nfields=0 nclasses=0 ckeys=class_dict_s.keys() ckeys.sort() for ckey in ckeys: nclasses=nclasses+1 #print '* Post-processing class %s...' % ckey for mkey in method_dict_s: method = method_dict_s[mkey] if method['class']==ckey: nmethods=nmethods+1 nmkey = method['csv'] # Use CSV name to determine method key. if nmkey==None: nmkey=method['searge'] class_dict_s[ckey]['methods'][nmkey]=method for fkey in field_dict_s: field = field_dict_s[fkey] if field['class']==ckey: nfields=nfields+1 nfkey = field['csv'] # Use CSV name to determine field key. if nfkey==None: nfkey=field['searge'] class_dict_s[ckey]['fields'][nfkey]=field print '*** POST-PROCESSING COMPLETE ***' print ' + %d classes' % nclasses print ' + %d methods' % nmethods print ' + %d fields' % nfields #solve_collisions(client_dic) #solve_collisions(server_dic) return class_dict_s
def parse_all_files (config_file): renamer_options = { 'class_csv' : None, 'field_csv' : None, 'method_csv' : None, 'server_rgs' : None, 'client_rgs' : None, 'server_rgs_out' : None, 'client_rgs_out' : None, 'server_src' : None, 'client_src' : None, 'package_name' : None, 'unknown_name' : None, 'known_name' : None, 'md5_file_client' : None, 'md5_file_server' : None, } config = parsers.parse_config(config_file, renamer_options) class_csv = parsers.parse_csv(config['class_csv'], 3, ',', ['full', 'trashbin', 'notch_c', 'trashbin', 'notch_s', 'description']) method_csv = parsers.parse_csv(config['method_csv'], 4, ',', ['trashbin', 'searge_c', 'trashbin', 'searge_s', 'full', 'description']) field_csv = parsers.parse_csv(config['field_csv'], 3, ',', ['trashbin', 'trashbin', 'searge_c', 'trashbin', 'trashbin', 'searge_s', 'full', 'description']) client_rgs = parsers.parse_rgs(config['client_rgs']) #contains a list of notch_name to searge_name for the client server_rgs = parsers.parse_rgs(config['server_rgs']) #contains a list of notch_name to searge_name for the server #We want 3 dicts per soft. One for classes, methods and fields. Each dict is going to take the searge_name as the key, as it is the only #unique identifier we are sure of for now. Each dict will have at least 3 entries, notch_name, searge_name and full_name. #Classes will have an identical searge_name and full_name, as they are identical. Methods will also contain the notch_signature and maybe the searge_signature. #Packages can also be a value somewhere for the reobfuscation step. #Let's start with the class dictionary. For this one, we just need the rgs file. class_dict_c = create_dic_class(client_rgs) class_dict_s = create_dic_class(server_rgs) #Now the fields, as they are easy to process. Need both the csv and the rgs. Third argument is to get the right column field_dict_c = create_dic_member(client_rgs, field_csv, class_dict_c, 'c', 'field_map', config) field_dict_s = create_dic_member(server_rgs, field_csv, class_dict_s, 's', 'field_map', config) #And finally the methods. Same as before. method_dict_c = create_dic_member(client_rgs, method_csv, class_dict_c, 'c', 'method_map', config) method_dict_s = create_dic_member(server_rgs, method_csv, class_dict_s, 's', 'method_map', config) client_dic = {'class':class_dict_c, 'method':method_dict_c, 'field':field_dict_c} server_dic = {'class':class_dict_s, 'method':method_dict_s, 'field':field_dict_s} #solve_collisions(client_dic) #solve_collisions(server_dic) return client_dic, server_dic, config
def local_stats_sum(args): input, state, cache, cache_dir, id, owner = readin(args) base_dir = state["baseDirectory"] preprocess_method = input["preprocess_method"] use_CV = input["use_CV"] output = {} (X, y, name_features) = parse_csv(input) # split and store train / test dataset X_train, X_test, y_train, y_test = split_save_train_test( input, output, cache, id, owner, cache_dir, X, y) # split folds for CV if use_CV: fold_indices, valid_indices = split_folds_save_valid( input, output, cache, cache_dir, X_train, y_train) # cache dict cache["preprocess_method"] = preprocess_method cache["use_CV"] = use_CV cache["n_features"] = X.shape[1] if use_CV: cache["n_folds"] = input["n_folds"] # calculate stats: n_samples, sum stats = Stats(cache, preprocess_method) stats.cal_stats(X_train=X_train, y_train=y_train, y_test=y_test) stats.add_output(output) if use_CV: stats_CV = Stats_CV(cache, preprocess_method) stats_CV.cal_stats( X_train=X_train, y_train=y_train, fold_indices=fold_indices, valid_indices=valid_indices, ) stats_CV.add_output(output) # output dict if id == owner: output["msg"] = "to_agg_mean" output["label"] = input["label"] output["train_split_local"] = input["train_split_local"] output["train_split_owner"] = input["train_split_owner"] output["preprocess_method"] = preprocess_method output["max_iter"] = input["max_iter"] output["tol"] = input["tol"] output["positive"] = input["positive"] output["selection"] = input["selection"] output["lambdas"] = input["lambdas"] output["eps"] = input["eps"] output["n_lambdas"] = input["n_lambdas"] output["use_CV"] = use_CV if use_CV: output["n_folds"] = input["n_folds"] output["name_features"] = name_features result_dict = {"output": output, "cache": cache} return json.dumps(result_dict)
def csv_serving_input_fn(): csv_row = tf.placeholder(shape=[None], dtype=tf.string) features = parsers.parse_csv(csv_row) features.pop(metadata.TARGET_NAME) return tf.contrib.learn.InputFnOps(preprocess.process_features(features), None, {'csv_row': csv_row})
def task(): inp = input("Enter <<visual_descriptor_model model k imageID>> :") inpStringArray = inp.split() vd_model = inpStringArray[0] model = inpStringArray[1] imageID = int(inpStringArray[3]) try: k = int(inpStringArray[2]) except: print("Error : Please enter valid value of k ") return location_dict = parseLocationFile() csv_df = parse_csv( location_dict, vd_model) #dataframe which contains all images and feature values csv_df_locations = (csv_df['location'].tolist() ) #storing all locations inside list del csv_df[ 'location'] #deleting location row so it doesn't interfere in calculation # csv_df_locations = [i[1] for i in csv_df.index.tolist()] # Similarity calculation based on TF if model.upper() == 'SVD': item_ids = csv_df.index.tolist() #storing all image ids inside list u_df, sigma_df, v_df = perform_svd(csv_df, item_ids, k) sorted_df = cal_sorted_distance(u_df, imageID, item_ids, csv_df_locations) print(k, ' latent semantics: \n', v_df) print('\n') print('Top 5 matching images: \n', sorted_df[0].head(5)) print('\n') print('Top 5 matching locations: \n', sorted_df.drop_duplicates('location').head(5)) elif (model.upper() == 'PCA'): item_ids = csv_df.index.tolist() image_df, v_df = perform_pca(csv_df, item_ids, k) print(v_df) sorted_df = cal_sorted_distance(image_df, imageID, item_ids, csv_df_locations) print('\nTop 5 matching images:\n', sorted_df[0].head(5)) print('\nTop 5 matching locations:\n', sorted_df.drop_duplicates('location').head(5)) elif model.upper() == 'LDA': task3_lda.task_lda(vd_model, k, imageID) else: print('Please enter valid input.')
def csv_serving_input_fn(): csv_row = tf.placeholder(shape=[None], dtype=tf.string) features = parsers.parse_csv(csv_row) features.pop(metadata.TARGET_NAME) if metadata.TASK_TYPE == "custom": return tf.estimator.export.ServingInputReceiver( features=preprocess.process_features(features), receiver_tensors={'csv_row': csv_row}) return tf.contrib.learn.InputFnOps(preprocess.process_features(features), None, {'csv_row': csv_row})
import parsers parsers.parse_xml() parsers.parse_csv() parsers.parse_json() parsers.write_xml()