def get_cv_grid(document, rid): cv_grid_data = None param_str = None jopts = None doc = None if document.ml_has_cv == "yes": # get data from mongo.dataset_info try: doc = query_mongo.find_one( settings.MONGO_OUT_DNS, settings.MONGO_OUT_PORT, settings.MONGO_OUT_DB, settings.MONGO_OUT_TBL, settings.MONGO_OUT_USR, settings.MONGO_OUT_PWD, '{"rid":' + rid + ',"key":"cv_result"}', '{"param_str":1,"cv_grid_data":1,"best_param":1,"_id":0}') except Exception as e: print "Exception from MongoDB:", e # if CV exists if doc: #print "cv_grid_data=",str(doc["cv_grid_data"]) cv_grid_data = doc["cv_grid_data"] param_str = doc["param_str"] # convert ml_opts to json object, and set 1st uppercase & remove _ if doc and document.ml_opts: jopts = json.loads(document.ml_opts) jopts["learning_algorithm"] = jopts["learning_algorithm"].title( ).replace("_", " ") return cv_grid_data, param_str, jopts
def get_feat_impo(request, rid, perm,disabled4reader): # chk access document =_list.get_ds_doc(rid, perm) if not document: return Response({"data not found":-1}) # get data from mongo.dataset_info doc=query_mongo.find_one(settings.MONGO_OUT_DNS, settings.MONGO_OUT_PORT, settings.MONGO_OUT_DB, settings.MONGO_OUT_TBL , settings.MONGO_OUT_USR, settings.MONGO_OUT_PWD , '{"rid":'+rid+',"key":"feature_importance"}', '{"value":1,"_id":0}') if doc: arr=doc["value"] return Response(arr) else: return Response({"data not found":-1})
def main(): parser = ArgumentParser(description=__description__) parser.add_argument("-d", "--name", type=str, metavar="file name", help="file name for prediction", required=False) parser.add_argument("-o", "--out", type=str, metavar="learner output", help="out files for prediction", required=False) parser.add_argument("-r", "--row_id", type=str, metavar="row_id number", help="row_id number in the db", required=False) parser.add_argument("-i", "--cid", type=str, metavar="child row id", help="child row id for prediction", required=False) ####other parameters parser.add_argument("-nb", "--num", type=str, metavar="n gram", help="window size for n gram", required=False) parser.add_argument("-pa", "--para", type=str, metavar="param in 1 gram", help="number of parameters in 1 gram, if -1, no 1 gram", required=False) parser.add_argument("-x", "--max", type=str, metavar="max number of features", help="max number of features generated", required=False) parser.add_argument("-fw", "--fromweb", type=str, metavar="flag for web", help="flag for web", required=False) parser.add_argument("-pm", "--parameter", type=str, metavar="parameters in json", help="json string contains learning alg and parameter selection", required=False) parser.add_argument("-pp", "--pca_param", type=str, metavar="pca parameters in json", help="json string contains pca parameter selection", required=False) parser.add_argument("-lb", "--lib", type=str, metavar="spark mllib or scikit", help="learning library used", required=False) parser.add_argument("-sl", "--showlabelname", type=str, metavar="show label name", help="0: not shown; 1: show label name", required=False) parser.add_argument("-dsid", "--ds_id", type=str, metavar="source dataset id", help="source dataset id for training option", required=False) parser.add_argument("-ptn", "--pattern_str", type=str, metavar="regular express pattern to extract string" , help="regular express pattern to extract string", required=False) parser.add_argument("-vb", "--verbose", type=str, metavar="show detailed features", help="show detailed features", required=False) parser.add_argument("-ft", "--feat_cnt_threshold", type=str, dest='feat_cnt_threshold', help="feature count to allow prediction" , default =config.get('machine_learning', 'feature_count_threshold')) ###SPARK### parser.add_argument('-sp','--sp_master', type=str, dest='sp_master', help='spark.master' , default =config.get('spark', 'spark_master')) parser.add_argument('-em','--exe_memory', type=str, dest='exe_memory', help='spark.executor.memory' , default =config.get('spark', 'spark_executor_memory')) parser.add_argument('-cm','--core_max', type=str, dest='core_max', help='spark.cores.max' , default =config.get('spark', 'spark_cores_max')) #### database for output parser.add_argument('-ip','--ip_address', type=str, dest='ip_address', help='mongodb ip address' , default =config.get('mongo', 'out_ip_address')) parser.add_argument('-p','--port', type=str, dest='port', help='mongodb port' , default =eval(config.get('mongo', 'out_port'))) parser.add_argument('-dn','--db_name', type=str, dest='db_name', help='mongodb db name' , default =config.get('mongo', 'out_db')) parser.add_argument('-t','--tb_name', type=str, dest='tb_name', help='mongodb table name' , default =config.get('mongo', 'out_tb')) # auth parser.add_argument('-un','--username', type=str, dest='username', help='mongodb username' , default =config.get('mongo', 'out_username')) parser.add_argument('-pw','--password', type=str, dest='password', help='mongodb password' , default =config.get('mongo', 'out_password')) args = parser.parse_args() if input_gz = else: input_gz = '000d9941eaf04efb55e5d0ccff3d90ee.gz' if args.out: out_dir = args.out else: out_dir = '.' if args.row_id: row_id_str = args.row_id else: row_id_str = '553' if args.ds_id: ds_id = args.ds_id else: ds_id = '' if args.cid: cid_str = args.cid else: cid_str = '01' ################################################### if args.num: num_gram = eval(args.num) else: num_gram = eval(config.get("machine_learning","svm_num_gram")) if args.para: param_in_gram_1 = eval(args.para) else: param_in_gram_1 = eval(config.get("IN","param_in_gram_1_in")) if args.max: MAX_FEATURES = eval(args.max) else: MAX_FEATURES = eval(config.get("IN","MAX_FEATURES_IN")) if args.fromweb: fromweb = args.fromweb else: fromweb = None if args.parameter: j_str = args.parameter else: j_str='{"c":"1","iterations":"300","regularization":"l2","learning_algorithm":"logistic_regression_with_sgd"}' if args.lib: # mllib or scikit mode = args.lib else: mode='scikit' if args.showlabelname: # mllib or scikit labelnameflag = eval(args.showlabelname) else: labelnameflag = 0 if args.verbose: verbose = args.verbose else: verbose = "1" ######database######################################## if len(args.username)>0: username = args.username else: username = None if len(args.password)>0: password = args.password else: password = None t0 = time() coef_arr=None ml_opts = json.loads(j_str) # ML parameters ===================== input ml_opts ============== learning_algorithm=None try: if ml_opts is None: ml_opts = json.loads(j_str) learning_algorithm = ml_opts['learning_algorithm'] except Exception as e: print "WARNING: load learning_algorithm failed.",e # read raw data from .gz file ===================== input .gz ============== file_content=None try: f =, 'rb') file_content = f.close() except Exception as e: print "ERROR: load data file ["+input_gz+"] failed.",e return -5 # get data here; assume libsvm format label_feature_array=None feature_array=None label=None # optional sample_info=None if not file_content is None: label_features = file_content.strip() if not label_features is None: label_feature_array = label_features.split(' ') label = label_feature_array[0] # check if 1st item is integer int_1st="y" try: int(label_feature_array[0]) except ValueError: int_1st="n" # check if 2nd item is integer int_2nd="y" try: int(label_feature_array[0]) except ValueError: int_2nd="n" if int_1st=="n" and int_2nd=="y": feature_array = label_feature_array[2:len(label_feature_array)] elif int_1st=="y" and int_2nd=="n": feature_array = label_feature_array[1:len(label_feature_array)] #feature_array = label_feature_array[1:len(label_feature_array)] # if sample_info exists, 2nd item will be digit and 3rd item won't be digit if not label.isdigit() and label_feature_array[1].isdigit() and not label_feature_array[2].isdigit(): sample_info=label label=label_feature_array[1] feature_array=label_feature_array[2:len(label_feature_array)] else: print "ERROR: data format error!" else: print "ERROR: no data found!" #print label ### -1 means no label #print feature_array curr_dic = {} #print "feature_array=",feature_array for features in feature_array: if len(features)>0: key, value = features.split(':') curr_dic[key] = float(value) else: print "WARNING: data format error!" print "INFO: curr_dic len=",len(curr_dic) if curr_dic and verbose=="1": #print "INFO: *** Feature list: ====================================" # clean up feature file out_file=os.path.join(out_dir,cid_str+"_feature_list.json") print "INFO: feature file=",out_file if os.path.exists(out_file): try: os.remove(out_file) except OSError, e: print ("ERROR: %s - %s." % (e.strerror, out_file)) out_f=open(out_file, 'a') # get coef_arr ================================== if coef_arr is None and not learning_algorithm in ('kmeans'): key = "coef_arr" #{"123":"openFile"} jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # each model has its own coef_arr doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj) coef_arr = doc['value'] fout_arr=[] len_coef=len(coef_arr) for k,v in curr_dic.items(): feat_out={} feat_out["ngram"]="" if int(k) < len_coef: feat_out["fid"]=k feat_out["coef"]=coef_arr[int(k)-1] feat_out["desc"]="" else: feat_out["fid"]="None" feat_out["coef"]=0 feat_out["desc"]=str(k) fout_arr.append(feat_out) if len(fout_arr) > 0: out_f.write(json.dumps(fout_arr)) out_f.close()
print "RESULT: predict output=", sing_label_pred ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### pred_label=None if labelnameflag == 1: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic try: pred_label = label_dic[int(sing_label_pred)] except Exception as e: print "WARNING: Can't get label",e pred_label=str(sing_label_pred) else: pred_label = str(sing_label_pred) print "RESULT: prediction=", pred_label
def mrun(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, fromweb , training_fraction, jobname, run_number, bin_number ): ### generate data folder and out folder, clean up if needed if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # zip func in other files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path # get_spark_context sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) t0 = time() # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat=0 if not ml_opts_jstr is None: ml_opts=json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat=ml_opts["has_excluded_feat"] #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: key = "feature_excluded" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get from own id (not from parent dataset id) #print "jstr_filter=",jstr_filter,",jstr_proj=",jstr_proj doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj) #print "feature_excluded=",doc if not doc is None and 'value' in doc: excluded_feat_cslist = ','.join(str(i) for i in doc['value']) print "INFO: excluded_feat_cslist=",excluded_feat_cslist ### generate Labeled point #libsvm_data_file = data_folder + "libsvm_data" # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file feat_count_file=libsvm_data_file+"_feat_count" feature_count=zip_feature_util.get_feature_count(sc,feat_count_file) print "INFO: feature_count=",feature_count #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() samples_rdd, feature_count=zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist) #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) # get distinct label list labels_list_all = p: p[0].label).distinct().collect() #labels_list_all = p: p.label).collect() t1 = time() print "INFO: labels_list_all=",labels_list_all #print "INFO: training and testing samples generated!" print 'INFO: data generating time: %f' %(t1-t0) t0 = t1 ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### label_set = set(labels_list_all) class_num = len(label_set) #class_num = len(labels_list) if class_num > 2: print "INFO:Number of classes=", class_num ############################################### ###########build learning model################ ############################################### ### get the parameters### print "INFO: ============Learning Algorithm and Parameters=============" #param_dict = json.loads(ml_opts_jstr) flag_model = ml_opts['learning_algorithm'] # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd C = eval(ml_opts['c']) iteration_num = ml_opts['iterations'] regularization = ml_opts['regularization'] print "INFO: Learning Algorithm: ", flag_model print "INFO: C = ", C print "INFO: iteration = ", iteration_num print "INFO: regType = ", regularization t0 = time() accuracy_array = np.zeros(run_number) for rnd in range (0, run_number): ### generate training and testing data training_rdd, testing_rdd = samples_rdd.randomSplit([training_fraction, 1-training_fraction]) p:p[0])# keep LabeledPoint only training_rdd.cache() testing_rdd.cache() training_sample_count = training_rdd.count() regP = C/float(training_sample_count) print "INFO: Calculated: regParam = ", regP ### build model ### if flag_model == "linear_svm_with_sgd": ### 1: linearSVM print "INFO: ====================1: Linear SVM=============" model_classification = SVMWithSGD.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) #print model_classification elif flag_model == "logistic_regression_with_lbfgs": ### 2: LogisticRegressionWithLBFGS print "INFO: ====================2: LogisticRegressionWithLBFGS=============" model_classification = LogisticRegressionWithLBFGS.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num) # regParam = 1/(sample_number*C) elif flag_model == "logistic_regression_with_sgd": ### 3: LogisticRegressionWithLBFGS print "INFO: ====================3: LogisticRegressionWithSGD=============" model_classification = LogisticRegressionWithSGD.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) else: print "ERROR: Training model selection error: no valid ML model selected!" return ### Evaluating the model on testing data labelsAndPreds = p: (p[0].label, model_classification.predict(p[0].features))) labelsAndPreds.cache() testing_sample_number = testing_rdd.count() testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testing_sample_number) accuracy = 1 - testErr accuracy_array[rnd] = accuracy print "INFO: current round=", rnd print "INFO: Accuracy=", accuracy ########################below: same as ############################################### #######plot distribution and variance########## ############################################### plt.figure(1) num_bins = bin_number ####10 is default n, bins, patches = plt.hist(accuracy_array, num_bins, normed=1, facecolor='green', alpha=0.5) ave = np.mean(accuracy_array) print "INFO: Accuracy mean=", ave variance = np.std(accuracy_array) print "INFO: Accuracy variance=", variance #print "INFO: bins: ", bins # add a 'best fit' line y = mlab.normpdf(bins, ave, variance) #print "INFO: y: ", y plt.plot(bins, y, 'r--') plt.title('Accuracy distribution of '+str(run_number)+' runs:') plt.xlabel('Accuracy Values') plt.ylabel('Probability / Accuracy bar width') #plt.savefig(local_out_dir+file_name_given+"_var_"+str(run_number)+".png") plt.savefig(os.path.join(local_out_dir, row_id_str+"_var_"+str(run_number)+".png")) # create ROC data for graph ==================== all_json=[] barp_arr=[] #n disp_arr=[] #y last_idx=0 for idx,ht in enumerate(n): # n is bar height #print "INFO: mrun bar=",idx, bins[idx], bins[idx+1],((bins[idx]+bins[idx+1])/2) barp_arr.append([ ((bins[idx]+bins[idx+1])/2.0),n[idx]]) # mid point for x axis if not math.isnan(y[idx]): disp_arr.append([bins[idx],y[idx]]) last_idx=idx #print "INFO: ",last_idx+1, bins[last_idx+1], y[last_idx+1] if not math.isnan(y[last_idx+1]): disp_arr.append([bins[last_idx+1],y[last_idx+1]]) #print "barp_arr=", barp_arr #print "disp_arr=", disp_arr #bar bar_json={} bar_json["values"]=barp_arr bar_json["key"]='Mutil-Run Accuracy' # bar_json["type"]="bar" # light blue bar_json["yAxis"]=1 all_json.append(bar_json) #distribution if len(disp_arr)>0: dis_json={} dis_json["values"]=disp_arr dis_json["key"]='Normal Distribution' # dis_json["type"]="line" # light blue dis_json["yAxis"]=1 all_json.append(dis_json) mrun_jfile = os.path.join(local_out_dir, row_id_str+"_mrun.json") #print "INFO: all_json=",all_json print "INFO: mrun_jfile=",mrun_jfile if os.path.exists(mrun_jfile): try: os.remove(mrun_jfile) except OSError, e: print ("Error: %s - %s." % (e.mrun_jfile,e.strerror))
def result2(request, rid, oid, perm, disabled4reader): print 'in result2, rid=', rid, ', oid=', oid o_rid = rid # get train option doc, if oid provided if oid > 0: rid = oid document = _list.get_ds_doc(rid, perm) if not document: return HttpResponseRedirect(reverse('atdml.views.list')) # for return only #form=DocumentForm() predictions = [ ] #Document.objects.all().filter(file_type="predict", train_id=rid).order_by('-id')[0:10] # get train option id train_id = document.train_id ml_lib = document.ml_lib status = document.status # get sample file list sflist = _predict.get_sfile_list(document.filename,, document.file_type, train_id) # how to get dir? # get cross validation info cv_grid_data, param_str, jopts = get_cv_grid(document, rid) print "************** ml_has_cv=", document.ml_has_cv, cv_grid_data if jopts: print "rid=", rid, ", jopts=", jopts else: print "rid=", rid, ", jopts not found" has_roc = has_result_file(rid, str(rid) + "_roc.json") has_mrun = has_result_file(rid, str(rid) + "_mrun.json") has_score = has_result_file(rid, str(rid) + "_score_graph.json") print "has_roc=", has_roc, ", has_mrun=", has_mrun, ", has_score=", has_score has_result = None # check algorithm train_opt = {} if not document.ml_opts is None and len(document.ml_opts) > 0: train_opt = json.loads(document.ml_opts) # if document.status_code >= 500: # check if clustering data is in if has_result_file(rid, str(rid) + "_cluster*.png" ) and train_opt["learning_algorithm"] in ('kmeans'): has_result = "U" else: # check if png for classification exists? has_result = "Y" elif ml_lib == "dnn": # allow DNN to view status has_result = "Y" has_featc = has_result_file(rid, str(rid) + "_feat_coef.json") has_fp = has_result_file(rid, str(rid) + "_false_pred.json") # get ml_opts feature_excluded_list = None if "has_excluded_feat" in train_opt and train_opt["has_excluded_feat"] == 1: # get data from mongo.dataset_info try: doc = query_mongo.find_one( settings.MONGO_OUT_DNS, settings.MONGO_OUT_PORT, settings.MONGO_OUT_DB, settings.MONGO_OUT_TBL, settings.MONGO_OUT_USR, settings.MONGO_OUT_PWD, '{"rid":' + rid + ',"key":"feature_excluded"}', '{"value":1}') if not doc is None: #print "doc type=", type(doc), ",doc=",doc feature_excluded_list = doc["value"] print "feature_excluded_list=", feature_excluded_list except Exception as e: print "Exception from MongoDB:", e rpage = 'atdml/result.html' if oid > 0: rpage = 'atdml/result_opts.html' feat_str = "" if not feature_excluded_list is None: feat_str = ','.join(str(i) for i in feature_excluded_list) print "has_roc=", has_roc, ", has_mrun=", has_mrun, ", has_result=", has_result, "rpage=", rpage # get perf and dataset info if document.perf_measures and document.perf_measures != "null": perf_measures = json.loads(document.perf_measures) else: perf_measures = {} if document.dataset_info and document.dataset_info != "null": dataset_info = json.loads(document.dataset_info) else: dataset_info = {} return render( request, #'atdml/result.html', rpage, { "document": document, "predictions": predictions, "sflist": sflist #, "form": form , "disabled4reader": disabled4reader, "perm": perm, "cv_grid_data": cv_grid_data, "param_str": param_str, "has_fp": has_fp, "jopts": jopts, "has_roc": has_roc, "has_mrun": has_mrun, "has_result": has_result, "has_featc": has_featc, "has_score": has_score, "feature_excluded": feat_str, "ml_lib": ml_lib, "status": status, "tp": perf_measures["tp"] if "tp" in perf_measures else "", "tn": perf_measures["tn"] if "tn" in perf_measures else "", "fp": perf_measures["fp"] if "fp" in perf_measures else "", "fn": perf_measures["fn"] if "fn" in perf_measures else "", "phi": '%0.5f' % perf_measures["phi"] if "phi" in perf_measures else "", "fscore": '%0.5f' % perf_measures["fscore"] if "fscore" in perf_measures else "", "roc_auc": '%0.5f' % perf_measures["roc_auc"] if "roc_auc" in perf_measures else "", "class_count": dataset_info["class_count"] if "class_count" in dataset_info else "", "training_fraction": dataset_info["training_fraction"] if "training_fraction" in dataset_info else "", "dataset_count": dataset_info["dataset_count"] if "dataset_count" in dataset_info else "", "MEDIA_URL": settings.MEDIA_URL }, )
def main(): parser = ArgumentParser(description=__description__) parser.add_argument("-f", "--folder", type=str, metavar="folder of features", help="hdfs folder contains features", required=False) parser.add_argument("-n", "--name", type=str, metavar="file name", help="file name for sample folder", required=False) parser.add_argument("-o", "--out", type=str, metavar="out figure folder", help="folder contains output", required=False) parser.add_argument("-r", "--row_id", type=str, metavar="row id", help="row_id number in the db", required=False) parser.add_argument("-mf", "--modelfolder", type=str, metavar="model folder", help="model for prediction", required=False) parser.add_argument( "-l", "--listfile", type=str, metavar="list file", help="list of testing data hashes for single file prediction", required=False) parser.add_argument("-u", "--uploadtype", type=str, metavar="upload type", help="data type", required=False) parser.add_argument("-w", "--fromweb", type=str, metavar="flag for web", help="flag for web", required=False) parser.add_argument( "-pm", "--parameter", type=str, metavar="parameters in json", help="json string contains learning alg and parameter selection", required=False) parser.add_argument('-sp', '--sp_master', type=str, dest='sp_master', help='spark.master', default=config.get('spark', 'spark_master')) parser.add_argument('-em', '--exe_memory', type=str, dest='exe_memory', help='spark.executor.memory', default=config.get('spark', 'spark_executor_memory')) parser.add_argument('-cm', '--core_max', type=str, dest='core_max', help='spark.cores.max', default=config.get('spark', 'spark_cores_max')) #### database parser.add_argument('-ip', '--ip_address', type=str, dest='ip_address', help='mongodb ip address', default=config.get('mongo', 'ip_address')) parser.add_argument('-p', '--port', type=str, dest='port', help='mongodb port', default=eval(config.get('mongo', 'port'))) parser.add_argument('-dn', '--db_name', type=str, dest='db_name', help='mongodb db name', default=config.get('mongo', 'out_db')) parser.add_argument('-t', '--tb_name', type=str, dest='tb_name', help='mongodb table name', default=config.get('mongo', 'out_feat_tb')) # auth parser.add_argument('-un', '--username', type=str, dest='username', help='mongodb username', default=config.get('mongo', 'username')) parser.add_argument('-pw', '--password', type=str, dest='password', help='mongodb password', default=config.get('mongo', 'password')) args = parser.parse_args() if args.folder: feat_dir = args.folder else: feat_dir = config.get( 'app', 'HADOOP_MASTER' ) + '/user/hadoop/yigai/sality_virut_zbot_backdoor_hash_000' if file_name_given = else: file_name_given = 'aaaa' if args.out: out_dir = args.out else: out_dir = 'out_result' if args.row_id: row_id_str = args.row_id else: row_id_str = '88' if args.modelfolder: model_data_folder = args.modelfolder else: model_data_folder = out_dir + '/' + row_id_str + '_model/' if args.listfile: list_file_test = args.listfile else: list_file_test = out_dir + '/' + row_id_str + '_testhashlist.txt' if args.uploadtype: uploadtype = args.uploadtype else: uploadtype = None if args.fromweb: fromweb = args.fromweb else: fromweb = None if args.parameter: j_str = args.parameter else: j_str = '{"learning_algorithm":"linear_svm_with_sgd", "c":"1", "iteration":"300", "regularization":"l2"}' if len(args.username) > 0: username = args.username else: username = None if len(args.password) > 0: password = args.password else: password = None data_folder = feat_dir + "/" out_dir = out_dir + "/" if not os.path.exists(out_dir): os.makedirs(out_dir) if os.path.exists(model_data_folder): shutil.rmtree(model_data_folder) if not os.path.exists(model_data_folder): os.makedirs(model_data_folder) if os.path.isfile(list_file_test): try: os.remove(list_file_test) except OSError: pass else: with open(list_file_test, "w") as myfile: pass SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty( 'spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'sk-learn-train:' + str(args.row_id)) t0 = time() ### load libsvm file ### libsvm_data_file = data_folder + "libsvm_data" print "libsvm_data_file:", libsvm_data_file samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) #print samples_rdd.count() labels_and_features_rdd = p: (p.label, p.features)) all_data = labels_and_features_rdd.collect() features_list = [x.toArray() for _, x in all_data] labels_list = [x for x, _ in all_data] labels_list = array(labels_list) features_array = np.array(features_list) ### generate sparse matrix (csr) for all samples features_sparse_mtx = csr_matrix(features_array) ### randomly split the samples into training and testing data X_train_sparse, X_test_sparse, labels_train, labels_test = cross_validation.train_test_split( features_sparse_mtx, labels_list, test_size=0.4) t1 = time() print 'data generating time: %f' % (t1 - t0) ############################################### ###########build learning model################ ############################################### if flag_model == "linear_svm_with_sgd": ### 1: linearSVM print "====================1: Linear SVM=============" clf = svm.LinearSVC(), labels_train) #### save clf for future use #### joblib.dump(clf, model_data_folder + row_id_str + '.pkl') #print "**model:coef***" #print clf.coef_ #print "**model:intercept***" #print clf.intercept_ ### Evaluating the model on testing data labels_pred = clf.predict(X_test_sparse) #print "************results*********" #print "Predicting results:" #print labels_pred #print "True testing labels:" #print labels_test accuracy = clf.score(X_test_sparse, labels_test) print "Accuracy = ", accuracy ################################################### ### generate label names (family names) ########### ### connect to database to get the column list which contains all column number of the corresponding feature#### ################################################### key = "dic_name_label" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' doc = query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) ### generate sample numbers of each family in testing data### testing_sample_number = len(labels_test) print "testing_sample_number:", testing_sample_number test_cnt_dic = {} for key in label_dic: test_cnt_dic[key] = 0 for i in range(0, testing_sample_number): for key in label_dic: if labels_test[i] == key: test_cnt_dic[key] = test_cnt_dic[key] + 1 print "Number of samples in each label is:", test_cnt_dic ############################################### ###########plot prediction result figure####### ############################################### ### reorder labels so that labels are ordered according to the true label of the data len_pred = len(labels_pred) wide_len = math.ceil(math.sqrt(len_pred)) pred_list = labels_pred.tolist() test_list = labels_test.tolist() labels_true_pred = zip(test_list, pred_list) labels_true_pred.sort(key=lambda x: x[0]) test_ordered = [x for x, _ in labels_true_pred] pred_ordered = [x for _, x in labels_true_pred] last_value = test_ordered[len_pred - 1] for i in range(len_pred, int(wide_len * wide_len)): test_ordered.append(last_value) pred_ordered.append(last_value) mtx_testing = np.reshape(test_ordered, (wide_len, wide_len)) mtx_pred = np.reshape(pred_ordered, (wide_len, wide_len)) ### plot figues ### fig, ax = plt.subplots() cax = ax.imshow(mtx_pred, interpolation='nearest', num_labels = len(labels_list) tic = range(0, num_labels) labels_str = [] # append sample count at the end for key in sorted(test_cnt_dic): labels_str.append(labels_list[key] + "(" + str(test_cnt_dic[key]) + ")") cbar = fig.colorbar(cax, ticks=tic) # vertically oriented colorbar plt.xlabel('Prediction (Single Run)') plt.savefig(out_dir + file_name_given + "_1" + ".png") fig, ax = plt.subplots() cax = ax.imshow(mtx_testing, interpolation='nearest', #ax.set_title('Gaussian noise with vertical colorbar') # Add colorbar, make sure to specify tick locations to match desired ticklabels cbar = fig.colorbar(cax, ticks=tic) # vertically oriented colorbar plt.xlabel('True Labels (Single Run)') plt.savefig(out_dir + file_name_given + "_2" + ".png") ############################################################# ###################for 2 class only (plot ROC curve)######### ############################################################# if len(labels_list) == 2: reverse_label_dic = dict((v, k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] else: print "No ROC curve generated: 'clean' must be a label for indicating negative class!" return confidence_score = clf.decision_function(X_test_sparse) if flag_clean == 0: scores = [x for x in confidence_score] s_labels = [x for x in labels_test] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x in confidence_score] s_labels = [1 - x for x in labels_test] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] ###########plot ROC figure####### try: fpr, tpr, thresholds = roc_curve(s_labels, scores, pos_label=1) roc_auc = auc(fpr, tpr) except ValueError as e: print "Error!! in ROC curve: ", print e print "ROC_AUC = ", roc_auc plt.figure() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.savefig(out_dir + file_name_given + "_ROC" + ".png") print "Figure save!" #### generate fpr tpr ACC threshold results file### ROC_file = out_dir + file_name_given + "_ROC_value.txt" if os.path.exists(ROC_file): try: os.remove(ROC_file) except OSError, e: print("Error: %s - %s." % (e.ROC_file, e.strerror)) for i in range(0, len(fpr)): ACC = (testing_P * tpr[i] + testing_N * (1 - fpr[i])) / (testing_P + testing_N) with open(ROC_file, 'a') as f: f.write('%0.5f ' % (fpr[i])) f.write('%0.5f ' % (tpr[i])) f.write('%0.5f ' % (thresholds[i])) f.write('%0.5f\n' % (ACC)) ### print FPR, TPR, ACC ### if flag_model == "linear_svm_with_sgd": thr = 0 elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd": thr = -0.5 for i in range(0, len(thresholds)): if thresholds[i] < thr: print "===Results Summary===" print "Accuracy: ", accuracy #print "Accuracy (calculate): ", (testing_P*tpr[i-1] + testing_N*(1-fpr[i-1]))/(testing_P + testing_N) print "False Positive Rate (FPR): ", fpr[i - 1] print "True Positive Rate (TPR): ", tpr[i - 1] print "=====================" break
def predict(row_id_str, ds_id, cid_str, input_gz, local_out_dir, num_gram , j_str, lib_mode , fromweb, verbose,label_idx=0, data_idx=3, metadata_count=3, pattern_str='(.*)', ln_delimitor = '\t', binary_flag=True, labelnameflag=1 , model_filename=None, str_model_json=None, sample_txt=None ,pca_filename=None, pca_param=None , sp_master=config.get('spark', 'spark_master'), exe_memory=config.get('spark', 'spark_executor_memory') , core_max=config.get('spark', 'spark_cores_max') , MAX_FEATURES=eval(config.get("machine_learning","MAX_FEATURES")) , dic_name_label=None , feat_cnt_threshold=config.get('machine_learning', 'feature_count_threshold') , ip_address=config.get('mongo', 'out_ip_address'), port=eval(config.get('mongo', 'out_port')) , db_name=config.get('mongo', 'out_db'), tb_name=config.get('mongo', 'out_tb') , username=config.get('mongo', 'out_username'), password=config.get('mongo', 'out_password') , sc=None ): print "in" t0 = time() coef_arr=None dic_hash_str=None dic_seq_hashes=None dic_hashes_seq=None feat_sample_count_arr=None hashes_cnt_dic=None hash_str_dic=None data_rows=None data_cols=None ml_opts=None # load model from strings ============ for offline IN, liner model ============== if not str_model_json is None and len(str_model_json)>10: try: model_json=json.loads(str_model_json) except Exception as e: print "ERROR: model json load error." , e return -1 #print "model_json=",model_json if "coef_arr" in model_json: coef_arr=model_json["coef_arr"] col_num = len(coef_arr) if "coef_intercept" in model_json: coef_intercept=model_json["coef_intercept"] if "dic_hash_str" in model_json: dic_hash_str=model_json["dic_hash_str"] if "dic_seq_hashes" in model_json: dic_seq_hashes=model_json["dic_seq_hashes"] if "pca_param" in model_json: pca_param=model_json["pca_param"] if "feat_sample_count_arr" in model_json: feat_sample_count_arr=model_json["feat_sample_count_arr"] if dic_name_label is None: dic_name_label=model_json["dic_name_label"] if j_str is None: j_str=model_json["ml_opts"] ml_opts=json.loads(j_str) #print "j_str=",j_str num_gram=eval(model_json["ml_n_gram"]) lib_mode="offline" # load model from a file ====== elif not model_filename is None: if os.path.exists(model_filename): print "INFO: model from file=",model_filename try: with open(model_filename) as jf: model_json=json.load(jf) if "coef_arr" in model_json: coef_arr=model_json["coef_arr"] col_num = len(coef_arr) if "coef_intercept" in model_json: coef_intercept=model_json["coef_intercept"] if "dic_hash_str" in model_json: dic_hash_str=model_json["dic_hash_str"] if "dic_seq_hashes" in model_json: dic_seq_hashes=model_json["dic_seq_hashes"] if "pca_param" in model_json: pca_param=model_json["pca_param"] if "feat_sample_count_arr" in model_json: feat_sample_count_arr=model_json["feat_sample_count_arr"] dic_name_label=model_json["dic_name_label"] ml_opts=model_json["ml_opts"] num_gram=eval(model_json["ml_n_gram"]) lib_mode="offline" #print "-- id=",id,",ds_id=",ds_id print "INFO: model feature count=",col_num print "INFO: model for num_gram=",num_gram #print "model json=",model_json except Exception as e: print "ERROR: loading model file ["+model_filename+"] error! ",e return -3 else: print "ERROR: model file ["+model_filename+"] not found! " return -4 # ML parameters ===================== input ml_opts ============== learning_algorithm=None try: if ml_opts is None: ml_opts = json.loads(j_str) #print "INFO: ml_opts=",ml_opts if 'learning_algorithm' in ml_opts: learning_algorithm = ml_opts['learning_algorithm'] except Exception as e: print "WARNING: load learning_algorithm failed.",e print "INFO: learning_algorithm=",learning_algorithm # read raw data from .gz file ===================== input .gz ============== #if json_str is None: # TBD by faster convert_to_line_by_bash() try: f =, 'rb') sample_txt = convert_to_line(f, metadata_count) # check if one line, if raw file then convert to 1 line #print "sample_txt=",sample_txt[:100].replace('\t',',') #print "sample_txt=",sample_txt.replace('\t',',') f.close() except Exception as e: print "ERROR: load data file ["+input_gz+"] failed.",e return -5 #f =, 'rb') #file_content = f.readline() # assume only one line #file_content=convert_to_line(f, metadata_count) # check if one line, if raw file then convert to 1 line #f.close() # input: assume one line of ngram pattern format string =========== # return an array [meta-data1,meta-data2,...,str_arr] raw_arr=None coef_arr=None feat_arr=None # input: one line text # return array: [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] raw_arr=preprocess_pattern(sample_txt, metadata_count, pattern_str, ln_delimitor, label_idx, label_arr=None ) #print "*****************raw_arr=",raw_arr # input: array: [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] # return hashes_cnt_dic: {hash,hash:count),...} hash_str_dic: {hash: 'str1',... } feat_arr=feature_extraction_ngram(raw_arr, data_idx, MAX_FEATURES, num_gram) #print "**************feat_arr=",feat_arr # if feat_arr is None or len(feat_arr)==0: print "ERROR: Raw data format error or no feature found at predict_single_file_pattern." return -1 # load PCA params ================= ======== threshold=None n_component=None # data for PCA; TBD for all algorithm if learning_algorithm =='kmeans' : # if pca_param is None: # get from mongo key = "pca_param" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # ???get parent dataset's data #if ds_id != row_id_str: # jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj) if doc and "value" in doc: pca_param = doc['value'] print "INFO: pca_param=", pca_param # param for PCA model and transform: expect both threshold and k in pca_param if not pca_param is None: if "threshold" in pca_param: threshold=pca_param["threshold"] if "k" in pca_param: n_component=pca_param["k"] print "INFO: n_component=",n_component,", threshold=",threshold # get {seq :hash,hash } mapping from mongo key=dic_seq_hashes =================== if dic_seq_hashes is None: key = "dic_seq_hashes" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' #print "************** ds_id=",ds_id,", rid=",row_id_str # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj) if not doc is None: dic_seq_hashes = doc['value'] else: # get from local file fn=os.path.join(local_out_dir,ds_id+"_dic_seq_hashes.pkl") print "INFO: get dic_seq_hashes from local", fn dic_seq_hashes=ml_util.ml_pickle_load(fn) print "INFO: len(dic_seq_hashes)=", len(dic_seq_hashes) if dic_seq_hashes: dic_len=len(dic_seq_hashes) else: dic_len=0 # print feature for ref by a new optional param? out_f=None # for feature list. not for kmeans if verbose=="1" and learning_algorithm not in ('kmeans'): # get {hash : raw string} mapping ================================== if dic_hash_str is None: key = "dic_hash_str" #{"123":"openFile"} jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj) if not doc is None: dic_hash_str = doc['value'] else: # get from local file fn=os.path.join(local_out_dir,ds_id+"_dic_hash_str.pkl") print "INFO: get dic_hash_str from local", fn dic_hash_str=ml_util.ml_pickle_load(fn) print "INFO: len(dic_hash_str)=", len(dic_hash_str) #print "hashes_cnt_dic=",hashes_cnt_dic #print "dic_hash_str=",dic_hash_str # clean up feature file out_file=os.path.join(local_out_dir,cid_str+"_feature_list.json") if os.path.exists(out_file): try: os.remove(out_file) except OSError, e: print ("ERROR: %s - %s." % (e.strerror, out_file)) if not learning_algorithm in ('kmeans','lstm','cnn'): print "INFO: feature file=",out_file out_f=open(out_file, 'a') # get local dict hash_str_dic=feat_arr[data_idx+1] # convert key to string hash_str_dic={str(k): v for k, v in hash_str_dic.items()} coef_arr=None # get coef_arr ================================== if coef_arr is None and not learning_algorithm in ('kmeans','lstm','cnn'): key = "coef_arr" #{"123":"openFile"} jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # each model has its own coef_arr doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj) if not doc is None and 'value' in doc: coef_arr = doc['value'] else: # get from local file fn=os.path.join(local_out_dir,ds_id+"_coef_arr.pkl") print "INFO: get coef_arr from local fn=", fn coef_arr=ml_util.ml_pickle_load(fn) print "INFO: len(coef_arr)=", len(coef_arr) # get feat_sample_count_arr ================================== if feat_sample_count_arr is None and not learning_algorithm in ('kmeans'): key = "feat_sample_count_arr" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj) if not doc is None: feat_sample_count_arr = doc['value'] else: # get from local file fn=os.path.join(local_out_dir,ds_id+"_feat_sample_count_arr.pkl") print "INFO: get feat_sample_count_arr from local", fn feat_sample_count_arr=ml_util.ml_pickle_load(fn) print "INFO: len(feat_sample_count_arr)=", len(feat_sample_count_arr)
### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### pred_label=None label_dic = {} if labelnameflag == 1: if dic_name_label is None: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj) dic_list = doc['value'] for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic=", label_dic if not sing_label_pred is None: pred_label = label_dic[int(sing_label_pred)] if learning_algorithm in ("kmeans"): pred_label = "cluster# "+str(sing_label_pred) print "RESULT: prediction=", pred_label status="predicted"
def predict(row_id_str, ds_id, cid_str, input_gz, local_out_dir, num_gram, j_str, lib_mode, fromweb, verbose, labelnameflag=1, model_filename=None, str_model_json=None, sample_txt=None, pca_filename=None, pca_param=None, sp_master=config.get('spark', 'spark_master'), exe_memory=config.get('spark', 'spark_executor_memory'), core_max=config.get('spark', 'spark_cores_max'), MAX_FEATURES=eval(config.get("machine_learning", "MAX_FEATURES")), dic_name_label=None, feat_cnt_threshold=config.get('machine_learning', 'feature_count_threshold'), ip_address=config.get('mongo', 'out_ip_address'), port=eval(config.get('mongo', 'out_port')), db_name=config.get('mongo', 'out_db'), tb_name=config.get('mongo', 'out_tb'), username=config.get('mongo', 'out_username'), password=config.get('mongo', 'out_password'), sc=None): t0 = time() coef_arr = None ml_opts = json.loads(j_str) # ML parameters ===================== input ml_opts ============== learning_algorithm = None try: if ml_opts is None: ml_opts = json.loads(j_str) learning_algorithm = ml_opts['learning_algorithm'] except Exception as e: print "WARNING: load learning_algorithm failed.", e # read raw data from .gz file ===================== input .gz ============== file_content = None try: f =, 'rb') file_content = f.close() except Exception as e: print "ERROR: load data file [" + input_gz + "] failed.", e return -5 # get data here; assume libsvm format label_feature_array = None feature_array = None label = None # optional sample_info = None if not file_content is None: label_features = file_content.strip() if not label_features is None: label_feature_array = label_features.split(' ') label = label_feature_array[0] # check if 1st item is integer int_1st = "y" try: int(label_feature_array[0]) except ValueError: int_1st = "n" # check if 2nd item is integer int_2nd = "y" try: int(label_feature_array[0]) except ValueError: int_2nd = "n" if int_1st == "n" and int_2nd == "y": feature_array = label_feature_array[2:len(label_feature_array)] elif int_1st == "y" and int_2nd == "n": feature_array = label_feature_array[1:len(label_feature_array)] #feature_array = label_feature_array[1:len(label_feature_array)] # if sample_info exists, 2nd item will be digit and 3rd item won't be digit if not label.isdigit() and label_feature_array[1].isdigit( ) and not label_feature_array[2].isdigit(): sample_info = label label = label_feature_array[1] feature_array = label_feature_array[2:len(label_feature_array)] else: print "ERROR: data format error!" else: print "ERROR: no data found!" #print label ### -1 means no label #print feature_array curr_dic = {} #print "feature_array=",feature_array for features in feature_array: if len(features) > 0: key, value = features.split(':') curr_dic[key] = float(value) else: print "WARNING: data format error!" print "INFO: curr_dic len=", len(curr_dic) if curr_dic and verbose == "1": #print "INFO: *** Feature list: ====================================" # clean up feature file out_file = os.path.join(local_out_dir, cid_str + "_feature_list.json") print "INFO: feature file=", out_file if os.path.exists(out_file): try: os.remove(out_file) except OSError, e: print("ERROR: %s - %s." % (e.strerror, out_file)) out_f = open(out_file, 'a') # get coef_arr ================================== if coef_arr is None and not learning_algorithm in ('kmeans'): key = "coef_arr" #{"123":"openFile"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # each model has its own coef_arr doc = query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj) coef_arr = doc['value'] fout_arr = [] len_coef = len(coef_arr) for k, v in curr_dic.items(): feat_out = {} feat_out["ngram"] = "" if int(k) < len_coef: feat_out["fid"] = k feat_out["coef"] = coef_arr[int(k) - 1] feat_out["desc"] = "" else: feat_out["fid"] = "None" feat_out["coef"] = 0 feat_out["desc"] = str(k) fout_arr.append(feat_out) if len(fout_arr) > 0: out_f.write(json.dumps(fout_arr)) out_f.close()