def get_cv_grid(document, rid):
    cv_grid_data = None
    param_str = None
    jopts = None
    doc = None

    if document.ml_has_cv == "yes":
        # get data from mongo.dataset_info
        try:
            doc = query_mongo.find_one(
                settings.MONGO_OUT_DNS, settings.MONGO_OUT_PORT,
                settings.MONGO_OUT_DB, settings.MONGO_OUT_TBL,
                settings.MONGO_OUT_USR, settings.MONGO_OUT_PWD,
                '{"rid":' + rid + ',"key":"cv_result"}',
                '{"param_str":1,"cv_grid_data":1,"best_param":1,"_id":0}')
        except Exception as e:
            print "Exception from MongoDB:", e

        # if CV exists
        if doc:
            #print "cv_grid_data=",str(doc["cv_grid_data"])
            cv_grid_data = doc["cv_grid_data"]
            param_str = doc["param_str"]
    # convert ml_opts to json object, and set 1st uppercase & remove _
    if doc and document.ml_opts:
        jopts = json.loads(document.ml_opts)
        jopts["learning_algorithm"] = jopts["learning_algorithm"].title(
        ).replace("_", " ")

    return cv_grid_data, param_str, jopts
def get_feat_impo(request, rid, perm,disabled4reader):
    # chk access
    document =_list.get_ds_doc(rid, perm)
    if not document:
        return Response({"data not found":-1})
    
    # get data from mongo.dataset_info
    doc=query_mongo.find_one(settings.MONGO_OUT_DNS, settings.MONGO_OUT_PORT, settings.MONGO_OUT_DB, settings.MONGO_OUT_TBL
        , settings.MONGO_OUT_USR, settings.MONGO_OUT_PWD
        , '{"rid":'+rid+',"key":"feature_importance"}', '{"value":1,"_id":0}')
        
    if doc:
        arr=doc["value"]    
        return Response(arr)
    else:
        return Response({"data not found":-1})
def main():

    parser = ArgumentParser(description=__description__)
    parser.add_argument("-d", "--name", type=str, metavar="file name", help="file name for prediction", required=False)
    parser.add_argument("-o", "--out", type=str, metavar="learner output", help="out files for prediction", required=False)
    parser.add_argument("-r", "--row_id", type=str, metavar="row_id number", help="row_id number in the db", required=False)
    parser.add_argument("-i", "--cid", type=str, metavar="child row id", help="child row id for prediction", required=False)
    ####other parameters
    parser.add_argument("-nb", "--num", type=str, metavar="n gram", help="window size for n gram", required=False)
    parser.add_argument("-pa", "--para", type=str, metavar="param in 1 gram", help="number of parameters in 1 gram, if -1, no 1 gram", required=False)
    parser.add_argument("-x", "--max", type=str, metavar="max number of features", help="max number of features generated", required=False)
    parser.add_argument("-fw", "--fromweb", type=str, metavar="flag for web", help="flag for web", required=False)
    parser.add_argument("-pm", "--parameter", type=str, metavar="parameters in json", help="json string contains learning alg and parameter selection", required=False)
    parser.add_argument("-pp", "--pca_param", type=str, metavar="pca parameters in json", help="json string contains pca parameter selection", required=False)
    parser.add_argument("-lb", "--lib", type=str, metavar="spark mllib or scikit", help="learning library used", required=False)
    parser.add_argument("-sl", "--showlabelname", type=str, metavar="show label name", help="0: not shown; 1: show label name", required=False)
    parser.add_argument("-dsid", "--ds_id", type=str, metavar="source dataset id", help="source dataset id for training option", required=False)

    parser.add_argument("-ptn", "--pattern_str", type=str, metavar="regular express pattern to extract string"
        , help="regular express pattern to extract string", required=False)
    parser.add_argument("-vb", "--verbose", type=str, metavar="show detailed features", help="show detailed features", required=False)
    parser.add_argument("-ft", "--feat_cnt_threshold", type=str, dest='feat_cnt_threshold', help="feature count to allow prediction"
            , default =config.get('machine_learning', 'feature_count_threshold'))
    
    ###SPARK###
    parser.add_argument('-sp','--sp_master', type=str, dest='sp_master', help='spark.master'
                , default =config.get('spark', 'spark_master'))
    parser.add_argument('-em','--exe_memory', type=str, dest='exe_memory', help='spark.executor.memory'
                , default =config.get('spark', 'spark_executor_memory'))
    parser.add_argument('-cm','--core_max', type=str, dest='core_max', help='spark.cores.max'
                , default =config.get('spark', 'spark_cores_max'))
    
    #### database for output
    parser.add_argument('-ip','--ip_address', type=str, dest='ip_address', help='mongodb ip address'
                , default =config.get('mongo', 'out_ip_address'))
    parser.add_argument('-p','--port', type=str, dest='port', help='mongodb port'
                , default =eval(config.get('mongo', 'out_port')))
    parser.add_argument('-dn','--db_name', type=str, dest='db_name', help='mongodb db name'
                , default =config.get('mongo', 'out_db'))
    parser.add_argument('-t','--tb_name', type=str, dest='tb_name', help='mongodb table name'
                , default =config.get('mongo', 'out_tb'))
    # auth
    parser.add_argument('-un','--username', type=str, dest='username', help='mongodb username'
                , default =config.get('mongo', 'out_username'))
    parser.add_argument('-pw','--password', type=str, dest='password', help='mongodb password'
                , default =config.get('mongo', 'out_password'))
    
    args = parser.parse_args()
    
    if args.name:
        input_gz = args.name
    else:
        input_gz  = '000d9941eaf04efb55e5d0ccff3d90ee.gz'
    if args.out:
        out_dir = args.out
    else:
        out_dir  = '.'
    if args.row_id:
        row_id_str = args.row_id
    else:
        row_id_str  = '553'
    if args.ds_id:
        ds_id = args.ds_id
    else:
        ds_id  = '' 
    if args.cid:
        cid_str = args.cid
    else:
        cid_str  = '01'

    ###################################################
    if args.num:
        num_gram = eval(args.num)
    else:
        num_gram  = eval(config.get("machine_learning","svm_num_gram"))
    if args.para:
        param_in_gram_1 = eval(args.para)
    else:
        param_in_gram_1  = eval(config.get("IN","param_in_gram_1_in"))
    if args.max:
        MAX_FEATURES = eval(args.max)
    else:
        MAX_FEATURES  = eval(config.get("IN","MAX_FEATURES_IN"))
    if args.fromweb:
        fromweb = args.fromweb
    else:
        fromweb  = None        
    if args.parameter:
        j_str = args.parameter
    else:
        j_str='{"c":"1","iterations":"300","regularization":"l2","learning_algorithm":"logistic_regression_with_sgd"}'
    if args.lib: # mllib or scikit
        mode = args.lib
    else:
        mode='scikit'
    if args.showlabelname: # mllib or scikit
        labelnameflag = eval(args.showlabelname)
    else:
        labelnameflag = 0
    if args.verbose: 
        verbose = args.verbose
    else:
        verbose = "1"    
    ######database########################################
    if len(args.username)>0:
        username = args.username
    else:
        username  = None
    if len(args.password)>0:
        password = args.password
    else:
        password  = None     
    
    
    
    t0 = time()
    coef_arr=None
    
    ml_opts = json.loads(j_str)
    
    # ML parameters ===================== input ml_opts ==============
    learning_algorithm=None
    try:
        if ml_opts is None:
            ml_opts = json.loads(j_str)
        learning_algorithm = ml_opts['learning_algorithm']        
    except Exception as e:
        print "WARNING: load learning_algorithm failed.",e
    
    
    # read raw data from .gz file ===================== input .gz ==============
    file_content=None
    try:
        f = gzip.open(input_gz, 'rb')
        file_content = f.read()
        f.close()
    except Exception as e:
        print "ERROR: load data file ["+input_gz+"] failed.",e
        return -5
        
    
    # get data here; assume libsvm format
    label_feature_array=None
    feature_array=None
    label=None
    # optional
    sample_info=None
    if not file_content is None:
        label_features = file_content.strip()
        if not label_features is None:
            label_feature_array = label_features.split(' ')
            label = label_feature_array[0]
            # check if 1st item is integer
            int_1st="y"
            try:
                int(label_feature_array[0])
            except ValueError:
                int_1st="n"
            # check if 2nd item is integer
            int_2nd="y"
            try:
                int(label_feature_array[0])
            except ValueError: 
                int_2nd="n"
                
            if int_1st=="n" and int_2nd=="y":
                feature_array = label_feature_array[2:len(label_feature_array)]
            elif int_1st=="y" and int_2nd=="n":
                feature_array = label_feature_array[1:len(label_feature_array)]
                
            #feature_array = label_feature_array[1:len(label_feature_array)]
            # if sample_info exists, 2nd item will be digit and 3rd item won't be digit 
            if not label.isdigit() and label_feature_array[1].isdigit() and not label_feature_array[2].isdigit():
                sample_info=label
                label=label_feature_array[1]
                feature_array=label_feature_array[2:len(label_feature_array)] 
        else:
            print "ERROR: data format error!"
    else:
        print "ERROR: no data found!"
    #print label  ### -1 means no label
    #print feature_array
    curr_dic = {}
    #print "feature_array=",feature_array
    for features in feature_array:
        if len(features)>0:
            key, value = features.split(':')
            curr_dic[key] = float(value)
        else:
            print "WARNING: data format error!"
            
    print "INFO: curr_dic len=",len(curr_dic)

    if curr_dic and verbose=="1":

        #print "INFO: *** Feature list: ===================================="
        # clean up feature file
        out_file=os.path.join(out_dir,cid_str+"_feature_list.json")
        print "INFO: feature file=",out_file
        if os.path.exists(out_file):
            try:
                os.remove(out_file)
            except OSError, e:
                print ("ERROR: %s - %s." % (e.strerror, out_file))
        out_f=open(out_file, 'a')   
        
        # get coef_arr ==================================
        if coef_arr is None and not learning_algorithm in ('kmeans'):
            key = "coef_arr"  #{"123":"openFile"}
            jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
            jstr_proj='{"value":1}'
            # each model has its own coef_arr
                
            doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj)
            coef_arr = doc['value']
        
        fout_arr=[]
        len_coef=len(coef_arr)
        for k,v in curr_dic.items():
            feat_out={}
            feat_out["ngram"]=""
            if int(k) < len_coef:
                feat_out["fid"]=k
                feat_out["coef"]=coef_arr[int(k)-1]
                feat_out["desc"]=""
            else:
                feat_out["fid"]="None"
                feat_out["coef"]=0
                feat_out["desc"]=str(k)
                
            fout_arr.append(feat_out)

        if len(fout_arr) > 0:
            out_f.write(json.dumps(fout_arr))               
        out_f.close()
    print "RESULT: predict output=", sing_label_pred      

    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    pred_label=None
    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
        
        doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj)
        dic_list = doc['value']
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
        try:
            pred_label = label_dic[int(sing_label_pred)]
        except Exception as e:
            print "WARNING: Can't get label",e
            pred_label=str(sing_label_pred)
    else:
        pred_label = str(sing_label_pred)
        print "RESULT: prediction=", pred_label
def mrun(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, fromweb
    , training_fraction, jobname, run_number, bin_number ): 
        
    ### generate data folder and out folder, clean up if needed
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path       

    # get_spark_context
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 
    
    
    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat=0
    if not ml_opts_jstr is None:
        ml_opts=json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat=ml_opts["has_excluded_feat"]
    #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist
    
    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        key = "feature_excluded"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
        # get from own id (not from parent dataset id)
        #print "jstr_filter=",jstr_filter,",jstr_proj=",jstr_proj
        doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj)
        #print "feature_excluded=",doc
        if not doc is None and 'value' in doc:
            excluded_feat_cslist = ','.join(str(i) for i in doc['value'])
    print "INFO: excluded_feat_cslist=",excluded_feat_cslist
    
    
    ### generate Labeled point
    #libsvm_data_file = data_folder + "libsvm_data"
    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file
    
    # load feature count file
    feat_count_file=libsvm_data_file+"_feat_count"
    feature_count=zip_feature_util.get_feature_count(sc,feat_count_file)
    print "INFO: feature_count=",feature_count

    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    # load sample RDD from text file   
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash() 
    samples_rdd, feature_count=zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist)
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    
    # get distinct label list
    labels_list_all = samples_rdd.map(lambda p: p[0].label).distinct().collect()
    #labels_list_all = samples_rdd.map(lambda p: p.label).collect()

    t1 = time()
    print "INFO: labels_list_all=",labels_list_all
    #print "INFO: training and testing samples generated!"
    print 'INFO: data generating time: %f' %(t1-t0)
    t0 = t1
    
    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    label_set = set(labels_list_all)
    class_num = len(label_set)
    #class_num = len(labels_list)
    if class_num > 2:
        print "INFO:Number of classes=", class_num
    
    
    ###############################################
    ###########build learning model################
    ###############################################
    
    ### get the parameters###
    print "INFO: ============Learning Algorithm and Parameters============="
    #param_dict = json.loads(ml_opts_jstr)
    flag_model = ml_opts['learning_algorithm']     # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd
    C = eval(ml_opts['c'])
    iteration_num = ml_opts['iterations']
    regularization = ml_opts['regularization']
    print "INFO: Learning Algorithm: ", flag_model
    print "INFO: C = ", C
    print "INFO: iteration = ", iteration_num
    print "INFO: regType = ", regularization
    
    
    t0 = time()
    accuracy_array = np.zeros(run_number)
    for rnd in range (0, run_number):
    
        ### generate training and testing data
        training_rdd, testing_rdd = samples_rdd.randomSplit([training_fraction, 1-training_fraction])
        training_rdd=training_rdd.map(lambda p:p[0])# keep LabeledPoint only
        training_rdd.cache()
        testing_rdd.cache()
        training_sample_count = training_rdd.count()
                
        regP = C/float(training_sample_count)
        print "INFO: Calculated: regParam = ", regP
        
        ### build model ###
        
        if flag_model == "linear_svm_with_sgd":
            ### 1: linearSVM
            print "INFO: ====================1: Linear SVM============="
            model_classification = SVMWithSGD.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization)   # regParam = 1/(sample_number*C)
            #print model_classification
        elif flag_model == "logistic_regression_with_lbfgs":
            ### 2: LogisticRegressionWithLBFGS
            print "INFO: ====================2: LogisticRegressionWithLBFGS============="
            model_classification = LogisticRegressionWithLBFGS.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num)   # regParam = 1/(sample_number*C)
        elif flag_model == "logistic_regression_with_sgd":
            ### 3: LogisticRegressionWithLBFGS
            print "INFO: ====================3: LogisticRegressionWithSGD============="
            model_classification = LogisticRegressionWithSGD.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization)   # regParam = 1/(sample_number*C)    
        else:
            print "ERROR: Training model selection error: no valid ML model selected!"
            return
        
        ### Evaluating the model on testing data
        labelsAndPreds = testing_rdd.map(lambda p: (p[0].label, model_classification.predict(p[0].features)))
        labelsAndPreds.cache()
        testing_sample_number = testing_rdd.count()
        testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testing_sample_number)
        accuracy = 1 - testErr
        
        accuracy_array[rnd] = accuracy        
        print "INFO: current round=", rnd
        print "INFO: Accuracy=", accuracy
    

    ########################below: same as train_skLean_multi_run.py#####################################    
    ###############################################
    #######plot distribution and variance##########
    ###############################################

    plt.figure(1)
    
    num_bins = bin_number  ####10 is default
    n, bins, patches = plt.hist(accuracy_array, num_bins, normed=1, facecolor='green', alpha=0.5)
    ave = np.mean(accuracy_array)
    print "INFO: Accuracy mean=", ave
    variance = np.std(accuracy_array)
    print "INFO: Accuracy variance=", variance
    
    #print "INFO: bins: ", bins
    # add a 'best fit' line
    y = mlab.normpdf(bins, ave, variance)
    #print "INFO: y: ", y
    plt.plot(bins, y, 'r--')
    
    plt.title('Accuracy distribution of '+str(run_number)+' runs:')
    plt.xlabel('Accuracy Values')
    plt.ylabel('Probability / Accuracy bar width')
    
    #plt.savefig(local_out_dir+file_name_given+"_var_"+str(run_number)+".png")
    plt.savefig(os.path.join(local_out_dir, row_id_str+"_var_"+str(run_number)+".png"))

    # create ROC data for graph ====================
    all_json=[]
    barp_arr=[] #n
    disp_arr=[] #y
    last_idx=0
    for idx,ht in enumerate(n): # n is bar height
        #print "INFO: mrun bar=",idx, bins[idx], bins[idx+1],((bins[idx]+bins[idx+1])/2)
        barp_arr.append([ ((bins[idx]+bins[idx+1])/2.0),n[idx]]) # mid point for x axis
        if not math.isnan(y[idx]):
            disp_arr.append([bins[idx],y[idx]])
        last_idx=idx
    #print "INFO: ",last_idx+1, bins[last_idx+1], y[last_idx+1]
    if not math.isnan(y[last_idx+1]):
        disp_arr.append([bins[last_idx+1],y[last_idx+1]])
    #print "barp_arr=", barp_arr
    #print "disp_arr=", disp_arr
    #bar
    bar_json={}
    bar_json["values"]=barp_arr
    bar_json["key"]='Mutil-Run Accuracy' #
    bar_json["type"]="bar" # light blue
    bar_json["yAxis"]=1
    all_json.append(bar_json)
    #distribution
    if len(disp_arr)>0:
        dis_json={}
        dis_json["values"]=disp_arr
        dis_json["key"]='Normal Distribution' #
        dis_json["type"]="line" # light blue
        dis_json["yAxis"]=1
        all_json.append(dis_json)
    
    mrun_jfile = os.path.join(local_out_dir, row_id_str+"_mrun.json")
    #print "INFO: all_json=",all_json
    print "INFO: mrun_jfile=",mrun_jfile
    if os.path.exists(mrun_jfile):
        try:
            os.remove(mrun_jfile)
        except OSError, e:
            print ("Error: %s - %s." % (e.mrun_jfile,e.strerror))
def result2(request, rid, oid, perm, disabled4reader):
    print 'in result2, rid=', rid, ', oid=', oid
    o_rid = rid
    # get train option doc, if oid provided
    if oid > 0:
        rid = oid
    document = _list.get_ds_doc(rid, perm)
    if not document:
        return HttpResponseRedirect(reverse('atdml.views.list'))

    # for return only
    #form=DocumentForm()
    predictions = [
    ]  #Document.objects.all().filter(file_type="predict", train_id=rid).order_by('-id')[0:10]
    # get train option id
    train_id = document.train_id
    ml_lib = document.ml_lib
    status = document.status
    # get sample file list
    sflist = _predict.get_sfile_list(document.filename, document.id,
                                     document.file_type, train_id)
    # how to get dir?
    # get cross validation info
    cv_grid_data, param_str, jopts = get_cv_grid(document, rid)
    print "************** ml_has_cv=", document.ml_has_cv, cv_grid_data

    if jopts:
        print "rid=", rid, ", jopts=", jopts
    else:
        print "rid=", rid, ", jopts not found"

    has_roc = has_result_file(rid, str(rid) + "_roc.json")
    has_mrun = has_result_file(rid, str(rid) + "_mrun.json")
    has_score = has_result_file(rid, str(rid) + "_score_graph.json")
    print "has_roc=", has_roc, ", has_mrun=", has_mrun, ", has_score=", has_score
    has_result = None

    # check algorithm
    train_opt = {}
    if not document.ml_opts is None and len(document.ml_opts) > 0:
        train_opt = json.loads(document.ml_opts)

    #
    if document.status_code >= 500:
        # check if clustering data is in
        if has_result_file(rid,
                           str(rid) + "_cluster*.png"
                           ) and train_opt["learning_algorithm"] in ('kmeans'):
            has_result = "U"
        else:
            # check if png for classification exists?
            has_result = "Y"
    elif ml_lib == "dnn":  # allow DNN to view status
        has_result = "Y"

    has_featc = has_result_file(rid, str(rid) + "_feat_coef.json")
    has_fp = has_result_file(rid, str(rid) + "_false_pred.json")

    # get ml_opts
    feature_excluded_list = None
    if "has_excluded_feat" in train_opt and train_opt["has_excluded_feat"] == 1:
        # get data from mongo.dataset_info
        try:
            doc = query_mongo.find_one(
                settings.MONGO_OUT_DNS, settings.MONGO_OUT_PORT,
                settings.MONGO_OUT_DB, settings.MONGO_OUT_TBL,
                settings.MONGO_OUT_USR, settings.MONGO_OUT_PWD,
                '{"rid":' + rid + ',"key":"feature_excluded"}', '{"value":1}')
            if not doc is None:
                #print "doc type=", type(doc), ",doc=",doc
                feature_excluded_list = doc["value"]
                print "feature_excluded_list=", feature_excluded_list
        except Exception as e:
            print "Exception from MongoDB:", e

    rpage = 'atdml/result.html'
    if oid > 0:
        rpage = 'atdml/result_opts.html'
    feat_str = ""
    if not feature_excluded_list is None:
        feat_str = ','.join(str(i) for i in feature_excluded_list)
    print "has_roc=", has_roc, ", has_mrun=", has_mrun, ", has_result=", has_result, "rpage=", rpage

    # get perf and dataset info
    if document.perf_measures and document.perf_measures != "null":
        perf_measures = json.loads(document.perf_measures)
    else:
        perf_measures = {}
    if document.dataset_info and document.dataset_info != "null":
        dataset_info = json.loads(document.dataset_info)
    else:
        dataset_info = {}
    return render(
        request,
        #'atdml/result.html',
        rpage,
        {
            "document":
            document,
            "predictions":
            predictions,
            "sflist":
            sflist  #, "form": form
            ,
            "disabled4reader":
            disabled4reader,
            "perm":
            perm,
            "cv_grid_data":
            cv_grid_data,
            "param_str":
            param_str,
            "has_fp":
            has_fp,
            "jopts":
            jopts,
            "has_roc":
            has_roc,
            "has_mrun":
            has_mrun,
            "has_result":
            has_result,
            "has_featc":
            has_featc,
            "has_score":
            has_score,
            "feature_excluded":
            feat_str,
            "ml_lib":
            ml_lib,
            "status":
            status,
            "tp":
            perf_measures["tp"] if "tp" in perf_measures else "",
            "tn":
            perf_measures["tn"] if "tn" in perf_measures else "",
            "fp":
            perf_measures["fp"] if "fp" in perf_measures else "",
            "fn":
            perf_measures["fn"] if "fn" in perf_measures else "",
            "phi":
            '%0.5f' % perf_measures["phi"] if "phi" in perf_measures else "",
            "fscore":
            '%0.5f' %
            perf_measures["fscore"] if "fscore" in perf_measures else "",
            "roc_auc":
            '%0.5f' %
            perf_measures["roc_auc"] if "roc_auc" in perf_measures else "",
            "class_count":
            dataset_info["class_count"]
            if "class_count" in dataset_info else "",
            "training_fraction":
            dataset_info["training_fraction"]
            if "training_fraction" in dataset_info else "",
            "dataset_count":
            dataset_info["dataset_count"]
            if "dataset_count" in dataset_info else "",
            "MEDIA_URL":
            settings.MEDIA_URL
        },
    )
def main():

    parser = ArgumentParser(description=__description__)
    parser.add_argument("-f",
                        "--folder",
                        type=str,
                        metavar="folder of features",
                        help="hdfs folder contains features",
                        required=False)
    parser.add_argument("-n",
                        "--name",
                        type=str,
                        metavar="file name",
                        help="file name for sample folder",
                        required=False)
    parser.add_argument("-o",
                        "--out",
                        type=str,
                        metavar="out figure folder",
                        help="folder contains output",
                        required=False)
    parser.add_argument("-r",
                        "--row_id",
                        type=str,
                        metavar="row id",
                        help="row_id number in the db",
                        required=False)
    parser.add_argument("-mf",
                        "--modelfolder",
                        type=str,
                        metavar="model folder",
                        help="model for prediction",
                        required=False)
    parser.add_argument(
        "-l",
        "--listfile",
        type=str,
        metavar="list file",
        help="list of testing data hashes for single file prediction",
        required=False)
    parser.add_argument("-u",
                        "--uploadtype",
                        type=str,
                        metavar="upload type",
                        help="data type",
                        required=False)
    parser.add_argument("-w",
                        "--fromweb",
                        type=str,
                        metavar="flag for web",
                        help="flag for web",
                        required=False)
    parser.add_argument(
        "-pm",
        "--parameter",
        type=str,
        metavar="parameters in json",
        help="json string contains learning alg and parameter selection",
        required=False)

    parser.add_argument('-sp',
                        '--sp_master',
                        type=str,
                        dest='sp_master',
                        help='spark.master',
                        default=config.get('spark', 'spark_master'))
    parser.add_argument('-em',
                        '--exe_memory',
                        type=str,
                        dest='exe_memory',
                        help='spark.executor.memory',
                        default=config.get('spark', 'spark_executor_memory'))
    parser.add_argument('-cm',
                        '--core_max',
                        type=str,
                        dest='core_max',
                        help='spark.cores.max',
                        default=config.get('spark', 'spark_cores_max'))

    #### database
    parser.add_argument('-ip',
                        '--ip_address',
                        type=str,
                        dest='ip_address',
                        help='mongodb ip address',
                        default=config.get('mongo', 'ip_address'))
    parser.add_argument('-p',
                        '--port',
                        type=str,
                        dest='port',
                        help='mongodb port',
                        default=eval(config.get('mongo', 'port')))
    parser.add_argument('-dn',
                        '--db_name',
                        type=str,
                        dest='db_name',
                        help='mongodb db name',
                        default=config.get('mongo', 'out_db'))
    parser.add_argument('-t',
                        '--tb_name',
                        type=str,
                        dest='tb_name',
                        help='mongodb table name',
                        default=config.get('mongo', 'out_feat_tb'))
    # auth
    parser.add_argument('-un',
                        '--username',
                        type=str,
                        dest='username',
                        help='mongodb username',
                        default=config.get('mongo', 'username'))
    parser.add_argument('-pw',
                        '--password',
                        type=str,
                        dest='password',
                        help='mongodb password',
                        default=config.get('mongo', 'password'))

    args = parser.parse_args()

    if args.folder:
        feat_dir = args.folder
    else:
        feat_dir = config.get(
            'app', 'HADOOP_MASTER'
        ) + '/user/hadoop/yigai/sality_virut_zbot_backdoor_hash_000'
    if args.name:
        file_name_given = args.name
    else:
        file_name_given = 'aaaa'
    if args.out:
        out_dir = args.out
    else:
        out_dir = 'out_result'
    if args.row_id:
        row_id_str = args.row_id
    else:
        row_id_str = '88'

    if args.modelfolder:
        model_data_folder = args.modelfolder
    else:
        model_data_folder = out_dir + '/' + row_id_str + '_model/'
    if args.listfile:
        list_file_test = args.listfile
    else:
        list_file_test = out_dir + '/' + row_id_str + '_testhashlist.txt'
    if args.uploadtype:
        uploadtype = args.uploadtype
    else:
        uploadtype = None
    if args.fromweb:
        fromweb = args.fromweb
    else:
        fromweb = None
    if args.parameter:
        j_str = args.parameter
    else:
        j_str = '{"learning_algorithm":"linear_svm_with_sgd", "c":"1", "iteration":"300", "regularization":"l2"}'
    if len(args.username) > 0:
        username = args.username
    else:
        username = None
    if len(args.password) > 0:
        password = args.password
    else:
        password = None

    data_folder = feat_dir + "/"
    out_dir = out_dir + "/"
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if os.path.exists(model_data_folder):
        shutil.rmtree(model_data_folder)
    if not os.path.exists(model_data_folder):
        os.makedirs(model_data_folder)

    if os.path.isfile(list_file_test):
        try:
            os.remove(list_file_test)
        except OSError:
            pass
    else:
        with open(list_file_test, "w") as myfile:
            pass

    SparkContext.setSystemProperty('spark.rdd.compress',
                                   config.get('spark', 'spark_rdd_compress'))
    SparkContext.setSystemProperty(
        'spark.driver.maxResultSize',
        config.get('spark', 'spark_driver_maxResultSize'))
    SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
    SparkContext.setSystemProperty('spark.cores.max', args.core_max)

    sc = SparkContext(args.sp_master, 'sk-learn-train:' + str(args.row_id))

    t0 = time()

    ### load libsvm file ###
    libsvm_data_file = data_folder + "libsvm_data"
    print "libsvm_data_file:", libsvm_data_file
    samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    #print samples_rdd.count()
    labels_and_features_rdd = samples_rdd.map(lambda p: (p.label, p.features))
    all_data = labels_and_features_rdd.collect()
    features_list = [x.toArray() for _, x in all_data]
    labels_list = [x for x, _ in all_data]
    labels_list = array(labels_list)

    features_array = np.array(features_list)

    ### generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    ### randomly split the samples into training and testing data
    X_train_sparse, X_test_sparse, labels_train, labels_test = cross_validation.train_test_split(
        features_sparse_mtx, labels_list, test_size=0.4)

    t1 = time()
    print 'data generating time: %f' % (t1 - t0)

    ###############################################
    ###########build learning model################
    ###############################################

    if flag_model == "linear_svm_with_sgd":
        ### 1: linearSVM
        print "====================1: Linear SVM============="
        clf = svm.LinearSVC()
        clf.fit(X_train_sparse, labels_train)

    #### save clf for future use ####
    joblib.dump(clf, model_data_folder + row_id_str + '.pkl')

    #print "**model:coef***"
    #print clf.coef_
    #print "**model:intercept***"
    #print clf.intercept_

    ### Evaluating the model on testing data
    labels_pred = clf.predict(X_test_sparse)
    #print "************results*********"
    #print "Predicting results:"
    #print labels_pred
    #print "True testing labels:"
    #print labels_test

    accuracy = clf.score(X_test_sparse, labels_test)
    print "Accuracy = ", accuracy

    ###################################################
    ### generate label names (family names) ###########
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    ###################################################
    key = "dic_name_label"
    jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
    jstr_proj = '{"value":1}'

    doc = query_mongo.find_one(args.ip_address, args.port, args.db_name,
                               args.tb_name, username, password, jstr_filter,
                               jstr_proj)
    dic_list = doc['value']

    label_dic = {}
    for i in range(0, len(dic_list)):
        for key in dic_list[i]:
            label_dic[dic_list[i][key]] = key.encode('UTF8')
    print "label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])

    ### generate sample numbers of each family in testing data###
    testing_sample_number = len(labels_test)
    print "testing_sample_number:", testing_sample_number
    test_cnt_dic = {}
    for key in label_dic:
        test_cnt_dic[key] = 0
    for i in range(0, testing_sample_number):
        for key in label_dic:
            if labels_test[i] == key:
                test_cnt_dic[key] = test_cnt_dic[key] + 1
    print "Number of samples in each label is:", test_cnt_dic

    ###############################################
    ###########plot prediction result figure#######
    ###############################################

    ### reorder labels so that labels are ordered according to the true label of the data
    len_pred = len(labels_pred)
    wide_len = math.ceil(math.sqrt(len_pred))

    pred_list = labels_pred.tolist()
    test_list = labels_test.tolist()
    labels_true_pred = zip(test_list, pred_list)
    labels_true_pred.sort(key=lambda x: x[0])

    test_ordered = [x for x, _ in labels_true_pred]
    pred_ordered = [x for _, x in labels_true_pred]

    last_value = test_ordered[len_pred - 1]
    for i in range(len_pred, int(wide_len * wide_len)):
        test_ordered.append(last_value)
        pred_ordered.append(last_value)

    mtx_testing = np.reshape(test_ordered, (wide_len, wide_len))
    mtx_pred = np.reshape(pred_ordered, (wide_len, wide_len))

    ### plot figues ###
    fig, ax = plt.subplots()
    cax = ax.imshow(mtx_pred, interpolation='nearest', cmap=plt.cm.jet)
    num_labels = len(labels_list)
    tic = range(0, num_labels)

    labels_str = []
    # append sample count at the end
    for key in sorted(test_cnt_dic):
        labels_str.append(labels_list[key] + "(" + str(test_cnt_dic[key]) +
                          ")")

    cbar = fig.colorbar(cax, ticks=tic)
    cbar.ax.set_yticklabels(labels_str)  # vertically oriented colorbar
    cbar.ax.invert_yaxis()
    plt.xlabel('Prediction (Single Run)')
    plt.savefig(out_dir + file_name_given + "_1" + ".png")

    fig, ax = plt.subplots()
    cax = ax.imshow(mtx_testing, interpolation='nearest', cmap=plt.cm.jet)
    #ax.set_title('Gaussian noise with vertical colorbar')
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    cbar = fig.colorbar(cax, ticks=tic)
    cbar.ax.set_yticklabels(labels_str)  # vertically oriented colorbar
    cbar.ax.invert_yaxis()
    plt.xlabel('True Labels (Single Run)')
    plt.savefig(out_dir + file_name_given + "_2" + ".png")

    plt.show()

    #############################################################
    ###################for 2 class only (plot ROC curve)#########
    #############################################################
    if len(labels_list) == 2:

        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        else:
            print "No ROC curve generated: 'clean' must be a label for indicating negative class!"
            return

        confidence_score = clf.decision_function(X_test_sparse)

        if flag_clean == 0:
            scores = [x for x in confidence_score]
            s_labels = [x for x in labels_test]
            testing_N = test_cnt_dic[0]
            testing_P = test_cnt_dic[1]
        else:
            scores = [-x for x in confidence_score]
            s_labels = [1 - x for x in labels_test]
            testing_N = test_cnt_dic[1]
            testing_P = test_cnt_dic[0]

        ###########plot ROC figure#######
        try:
            fpr, tpr, thresholds = roc_curve(s_labels, scores, pos_label=1)
            roc_auc = auc(fpr, tpr)
        except ValueError as e:
            print "Error!! in ROC curve: ",
            print e

        print "ROC_AUC = ", roc_auc

        plt.figure()
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")
        plt.savefig(out_dir + file_name_given + "_ROC" + ".png")
        print "Figure save!"

        #### generate fpr tpr ACC threshold results file###
        ROC_file = out_dir + file_name_given + "_ROC_value.txt"
        if os.path.exists(ROC_file):
            try:
                os.remove(ROC_file)
            except OSError, e:
                print("Error: %s - %s." % (e.ROC_file, e.strerror))

        for i in range(0, len(fpr)):
            ACC = (testing_P * tpr[i] + testing_N *
                   (1 - fpr[i])) / (testing_P + testing_N)
            with open(ROC_file, 'a') as f:
                f.write('%0.5f  ' % (fpr[i]))
                f.write('%0.5f  ' % (tpr[i]))
                f.write('%0.5f  ' % (thresholds[i]))
                f.write('%0.5f\n' % (ACC))

        ### print FPR, TPR, ACC ###
        if flag_model == "linear_svm_with_sgd":
            thr = 0
        elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd":
            thr = -0.5
        for i in range(0, len(thresholds)):
            if thresholds[i] < thr:
                print "===Results Summary==="
                print "Accuracy: ", accuracy
                #print "Accuracy (calculate): ", (testing_P*tpr[i-1] + testing_N*(1-fpr[i-1]))/(testing_P + testing_N)
                print "False Positive Rate (FPR): ", fpr[i - 1]
                print "True Positive Rate (TPR): ", tpr[i - 1]
                print "====================="
                break
예제 #8
0
def predict(row_id_str, ds_id, cid_str, input_gz, local_out_dir, num_gram
        , j_str, lib_mode
        , fromweb, verbose,label_idx=0, data_idx=3, metadata_count=3, pattern_str='(.*)', ln_delimitor = '\t', binary_flag=True, labelnameflag=1
        , model_filename=None, str_model_json=None, sample_txt=None ,pca_filename=None, pca_param=None
        , sp_master=config.get('spark', 'spark_master'), exe_memory=config.get('spark', 'spark_executor_memory')
        , core_max=config.get('spark', 'spark_cores_max')
        , MAX_FEATURES=eval(config.get("machine_learning","MAX_FEATURES")) , dic_name_label=None
        , feat_cnt_threshold=config.get('machine_learning', 'feature_count_threshold')
        , ip_address=config.get('mongo', 'out_ip_address'), port=eval(config.get('mongo', 'out_port'))
        , db_name=config.get('mongo', 'out_db'), tb_name=config.get('mongo', 'out_tb')
        , username=config.get('mongo', 'out_username'), password=config.get('mongo', 'out_password')
        , sc=None
        ):

    print "in"
    t0 = time()
    coef_arr=None
    dic_hash_str=None
    dic_seq_hashes=None
    dic_hashes_seq=None
    feat_sample_count_arr=None
    hashes_cnt_dic=None
    hash_str_dic=None
    data_rows=None
    data_cols=None

    ml_opts=None

    # load model from strings ============ for offline IN, liner model ==============
    if not str_model_json is None and len(str_model_json)>10:
        try:
            model_json=json.loads(str_model_json)
        except Exception as e:
            print "ERROR: model json load error." , e
            return -1
      
        #print "model_json=",model_json
        if "coef_arr" in model_json:
            coef_arr=model_json["coef_arr"]
            col_num = len(coef_arr)   
        if "coef_intercept" in model_json:
            coef_intercept=model_json["coef_intercept"]
        if "dic_hash_str" in model_json:
            dic_hash_str=model_json["dic_hash_str"]
        if "dic_seq_hashes" in model_json:
            dic_seq_hashes=model_json["dic_seq_hashes"]    
        if "pca_param" in model_json:
            pca_param=model_json["pca_param"]
        if "feat_sample_count_arr" in model_json:
            feat_sample_count_arr=model_json["feat_sample_count_arr"]
        if dic_name_label is None:
            dic_name_label=model_json["dic_name_label"]
        if j_str is None:
            j_str=model_json["ml_opts"]
        ml_opts=json.loads(j_str)
        #print "j_str=",j_str
        num_gram=eval(model_json["ml_n_gram"])
        lib_mode="offline"
    # load model from a file ======
    elif not model_filename is None: 
        if os.path.exists(model_filename):
            print "INFO: model from file=",model_filename
            try:
                with open(model_filename) as jf:
                    model_json=json.load(jf)
                    if "coef_arr" in model_json:
                        coef_arr=model_json["coef_arr"]
                        col_num = len(coef_arr)   
                    if "coef_intercept" in model_json:
                        coef_intercept=model_json["coef_intercept"]
                    if "dic_hash_str" in model_json:
                        dic_hash_str=model_json["dic_hash_str"]
                    if "dic_seq_hashes" in model_json:
                        dic_seq_hashes=model_json["dic_seq_hashes"]    
                    if "pca_param" in model_json:
                        pca_param=model_json["pca_param"]
                    if "feat_sample_count_arr" in model_json:
                        feat_sample_count_arr=model_json["feat_sample_count_arr"]
                    dic_name_label=model_json["dic_name_label"]
                    ml_opts=model_json["ml_opts"]
                    num_gram=eval(model_json["ml_n_gram"])
                    lib_mode="offline"
                    #print "-- id=",id,",ds_id=",ds_id
                    print "INFO: model feature count=",col_num
                    print "INFO: model for num_gram=",num_gram
                    #print "model json=",model_json
            except Exception as e:
                print "ERROR: loading model file ["+model_filename+"] error! ",e
                return -3
        else:
            print "ERROR: model file ["+model_filename+"] not found! "
            return -4
            
    # ML parameters ===================== input ml_opts ==============
    learning_algorithm=None
    try:
        if ml_opts is None:
            ml_opts = json.loads(j_str)
        #print "INFO: ml_opts=",ml_opts
        if 'learning_algorithm' in ml_opts:
            learning_algorithm = ml_opts['learning_algorithm'] 
    except Exception as e:
        print "WARNING: load learning_algorithm failed.",e
    print "INFO: learning_algorithm=",learning_algorithm
    
    
    # read raw data from .gz file ===================== input .gz ==============
    #if json_str is None:
    # TBD by faster convert_to_line_by_bash()
    try:
        f = gzip.open(input_gz, 'rb')
        sample_txt = convert_to_line(f, metadata_count) # check if one line, if raw file then convert to 1 line
        #print "sample_txt=",sample_txt[:100].replace('\t',',')
        #print "sample_txt=",sample_txt.replace('\t',',')
        f.close()
    except Exception as e:
        print "ERROR: load data file ["+input_gz+"] failed.",e
        return -5

    #f = gzip.open(input_gz, 'rb')
    #file_content = f.readline() # assume only one line
    #file_content=convert_to_line(f, metadata_count) # check if one line, if raw file then convert to 1 line
    #f.close()

    # input: assume one line of ngram pattern format string ===========
    #       return an array [meta-data1,meta-data2,...,str_arr]
    raw_arr=None
    coef_arr=None
    feat_arr=None

    # input:  one line text
    #       return array: [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    raw_arr=preprocess_pattern(sample_txt, metadata_count, pattern_str, ln_delimitor, label_idx, label_arr=None )
    #print "*****************raw_arr=",raw_arr
    
    # input:  array: [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    #       return hashes_cnt_dic: {hash,hash:count),...}  hash_str_dic: {hash: 'str1',... }
    feat_arr=feature_extraction_ngram(raw_arr, data_idx, MAX_FEATURES, num_gram)
    #print "**************feat_arr=",feat_arr

    #
    if feat_arr is None or len(feat_arr)==0:
        print "ERROR: Raw data format error or no feature found at predict_single_file_pattern."
        return -1

    # load PCA params ================= ========
    threshold=None
    n_component=None
    # data for PCA; TBD for all algorithm
    if learning_algorithm =='kmeans' : #
        if pca_param is None:
            # get from mongo 
            key = "pca_param"
            jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
            jstr_proj='{"value":1}'  

            # ???get parent dataset's data
            #if ds_id != row_id_str:
            #    jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
                    
            doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj)
            if doc and "value" in doc:
                pca_param = doc['value']
                print "INFO: pca_param=", pca_param

        # param for PCA model and transform: expect both threshold and k in pca_param
        if not pca_param is None:
            if "threshold" in pca_param:
                threshold=pca_param["threshold"]
            if "k" in pca_param:
                n_component=pca_param["k"]
        print "INFO: n_component=",n_component,", threshold=",threshold        
        
    # get {seq :hash,hash } mapping from mongo  key=dic_seq_hashes ===================
    if dic_seq_hashes is None:
        key = "dic_seq_hashes"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
        #print "************** ds_id=",ds_id,", rid=",row_id_str
        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
        
        doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj)
        if not doc is None:
            dic_seq_hashes = doc['value']
        else:
            # get from local file
            fn=os.path.join(local_out_dir,ds_id+"_dic_seq_hashes.pkl")
            print "INFO: get dic_seq_hashes from local", fn
            dic_seq_hashes=ml_util.ml_pickle_load(fn)
            print "INFO: len(dic_seq_hashes)=", len(dic_seq_hashes)
        
    if dic_seq_hashes:
        dic_len=len(dic_seq_hashes)
    else:
        dic_len=0

    # print feature for ref by a new optional param?
    out_f=None
    # for feature list. not for kmeans 
    if verbose=="1" and learning_algorithm not in ('kmeans'):
        # get {hash : raw string} mapping ==================================
        if dic_hash_str is None:
            key = "dic_hash_str"  #{"123":"openFile"}
            jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
            jstr_proj='{"value":1}'
            # get parent dataset's data
            if ds_id != row_id_str:
                jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
                
            doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj)
            if not doc is None:
                dic_hash_str = doc['value']
            else:
                # get from local file
                fn=os.path.join(local_out_dir,ds_id+"_dic_hash_str.pkl")
                print "INFO: get dic_hash_str from local", fn
                dic_hash_str=ml_util.ml_pickle_load(fn)
                print "INFO: len(dic_hash_str)=", len(dic_hash_str)
        
            #print "hashes_cnt_dic=",hashes_cnt_dic
            #print "dic_hash_str=",dic_hash_str
        
        # clean up feature file
        out_file=os.path.join(local_out_dir,cid_str+"_feature_list.json")
        if os.path.exists(out_file):
            try:
                os.remove(out_file)
            except OSError, e:
                print ("ERROR: %s - %s." % (e.strerror, out_file))
        if not learning_algorithm in ('kmeans','lstm','cnn'):
            print "INFO: feature file=",out_file
            out_f=open(out_file, 'a')
        
        # get local dict    
        hash_str_dic=feat_arr[data_idx+1]
        # convert key to string
        hash_str_dic={str(k): v for k, v in hash_str_dic.items()}

        coef_arr=None
        # get coef_arr ==================================
        if coef_arr is None and not learning_algorithm in ('kmeans','lstm','cnn'):
            key = "coef_arr"  #{"123":"openFile"}
            jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
            jstr_proj='{"value":1}'
            # each model has its own coef_arr
                
            doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj)
            if not doc is None and 'value' in doc:
                coef_arr = doc['value']
            else:
                # get from local file
                fn=os.path.join(local_out_dir,ds_id+"_coef_arr.pkl")
                print "INFO: get coef_arr from local fn=", fn
                coef_arr=ml_util.ml_pickle_load(fn)
                print "INFO: len(coef_arr)=", len(coef_arr)

        # get feat_sample_count_arr ==================================
        if feat_sample_count_arr is None and not learning_algorithm in ('kmeans'):
            key = "feat_sample_count_arr"  
            jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
            jstr_proj='{"value":1}'
            # get parent dataset's data
            if ds_id != row_id_str:
                jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
                
            doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj)
            if not doc is None:
                feat_sample_count_arr = doc['value']
            else:
                # get from local file
                fn=os.path.join(local_out_dir,ds_id+"_feat_sample_count_arr.pkl")
                print "INFO: get feat_sample_count_arr from local", fn
                feat_sample_count_arr=ml_util.ml_pickle_load(fn)
                print "INFO: len(feat_sample_count_arr)=", len(feat_sample_count_arr)  
예제 #9
0
    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    pred_label=None
    label_dic = {}
    if labelnameflag == 1:
        if dic_name_label is None:
            key = "dic_name_label"
            jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
            jstr_proj='{"value":1}'

            # get parent dataset's data
            if ds_id != row_id_str:
                jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
            
            doc=query_mongo.find_one(ip_address, port, db_name, tb_name, username, password, jstr_filter, jstr_proj)
            dic_list = doc['value']
            
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic=", label_dic
        

        if not sing_label_pred is None:
            pred_label = label_dic[int(sing_label_pred)]
        if learning_algorithm in ("kmeans"):
            pred_label = "cluster# "+str(sing_label_pred)
        print "RESULT: prediction=", pred_label
    
    status="predicted"
예제 #10
0
def predict(row_id_str,
            ds_id,
            cid_str,
            input_gz,
            local_out_dir,
            num_gram,
            j_str,
            lib_mode,
            fromweb,
            verbose,
            labelnameflag=1,
            model_filename=None,
            str_model_json=None,
            sample_txt=None,
            pca_filename=None,
            pca_param=None,
            sp_master=config.get('spark', 'spark_master'),
            exe_memory=config.get('spark', 'spark_executor_memory'),
            core_max=config.get('spark', 'spark_cores_max'),
            MAX_FEATURES=eval(config.get("machine_learning", "MAX_FEATURES")),
            dic_name_label=None,
            feat_cnt_threshold=config.get('machine_learning',
                                          'feature_count_threshold'),
            ip_address=config.get('mongo', 'out_ip_address'),
            port=eval(config.get('mongo', 'out_port')),
            db_name=config.get('mongo', 'out_db'),
            tb_name=config.get('mongo', 'out_tb'),
            username=config.get('mongo', 'out_username'),
            password=config.get('mongo', 'out_password'),
            sc=None):

    t0 = time()
    coef_arr = None

    ml_opts = json.loads(j_str)

    # ML parameters ===================== input ml_opts ==============
    learning_algorithm = None
    try:
        if ml_opts is None:
            ml_opts = json.loads(j_str)
        learning_algorithm = ml_opts['learning_algorithm']
    except Exception as e:
        print "WARNING: load learning_algorithm failed.", e

    # read raw data from .gz file ===================== input .gz ==============
    file_content = None
    try:
        f = gzip.open(input_gz, 'rb')
        file_content = f.read()
        f.close()
    except Exception as e:
        print "ERROR: load data file [" + input_gz + "] failed.", e
        return -5

    # get data here; assume libsvm format
    label_feature_array = None
    feature_array = None
    label = None
    # optional
    sample_info = None
    if not file_content is None:
        label_features = file_content.strip()
        if not label_features is None:
            label_feature_array = label_features.split(' ')
            label = label_feature_array[0]
            # check if 1st item is integer
            int_1st = "y"
            try:
                int(label_feature_array[0])
            except ValueError:
                int_1st = "n"
            # check if 2nd item is integer
            int_2nd = "y"
            try:
                int(label_feature_array[0])
            except ValueError:
                int_2nd = "n"

            if int_1st == "n" and int_2nd == "y":
                feature_array = label_feature_array[2:len(label_feature_array)]
            elif int_1st == "y" and int_2nd == "n":
                feature_array = label_feature_array[1:len(label_feature_array)]

            #feature_array = label_feature_array[1:len(label_feature_array)]
            # if sample_info exists, 2nd item will be digit and 3rd item won't be digit
            if not label.isdigit() and label_feature_array[1].isdigit(
            ) and not label_feature_array[2].isdigit():
                sample_info = label
                label = label_feature_array[1]
                feature_array = label_feature_array[2:len(label_feature_array)]
        else:
            print "ERROR: data format error!"
    else:
        print "ERROR: no data found!"
    #print label  ### -1 means no label
    #print feature_array
    curr_dic = {}
    #print "feature_array=",feature_array
    for features in feature_array:
        if len(features) > 0:
            key, value = features.split(':')
            curr_dic[key] = float(value)
        else:
            print "WARNING: data format error!"

    print "INFO: curr_dic len=", len(curr_dic)

    if curr_dic and verbose == "1":

        #print "INFO: *** Feature list: ===================================="
        # clean up feature file
        out_file = os.path.join(local_out_dir, cid_str + "_feature_list.json")
        print "INFO: feature file=", out_file
        if os.path.exists(out_file):
            try:
                os.remove(out_file)
            except OSError, e:
                print("ERROR: %s - %s." % (e.strerror, out_file))
        out_f = open(out_file, 'a')

        # get coef_arr ==================================
        if coef_arr is None and not learning_algorithm in ('kmeans'):
            key = "coef_arr"  #{"123":"openFile"}
            jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
            jstr_proj = '{"value":1}'
            # each model has its own coef_arr

            doc = query_mongo.find_one(args.ip_address, args.port,
                                       args.db_name, args.tb_name, username,
                                       password, jstr_filter, jstr_proj)
            coef_arr = doc['value']

        fout_arr = []
        len_coef = len(coef_arr)
        for k, v in curr_dic.items():
            feat_out = {}
            feat_out["ngram"] = ""
            if int(k) < len_coef:
                feat_out["fid"] = k
                feat_out["coef"] = coef_arr[int(k) - 1]
                feat_out["desc"] = ""
            else:
                feat_out["fid"] = "None"
                feat_out["coef"] = 0
                feat_out["desc"] = str(k)

            fout_arr.append(feat_out)

        if len(fout_arr) > 0:
            out_f.write(json.dumps(fout_arr))
        out_f.close()