def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, labelnameflag, fromweb, src_filename
    , jobname ): 

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path
        
    #data_folder = hdfs_feat_dir + "/"
    #local_out_dir = local_out_dir + "/"
    #if os.path.exists(local_out_dir): 
    #    shutil.rmtree(local_out_dir) # to keep smaplelist file
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)
            
    # init Spark context ====
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 

    # start here =================================================================== ===============
    t0 = time()
        
    
    ### Need to check if PCA available here ===========================
    libsvm_data_file = os.path.join(hdfs_feat_dir , src_filename) # need to set k numb in filename somehow
    print "INFO: libsvm_data_file=", libsvm_data_file
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file).cache()
    # load sample RDD from text file   
    # format (LabeledPoint,hash) from str2LabeledPoint_hash() 
    feature_count=0
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, '')
    
    # get label as a list
    labels_list_all = samples_rdd.map(lambda p: int(p[0].label)).collect()
    total_sample_count=len(labels_list_all)
    parsedData =samples_rdd.map(lambda p: p[0].features).cache()
    #for i in parsedData.collect(): #p.features: pyspark.mllib.linalg.SparseVector
    #    print "pd=",type(i),",i=",i

    t1 = time()
    print 'INFO: running time: %f' %(t1-t0)
    t0 = t1
    
    ###############################################
    ########## build learning model ###############
    ###############################################
    
    ### get the parameters###
    print "INFO: ============Learning Algorithm and Parameters============="
    para_dict = json.loads(ml_opts_jstr)
    flag_model = para_dict['learning_algorithm'] # kmeans
    iteration_num = eval(para_dict['iterations'])
    k=2
    if 'k' in para_dict:
        k = eval(para_dict['k'])

    print "INFO: Learning Algorithm:", flag_model
    print "INFO: iterations=", iteration_num
    #print "training_sample_number=", training_sample_number
    
    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
 
        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
 
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic 
        
    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels_list=", labels_list
    
    ### build model ###
    
    if flag_model == "kmeans":
        print "=================== Kmeans ============"
        model = KMeans.train(parsedData, k, maxIterations=iteration_num)   
        t_cost= model.computeCost(parsedData)
        print "INFO: cost for training set =", str(t_cost)
        clusterCenters=model.clusterCenters
        print "INFO: clusterCenters t=", type(clusterCenters)  #list
    elif flag_model == "gaussian_mixture_model": # didn't work some native lib issue
        print "=================== Gaussian_Mixture_Model ============"
        model = GaussianMixture.train(parsedData, k, maxIterations=iteration_num)   
        print "INFO: model.weights =", model.weights
    else:
        print "INFO: Training model selection error: no valid ML model selected!"
        return
        
    ### Save model
    save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str
    try:
        hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(save_dir)
        #print "all files removed"
    except IOError as e:
        print "ERROR: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "ERROR: Unexpected error:", sys.exc_info()[0] 
    
    print "INFO: model saved at hdfs=",save_dir
    print "INFO: model type=",type(model)," model=",model
    model.save(sc, save_dir)
        
    ###load model if needed 
    #sameModel = SVMModel.load(sc, save_dir)

    ### 
    # (true label, keams label, features list, hash)
    all_data=samples_rdd.map(lambda t: ( t[0].label, model.predict(t[0].features), t[0].features, t[1] ) ).collect() 
    true_label_arr = np.asarray([int(x) for x,_,_,_ in all_data])
    labels_kmeans = np.asarray([int(x) for _,x,_,_ in all_data])
    hash_list = np.asarray([x for _,_,_,x in all_data])
    print "INFO: all_data len=",len(all_data),"all_data t=",type(labels_list_all)
    print "INFO: true_label_arr.shape=",true_label_arr.shape,"labels_kmeans.shape=",labels_kmeans.shape
    print "INFO: true_label_arr t=",type(true_label_arr),"labels_kmeans t=",type(labels_kmeans)
    mtx_center=np.asarray(clusterCenters)
    features_array_reduced=np.asarray([x.toArray() for _,_,x,_ in all_data])
    print "INFO: mtx_center t=",type(mtx_center),"mtx_center.shape=",mtx_center.shape
    print "INFO: features_array_reduced t=",type(features_array_reduced),"features_array_reduced.shape",features_array_reduced.shape

    #Adjusted Mutual Information between two clusterings
    amis=adjusted_mutual_info_score(labels_list_all,labels_kmeans)
    print "INFO: Adjusted_mutual_info_score=", amis  
    #Similarity measure between two clusterings
    ars=adjusted_rand_score(labels_list_all,labels_kmeans)
    print "INFO: Adjusted_rand_score=", ars   

    
    accuracy=0.0
   
    t1 = time()
    print 'INFO: training run time: %f' %(t1-t0)
    t0 = t1

    ###############################################
    ########## plot histogram               ######
    ###############################################
    n_clusters=k
    plot_col_num = int(math.ceil(math.sqrt(n_clusters)))
    figsize = (4*plot_col_num, 3*int(math.ceil(n_clusters*1.0/plot_col_num)))
    

    print "INFO: n_clusters=",n_clusters,",label_dic=",label_dic
    print "INFO: plot_col_num=",plot_col_num,",figsize=",figsize,",local_out_dir=",local_out_dir
    
    # kmeans histogram
    _, p_true = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic
                        , plot_col_num = plot_col_num, figsize=figsize, folder = local_out_dir, rid=row_id_str)
    # normalized kmeans histogram
    _, p_true_norm = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic
                        , plot_col_num = plot_col_num, figsize=figsize, normalize = True, folder = local_out_dir, rid=row_id_str)
    

    ####plot "reverse" histogram with labels ####
    num_bars = max(true_label_arr) + 1
    figsize = (4*plot_col_num, 3*int(math.ceil(num_bars*1.0/plot_col_num)))
    
    _, p_cluster = ml_plot_kmeans_histogram_subfigures(labels_kmeans, true_label_arr, num_bars, names = label_dic
                        , plot_col_num = plot_col_num, figsize=figsize, reverse = True, folder = local_out_dir, rid=row_id_str)


    #### plot dot figures ####
    # dot plot for Kmeans   ===========
    filename=os.path.join(local_out_dir ,row_id_str+'_cluster.png')   
    filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d.json')  
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10,7), filename=filename
        , title='KMeans', filename_3d=filename_3d)
        
    # dot plot for True Labels  ===========
    filename=os.path.join(local_out_dir ,row_id_str+'_cluster_tl.png')      
    filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d_tl.json')  
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_arr, mtx_center, n_clusters, figsize=(10,7), filename=filename
        , title='True Labels', filename_3d=filename_3d)

    dataset_info={"training_fraction":1, "class_count":n_clusters,"dataset_count":total_sample_count}
    
    # only update db for web request
    if fromweb=="1": 
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', total_feature_numb='"+str(feature_count) \
            +"', perf_measures='{}" \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret=exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '"+str(accuracy*100)+"%"

    
    print 'INFO: Finished!'
    return 0
def train(row_id_str,
          ds_id,
          hdfs_feat_dir,
          local_out_dir,
          ml_opts_jstr,
          excluded_feat_cslist,
          sp_master,
          spark_rdd_compress,
          spark_driver_maxResultSize,
          sp_exe_memory,
          sp_core_max,
          zipout_dir,
          zipcode_dir,
          zip_file_name,
          mongo_tuples,
          labelnameflag,
          fromweb,
          training_fraction,
          jobname,
          model_data_folder,
          random_seed=None):

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # ML model filename ====
    model_fname = os.path.join(model_data_folder, row_id_str + '.pkl')
    print "INFO: model_data_folder=", model_data_folder
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir,
                                   model_data_folder, model_fname)

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    # start here =================================================================== ===============
    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist

    # source libsvm filename
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    # collect all data to local for processing ===============
    all_data = samples_rdd.collect()
    sample_count = len(all_data)

    if not random_seed is None and int(random_seed) > 0:
        np.random.seed(int(random_seed))
        all_data = sorted(all_data, key=lambda x: x[1])

    # 2-D array
    features_list = [x.features.toArray() for x, _ in all_data]
    # label array
    labels_list_all = [x.label for x, _ in all_data]
    # hash array
    hash_list_all = [x for _, x in all_data]

    # convert to np array
    labels_list_all = array(labels_list_all)
    features_array = np.array(features_list)
    hash_list_all = np.array(hash_list_all)
    #print "features_list=",features_list

    # generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    # if ensamble is on, do special split here
    ### randomly split the samples into training and testing data ===============
    X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \
            cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) )
    # X_test_sparse is scipy.sparse.csr.csr_matrix
    testing_sample_count = len(labels_test)
    training_sample_count = len(labels_train)
    training_lbl_cnt_list = Counter(labels_train)
    testing_lbl_cnt_list = Counter(labels_test)
    print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count, ",sample_count=", sample_count
    print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list
    print "INFO: train_hash_list count=", len(
        train_hash_list), ", test_hash_list count=", len(test_hash_list)

    # random_seed testing
    if not random_seed is None:
        cnt = 0
        for i in train_hash_list:
            print i
            cnt = cnt + 1
            if cnt > 3:
                break

    #print "INFO: labels_list_all=",labels_list_all # too big
    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    ###############################################
    ###########build learning model ==================================================== ===============
    ###############################################

    ### parse parameters and generate the model ###
    (clf, model_name) = parse_param_and_get_model(ml_opts)
    if model_name == "none":
        print "ERROR: model name not found!"
        return -1

    #####fit the model to training dataset  ===============
    try:
        clf.fit(X_train_sparse, labels_train)
    except:
        print "ERROR: clf.fit(): clf=", clf
        print "ERROR: sys.exc_info:", sys.exc_info()[0]
        return -1

    print "INFO: model type=", type(clf), " clf=", clf
    #### save clf for future use ================== ===============
    joblib.dump(clf, model_fname)

    # get data from model ================================
    coef = None
    intercept = None
    # get column size =====
    try:
        if type(clf) in (classes.SVC, classes.NuSVC):  # svm didn't have coef_
            col_num = clf.support_vectors_.shape[1]
        else:  #linear only
            # coef_ is only available when using a linear kernel
            col_num = len(clf.coef_[0])
            coef = clf.coef_[0]
            intercept = clf.intercept_[0]  # only get 1st item?
            #print "**model:clf.coef_[0] =",clf.coef_[0]
            # save coef_ to Mongo
    except Exception as e:
        print "Warning: Can't get clf.coef_[0]. e=", e, ", get total features from meta-data"
        col_num = 0  #how to get feature number for sparse array?
    print "INFO: total feature # in the model: ", col_num

    jfeat_coef_dict = {}
    # create feature coefficient file ================================
    if coef is None:
        print "WARNING: model weights not found!"
    else:
        feat_filename = os.path.join(local_out_dir,
                                     row_id_str + "_feat_coef.json")
        print "INFO: feat_filename=", feat_filename
        # save coef_arr to mongo ===
        #jfeat_coef_dict=save_coef2db(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id)
        jfeat_coef_dict = ml_util.ml_save_coef_build_feat_coef(
            row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id)
    #print "INFO: jfeat_coef_dict=", jfeat_coef_dict
    print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict)

    ### Evaluating the model on testing dataset  ===============
    labels_pred = clf.predict(X_test_sparse)
    accuracy = clf.score(X_test_sparse, labels_test)
    print "INFO: Accuracy = ", accuracy

    # filename for false prediction samples  ===============
    false_pred_fname = os.path.join(local_out_dir,
                                    row_id_str + "_false_pred.json")
    print "INFO: false_pred_fname=", false_pred_fname
    # build files for false pred & score graph
    (score_arr_0, score_arr_1, max_score,
     min_score) = ml_build_false_pred(X_test_sparse,
                                      coef,
                                      intercept,
                                      labels_test,
                                      labels_pred,
                                      test_hash_list,
                                      model_name,
                                      jfeat_coef_dict,
                                      false_pred_fname,
                                      row_id_str=row_id_str,
                                      ds_id=ds_id,
                                      mongo_tuples=mongo_tuples)

    # save pred output
    pred_out_arr = []
    for i in range(0, len(labels_test)):
        pred_out_arr.append(
            (labels_test[i], labels_pred[i], test_hash_list[i]))
    pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl")
    print "INFO: pred_ofname=", pred_ofname
    ml_util.ml_pickle_save(pred_out_arr, pred_ofname)

    ###################################################
    ### generate label names (family names) ==================================================== ===============
    ###################################################
    if labelnameflag == 1:
        label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id)
        print "INFO: label_dic =", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: label_dic=", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])

    ###############################################
    ###########plot prediction result figures ==================================================== ===============
    ###############################################
    pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png")
    true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png")
    pred_xlabel = 'Prediction (Single Run)'
    true_xlabel = 'True Labels (Single Run)'
    test_cnt_dic = ml_util.ml_plot_predict_figures(labels_pred.tolist(),
                                                   labels_test.tolist(),
                                                   labels_list, label_dic,
                                                   testing_sample_count,
                                                   pred_xlabel, pred_fname,
                                                   true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    #print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc = None
    #fscore=None
    perf_measures = None
    class_count = len(labels_list)
    dataset_info = {
        "training_fraction": training_fraction,
        "class_count": class_count,
        "dataset_count": sample_count
    }
    #############################################################
    ###################for 2 class only (plot ROC curve) ==================================================== ===============
    #############################################################
    if class_count == 2:

        # build data file for score graph
        score_graph_fname = os.path.join(local_out_dir,
                                         row_id_str + "_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname
        ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name,
                                  score_graph_fname, max_score, min_score)

        do_ROC = True
        # clean is 0; dirty is 1
        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "WARNING: No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!"
            do_ROC = False

        if do_ROC:
            # calculate fscore  ==========
            perf_measures = ml_util.calculate_fscore(labels_test, labels_pred)
            #fscore=perf_measures["fscore"]
            #acc=perf_measures["accuracy"]
            #phi=perf_measures["phi"]
            print "INFO: perf_measures=", perf_measures

            confidence_score = clf.decision_function(X_test_sparse)
            #print "INFO:confidence_score=",confidence_score

            if flag_clean == 0:
                scores = [x for x in confidence_score]
                s_labels = [x for x in labels_test]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x in confidence_score]
                s_labels = [1 - x for x in labels_test]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]

            # create ROC data file ======== ====
            roc_auc = ml_create_roc_files(row_id_str, scores, s_labels,
                                          testing_N, testing_P, local_out_dir,
                                          row_id_str)

            perf_measures["roc_auc"] = roc_auc

    # only update db for web request ==================================================== ===============
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Sqlite update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    print 'INFO: Train Finished!'
    return 0
def save_inherited_ds(row_id_str,
                      hdfs_src_dir,
                      hdfs_src_fname,
                      hdfs_out_dir,
                      hdfs_out_fname,
                      sp_master=config.get('spark', 'spark_master'),
                      spark_rdd_compress=config.get('spark',
                                                    'spark_rdd_compress'),
                      spark_driver_maxResultSize=config.get(
                          'spark', 'spark_driver_maxResultSize'),
                      sp_exe_memory=config.get('spark',
                                               'spark_executor_memory'),
                      sp_core_max=config.get('spark', 'spark_cores_max'),
                      jobname='extract_dataset:',
                      join_field_name='md5',
                      zfname=None,
                      tblname='',
                      sql_script='',
                      hash_list=None):

    if len(zfname) > 0:
        print "INFO: zip filename=", os.path.basename(zfname)
        hash_list = get_hash_list(zfname)

    if len(hash_list) <= 0:
        print "ERROR: hash list is required!"
        return -1

    if not row_id_str in jobname:
        jobname = jobname + row_id_str

    #hash_list=[ ('2','a'),('3','b'),('99','xx'),('101','yy') ]
    print 'INFO: hash_list len=', len(hash_list), ", type=", type(hash_list)

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname)

    t0 = time()
    sqlCtx = SQLContext(sc)
    # convert input list into a DF

    # TBD hardcode "_label_"
    sch = StructType([
        StructField(join_field_name, StringType(), True),
        StructField("_label_", StringType(), True)
    ])
    plist = sc.parallelize(hash_list)
    list_df = sqlCtx.createDataFrame(plist, sch)

    # debug only
    #list_df.show()

    # source filename
    src_fname = os.path.join(hdfs_src_dir, hdfs_src_fname)
    print "INFO: src fname for source dataset=", os.path.basename(
        src_fname), ",join_field_name=", join_field_name

    sqlCtx = SQLContext(sc)
    src_df = sqlCtx.read.parquet(src_fname)
    #src_df.registerTempTable('pTbl')

    # get intersect DF
    src_df_rt = src_df.join(
        list_df, src_df[join_field_name] == list_df[join_field_name],
        'right_outer').cache()

    # existing data for reuse:
    existing_df = src_df_rt.select(src_df['*']).where(
        src_df[join_field_name].isNotNull()).cache()

    count_df = existing_df.count()
    print "INFO: existing_df count=", count_df
    ''' debug only
    print "INFO: src_df="
    src_df.show()
    print "INFO: existing_df="
    existing_df.show()
    print "INFO: src_df_rt="
    src_df_rt.show()
    '''

    # generate out folder, clean up if needed
    try:
        hdfs.mkdir(hdfs_out_dir)
    except:
        e = sys.exc_info()[0]
        print "WARNING: ", e
    # output hdfs file name
    out_fname = os.path.join(hdfs_out_dir, hdfs_out_fname)
    print "INFO: save data from inherited dataset to file=", os.path.basename(
        out_fname)
    # clean up existing hdfs file
    try:
        hdfs.rmr(out_fname)
    except:
        e = sys.exc_info()[0]
        print "WARNING: ", e

    print "INFO: tbl=", tblname, ",sql=", sql_script

    # convert to sql table
    existing_df.registerTempTable(tblname)
    df2 = sqlCtx.sql(sql_script)
    # save as parquet
    df2.write.parquet(out_fname)

    #df2.printSchema()

    new_list = []
    # new list which not in src DF for new retrival
    row_list = src_df_rt.select(list_df['*'], src_df[join_field_name]).where(
        src_df[join_field_name].isNull()).collect()
    new_list = [(i[0], i[1]) for i in row_list]
    #print "row_list=",row_list
    #print "INFO: new_list len=",len(new_list)
    ''' debug only
    test4=sqlCtx.read.parquet(out_fname)
    print "reload parquet="
    test4.show()
    '''
    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)
    print 'INFO: save_inherited_ds finished!'

    return new_list
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, labelnameflag, fromweb
    , training_fraction, jobname, model_data_folder ): 
    

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path
    

    # ML model filename ====
    model_fname=os.path.join(model_data_folder, row_id_str+'.pkl')
    print "INFO: model_data_folder=",model_data_folder    
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str,local_out_dir,model_data_folder,model_fname)   

    # init Spark context ====
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 

    
    t0 = time()
    t00 = t0
    
    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat=0
    if not ml_opts_jstr is None:
        ml_opts=json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat=ml_opts["has_excluded_feat"]

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist=ml_util.ml_get_excluded_feat(row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=",excluded_feat_cslist
            
    # source libsvm filename  
    libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file

    # load feature count file
    feat_count_file=libsvm_data_file+"_feat_count"
    feature_count=zip_feature_util.get_feature_count(sc,feat_count_file)
    print "INFO: feature_count=",feature_count

    
    # load sample RDD from text file   
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash() 
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd,feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    all_data = samples_rdd.collect()
    sample_count=len(all_data)
    # 2-D array
    features_list = [x.features.toArray() for x,_ in all_data]
    # label array
    labels_list_all = [x.label for x,_ in all_data]
    # hash array
    hash_list_all = [x for _,x in all_data]

    # convert to np array
    labels_list_all = array(labels_list_all)
    features_array = np.array(features_list)
    hash_list_all=np.array(hash_list_all)
    
    # generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    ### randomly split the samples into training and testing data ===============
    X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \
            cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) )
    # X_test_sparse is scipy.sparse.csr.csr_matrix
    testing_sample_count = len(labels_test)
    training_sample_count=len(labels_train)
    training_lbl_cnt_list=Counter(labels_train)
    testing_lbl_cnt_list=Counter(labels_test)
    
    print "INFO: training sample count=",training_sample_count,", testing sample count=",testing_sample_count,",sample_count=",sample_count
    print "INFO: training label list=",training_lbl_cnt_list,", testing label list=",testing_lbl_cnt_list
    print "INFO: train_hash_list count=",len(train_hash_list),", test_hash_list count=",len(test_hash_list)
    t1 = time()
    print 'INFO: running time: %f' %(t1-t0)
    
    ###############################################
    ###########build learning model################
    ###############################################
    
    ### parse parameters and generate the model ###
    (clf, model_name, api, cv, param_dic) = parse_param_and_get_model(ml_opts)
    if model_name == "none":
        print "ERROR: model name not found!"
        return -1

    #param_jobj=json.loads(ml_opts_jstr);
    #print "param_jobj=",param_jobj
        
    ########################################################
    ##########Grid Search with cross validation#############
    ########################################################    
    json2save={}
    json2save["rid"]=int(row_id_str)
    json2save["key"]="cv_result"
    #json2save["param_str"]=ml_opts_jstr
    json2save["param_dic"]=param_dic
    cv_grid=[]
    if api == "centralized":
        #########run with Scikit-learn API (for comparison)######
        print "INFO: ******************Grid Search with Scikit-learn API************"

        t0 = time()
        
        # Set the parameters by cross-validation
        #tuned_parameters = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}]
        #tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], \
        #                 'C': [1, 10, 100, 1000]}, \
        #                {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

        scores = ['accuracy']
        json2save["scores"]=scores
        #print json2save
        
        for score in scores: # for one item only? score=accuracy
            print("INFO: # Tuning hyper-parameters for %s" % score)
            #print()

            grid = grid_search.GridSearchCV(estimator = clf, param_grid = param_dic, cv=cv, scoring= score)
            grid.fit(X_train_sparse, labels_train)
            
            print "INFO: Best parameters set found on development set:"
            print "INFO: grid.best_params_=",grid.best_params_
            print "INFO: Grid scores on development set:" 
            for key in grid.best_params_:
                print "INFO: best_params["+key+"]=", grid.best_params_[key]
                if key.lower()=="regtype":
                    ml_opts['regularization']=str(grid.best_params_[key]) # add best param to 
                else:
                    ml_opts[key.lower()]=str(grid.best_params_[key]) # add best param to 
            # save best param to db as json string
            j_str=json.dumps(ml_opts);
            json2save["param_str"]=j_str;
            print "INFO: grid_scores_ with params:"
            for params, mean_score, scores in grid.grid_scores_:
                print "INFO: %0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
                #outstr='%s,%0.3f,%0.03f,%s' % (params,mean_score, scores.std() * 2,"Selected" if params==grid.best_params_ else "")
                outj={}
                outj["param"]=params
                outj["average_accuracy"]="%0.3f" % (mean_score)
                outj["std_deviation"]="%0.3f" % (scores.std() * 2)
                outj["selected"]="%s" % ("Selected" if params==grid.best_params_ else "")
                
                cv_grid.append(outj)
        
        clf_best = grid.best_estimator_
        t1 = time()
        ############# END run with SKlearn ######
        print 'INFO: Grid Search with SKlearn running time: %f' %(t1-t0)
        t0 = time()
    else:
    
        #############run with SPARK######
        
        print "INFO: ******************Grid Search with SPARK************"
            
        all_comb_list_of_dic = get_all_combination_list_of_dic(param_dic) 
        print "INFO: Total number of searching combinations=", len(all_comb_list_of_dic) 
        #print "all_comb_list_of_dic: ", all_comb_list_of_dic
        params_rdd = sc.parallelize(all_comb_list_of_dic)
        
        ###broad cast clf, traning data, testing data to all workers###
        X_broadcast = sc.broadcast(X_train_sparse)
        y_broadcast = sc.broadcast(labels_train)
        clf_broadcast = sc.broadcast(clf)
        
        ### Grid Search with CV in multiple workers ###
        models = params_rdd.map(lambda x: learn_with_params(clf_broadcast.value, X_broadcast.value, y_broadcast.value, cv, x)).sortByKey(ascending = False).cache()
        
        (ave_accuracy, (clf_best, p_dic_best, std2))  = models.first()
        # output results #

        print "INFO: Best parameters set found for ", model_name, " is: "
        print "INFO: ",
        for key in p_dic_best:
            print key, " = ", p_dic_best[key],
            if key.lower()=="regtype":
                ml_opts['regularization']=str(p_dic_best[key]) 
            else:
                ml_opts[key.lower()]=str(p_dic_best[key]) # add best param to 
            # save best param to db as json string
        print ""
        j_str=json.dumps(ml_opts);
        json2save["param_str"]=j_str;

        print "INFO: Average accuracy with CV = ", cv, ": ", ave_accuracy
        
        ######## print complete report #######
        print "INFO: Grid scores on development set:"
        all_results = models.collect()
        for i in range(0, len(all_results)):
            (ave_accu_i, (clf_i, p_dic_i, std2_i)) = all_results[i]
            print "INFO: ",ave_accu_i, " for ", p_dic_i
            print "INFO: %0.3f (+/-%0.03f) for " % (ave_accu_i, std2_i), p_dic_i
            #outstr='%s,%0.3f,%0.03f,%s' % ( p_dic_i, ave_accu_i, std2_i, "Selected" if p_dic_i==p_dic_best else "")
            outj={}
            outj["param"]=p_dic_i
            outj["average_accuracy"]="%0.3f" % (ave_accu_i)
            outj["std_deviation"]="%0.3f" % (std2_i)
            outj["selected"]="%s" % ("Selected" if p_dic_i==p_dic_best else "")
            
            cv_grid.append(outj)
        print " "
        
        t1 = time()
        
        ############# END run with SPARK######
        print 'INFO: Grid search with SPARK running time: %f' %(t1-t0)
    
    ##################################################################################
    #print "cv_grid=",cv_grid
    #json2save["cv_grid_title"]='param,average_accuracy,std_deviation,selected' 
    json2save["cv_grid_data"]=cv_grid
    json2save['clf_best']=str(clf_best).replace("\n","").replace("    ","")
    cv_result=json.dumps(json2save)
    #print "INFO: cv_result=",cv_result
    filter='{"rid":'+row_id_str+',"key":"cv_result"}'
    upsert_flag=True
    ## write to mongoDB.myml.dataset_info, ignore doc with duplicated key
    # db.dataset_info.createIndex({"rid":1,"key":1},{unique:true})
    ret=query_mongo.upsert_doc_t(mongo_tuples,filter,cv_result,upsert_flag)
    print "INFO: Upsert count for cv_result: ret=",ret
 
    ##################################################################################
    ##########Retrain with best model for training set and output results#############
    ##################################################################################
    print "INFO: **********Retrain with best model for training set and output results************"
    
    clf_best.fit(X_train_sparse, labels_train)
    #### save clf_best for future use ####
    #joblib.dump(clf_best, model_data_folder + row_id_str+'.pkl')
    joblib.dump(clf_best, model_fname) 
    
    ### Evaluating the model on testing data
    labels_pred = clf_best.predict(X_test_sparse)
    accuracy = clf_best.score(X_test_sparse, labels_test)
    print "INFO: Accuracy = ", accuracy
    
    
    ######################################the rest of the code is the same as train_sklean.py (replace clf with clf_best)#####################################################################
    clf=clf_best
    print "INFO: model type=",type(clf)," clf=",clf

    # get data from model ================================
    coef=None
    intercept=None
    try:
        if type(clf) in ( classes.SVC , classes.NuSVC) :# svm didn't have coef_
            col_num=clf.support_vectors_.shape[1]
        else: #linear only
            # coef_ is only available when using a linear kernel
            col_num = len(clf.coef_[0])
            coef=clf.coef_[0]
            intercept=clf.intercept_[0] # only get 1st item?
            #print "**model:clf.coef_[0] =",clf.coef_[0]
    except Exception as e:
        print "WARNING: Can't get clf.coef_[0]. e=",e,", get total features from meta-data"
        col_num = 0 #how to get feature number for sparse array? 
    print "INFO: total feature # in the model: ", col_num

    jfeat_coef_dict={}
    # create feature coefficient file ================================
    if coef is None:
        print "WARNING: model weights not found!"    
    else:
        feat_filename=os.path.join(local_out_dir,row_id_str+"_feat_coef.json")
        print "INFO: feat_filename=",feat_filename
        # save coef_arr to mongo & create jfeat_coef_dict===
        jfeat_coef_dict=ml_util.ml_save_coef_build_feat_coef(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id)
    #print "INFO: jfeat_coef_dict=", jfeat_coef_dict
    print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict )


    # filename for false pred 
    false_pred_fname=os.path.join(local_out_dir,row_id_str+"_false_pred.json")
    print "INFO: false_pred_fname=", false_pred_fname

    # build files for false pred & score graph
    (score_arr_0, score_arr_1, max_score,min_score)=ml_build_false_pred(X_test_sparse,coef,intercept
        , labels_test, labels_pred, test_hash_list, model_name, jfeat_coef_dict, false_pred_fname) 

    # save pred output
    pred_out_arr=[]
    for i in range(0,len(labels_test)):
        pred_out_arr.append((labels_test[i], labels_pred[i], test_hash_list[i]))
    pred_ofname=os.path.join(local_out_dir,row_id_str+"_pred_output.pkl")
    print "INFO: pred_ofname=", pred_ofname
    ml_util.ml_pickle_save(pred_out_arr,pred_ofname)
    
    ###################################################
    ### generate label names (family names) ###########
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    ###################################################
    
    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
        
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: ******generated label_dic:", label_dic 
    
    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    
    ### generate sample numbers of each family in testing data###
    testing_sample_number = len(labels_test)
    print "INFO: testing_sample_number=", testing_sample_number
    test_cnt_dic = {}
    for key in label_dic:
        test_cnt_dic[key] = 0
    for i in range (0, testing_sample_number):
        for key in label_dic:
            if labels_test[i] == key:
                test_cnt_dic[key] = test_cnt_dic[key] + 1
    print "INFO: Number of samples in each label is=", test_cnt_dic
    
    ###############################################
    ###########plot prediction result figure#######
    ###############################################
    pred_fname=os.path.join(local_out_dir,row_id_str+"_1"+".png")
    true_fname=os.path.join(local_out_dir,row_id_str+"_2"+".png")
    pred_xlabel='Prediction (Single Run)'
    true_xlabel='True Labels (Single Run)'
    test_cnt_dic=ml_util.ml_plot_predict_figures(labels_pred.tolist(), labels_test.tolist(), labels_list, label_dic, testing_sample_count 
        , pred_xlabel, pred_fname, true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc=None
    #fscore=None 
    perf_measures=None
    class_count=len(labels_list)
    dataset_info={"training_fraction":training_fraction, "class_count":class_count,"dataset_count":sample_count}
    #############################################################
    ###################for 2 class only (plot ROC curve)#########
    #############################################################
    if len(labels_list) == 2:

        # build data file for score graph
        score_graph_fname=os.path.join(local_out_dir,row_id_str+"_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname
        ml_build_pred_score_graph(score_arr_0,score_arr_1,model_name, score_graph_fname,max_score,min_score)

            
        do_ROC=True
        reverse_label_dic = dict((v,k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!"
            do_ROC=False
            
        if do_ROC:
            # calculate fscore  ==========
            perf_measures=ml_util.calculate_fscore(labels_test, labels_pred)
            print "INFO: perf_measures=",perf_measures
            
            confidence_score = clf_best.decision_function(X_test_sparse)
                    
            if flag_clean == 0:
                scores = [x for x in confidence_score]
                s_labels = [x for x in labels_test]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x in confidence_score]
                s_labels = [1-x for x in labels_test]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]
                
            # create ROC data file ======== ==== 
            roc_auc=ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P
                , local_out_dir, row_id_str)
                
            perf_measures["roc_auc"]=roc_auc
            
                
    # only update db for web request
    if fromweb=="1": 
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"',ml_opts='"+j_str \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret=exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '"+str(accuracy*100)+"%"
    
    print 'INFO: total running time: %f' %(t1-t00)
    
    print 'INFO: Finished!'
    return 0
def mrun(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, fromweb
    , training_fraction, jobname, run_number, bin_number ): 
        
    ### generate data folder and out folder, clean up if needed
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path       

    # get_spark_context
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 
    
    
    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat=0
    if not ml_opts_jstr is None:
        ml_opts=json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat=ml_opts["has_excluded_feat"]
    #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist
    
    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        key = "feature_excluded"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
        # get from own id (not from parent dataset id)
        #print "jstr_filter=",jstr_filter,",jstr_proj=",jstr_proj
        doc=query_mongo.find_one(args.ip_address, args.port, args.db_name, args.tb_name, username, password, jstr_filter, jstr_proj)
        #print "feature_excluded=",doc
        if not doc is None and 'value' in doc:
            excluded_feat_cslist = ','.join(str(i) for i in doc['value'])
    print "INFO: excluded_feat_cslist=",excluded_feat_cslist
    
    
    ### generate Labeled point
    #libsvm_data_file = data_folder + "libsvm_data"
    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file
    
    # load feature count file
    feat_count_file=libsvm_data_file+"_feat_count"
    feature_count=zip_feature_util.get_feature_count(sc,feat_count_file)
    print "INFO: feature_count=",feature_count

    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    # load sample RDD from text file   
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash() 
    samples_rdd, feature_count=zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist)
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    
    # get distinct label list
    labels_list_all = samples_rdd.map(lambda p: p[0].label).distinct().collect()
    #labels_list_all = samples_rdd.map(lambda p: p.label).collect()

    t1 = time()
    print "INFO: labels_list_all=",labels_list_all
    #print "INFO: training and testing samples generated!"
    print 'INFO: data generating time: %f' %(t1-t0)
    t0 = t1
    
    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    label_set = set(labels_list_all)
    class_num = len(label_set)
    #class_num = len(labels_list)
    if class_num > 2:
        print "INFO:Number of classes=", class_num
    
    
    ###############################################
    ###########build learning model################
    ###############################################
    
    ### get the parameters###
    print "INFO: ============Learning Algorithm and Parameters============="
    #param_dict = json.loads(ml_opts_jstr)
    flag_model = ml_opts['learning_algorithm']     # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd
    C = eval(ml_opts['c'])
    iteration_num = ml_opts['iterations']
    regularization = ml_opts['regularization']
    print "INFO: Learning Algorithm: ", flag_model
    print "INFO: C = ", C
    print "INFO: iteration = ", iteration_num
    print "INFO: regType = ", regularization
    
    
    t0 = time()
    accuracy_array = np.zeros(run_number)
    for rnd in range (0, run_number):
    
        ### generate training and testing data
        training_rdd, testing_rdd = samples_rdd.randomSplit([training_fraction, 1-training_fraction])
        training_rdd=training_rdd.map(lambda p:p[0])# keep LabeledPoint only
        training_rdd.cache()
        testing_rdd.cache()
        training_sample_count = training_rdd.count()
                
        regP = C/float(training_sample_count)
        print "INFO: Calculated: regParam = ", regP
        
        ### build model ###
        
        if flag_model == "linear_svm_with_sgd":
            ### 1: linearSVM
            print "INFO: ====================1: Linear SVM============="
            model_classification = SVMWithSGD.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization)   # regParam = 1/(sample_number*C)
            #print model_classification
        elif flag_model == "logistic_regression_with_lbfgs":
            ### 2: LogisticRegressionWithLBFGS
            print "INFO: ====================2: LogisticRegressionWithLBFGS============="
            model_classification = LogisticRegressionWithLBFGS.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num)   # regParam = 1/(sample_number*C)
        elif flag_model == "logistic_regression_with_sgd":
            ### 3: LogisticRegressionWithLBFGS
            print "INFO: ====================3: LogisticRegressionWithSGD============="
            model_classification = LogisticRegressionWithSGD.train(training_rdd, regParam=regP, iterations=iteration_num, regType=regularization)   # regParam = 1/(sample_number*C)    
        else:
            print "ERROR: Training model selection error: no valid ML model selected!"
            return
        
        ### Evaluating the model on testing data
        labelsAndPreds = testing_rdd.map(lambda p: (p[0].label, model_classification.predict(p[0].features)))
        labelsAndPreds.cache()
        testing_sample_number = testing_rdd.count()
        testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testing_sample_number)
        accuracy = 1 - testErr
        
        accuracy_array[rnd] = accuracy        
        print "INFO: current round=", rnd
        print "INFO: Accuracy=", accuracy
    

    ########################below: same as train_skLean_multi_run.py#####################################    
    ###############################################
    #######plot distribution and variance##########
    ###############################################

    plt.figure(1)
    
    num_bins = bin_number  ####10 is default
    n, bins, patches = plt.hist(accuracy_array, num_bins, normed=1, facecolor='green', alpha=0.5)
    ave = np.mean(accuracy_array)
    print "INFO: Accuracy mean=", ave
    variance = np.std(accuracy_array)
    print "INFO: Accuracy variance=", variance
    
    #print "INFO: bins: ", bins
    # add a 'best fit' line
    y = mlab.normpdf(bins, ave, variance)
    #print "INFO: y: ", y
    plt.plot(bins, y, 'r--')
    
    plt.title('Accuracy distribution of '+str(run_number)+' runs:')
    plt.xlabel('Accuracy Values')
    plt.ylabel('Probability / Accuracy bar width')
    
    #plt.savefig(local_out_dir+file_name_given+"_var_"+str(run_number)+".png")
    plt.savefig(os.path.join(local_out_dir, row_id_str+"_var_"+str(run_number)+".png"))

    # create ROC data for graph ====================
    all_json=[]
    barp_arr=[] #n
    disp_arr=[] #y
    last_idx=0
    for idx,ht in enumerate(n): # n is bar height
        #print "INFO: mrun bar=",idx, bins[idx], bins[idx+1],((bins[idx]+bins[idx+1])/2)
        barp_arr.append([ ((bins[idx]+bins[idx+1])/2.0),n[idx]]) # mid point for x axis
        if not math.isnan(y[idx]):
            disp_arr.append([bins[idx],y[idx]])
        last_idx=idx
    #print "INFO: ",last_idx+1, bins[last_idx+1], y[last_idx+1]
    if not math.isnan(y[last_idx+1]):
        disp_arr.append([bins[last_idx+1],y[last_idx+1]])
    #print "barp_arr=", barp_arr
    #print "disp_arr=", disp_arr
    #bar
    bar_json={}
    bar_json["values"]=barp_arr
    bar_json["key"]='Mutil-Run Accuracy' #
    bar_json["type"]="bar" # light blue
    bar_json["yAxis"]=1
    all_json.append(bar_json)
    #distribution
    if len(disp_arr)>0:
        dis_json={}
        dis_json["values"]=disp_arr
        dis_json["key"]='Normal Distribution' #
        dis_json["type"]="line" # light blue
        dis_json["yAxis"]=1
        all_json.append(dis_json)
    
    mrun_jfile = os.path.join(local_out_dir, row_id_str+"_mrun.json")
    #print "INFO: all_json=",all_json
    print "INFO: mrun_jfile=",mrun_jfile
    if os.path.exists(mrun_jfile):
        try:
            os.remove(mrun_jfile)
        except OSError, e:
            print ("Error: %s - %s." % (e.mrun_jfile,e.strerror))
def feat_extr_ngram(row_id_str,
                    hdfs_dir_list,
                    hdfs_feat_dir,
                    model_data_folder,
                    sp_master,
                    spark_rdd_compress,
                    spark_driver_maxResultSize,
                    sp_exe_memory,
                    sp_core_max,
                    zipout_dir,
                    zipcode_dir,
                    zip_file_name,
                    mongo_tuples,
                    fromweb,
                    label_arr,
                    metadata_count,
                    label_idx,
                    data_idx,
                    pattern_str,
                    ln_delimitor,
                    data_field_list,
                    jkey_dict,
                    jobname,
                    num_gram,
                    feature_count_threshold,
                    token_dict=None,
                    HDFS_RETR_DIR=None,
                    remove_duplicated="N",
                    cust_featuring=None,
                    cust_featuring_params=None,
                    local_out_dir=None,
                    filter_ratio=None,
                    binary_flag=True):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              user_custom=cust_featuring)
    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])
    # log time ================================================================ ================
    t0 = time()

    # input filename
    input_filename = "*"
    ext_type = '.gz'
    gz_list = None
    convert2dirty = "N"
    if not ',' in hdfs_dir_list:  # single dir having *.gz ==== =========
        # read raw data from HDFS as .gz format ==========
        rdd_files = os.path.join(hdfs_dir_list, input_filename + ext_type)
        # check if gz files in hdfs ============
        try:
            gz_list = hdfs.ls(hdfs_dir_list)
            print "INFO: check hdfs folder=", hdfs_dir_list

        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Error at checking HDFS file:", sys.exc_info()[0]
        # use whole folder
        if gz_list is None or len(gz_list) == 0:
            print "ERROR: No file found by ", input_filename + ext_type  #,", use",hdfs_dir_list,"instead"
            return -2
        elif len(gz_list) == 1:
            # use dir as filename
            rdd_files = hdfs_dir_list[0:-1]

    else:  # multiple dirs ==== =========
        rdd_files = ""
        cnt = 0
        temp_lbl_list = []
        comma = ""
        print "INFO: before label_arr=", label_arr

        # check each folder
        for dr in hdfs_dir_list.split(','):
            #print "****=",dr
            if not len(dr) > 0:
                continue
            try:
                # remove space etc.
                dr = dr.strip()
                fdr = os.path.join(HDFS_RETR_DIR, dr)
                #print "fdr=",fdr
                # ls didn't like "*"
                if '*' in fdr:
                    #gz_list=hdfs.ls(fdr.replace("*",""))
                    dn = os.path.dirname(fdr).strip()
                    bn = os.path.basename(fdr).strip()
                    #print "dn=",dn,",bn=",bn
                    # get all names under folder and do filtering
                    gz_list = fnmatch.filter(hdfs.ls(dn), '*' + bn)
                    #print "gz_list=",gz_list
                else:
                    gz_list = hdfs.ls(fdr)
                cnt = cnt + len(gz_list)

                if len(gz_list) > 0:
                    rdd_files = rdd_files + comma + fdr
                    comma = ","
            except IOError as e:
                print "WARNING: I/O error({0}): {1}".format(
                    e.errno, e.strerror)
            except:
                print "WARNING: Error at checking HDFS file:", sys.exc_info(
                )[0]
        # use whole folder
        if cnt is None or cnt == 0:
            print "ERROR: No file found at", rdd_files
            return -2
        else:
            print "INFO: total file count=", cnt
        # set convert flag only when multiple dir and label_arr has dirty label
        #if label_arr is None: # create label arr if None
        #    label_arr=temp_lbl_list
        if not label_arr is None and len(
                label_arr) == 2 and label_arr[1] == "dirty":
            convert2dirty = "Y"
    print "INFO: rdd_files=", rdd_files

    txt_rdd = sc.textFile(rdd_files)  #, use_unicode=False

    total_input_count = txt_rdd.count()
    print "INFO: Total input sample count=", total_input_count
    # debug only
    #for x in txt_rdd.collect():
    #    print "t=",x
    print "INFO: hdfs_dir_list=", hdfs_dir_list
    print "INFO: label_arr=", label_arr
    print "INFO: feature_count_threshold=", feature_count_threshold

    #jkey_dict={"meta_list":["label","md5","mdate"], "data_key":"logs"}
    #   this dict depends on the format of input data
    if not data_field_list is None:
        jkey_dict = json.loads(jkey_dict)

        data_key = jkey_dict["data_key"]
        meta_list = jkey_dict["meta_list"]

        metadata_count = len(meta_list)
        data_idx = metadata_count
        print "INFO: jkey_dict=", jkey_dict
        print "INFO: meta_list=", meta_list
        print "INFO: data_key=", data_key
        print "INFO: data_field_list=", data_field_list
        print "INFO: metadata_count=", metadata_count

        featured_rdd = txt_rdd \
            .map(lambda x: preprocess_json(x,meta_list,data_key,data_field_list)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()

        #print "INFO: featured_rdd="
        #for x in featured_rdd.collect():
        #    print "INFO: **** f=",x
    # user custom code for featuring  ============================================= ==========
    #   input txt_rdd format (string):  each text row for each sample
    #   output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    elif not cust_featuring is None and len(cust_featuring) > 0:
        user_module = None
        user_func = None
        user_func_dnn = None
        # load user module =======
        try:
            modules = map(__import__, [CUSTOM_PREFIX + cust_featuring])
            user_module = modules[0]
            user_func = getattr(user_module, CUSTOM_FUNC)
        except Exception as e:
            print "ERROR: module=", CUSTOM_PREFIX + cust_featuring
            print "ERROR: user module error.", e.__doc__, e.message
            return -101
        try:
            jparams = json.loads(cust_featuring_params)
            if jparams and 'n-gram' in jparams:
                num_gram = jparams['n-gram']
            elif jparams and 'ngram' in jparams:
                num_gram = jparams['ngram']
            if jparams and 'binary_flag' in jparams:
                binary_flag = eval(jparams['binary_flag'])
        except Exception as e:
            print "ERROR: user params error.", e.__doc__, e.message
            return -200

        # convert feast into array. output format: [ meta1,meta2,..., [feat1,feat2,...]]
        tmp_rdd = txt_rdd.map(lambda x: user_func(x, cust_featuring_params)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list).cache()
        print " tmp_rdd cnt=", tmp_rdd.count(
        ), ",ix=", data_idx, ",max f=", MAX_FEATURES, "ngram=", num_gram
        print "take(1) rdd=", tmp_rdd.take(1)

        # TBD for multivariant output format: [ meta1,meta2,..., [[feat1,feat2,...],[feat1,feat2,...],...]]

        # TBD only for num_gram available
        # for traditional ML, feat in a dict
        # output format: [ meta1,meta2,..., [[feat1,feat2,...],[feat1,feat2,...],...]]
        featured_rdd = tmp_rdd \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()

        all_hashes_cnt_dic = None
        all_hash_str_dic = None
        all_hashes_seq_dic = None
    else:
        print "INFO: pattern_str=", pattern_str + "<--"
        print "INFO: ln_delimitor=", ln_delimitor + "<--"
        print "INFO: label_idx=", label_idx
        print "INFO: data_idx=", data_idx
        print "INFO: metadata_count=", metadata_count
        print "INFO: filter_ratio=", filter_ratio

        # filter top and least percentage of feature
        if not filter_ratio is None and filter_ratio > 0 and filter_ratio < 1:
            # check total count here before continue
            upper_cnt = total_input_count * (1 - filter_ratio)
            lower_cnt = total_input_count * filter_ratio
            # set limit for lower bound. if total count is large, lower_cnt may exclude all features...
            # max lower count =  min( MAX_FILTER_LOWER_CNT, total_input_count/100 )
            if not MAX_FILTER_LOWER_CNT is None and lower_cnt > MAX_FILTER_LOWER_CNT:
                if MAX_FILTER_LOWER_CNT > total_input_count / 100:
                    lower_cnt = total_input_count / 100
                else:
                    lower_cnt = MAX_FILTER_LOWER_CNT

            print "INFO: filtering by count, upper bound=", upper_cnt, ",lower bound=", lower_cnt
            # find unique feature, count them, remove them if in highest and lowest % and then create a dict
            f_feat_set = Set (txt_rdd.map(lambda x:x.split(ln_delimitor)).flatMap(lambda x:Set(x[metadata_count:])) \
                .map(lambda x:(x,1)).reduceByKey(lambda a, b: a + b) \
                .filter(lambda x:x[1]<= upper_cnt and x[1]>= lower_cnt) \
                .map(lambda x:x[0]).collect() )

            print "INFO: f_feat_set len=", len(f_feat_set)
            broadcast_f_set = sc.broadcast(f_feat_set)

            #txt_rdd=txt_rdd.map(lambda x: filter_by_list(x, metadata_count,ln_delimitor, broadcast_f_list.value ))
            txt_rdd=txt_rdd.map(lambda x: x.split(ln_delimitor)) \
                        .map(lambda x: x[:metadata_count]+ [w for w in x[metadata_count:] if w and w in broadcast_f_set.value]) \
                        .map(lambda x: ln_delimitor.join(x))

        # preprocess by pattern matching and then extract n-gram features   #.encode('UTF8')
        #   input txt_rdd format (string):  meta-data1\tmeta-data2\t...\tdataline1\tdataline2\t...datalineN\n
        #   output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
        #       hash_cnt_dic: {hash,hash:count,...}  hash_str_dic: {hash: 'str1',... }
        tmp_rdd = txt_rdd \
            .map(lambda x: preprocess_pattern(x, metadata_count, pattern_str, ln_delimitor \
                                                , label_idx, label_arr, convert2dirty )) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) #.cache() memory issue...
        #tmp_rdd_count=tmp_rdd.count()
        #print "INFO: After preprocessing count=",tmp_rdd_count
        featured_rdd = tmp_rdd \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()
        #feat_rdd_count=featured_rdd.count()
        #print "INFO: After featuring count=",feat_rdd_count

        all_hashes_cnt_dic = None
        all_hash_str_dic = None
        all_hashes_seq_dic = None

    #get all hashes and total occurring count ===============
    #   all_hashes_cnt_dic: {'hash,hash': total count,... }
    if all_hashes_cnt_dic is None:
        #all_hashes_cnt_dic = featured_rdd.map(lambda x: x[metadata_count]).reduce(lambda a, b: combine_dic_cnt(a, b))
        all_hashes_cnt_dic = dict(
            featured_rdd.flatMap(lambda x: x[metadata_count].items()).
            reduceByKey(lambda a, b: a + b).collect())

    #get all hashes and their extracted string  ===============
    #   all_hash_str_dic: {hash:'str1', ...
    if all_hash_str_dic is None:
        #all_hash_str_dic = featured_rdd.map(lambda x: x[metadata_count+1]).reduce(lambda a, b: combine_dic(a, b))
        all_hash_str_dic = dict(
            featured_rdd.flatMap(
                lambda x: x[metadata_count + 1].items()).distinct().collect())

    # get all labels into an array  =============== provided by parameter?
    if label_arr is None:
        # will force "clean" be 0 here
        label_arr = sorted(
            featured_rdd.map(
                lambda x: x[label_idx].lower()).distinct().collect())
        # debug only
        print "INFO: label_arr.=", json.dumps(sorted(label_arr))

    # save labels to hdfs as text file==================================== ============
    hdfs_folder = hdfs_feat_dir  #+ "/"   # "/" is needed to create the folder correctly
    print "INFO: hdfs_folder=", hdfs_folder
    try:
        hdfs.mkdir(hdfs_folder)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0]

    # clean up metadata_file
    metadata_file = os.path.join(hdfs_folder, metadata)  #"metadata"
    print "INFO: metadata_file=", metadata_file
    try:
        hdfs.rmr(metadata_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at rmr():", sys.exc_info()[0]
    sc.parallelize(label_arr, 1).saveAsTextFile(metadata_file)

    #remap all hash values to continuous key/feature number ==============
    #     all_hashes_seq_dic: { hash : sequential_numb }
    if all_hashes_seq_dic is None:
        all_hashes_seq_dic = {}
        remap2seq(
            all_hashes_cnt_dic,
            all_hashes_seq_dic)  #all_hashes_seq_dic has continuous key number
    #print "all_hashes_seq_dic=",all_hashes_seq_dic
    total_feature_numb = len(all_hashes_seq_dic)
    print "INFO: Total feature count=", len(all_hashes_seq_dic)

    # featured_rdd (list):    [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    # seq_featured_rdd(list): [meta-data1,meta-data2,..., hash_cnthsh_dict, hash_str_dic] (feat id in sorted sequence)
    # hash_cnt_dic: {hash: count}  hash_str_dic: {hash: 'str1,str2...' }
    #     set binary_flag to True, all feature:value will be 1
    broadcast_dic = sc.broadcast(all_hashes_seq_dic)
    seq_featured_rdd = featured_rdd.map(lambda x: convert2seq(
        x, label_idx, data_idx, broadcast_dic.value, binary_flag=binary_flag)
                                        ).cache()

    # get hash_cnthsh_dict then flatMap and reduce to (feat id, count)
    ct_rdd = seq_featured_rdd.flatMap(lambda x: [(i[0], i[1]) for i in x[
        data_idx].iteritems()]).reduceByKey(lambda a, b: a + b)
    # sorted by feature id as int
    feat_sample_count_arr = ct_rdd.sortBy(lambda x: int(x[0])).map(
        lambda x: x[1]).collect()
    # sort after collect may fail when rdd is huge
    #feat_sample_count_arr=[]
    #for i in sorted(ct_rdd.collect(), key=lambda t: int(t[0])):
    #    feat_sample_count_arr.append(i[1])
    print "INFO: feat_sample_count_arr len=", len(feat_sample_count_arr)

    # save feat_sample_count_arr data ==================================== ============
    filter = '{"rid":' + row_id_str + ',"key":"feat_sample_count_arr"}'
    upsert_flag = True
    jo_insert = {}
    jo_insert["rid"] = eval(row_id_str)
    jo_insert["key"] = "feat_sample_count_arr"
    jo_insert["value"] = feat_sample_count_arr
    jstr_insert = json.dumps(jo_insert)
    ret = query_mongo.upsert_doc_t(mongo_tuples, filter, jstr_insert,
                                   upsert_flag)
    print "INFO: Upsert count for feat_sample_count_arr=", ret
    # insert failed, save to local
    if ret == 0:
        # drop old record in mongo
        ret = query_mongo.delete_many(mongo_tuples, None, filter)
        if not os.path.exists(local_out_dir):
            os.makedirs(local_out_dir)
        fsca_hs = os.path.join(local_out_dir, row_id_str,
                               row_id_str + "_feat_sample_count_arr.pkl")
        print "WARNING: save feat_sample_count_arr to local"
        ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs)

    # save feature data; TBD. not used. ==================================== ============

    #libsvm_rdd=seq_featured_rdd.map(lambda x: convert2libsvm(x,label_idx,data_idx,label_arr))
    # put hash to the front of each row, assume hash is after label
    libsvm_rdd = seq_featured_rdd.map(
        lambda x: x[label_idx + 1] + " " + convert2libsvm(
            x, label_idx, data_idx, label_arr))
    # debug only
    #print "libsvm_rdd="
    #for i in libsvm_rdd.collect():
    #    print i

    # get rdd statistics info
    stats = featured_rdd.map(lambda p: len(p[metadata_count])).stats()
    feat_count_max = stats.max()
    feat_count_stdev = stats.stdev()
    feat_count_mean = stats.mean()
    sample_count = stats.count()
    print "INFO: libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
    print "INFO:   ,max feature count=", feat_count_max
    # find sample count
    lbl_arr = featured_rdd.map(lambda x: (x[label_idx], 1)).reduceByKey(
        add).collect()
    print "INFO: Sample count by label=", lbl_arr

    # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx
    if remove_duplicated == "Y":
        libsvm_rdd=libsvm_rdd \
            .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \
            .groupByKey().map(lambda x: list(x[1])[0] ) \
            .cache()
        cnt_list = libsvm_rdd.map(lambda x: (x.split(' ')[1], 1)).reduceByKey(
            add).collect()
        stats = libsvm_rdd.map(
            lambda x: len(x.split(' ')[metadata_count:])).stats()
        feat_count_max = stats.max()
        feat_count_stdev = stats.stdev()
        feat_count_mean = stats.mean()
        sample_count = stats.count()
        print "INFO: Non-Duplicated libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
        print "INFO:   ,max feature count=", feat_count_max
        print "INFO: Non-Duplicated Label count list=", cnt_list

    # clean up libsvm data ==================================== ============
    libsvm_data_file = os.path.join(hdfs_folder,
                                    libsvm_alldata_filename)  #"libsvm_data"
    print "INFO: libsvm_data_file=", libsvm_data_file
    try:
        #hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(libsvm_data_file)
        #if num_gram == 1:
        #   hdfs.rmr(dnn_data_file)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format(
            e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info(
        )[0]

    #codec = "org.apache.hadoop.io.compress.GzipCodec"
    #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec)
    libsvm_rdd.saveAsTextFile(libsvm_data_file)  # TBD encrypted

    feat_count_file = libsvm_data_file + "_feat_count"
    print "INFO: feat_count_file=", feat_count_file
    try:
        hdfs.rmr(feat_count_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at feat_count clean up".format(
            e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info(
        )[0]
    sc.parallelize([total_feature_numb], 1).saveAsTextFile(feat_count_file)

    label_dic = {}
    # assign label a number
    for idx, label in enumerate(sorted(label_arr)):
        if not label in label_dic:
            label_dic[
                label] = idx  #starting from 0, value = idx, e.g., clean:0, dirty:1

    # output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN ===========
    if num_gram == 1:  # special flag to tokenize and keep input orders
        print "INFO: processing data for DNN..."
        # create token dict
        # str_hash_dict: string to hash
        # all_hashes_seq_dic: hash to seq id
        if token_dict is None or len(token_dict) == 0:
            token_dict = {}
            str_hash_dict = {v: k for k, v in all_hash_str_dic.iteritems()}
            for k, v in str_hash_dict.iteritems():
                token_dict[k] = int(all_hashes_seq_dic[str(v)])
            #print "token_dict=",len(token_dict),token_dict

        dnn_rdd = tmp_rdd \
            .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list)
        #.cache()
        # filter duplication here
        #print dnn_rdd.take(3)

        dnn_data_file = os.path.join(hdfs_folder,
                                     dnn_alldata_filename)  #"dnn_data"
        print "INFO: dnn_data_file=", dnn_data_file
        try:
            hdfs.rmr(dnn_data_file)
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format(
                e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info(
            )[0]

        # clean up data
        dnn_npy_gz_file = os.path.join(hdfs_folder, row_id_str + "_dnn_")
        print "INFO: dnn_npy_gz_file=", dnn_npy_gz_file
        try:
            hdfs.rmr(dnn_npy_gz_file + "data.npy.gz")
            hdfs.rmr(dnn_npy_gz_file + "label.npy.gz")
            hdfs.rmr(dnn_npy_gz_file + "info.npy.gz")
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_npy clean up".format(
                e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at dnn_npy file clean up:", sys.exc_info(
            )[0]
        # save new data
        try:
            dnn_rdd.saveAsTextFile(dnn_data_file)
        except:
            print "WARNING: Unexpected error at saving dnn data:", sys.exc_info(
            )[0]
        # show data statistics
        try:
            stats = dnn_rdd.map(lambda p: len(p[metadata_count])).stats()
            feat_count_max = stats.max()
            feat_count_stdev = stats.stdev()
            feat_count_mean = stats.mean()
            sample_count = stats.count()
            print "INFO: DNN data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
            print "INFO:   ,max feature count=", feat_count_max
        except:
            print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info(
            )[0]

    # clean up pca data in hdfs ============ ========================
    pca_files = '*' + libsvm_alldata_filename + "_pca_*"
    #print "INFO: pca_files=", pca_files
    try:
        f_list = hdfs.ls(hdfs_folder)
        if len(f_list) > 0:
            df_list = fnmatch.filter(f_list, pca_files)
            for f in df_list:
                print "INFO: rm ", f
                hdfs.rmr(f)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info(
        )[0]

    # clean up pca data in web local ============ ========================
    pca_fname = os.path.join(model_data_folder, row_id_str + '_pca_*.pkl*')
    print "INFO: pca_fname=", pca_fname

    try:
        for fl in glob.glob(pca_fname):
            print "INFO: remove ", fl
            os.remove(fl)
    except OSError, e:
        print("Error: %s - %s." % (e.pca_fname, e.strerror))
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr,
          excluded_feat_cslist, sp_master, spark_rdd_compress,
          spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir,
          zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb,
          training_fraction, jobname):

    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()
    t00 = t0

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    ml_opts = {}
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]

    #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist
    ### generate Labeled point
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file:", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)

    # get distinct label list
    labels_list_all = samples_rdd.map(
        lambda p: p[0].label).distinct().collect()

    ### generate training and testing data
    training_rdd, testing_rdd = samples_rdd.randomSplit(
        [training_fraction, 1 - training_fraction])
    training_rdd = training_rdd.map(lambda p: p[0])  # keep LabeledPoint only
    training_rdd.cache()
    training_sample_count = training_rdd.count()
    training_lbl_cnt_list = training_rdd.map(
        lambda p: (p.label, 1)).reduceByKey(add).collect()
    testing_rdd.cache()
    testing_sample_count = testing_rdd.count()
    testing_lbl_cnt_list = testing_rdd.map(
        lambda p: (p[0].label, 1)).reduceByKey(add).collect()
    sample_count = training_sample_count + testing_sample_count

    t1 = time()
    print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count
    print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list
    print "INFO: labels_list_all=", labels_list_all
    print "INFO: training and testing samples generated!"
    print 'INFO: running time: %f' % (t1 - t0)
    t0 = t1

    ##############################################
    ########### Grid Search with CV ##############
    ##############################################

    ### get the parameters for cross validation and grid search ###
    (cv, model_name, param_dict) = generate_param(ml_opts)

    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    if labelnameflag == 1:
        label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id)
        print "INFO: label_dic:", label_dic

    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    #print "labels:", labels_list
    class_num = len(labels_list)
    if class_num > 2:
        print "INFO: Multi-class classification! Number of classes = ", class_num

    #### generate training and testing rdd(s) for CV#####
    split_prob = 1.0 / float(cv)
    split_prob_list = []
    for i in range(0, cv):
        split_prob_list.append(split_prob)

    list_rdd = training_rdd.randomSplit(split_prob_list)
    list_train_rdd = []
    list_test_rdd = []
    for i in range(0, cv):
        list_rdd[i].cache()
    for i in range(0, cv):
        tr_rdd = sc.emptyRDD()
        for j in range(0, cv):
            if j == i:
                pass
            else:
                tr_rdd = tr_rdd + list_rdd[j]
        tr_rdd.cache()
        list_train_rdd.append(tr_rdd)
        list_test_rdd.append(list_rdd[i])

    all_comb_list_of_dic = get_all_combination_list_of_dic(param_dict)
    print "INFO: Total number of searching combinations:", len(
        all_comb_list_of_dic)

    ### loop for all parameter combinations and search the best parameters with CV###
    results = []
    for p in range(0, len(all_comb_list_of_dic)):
        params = all_comb_list_of_dic[p]
        C = params['C']
        iteration_num = params['iterations']
        regularization = params['regType']

        scores = []
        for i in range(0, cv):
            train_rdd = list_train_rdd[i]
            test_rdd = list_test_rdd[i]
            train_number = train_rdd.count()
            regP = C / float(train_number)

            ### build model ###
            if model_name == "linear_svm_with_sgd":
                #print "====================1: Linear SVM============="
                model_classification = SVMWithSGD.train(
                    train_rdd,
                    regParam=regP,
                    iterations=iteration_num,
                    regType=regularization)  # regParam = 1/(sample_number*C)
            elif model_name == "logistic_regression_with_lbfgs":
                #print "====================2: LogisticRegressionWithLBFGS============="
                model_classification = LogisticRegressionWithLBFGS.train(
                    train_rdd,
                    regParam=regP,
                    iterations=iteration_num,
                    regType=regularization,
                    numClasses=class_num)  # regParam = 1/(sample_number*C)
            elif model_name == "logistic_regression_with_sgd":
                #print "====================3: LogisticRegressionWithSGD============="
                model_classification = LogisticRegressionWithSGD.train(
                    train_rdd,
                    regParam=regP,
                    iterations=iteration_num,
                    regType=regularization)  # regParam = 1/(sample_number*C)
            else:
                print "ERROR: Training model selection error: no valid ML model selected!"
                return

            ### Evaluating the model on testing data
            labelsAndPreds = test_rdd.map(
                lambda p: (p.label, model_classification.predict(p.features)))
            labelsAndPreds.cache()
            test_sample_number = test_rdd.count()
            testErr = labelsAndPreds.filter(
                lambda (v, p): v != p).count() / float(test_sample_number)
            accuracy = 1 - testErr
            #print "Accuracy = ", accuracy
            scores.append(accuracy)

        ss = np.asarray(scores)
        #print "%0.3f (+/-%0.03f) for " % (ss.mean(), ss.std() * 2), params
        results.append((ss.mean(), ss.std() * 2, params))

    sorted_results = sorted(results, key=lambda x: x[0], reverse=1)
    (best_accuracy, best_std2, best_param) = sorted_results[0]
    print "INFO: ml_opts_jstr=", ml_opts_jstr
    print "INFO: best_param=", best_param

    #ml_opts=json.loads(ml_opts_jstr);
    print "INFO: ml_opts=", ml_opts

    ##############################################
    ######output Grid Search results##############
    ##############################################
    json2save = {}
    json2save["rid"] = int(row_id_str)
    json2save["key"] = "cv_result"
    #json2save["param_str"]=ml_opts_jstr
    json2save["param_dic"] = param_dict
    cv_grid = []
    print ""
    print "INFO: =====Grid Search Results for SPARK ======"
    print "INFO: Best parameters set found for ", model_name, " is: "
    for key in best_param:
        print "INFO:", key, "=", best_param[key]
        if key.lower() == "regtype":
            ml_opts['regularization'] = str(best_param[key])
        else:
            ml_opts[key.lower()] = str(best_param[key])  # add best param to
    ml_opts_jstr = json.dumps(ml_opts)
    json2save["param_str"] = ml_opts_jstr
    print "INFO: Average accuracy with CV = ", cv, ": ", best_accuracy
    print ""
    print "INFO: Grid scores on development set:"
    for i in range(0, len(sorted_results)):
        (ave_accu_i, std2_i, param_i) = sorted_results[i]
        print "%0.3f (+/-%0.03f) for " % (ave_accu_i, std2_i), param_i
        #outstr='%s,%0.3f,%0.03f,%s' % (param_i,ave_accu_i, std2_i,"Selected" if param_i==best_param else "")
        outj = {}
        outj["param"] = param_i
        outj["average_accuracy"] = "%0.3f" % (ave_accu_i)
        outj["std_deviation"] = "%0.3f" % (std2_i)
        outj["selected"] = "%s" % ("Selected" if param_i == best_param else "")
        cv_grid.append(outj)
    print " "

    t1 = time()
    print 'INFO: Grid Search with CV run time: %f' % (t1 - t0)
    t0 = time()

    ##################################################################################
    json2save["cv_grid_data"] = cv_grid
    cv_result = json.dumps(json2save)
    print "INFO: cv_result=", cv_result
    filter = '{"rid":' + row_id_str + ',"key":"cv_result"}'
    upsert_flag = True
    ## write to mongoDB.myml.dataset_info, ignore doc with duplicated key
    # db.dataset_info.createIndex({"rid":1,"key":1},{unique:true})
    ret = query_mongo.upsert_doc_t(mongo_tuples, filter, cv_result,
                                   upsert_flag)
    print "INFO: Upsert count for mllib cv_result: ret=", ret

    ############################################################################################
    ########### retrain with all training data and generate the final model with results #######
    ############################################################################################
    C = best_param['C']
    iteration_num = best_param['iterations']
    regularization = best_param['regType']
    regP = C / float(training_sample_count)

    ######################################the rest of the code is the same as train_MLlib.py #####################################################################

    if model_name == "linear_svm_with_sgd":
        ### 1: linearSVM
        print "INFO: ====================1: Linear SVM============="
        model_classification = SVMWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
        #print model_classification
    elif model_name == "logistic_regression_with_lbfgs":
        ### 2: LogisticRegressionWithLBFGS
        print "INFO: ====================2: LogisticRegressionWithLBFGS============="
        model_classification = LogisticRegressionWithLBFGS.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization,
            numClasses=class_num)  # regParam = 1/(sample_number*C)
    elif model_name == "logistic_regression_with_sgd":
        ### 3: LogisticRegressionWithSGD
        print "INFO: ====================3: LogisticRegressionWithSGD============="
        model_classification = LogisticRegressionWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
    else:
        print "INFO: Training model selection error: no valid ML model selected!"
        return

    print "INFO: model type=", type(model_classification)

    # create feature coefficient file ================================
    coef_arr = None
    intercept = None
    if model_classification.weights is None:
        print "WARNING: model weights not found!"
    else:
        coef_arr = model_classification.weights.toArray().tolist()
        # save to mongo
        key = "coef_arr"
        ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples)
        # save intercept to mongo
        key = "coef_intercept"
        intercept = model_classification.intercept
        ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples)

        # feature list + coef file =============
        feat_filename = os.path.join(local_out_dir,
                                     row_id_str + "_feat_coef.json")
        print "INFO: feat_filename=", feat_filename

        # create feature list + coef file =============================================== ============
        # expect a dict of {"fid":(coef, feature_raw_string)}
        jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None,
                                         coef_arr, ds_id, mongo_tuples)

        # special featuring for IN or libsvm
        if jret is None:
            jret = ml_util.build_feat_coef_raw_list_t(row_id_str,
                                                      feat_filename, coef_arr,
                                                      ds_id, mongo_tuples)
        if jret is None:
            print "WARNING: Cannot create sample list for testing dataset. "

        jfeat_coef_dict = jret
        print "INFO: coef_arr len=", len(
            coef_arr), ", feature_count=", feature_count
        # for multi-class
        if len(coef_arr) != feature_count:
            jfeat_coef_dict = {}
            print "WARNING: feature list can't be shown for multi-class classification"

        # Calculate prediction and Save testing dataset
        bt_coef_arr = sc.broadcast(coef_arr)
        bt_intercept = sc.broadcast(intercept)
        bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict)
        ### Evaluating the model on testing dataset: label, predict label, score, feature list
        print "INFO: intercept=", intercept
        print "INFO: coef_arr len=", len(coef_arr)
        print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict)

        # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ==============================
        if len(coef_arr) == feature_count:
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        else:  # for multi-class, no prediction score;, TBD for better solution: how to display multiple weights for each class
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,0 \
                ,p[0].features \
                ,p[1] \
            ) ).cache()

        # save false prediction to local file
        false_pred_fname = os.path.join(local_out_dir,
                                        row_id_str + "_false_pred.json")
        print "INFO: false_pred_fname=", false_pred_fname
        false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\
            .map(lambda p: (p[0],p[1],p[2] \
            ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value)
            ,p[4]  ) ) \
            .collect()
        print "INFO: false predicted count=", len(false_pred_data)
        false_pred_arr = []
        with open(false_pred_fname, "w") as fp:
            for sp in false_pred_data:
                jsp = {
                    "tlabel": sp[0],
                    "plabel": sp[1],
                    "score": sp[2],
                    "feat": sp[3],
                    "hash": sp[4]
                }
                #print "jsp=",jsp
                false_pred_arr.append(jsp)
            fp.write(json.dumps(false_pred_arr))

        # save prediction results, format: label, prediction, hash
        pred_ofname = os.path.join(local_out_dir,
                                   row_id_str + "_pred_output.pkl")
        print "INFO: pred_ofname=", pred_ofname
        pred_out_arr = testing_pred_rdd.map(lambda p:
                                            (p[0], p[1], p[4])).collect()
        ml_util.ml_pickle_save(pred_out_arr, pred_ofname)

    ### Evaluating the model on testing data
    #labelsAndPreds = testing_rdd.map(lambda p: (p.label, model_classification.predict(p.features)))
    labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1]))
    labelsAndPreds.cache()
    #testing_sample_count = testing_rdd.count()
    testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        testing_sample_count)
    accuracy = 1 - testErr
    print "INFO: Accuracy = ", accuracy

    ### Save model
    #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/'
    #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str
    save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'),
                            config.get('app', 'HDFS_MODEL_DIR'), row_id_str)
    try:
        hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(save_dir)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(
            e.errno, e.strerror), ". At HDFS=", save_dir
    except:
        print "WARNING: Unexpected error:", sys.exc_info(
        )[0], ". At HDFS=", save_dir

    model_classification.save(sc, save_dir)

    ###load model if needed
    #sameModel = SVMModel.load(sc, save_dir)

    t1 = time()
    print 'INFO: training run time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########plot prediction result figure#######
    ###############################################

    labels = labelsAndPreds.collect()
    true_label_list = [x for x, _ in labels]
    pred_label_list = [x for _, x in labels]

    pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png")
    true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png")
    pred_xlabel = 'Prediction (Single Run)'
    true_xlabel = 'True Labels (Single Run)'
    test_cnt_dic = ml_util.ml_plot_predict_figures(
        pred_label_list, true_label_list, labels_list, label_dic,
        testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname)

    plt.show()
    perf_measures = None
    dataset_info = {
        "training_fraction": training_fraction,
        "class_count": class_num,
        "dataset_count": sample_count
    }
    #############################################################
    ###################for 2 class only (plot ROC curve)#########
    #############################################################
    if len(labels_list) == 2:

        do_ROC = True
        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "WARNING: No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!"
            do_ROC = False

        # build data file for score graph
        score_graph_fname = os.path.join(local_out_dir,
                                         row_id_str + "_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname

        # build score_arr_0, score_arr_1
        #    format: tlabel, plabel, score, libsvm, raw feat str, hash
        graph_arr = testing_pred_rdd.map(lambda p:
                                         (int(p[0]), float(p[2]))).collect()
        score_arr_0 = []
        score_arr_1 = []
        max_score = 0
        min_score = 0
        for p in graph_arr:
            if p[0] == 0:
                score_arr_0.append(p[1])
            else:
                score_arr_1.append(p[1])
            # save max,min score
            if p[1] > max_score:
                max_score = p[1]
            elif p[1] < min_score:
                min_score = p[1]

        ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name,
                                  score_graph_fname, max_score, min_score)
        #print "score_arr_0=",score_arr_0
        #print "score_arr_1=",score_arr_1
        #print "max_score=",max_score
        #print "min_score=",min_score

        if do_ROC:

            perf_measures = ml_util.calculate_fscore(true_label_list,
                                                     pred_label_list)
            print "RESULT: perf_measures=", perf_measures
            model_classification.clearThreshold()
            scoreAndLabels = testing_rdd.map(lambda p: (
                model_classification.predict(p[0].features), int(p[0].label)))
            #metrics = BinaryClassificationMetrics(scoreAndLabels)
            #areROC = metrics.areaUnderROC
            #print areROC
            scoreAndLabels_list = scoreAndLabels.collect()
            if flag_clean == 0:
                scores = [x for x, _ in scoreAndLabels_list]
                s_labels = [x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x, _ in scoreAndLabels_list]
                s_labels = [1 - x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]
            #print scores
            #print s_labels
            # create ROC data file ======== ====
            roc_auc = ml_create_roc_files(row_id_str, scores, s_labels,
                                          testing_N, testing_P, local_out_dir,
                                          row_id_str)

            perf_measures["roc_auc"] = roc_auc

    # only update db for web request
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"',ml_opts='"+ml_opts_jstr \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    t1 = time()
    print 'INFO: total run time: %f' % (t1 - t00)

    print 'INFO: Finished!'
    return 0
def feat_importance_2way(row_id_str, ds_id, hdfs_feat_dir, score_file_IT,
                         score_file_prob, sp_master, spark_rdd_compress,
                         spark_driver_maxResultSize, sp_exe_memory,
                         sp_core_max, zipout_dir, zipcode_dir, zip_file_name,
                         mongo_tuples, jobname, uploadtype, description_file):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # get folder list (labels) from hdfs data_out/<id>/metadata
    dirFile_loc = os.path.join(hdfs_feat_dir, "metadata")
    dirFolders = sc.textFile(dirFile_loc)

    hash_Folders = dirFolders.collect()
    #print "hash_Folders=",hash_Folders
    folder_list = [x.encode('UTF8') for x in hash_Folders]
    #print "INFO: dirFile_loc=",dirFile_loc,", folder_list=",folder_list

    row_num_training = 0

    sample_numbers = []
    sparse_mtx_list = []
    features_list = []
    row_list = []
    col_list = []
    max_feat_list = []

    for folder in folder_list:
        print "INFO: folder:", folder
        label = folder_list.index(folder) + 1
        print 'INFO: label=', label
        # md5 list
        logFile_name = os.path.join(hdfs_feat_dir, folder, mtx_name_list)
        # libsvm data
        logFile_data = os.path.join(hdfs_feat_dir, folder, mtx_libsvm)
        #logFile_data = hdfs_feat_dir + folder + mtx_stat
        #print "INFO: logFile_name=",logFile_name
        #print "INFO: logFile_data=",logFile_data

        logNames = sc.textFile(logFile_name).cache()
        logData = sc.textFile(logFile_data).cache()

        names = logNames.collect()
        data = logData.collect()
        name_l = [x.encode('UTF8') for x in names]
        feature_l = [x.encode('UTF8') for x in data]
        name_list = [names.strip() for names in name_l]
        feature_list = [features.strip() for features in feature_l]

        num_names = len(name_list)
        sample_numbers.append(num_names)
        print 'INFO: sample_numbers=', sample_numbers
        print 'INFO: name_list count=', num_names

        ########generate list for csc_matrix creation #########
        i = 0

        features_training = []
        row_training = []
        col_training = []
        labels_training = []
        max_feat = 0
        while i < num_names:

            features = feature_list[i]

            features = features.strip()
            feature_array = features.split(' ')
            labels_training.append(label)

            length = len(feature_array)
            j = 0
            while j < length:
                feature = feature_array[j]
                feat, value = feature.split(':', 2)
                row_training.append(i)
                col_training.append(int(feat) - 1)
                features_training.append(int(value))
                max_feat = max(max_feat, int(feat))
                j = j + 1
            i = i + 1

        #print "features_training=",features_training
        #print "row_training=",row_training
        #print "col_training=",col_training
        features_training = array(features_training)
        row_training = array(row_training)
        col_training = array(col_training)

        #print "type(row_training)=",type(row_training),",row_training=",row_training
        #print "type(col_training)=",type(col_training),",col_training=",col_training

        max_feat_list.append(max_feat)
        features_list.append(features_training)
        row_list.append(row_training)
        col_list.append(col_training)

        row_num_training = row_num_training + num_names
    #print " END for folder in folder_list: "

    col_num = max(max_feat_list)
    print "INFO: column number=", col_num, ", len(max_feat_list)=", len(
        max_feat_list)

    for i in range(0, len(max_feat_list)):
        sparse_mtx = csc_matrix((features_list[i], (row_list[i], col_list[i])),
                                shape=(sample_numbers[i], col_num))
        sparse_mtx_list.append(sparse_mtx)

    #print sparse_mtx_list[0]
    #print "sparse_mtx_list[0].shape=",sparse_mtx_list[0].shape
    #print sparse_mtx_list[1]
    #print "sparse_mtx_list[1].shape=",sparse_mtx_list[1].shape

    exclusive_feature_set_mal = []
    exclusive_feature_set_clean = []
    dic_feature_cnt_mal = {}
    dic_feature_cnt_clean = {}

    dic_score = {}
    dic_cnt_mal = {}
    dic_cnt_clean = {}

    dic_IT_grain = {}

    ####################################################
    ####feature importance algorithms: 2 methods ####### # Only for 2 classes ???
    ####################################################
    if len(sample_numbers) == 2:

        ###################################################
        ################## calculate probability ############
        ###################################################

        print "INFO: ======= Feature Importance(probability) ================"

        for j in range(0, col_num):

            curr_col_dirty = sparse_mtx_list[0].getcol(j)
            sum_col = curr_col_dirty.sum(0)
            cnt_mal = sum_col.tolist()[0][0]

            curr_col_clean = sparse_mtx_list[1].getcol(j)
            sum_col = curr_col_clean.sum(0)
            cnt_clean = sum_col.tolist()[0][0]

            percnt_mal = cnt_mal / float(sample_numbers[0])
            percnt_clean = cnt_clean / float(sample_numbers[1])
            score_j = (percnt_mal + 1 - percnt_clean) / 2

            dic_score[j + 1] = score_j
            dic_cnt_clean[j + 1] = cnt_clean
            dic_cnt_mal[j + 1] = cnt_mal

        sorted_score = sorted(dic_score.items(),
                              key=operator.itemgetter(1),
                              reverse=True)

        #print "sorted_score:", sorted_score
        #print "dic_cnt_clean", dic_cnt_clean
        #print "dic_cnt_mal", dic_cnt_mal

        if os.path.exists(score_file_prob):
            try:
                os.remove(score_file_prob)
            except OSError, e:
                print("Error: %s - %s." % (e.score_file_prob, e.strerror))

        for ii in range(0, len(sorted_score)):
            (feat, score) = sorted_score[ii]
            #print feat, score, dic_all_columns[feat]

            str01 = str(feat) + "\t" + str(
                score) + "\t" + description_str + "\n"
            with open(score_file_prob, "a") as f:
                f.write(str01)

        ########################################################
        ##################Information Gain (entropy)############
        ########################################################

        print "INFO: ======= Information Gain ================"
        for j in range(0, col_num):
            cnt_mal = dic_cnt_mal[j + 1]
            cnt_clean = dic_cnt_clean[j + 1]

            total_samples = sample_numbers[0] + sample_numbers[1]

            p0 = float(sample_numbers[0]) / total_samples
            p1 = 1 - p0

            if p0 == 0 or p1 == 0:
                parent_entropy = 0
            else:
                parent_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(p1)

            if cnt_clean + cnt_mal == 0:
                information_gain = 0
            elif total_samples - cnt_clean - cnt_mal == 0:
                information_gain = 0
            else:
                p0 = float(cnt_mal) / (cnt_clean + cnt_mal)
                p1 = 1 - p0
                if p0 == 0 or p1 == 0:
                    child_left_entropy = 0
                else:
                    child_left_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(
                        p1)

                p0 = float(sample_numbers[0] - cnt_mal) / (total_samples -
                                                           cnt_clean - cnt_mal)
                p1 = 1 - p0
                if p0 == 0 or p1 == 0:
                    child_right_entropy = 0
                else:
                    child_right_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(
                        p1)

                weighted_child_entropy = child_left_entropy * float(
                    cnt_clean +
                    cnt_mal) / total_samples + child_right_entropy * float(
                        total_samples - cnt_clean - cnt_mal) / total_samples
                information_gain = parent_entropy - weighted_child_entropy

            dic_IT_grain[j + 1] = information_gain

        sorted_IT_gain = sorted(dic_IT_grain.items(),
                                key=operator.itemgetter(1),
                                reverse=True)

        if os.path.exists(score_file_IT):
            try:
                os.remove(score_file_IT)
            except OSError, e:
                print("ERROR: %s - %s." % (e.score_file_IT, e.strerror))
示例#9
0
def pca(row_id_str, ds_id, hdfs_feat_dir, local_out_dir  
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, fromweb, pca_jstr
    , jobname, model_data_folder ): 
    
    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path

    
    # init Spark context ====
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path])     
    

    
    pca_param=json.loads(pca_jstr)
    if "k" in pca_param:
        k=pca_param["k"]
    else:
        k=None
    if "threshold" in pca_param:
        threshold=pca_param["threshold"]
    else:
        threshold=None         
    if "lib" in pca_param:
        lib=pca_param["lib"]
    else:
        lib='mllib'
  
    ret=-1
    # start here =================================================================== ===============
    t0 = time()
    
    # source libsvm filename  
    libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file
    
    # load sample RDD from text file   
    # format Row(label, features, hash) from get_sample_dataframe() 
    samples_df, feature_count = zip_feature_util.get_sample_dataframe(sc, libsvm_data_file, 0, None)
    print "INFO: feature_count=",feature_count

    #df_pcaed format: hash,label, features 
    (df_pcaed, k, pca_model)=PCA_transform(sc, samples_df,feature_count, threshold, k) 
    print "INFO: Doing PCA... threshold=",threshold,",k=",k
    #print "df_pcaed=",df_pcaed.first()
    #print "k=",k
    #print "pca_model=",pca_model
    #print "pc=",pca_model.pc

    # pca model filename ============================= ===============
    if model_data_folder is None:
        if row_id_str != ds_id:
            # get from parent dataset
            model_data_folder  = os.path.join(config.get('app', 'HADOOP_MASTER'),config.get('app', 'HDFS_MODEL_DIR'), ds_id+"_pca")
        else:
            model_data_folder  = os.path.join(config.get('app', 'HADOOP_MASTER'),config.get('app', 'HDFS_MODEL_DIR'), row_id_str+"_pca")
            
    # create HDFS folder
    try:
        hdfs.mkdir(model_data_folder)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror),". At HDFS=", save_dir
    except:
        print "WARNING: Unexpected error:", sys.exc_info()[0] ,". At HDFS=", save_dir
    
        
    if not threshold is None:
        #pca_fname=os.path.join(hdfs_feat_dir , row_id_str+'_pca_'+str(threshold)+'.ml')
        pca_fname=os.path.join(model_data_folder , 'pca_model_'+str(threshold))
        libsvm_data_pca = os.path.join(hdfs_feat_dir , "libsvm_data_pca_"+str(threshold)+'.ml')
    else:
        pca_fname=os.path.join(model_data_folder , 'pca_model_'+str(k))
        libsvm_data_pca = os.path.join(hdfs_feat_dir , "libsvm_data_pca_"+str(k)+'.ml')
    
    # save pca model to HDFS ===============
    print "INFO: pca_fname=",pca_fname
    pca_model.write().overwrite().save(pca_fname)
    
    # save pca data to HDFS ============================= ===============
    print "INFO: libsvm_data_pca=",libsvm_data_pca
    # construct libsvm string
    libsvm_rdd=df_pcaed.rdd.map(lambda p: p[0]+" "+str(int(p[1]))+zip_feature_util.dv2libsvm(p[2].toArray()))
    
    # clean up old libsvm file ============================= ===============
    try:
        hdfs.rmr(libsvm_data_pca)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at rmr():", sys.exc_info()[0]     

    # overwrite pca file at hdfs
    libsvm_rdd.saveAsTextFile(libsvm_data_pca)

    
    t1 = time()
    print 'INFO: PCA processing time: %f' %(t1-t0)
    
    ### insert pca_param into mongoDB  ###
    filter='{"rid":'+row_id_str+',"key":"pca_param"}'
    if not threshold is None:
        pca_param["threshold"]=threshold
    if not k is None:
        pca_param["k"]=k
    
    print "INFO: pca_param=",pca_param
    upsert_flag=True
    jstr_insert = '{ "rid":'+row_id_str+',"key":"pca_param", "value":'+json.dumps(pca_param)+'}'
    ret=query_mongo.upsert_doc_t(mongo_tuples,filter,jstr_insert,upsert_flag)
    print "INFO: Upsert count for pca_param=",ret
    
    # only update db for web request   ===========
    if fromweb=="1": 
        #print "database update"
        str_sql="UPDATE atdml_document set "  \
            +" status = 'pca-ed', processed_date ='"+str(datetime.datetime.now()) \
            +"' , ml_pca_opts = '"+json.dumps(pca_param) \
            +"' where id="+row_id_str
        ret=exec_sqlite.exec_sql(str_sql)
        print "INFO: Update Sqlite DB done! ret=", str(ret)

    
    t1 = time()
    print 'INFO: running time: %f' %(t1-t0)
    
    #print 'Finished!'
    return 0
def train(row_id_str,
          ds_id,
          hdfs_feat_dir,
          local_out_dir,
          ml_opts_jstr,
          excluded_feat_cslist,
          sp_master,
          spark_rdd_compress,
          spark_driver_maxResultSize,
          sp_exe_memory,
          sp_core_max,
          zipout_dir,
          zipcode_dir,
          zip_file_name,
          mongo_tuples,
          labelnameflag,
          fromweb,
          training_fraction,
          jobname,
          random_seed=None):

    ### generate data folder and out folder, clean up if needed
    #local_out_dir = local_out_dir + "/"
    #if os.path.exists(local_out_dir):
    #    shutil.rmtree(local_out_dir) # to keep smaplelist file
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    ml_opts = {}
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]
    #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist

    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file:", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    # get distinct label list
    labels_list_all = samples_rdd.map(
        lambda p: p[0].label).distinct().collect()

    # split samples to training and testing data, format (LabeledPoint,hash)
    training_rdd, testing_rdd = samples_rdd.randomSplit(
        [training_fraction, 1 - training_fraction], seed=int(random_seed))
    training_rdd = training_rdd.map(lambda p: p[0])  # keep LabeledPoint only
    training_rdd.cache()
    training_sample_count = training_rdd.count()
    training_lbl_cnt_list = training_rdd.map(
        lambda p: (p.label, 1)).reduceByKey(add).collect()
    testing_rdd.cache()
    testing_sample_count = testing_rdd.count()
    testing_lbl_cnt_list = testing_rdd.map(
        lambda p: (p[0].label, 1)).reduceByKey(add).collect()
    sample_count = training_sample_count + testing_sample_count

    # random_seed testing
    if not random_seed is None:
        all_t = testing_rdd.collect()
        all_t = sorted(all_t, key=lambda x: x[1])
        cnt = 0
        for i in all_t:
            print i[1]
            cnt = cnt + 1
            if cnt > 3:
                break

    t1 = time()
    print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count
    print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list
    print "INFO: labels_list_all=", labels_list_all
    print "INFO: training and testing samples generated!"
    print 'INFO: running time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########build learning model################
    ###############################################

    ### get the parameters###
    print "INFO: ======Learning Algorithm and Parameters============="
    #ml_opts = json.loads(ml_opts_jstr)
    model_name = ml_opts[
        'learning_algorithm']  # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd
    iteration_num = 0
    if 'iterations' in ml_opts:
        iteration_num = ml_opts['iterations']
    C = 0
    if 'c' in ml_opts:
        C = eval(ml_opts['c'])
    regularization = ""
    if 'regularization' in ml_opts:
        regularization = ml_opts['regularization']

    print "INFO: Learning Algorithm: ", model_name
    print "INFO: C = ", C
    print "INFO: iterations = ", iteration_num
    print "INFO: regType = ", regularization
    regP = C / float(training_sample_count)
    print "INFO: Calculated: regParam = ", regP

    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    if labelnameflag == 1:
        '''
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
 
        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
 
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        print "INFO: dic_list=",dic_list
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        '''
        label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id)
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels:", labels_list
    class_num = len(labels_list)
    if class_num > 2:
        print "INFO: Multi-class classification! Number of classes = ", class_num

    ### build model ###

    if model_name == "linear_svm_with_sgd":
        ### 1: linearSVM
        print "INFO: ====================1: Linear SVM============="
        model_classification = SVMWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
        #print model_classification
    elif model_name == "logistic_regression_with_lbfgs":
        ### 2: LogisticRegressionWithLBFGS
        print "INFO: ====================2: LogisticRegressionWithLBFGS============="
        model_classification = LogisticRegressionWithLBFGS.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization,
            numClasses=class_num)  # regParam = 1/(sample_number*C)
    elif model_name == "logistic_regression_with_sgd":
        ### 3: LogisticRegressionWithSGD
        print "INFO: ====================3: LogisticRegressionWithSGD============="
        model_classification = LogisticRegressionWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
    else:
        print "INFO: Training model selection error: no valid ML model selected!"
        return

    print "INFO: model type=", type(model_classification)

    # create feature coefficient file ================================
    coef_arr = None
    intercept = None
    if model_classification.weights is None:
        print "WARNING: model weights not found!"
    else:
        coef_weights = model_classification.weights
        #print "coef_weights=",coef_weights
        #print type(coef_weights),coef_weights.shape
        coef_arr = coef_weights.toArray().tolist()
        # save coef_arr to mongo
        key = "coef_arr"
        ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples)

        # save coef_arr to local file
        if ret == 0:
            # drop old record in mongo
            filter = '{"rid":' + row_id_str + ',"key":"coef_arr"}'
            ret = query_mongo.delete_many(mongo_tuples, None, filter)
            if not os.path.exists(local_out_dir):
                os.makedirs(local_out_dir)
            fn_ca = os.path.join(local_out_dir, row_id_str,
                                 row_id_str + "_coef_arr.pkl")
            print
            ml_util.ml_pickle_save(coef_arr, fn_ca)

        # save intercept to mongo
        intercept = model_classification.intercept
        key = "coef_intercept"
        ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples)

        # feature list + coef file =============
        feat_filename = os.path.join(local_out_dir,
                                     row_id_str + "_feat_coef.json")
        print "INFO: feat_filename=", feat_filename

        # create feature, coef & raw string file =============================================== ============
        # expect a dict of {"fid":(coef, feature_raw_string)}
        jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None,
                                         coef_arr, ds_id, mongo_tuples)

        # special featuring for IN or libsvm
        if jret is None:
            jret = ml_util.build_feat_coef_raw_list_t(row_id_str,
                                                      feat_filename, coef_arr,
                                                      ds_id, mongo_tuples)
        if jret is None:
            print "WARNING: Cannot create sample list for testing dataset. "

        jfeat_coef_dict = jret
        print "INFO: coef_arr len=", len(
            coef_arr), ", feature_count=", feature_count
        # for multi-class
        if len(coef_arr) != feature_count:
            jfeat_coef_dict = {}
            print "WARNING: coef count didn't match feature count.  multi-class classification was not supported"

        # Calculate prediction and Save testing dataset
        bt_coef_arr = sc.broadcast(coef_arr)
        bt_intercept = sc.broadcast(intercept)
        bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict)
        ### Evaluating the model on testing dataset: label, predict label, score, feature list
        print "INFO: intercept=", intercept
        print "INFO: coef_arr len=", len(coef_arr), type(coef_arr)
        print "INFO: jfeat_coef_dict len=", len(
            jfeat_coef_dict)  #, jfeat_coef_dict

        # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ==============================
        if len(coef_arr) == feature_count:
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        else:  # for multi-class, no prediction score; TBD for better solution: how to display multiple weights for each class
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,"-" \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        ''',p[0].features.dot(bt_coef_arr.value)+bt_intercept.value \
        # Save testing dataset for analysis
        libsvm_testing_output = hdfs_feat_dir + "libsvm_testing_output_"+row_id_str
        print "INFO: libsvm_testing_output=", libsvm_testing_output
        try:
            hdfs.rmr(libsvm_testing_output)
        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm_testing_output file clean up:", sys.exc_info()[0] 
        # save only false prediction?
        #testing_pred_rdd.filter(lambda p: p[0] != p[1]).saveAsTextFile(libsvm_testing_output)
        testing_pred_rdd.saveAsTextFile(libsvm_testing_output)
        
        '''
        #test_tmp=testing_pred_rdd.collect()

        # save false prediction to local file
        false_pred_fname = os.path.join(local_out_dir,
                                        row_id_str + "_false_pred.json")
        print "INFO: false_pred_fname=", false_pred_fname
        false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\
            .map(lambda p: (p[0],p[1],p[2] \
            ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value)
            ,p[4]  ) ) \
            .collect()
        print "INFO: false predicted count=", len(false_pred_data)
        false_pred_arr = []
        with open(false_pred_fname, "w") as fp:
            for sp in false_pred_data:
                jsp = {
                    "tlabel": sp[0],
                    "plabel": sp[1],
                    "score": sp[2],
                    "feat": sp[3],
                    "hash": sp[4]
                }
                #print "jsp=",jsp
                false_pred_arr.append(jsp)
            fp.write(json.dumps(false_pred_arr))

        # save prediction results, format: label, prediction, hash
        pred_ofname = os.path.join(local_out_dir,
                                   row_id_str + "_pred_output.pkl")
        print "INFO: pred_ofname=", pred_ofname
        pred_out_arr = testing_pred_rdd.map(lambda p:
                                            (p[0], p[1], p[4])).collect()
        ml_util.ml_pickle_save(pred_out_arr, pred_ofname)
        '''
        one_item= testing_pred_rdd.first()
        print "one_item=",one_item
        sparse_arr=one_item[3]

        dict_feat=zip_feature_util.sparseVector2dict(sparse_arr)
        print "len=",len(dict_feat),"dict_feat=",dict_feat
        dict_weit=zip_feature_util.add_coef2dict(coef_arr,dict_feat)
        print "len=",len(dict_weit),"dict_weit=",dict_weit
        '''
    # Calculate Accuracy. labelsAndPreds = (true_label,predict_label)
    labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1]))
    labelsAndPreds.cache()
    testing_sample_number = testing_rdd.count()
    testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        testing_sample_number)
    accuracy = 1 - testErr
    print "INFO: Accuracy = ", accuracy

    ### Save model
    #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/'
    #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str
    save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'),
                            config.get('app', 'HDFS_MODEL_DIR'), row_id_str)
    try:
        hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(save_dir)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(
            e.errno, e.strerror), ". At HDFS=", save_dir
    except:
        print "WARNING: Unexpected error:", sys.exc_info(
        )[0], ". At HDFS=", save_dir
    model_classification.save(sc, save_dir)

    ###load model if needed
    #sameModel = SVMModel.load(sc, save_dir)

    t1 = time()
    print 'INFO: training run time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########plot prediction result figure ==================================================== ===============
    ###############################################

    labels = labelsAndPreds.collect()
    true_label_list = [x for x, _ in labels]
    pred_label_list = [x for _, x in labels]

    pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png")
    true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png")
    pred_xlabel = 'Prediction (Single Run)'
    true_xlabel = 'True Labels (Single Run)'
    test_cnt_dic = ml_util.ml_plot_predict_figures(
        pred_label_list, true_label_list, labels_list, label_dic,
        testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    #print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc = None
    perf_measures = None
    dataset_info = {
        "training_fraction": training_fraction,
        "class_count": class_num,
        "dataset_count": sample_count
    }
    #############################################################
    ###################for 2 class only (plot ROC curve) ==================================================== ===============
    #############################################################
    if len(labels_list) == 2:

        do_ROC = True
        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "INFO: No ROC curve generated: 'clean','benign' or '0' must be a label for indicating negative class!"
            do_ROC = False

        # build data file for score graph
        score_graph_fname = os.path.join(local_out_dir,
                                         row_id_str + "_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname

        # build score_arr_0, score_arr_1
        #    format: tlabel, plabel, score, libsvm, raw feat str, hash
        graph_arr = testing_pred_rdd.map(lambda p:
                                         (int(p[0]), float(p[2]))).collect()
        score_arr_0 = []
        score_arr_1 = []
        max_score = 0
        min_score = 0
        for p in graph_arr:
            if p[0] == 0:
                score_arr_0.append(p[1])
            else:
                score_arr_1.append(p[1])
            # save max,min score
            if p[1] > max_score:
                max_score = p[1]
            elif p[1] < min_score:
                min_score = p[1]

        ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name,
                                  score_graph_fname, max_score, min_score)

        if do_ROC:

            perf_measures = ml_util.calculate_fscore(true_label_list,
                                                     pred_label_list)
            print "RESULT: perf_measures=", perf_measures
            '''
            # calculate fscore  ==========
            tp = labelsAndPreds.filter(lambda (v, p): v == 1 and p==1 ).count() 
            fp = labelsAndPreds.filter(lambda (v, p): v == 0 and p==1 ).count() 
            fn = labelsAndPreds.filter(lambda (v, p): v == 1 and p==0 ).count() 
            tn = labelsAndPreds.filter(lambda (v, p): v == 0 and p==0 ).count() 
            print "RESULT: tp=",tp,",fp=",fp,",fn=",fn,",tn=",tn
            precision=float(tp)/(tp+fp)
            recall=float(tp)/(tp+fn)
            print "RESULT: precision=",precision,",recall=",recall
            acc=(tp+tn)/(float(testing_sample_number))
            fscore=2*((precision*recall)/(precision+recall))
            print "RESULT: fscore=",fscore,",acc=",acc  
            '''
            model_classification.clearThreshold()
            scoreAndLabels = testing_rdd.map(lambda p: (
                model_classification.predict(p[0].features), int(p[0].label)))
            #metrics = BinaryClassificationMetrics(scoreAndLabels)
            #areROC = metrics.areaUnderROC
            #print areROC
            scoreAndLabels_list = scoreAndLabels.collect()

            if flag_clean == 0:
                scores = [x for x, _ in scoreAndLabels_list]
                s_labels = [x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x, _ in scoreAndLabels_list]
                s_labels = [1 - x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]

            # create ROC data file ======== ====
            roc_auc = ml_create_roc_files(row_id_str, scores, s_labels,
                                          testing_N, testing_P, local_out_dir,
                                          row_id_str)
            #, local_out_dir, file_name_given)

            perf_measures["roc_auc"] = roc_auc

    # only update db for web request ==================================================== ===============
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    print 'INFO: Finished!'
    return 0
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file,
                         sp_master, spark_rdd_compress,
                         spark_driver_maxResultSize, sp_exe_memory,
                         sp_core_max, zipout_dir, zipcode_dir, zip_file_name,
                         mongo_tuples, training_fraction, jobname, uploadtype,
                         description_file):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # get feature seq mapping from mongo
    if uploadtype == "MD5 List IN-dynamic":
        ### connect to database to get the column list which contains all column number of the corresponding feature
        key = "dict_dynamic"
        jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
        jstr_proj = '{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

        doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']

        dic_all_columns = {}
        max_feature = 0
        # reverse dict{hashes:sequence number} ======
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                dic_all_columns[eval(dic_list[i][key])] = key
                if eval(dic_list[i][key]) > max_feature:
                    max_feature = eval(dic_list[i][key])
        print "INFO: max_feature=", max_feature
        #print "dic_all_columns=",dic_all_columns # fid:numb,numb

    dirFile_loc = os.path.join(hdfs_feat_dir, "metadata")
    dirFolders = sc.textFile(dirFile_loc)

    hash_Folders = dirFolders.collect()
    #print "INFO: dirFile_loc=",dirFile_loc,", hash_Folders=",hash_Folders
    folder_list = [x.encode('UTF8') for x in hash_Folders]
    print "INFO: hdfs folder_list=", folder_list  #['dirty/', 'clean/']

    features_training = []
    labels_training = []
    names_training = []
    row_training = []
    col_training = []
    max_feat_training = 0
    row_num_training = 0
    features_testing = []
    labels_testing = []
    names_testing = []
    row_testing = []
    col_testing = []
    max_feat_testing = 0
    row_num_testing = 0

    # loop through hdfs folders; TBD
    for folder in folder_list:
        print "INFO: folder=", folder
        label = folder_list.index(folder) + 1
        print 'INFO: label=', label

        logFile_name = os.path.join(hdfs_feat_dir, folder, mtx_name_list)
        #print "logFile_name=",logFile_name
        logFile_data = os.path.join(hdfs_feat_dir, folder, mtx_libsvm)
        #print "logFile_data=",logFile_data

        logNames = sc.textFile(logFile_name).cache()
        logData = sc.textFile(logFile_data).cache()

        names = logNames.collect()
        data = logData.collect()
        name_l = [x.encode('UTF8') for x in names]
        feature_l = [x.encode('UTF8') for x in data]
        name_list = [names.strip() for names in name_l]
        feature_list = [features.strip() for features in feature_l]

        ##########data seperation######
        id_perm = data_seperation_random(name_list)

        num_names = len(name_list)
        print 'INFO: num of samples=', num_names
        num_train = int(training_portion * num_names)
        print 'INFO: num_train = ', num_train

        ########generate training data#########
        i = 0
        #print "INFO: generate training data"
        #print "INFO: len(id_perm)=",len(id_perm)
        while i < num_train:
            #print i, id_perm[i]
            features = feature_list[id_perm[i]]

            features = features.strip()
            feature_array = features.split(' ')
            labels_training.append(label)

            length = len(feature_array)
            j = 0
            while j < length:
                feature = feature_array[j]
                feat, value = feature.split(':', 2)
                row_training.append(i + row_num_training)
                col_training.append(int(feat) - 1)
                features_training.append(int(value))
                max_feat_training = max(max_feat_training, int(feat))
                j = j + 1
            i = i + 1
        row_num_training = row_num_training + num_train
        i = num_train
        ########generate testing data#########
        while i < num_names:

            ####for generating testing data folder####
            test_file_name = name_list[id_perm[i]]

            features = feature_list[id_perm[i]]

            features = features.strip()
            feature_array = features.split(' ')
            labels_testing.append(label)

            length = len(feature_array)
            j = 0
            while j < length:
                feature = feature_array[j]
                feat, value = feature.split(':', 2)
                row_testing.append(i - num_train + row_num_testing)
                col_testing.append(int(feat) - 1)
                features_testing.append(int(value))
                max_feat_testing = max(max_feat_testing, int(feat))
                j = j + 1
            i = i + 1
        row_num_testing = row_num_testing + (num_names - num_train)

    # end for loop here ========================

    col_num = max(max_feat_training, max_feat_testing)
    if max_feat_training < col_num:
        for i in range(0, row_num_training):
            for j in range(max_feat_training, col_num):
                features_training.append(0)
                row_training.append(i)
                col_training.append(j)
    elif max_feat_testing < col_num:
        for i in range(0, row_num_testing):
            for j in range(max_feat_testing, col_num):
                features_testing.append(0)
                row_testing.append(i)
                col_testing.append(j)

    features_training = array(features_training)
    row_training = array(row_training)
    col_training = array(col_training)
    #print "row_training:", row_training
    #print "INFO: col_training:", col_training
    len_col = len(col_training)
    print "INFO: col_num:", col_num
    labels_training = array(labels_training)

    features_testing = array(features_testing)
    row_testing = array(row_testing)

    col_testing = array(col_testing)
    labels_testing = array(labels_testing)

    sparse_mtx = csc_matrix((features_training, (row_training, col_training)),
                            shape=(row_num_training, col_num))
    #print "sparse_mtx.todense(), sparse_mtx.shape=",sparse_mtx.todense(), sparse_mtx.shape

    sparse_test = csc_matrix((features_testing, (row_testing, col_testing)),
                             shape=(row_num_testing, col_num))
    #print " sparse_test.todense(), sparse_test.shape=",sparse_test.todense(), sparse_test.shape

    clf = svm.LinearSVC()
    #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
    #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None)
    #print "labels_training=",labels_training
    #print "sparse_mtx=",sparse_mtx
    clf.fit(sparse_mtx, labels_training)

    #print "INFO: model:intercept=",clf.intercept_
    #print "INFO: model:coef=",clf.coef_

    labels_pred = clf.predict(sparse_test)
    #print "labels_pred:", labels_pred

    accuracy = clf.score(sparse_test, labels_testing)
    #print "INFO: data folder=", hdfs_feat_dir
    print "INFO: accuracy=", accuracy

    #####################################################################
    ##################calculate feature importance with predication labels#######################
    #####################################################################
    AA = sparse_mtx.todense()
    BB = sparse_test.todense()
    labels_train_pred = clf.predict(sparse_mtx)
    labels_test_pred = labels_pred

    #print "###################################################################################"
    print "INFO: ======= Calculate feature importance with predication labels =================="
    #print "###################################################################################"
    dic_importance_label = {}

    for j in range(0, col_num):  ###for all features in the loop

        ##############################
        #print "====new way with sparse matrix========="
        curr_col_train = sparse_mtx.getcol(j)
        sum_col = curr_col_train.sum(0)
        positive_feature_number = int(sum_col.tolist()[0][0])

        labels_value = 3 - labels_train_pred
        dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train)
        sum_product = dot_product.sum(1)
        labels_positive_sum = int(sum_product.tolist()[0][0])

        sum_label_values = sum(labels_value)
        labels_negitive_sum = sum_label_values - labels_positive_sum

        ##############################
        #print "====new way with sparse matrix========="
        curr_col_test = sparse_test.getcol(j)
        sum_col = curr_col_test.sum(0)
        positive_feature_number = positive_feature_number + int(
            sum_col.tolist()[0][0])

        labels_value = 3 - labels_test_pred
        dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test)
        sum_product = dot_product.sum(1)
        labels_positive_sum = labels_positive_sum + int(
            sum_product.tolist()[0][0])

        sum_label_values = sum(labels_value)
        labels_negitive_sum = labels_negitive_sum + sum_label_values - int(
            sum_product.tolist()[0][0])

        n_total = row_num_training + row_num_testing
        negitive_feature_number = n_total - positive_feature_number
        if positive_feature_number == 0:
            #print "feature ", j+1, "all 0s!"
            dic_importance_label[j + 1] = -100
        elif negitive_feature_number == 0:
            #print "feature ", j+1, "all 1s!"
            dic_importance_label[j + 1] = -200
        else:
            q_positive = float(labels_positive_sum) / positive_feature_number
            q_negitive = float(labels_negitive_sum) / negitive_feature_number

            Q = (q_positive - q_negitive) * sqrt(
                float(q_positive) * q_negitive / float(n_total) /
                float(n_total))
            dic_importance_label[j + 1] = Q

    sorted_importance = sorted(dic_importance_label.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
    print "INFO: ======= Feature Importance(FIRM score) ================"

    if os.path.exists(local_score_file):
        try:
            os.remove(local_score_file)
        except OSError, e:
            print("ERROR: %s - %s." % (e.local_score_file, e.strerror))
示例#12
0
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file,
                         sp_master, spark_rdd_compress,
                         spark_driver_maxResultSize, sp_exe_memory,
                         sp_core_max, zipout_dir, zipcode_dir, zip_file_name,
                         mongo_tuples, training_fraction, jobname, uploadtype):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])
    '''
    SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress'))
    SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
    SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
    SparkContext.setSystemProperty('spark.cores.max', args.core_max)
    sc = SparkContext(args.sp_master, 'feature_importance_FRIM:'+str(args.row_id))
    '''

    t0 = time()

    # get folder list (labels) from hdfs data_out/<id>/metadata  ==============
    dirFile_loc = os.path.join(hdfs_feat_dir, "metadata")
    dirFolders = sc.textFile(dirFile_loc)

    hash_Folders = dirFolders.collect()
    print "INFO: dirFile_loc=", dirFile_loc, ", hash_Folders=", hash_Folders
    folder_list = [x.encode('UTF8') for x in hash_Folders]
    print "INFO: folder_list=", folder_list  #['dirty/', 'clean/']

    # get feature seq : ngram hash mapping ==================================
    key = "dic_seq_hashes"  #{"123":"136,345"}
    jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
    jstr_proj = '{"value":1}'

    # get parent dataset's data
    if ds_id != row_id_str:
        jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

    doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
    dic_list = doc['value']

    dic_all_columns = dic_list
    feature_count = len(dic_list)

    #print "INFO: feature_count=",feature_count
    #print "dic_list=",dic_list #{u'123,345':u'136'}
    #print "dic_all_columns=",dic_all_columns # {1: u'8215,8216'}
    # end

    # get {hash : raw string} mapping ==================================
    key = "dic_hash_str"  #{"123":"openFile"}
    jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
    jstr_proj = '{"value":1}'
    # get parent dataset's data
    if ds_id != row_id_str:
        jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

    doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
    dic_hash_str = doc['value']
    '''
    # get folder list (labels) from hdfs data_out/<id>/libsvm  ==============
    libsvm_loc = os.path.join(hdfs_feat_dir , "libsvm_data")    

    print "INFO: libsvm_loc=", libsvm_loc
    samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_loc)
    '''

    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file

    # load feature count file
    #feat_count_file=libsvm_data_file+"_feat_count"
    #feature_count=zip_feature_util.get_feature_count(sc,feat_count_file)
    print "INFO: feature_count=", feature_count

    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist=None)

    labels_and_features_rdd = samples_rdd.map(lambda p:
                                              (p[0].label, p[0].features))

    all_data = labels_and_features_rdd.collect()
    features_list = [x.toArray() for _, x in all_data]
    labels_list_all = [x for x, _ in all_data]
    labels_list_all = np.array(labels_list_all)
    features_array = np.array(features_list)

    ### generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    ### randomly split the samples into training and testing data
    sparse_mtx, sparse_test, labels_training, labels_testing = \
        cross_validation.train_test_split(features_sparse_mtx, labels_list_all, test_size=(1-training_fraction))

    #print "INFO: sparse_mtx.shape=",sparse_mtx.shape
    #print "INFO: sparse_test.shape=",sparse_test.shape
    row_num_training = (sparse_mtx.shape)[0]
    row_num_testing = (sparse_test.shape)[0]

    # why use LinearSVC ?
    clf = svm.LinearSVC()
    #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
    #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None)
    #print "labels_training=",labels_training
    #print "sparse_mtx=",sparse_mtx
    clf.fit(sparse_mtx, labels_training)

    #print "**model:intercept***"
    #print clf.intercept_
    #print "**model:coef***"
    #print clf.coef_
    col_num = len(clf.coef_[0])  # for n_classes==2
    print "INFO: col_num=", col_num

    labels_pred = clf.predict(sparse_test)
    #print "labels_pred:", labels_pred

    accuracy = clf.score(sparse_test, labels_testing)
    print "INFO: data folder:", hdfs_feat_dir
    print "INFO: accuracy: ", accuracy

    #####################################################################
    ##################calculate feature importance with predication labels#######################
    #####################################################################
    AA = sparse_mtx.todense()
    BB = sparse_test.todense()
    labels_train_pred = clf.predict(sparse_mtx)
    labels_test_pred = labels_pred

    print "INFO: ###################################################################################"
    print "INFO: ############calculate feature importance with predication labels###################"
    print "INFO: ###################################################################################"
    dic_importance_label = {}

    for j in range(0, col_num):  ###for all features in the loop

        ##############################
        #print "====new way with sparse matrix========="
        curr_col_train = sparse_mtx.getcol(j)
        sum_col = curr_col_train.sum(0)
        positive_feature_number = int(sum_col.tolist()[0][0])

        labels_value = 3 - labels_train_pred
        dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train)
        sum_product = dot_product.sum(1)
        labels_positive_sum = int(sum_product.tolist()[0][0])

        sum_label_values = sum(labels_value)
        labels_negitive_sum = sum_label_values - labels_positive_sum

        ##############################
        #print "====new way with sparse matrix========="
        curr_col_test = sparse_test.getcol(j)
        sum_col = curr_col_test.sum(0)
        positive_feature_number = positive_feature_number + int(
            sum_col.tolist()[0][0])

        labels_value = 3 - labels_test_pred
        dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test)
        sum_product = dot_product.sum(1)
        labels_positive_sum = labels_positive_sum + int(
            sum_product.tolist()[0][0])

        sum_label_values = sum(labels_value)
        labels_negitive_sum = labels_negitive_sum + sum_label_values - int(
            sum_product.tolist()[0][0])

        n_total = row_num_training + row_num_testing
        negitive_feature_number = n_total - positive_feature_number
        if positive_feature_number == 0:
            #print "feature ", j+1, "all 0s!"
            dic_importance_label[j + 1] = -100
        elif negitive_feature_number == 0:
            #print "feature ", j+1, "all 1s!"
            dic_importance_label[j + 1] = -200
        else:
            q_positive = float(labels_positive_sum) / positive_feature_number
            q_negitive = float(labels_negitive_sum) / negitive_feature_number

            Q = (q_positive - q_negitive) * sqrt(
                float(q_positive) * q_negitive / float(n_total) /
                float(n_total))
            dic_importance_label[j + 1] = Q

    sorted_importance = sorted(dic_importance_label.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

    print "INFO: =======Feature Importance(FIRM score)================"

    if os.path.exists(local_score_file):
        try:
            os.remove(local_score_file)
        except OSError, e:
            print("ERROR: %s - %s." % (e.local_score_file, e.strerror))
def preprocess(
    row_id_str,
    ds_id,
    hdfs_feat_dir,
    local_out_dir,
    ml_opts_jstr  #, excluded_feat_cslist
    ,
    sp_master,
    spark_rdd_compress,
    spark_driver_maxResultSize,
    sp_exe_memory,
    sp_core_max,
    zipout_dir,
    zipcode_dir,
    zip_file_name,
    data_fname
    #, mongo_tuples, labelnameflag, fromweb
    ,
    jobname,
    dnn_data_suffix=config.get("machine_learning", "dnn_data_suffix"),
    dnn_label_suffix=config.get("machine_learning", "dnn_label_suffix"),
    dnn_info_suffix=config.get("machine_learning", "dnn_info_suffix")):

    ### generate data folder and out folder, clean up if needed

    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path, ",ml_opts_jstr=", ml_opts_jstr

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # Check option
    ml_opts = {}
    try:
        ml_opts = json.loads(ml_opts_jstr)
    except:
        print "ERROR: string ml_opts is invalid!"
        return -1

    learning_algorithm = ml_opts["learning_algorithm"]
    #return

    # create 3 arrays
    feat_list = []
    label_list = []
    info_list = []
    feature_count = None
    sample_count = None
    if learning_algorithm == "cnn":  #==================================================  ==========
        # libsvm_data for featured data, not use "dnn_data" here
        libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
        print "INFO: libsvm_data_file=", libsvm_data_file
        # load sample RDD from libsvm file
        # output format: [([features], label, info)] , feature_count, feat_max, feat_min
        all_list, feature_count, feat_max, feat_min =\
            zip_feature_util.get_sample_as_arr(sc, libsvm_data_file, None, None)

        c = 0
        # convert to 3 list  =========================  ==========
        for i in all_list:
            #print type(i[0]),type(i[1]), type(i[2])
            if c % 10000 == 0:
                print ".",
                sys.stdout.flush()
            c = c + 1
            # convert to float32 np array and normalize by max value (0-1.0)
            feat_list.append(np.array(i[0], dtype=np.float32) / feat_max)
            label_list.append(i[1])
            info_list.append(i[2])
        print "INFO: feat_list t=", type(feat_list), type(feat_list[0]), type(
            feat_list[0][0])
    elif learning_algorithm == "lstm":  #==================================================  ==========
        # filename for featured data
        dnn_data_file = os.path.join(hdfs_feat_dir, data_fname)
        print "INFO: dnn_data_file=", dnn_data_file
        # output format: [([features], label, info)] , chucked_sample_count
        all_list, chucked_sample_count=\
            zip_feature_util.get_sample_as_chunk(sc, dnn_data_file, None, None)
        c = 0
        # convert to 3 list  =========================  ==========
        for i in all_list:
            #print type(i[0]),type(i[1]), type(i[2])
            if c % 10000 == 0:
                print ".",
                sys.stdout.flush()
            c = c + 1
            # convert to float32 np array and normalize by max value (0-1.0)
            feat_list.append(np.array(i[0], dtype=np.int32))
            label_list.append(i[1])
            info_list.append(i[2])
        print "INFO: feat_list t=", type(feat_list), type(
            feat_list[0]), feat_list[0][0]
    print "INFO: label_list t=", type(label_list), label_list[0]
    print "INFO: info_list t=", type(info_list), info_list[0]

    if not all_list is None and len(all_list) > 0:
        sample_count = len(all_list)
    else:
        sample_count = None
    print "INFO: sample_count=", sample_count

    tgt_prefix = row_id_str
    if (row_id_str != ds_id):
        tgt_prefix = ds_id

    # save data file   =========================  ==========
    data_fname = os.path.join(local_out_dir, tgt_prefix + dnn_data_suffix)
    print "INFO: data_fname=", data_fname
    # reshape for image
    #if not feature_count is None and feature_count >0 and not sample_count is None and sample_count > 0:
    #    nparr_feat=np.asarray(feat_list,dtype=np.float32).reshape(sample_count,feature_count)
    #else: # lstm didn't need to reshape
    nparr_feat = np.asarray(feat_list, dtype=np.int32)
    print "INFO: nparr_feat s=", nparr_feat.shape, nparr_feat[0].shape, type(
        nparr_feat[0][0]), nparr_feat[0][0]

    with gzip.open(data_fname, "wb") as fp:
        #pickle.dump(nparr_feat,fp, 2) # may overflow in pickle; limited to 2B elements; use numpy.save()
        np.save(fp, nparr_feat, allow_pickle=False)  #62.9M for (465724, 2967)

    # save label file  =========================  ==========
    lbl_fname = os.path.join(local_out_dir, tgt_prefix + dnn_label_suffix)
    print "INFO: lbl_fname=", lbl_fname
    # convert to numpy array and int32 to save space
    nparr_lbl = np.asarray(label_list, dtype=np.int32)
    #print "label=",nparr_lbl[0]
    with gzip.open(lbl_fname, "wb") as fp:
        #pickle.dump(nparr_lbl,fp,  2)
        np.save(fp, nparr_lbl, allow_pickle=False)
    # save info file  =========================  ==========
    info_fname = os.path.join(local_out_dir, tgt_prefix + dnn_info_suffix)
    print "INFO: info_fname=", info_fname
    nparr_info = np.asarray(info_list)
    with gzip.open(info_fname, "wb") as fp:
        #pickle.dump(nparr_info,fp,2)
        np.save(fp, nparr_info, allow_pickle=False)

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)
    print 'INFO: Finished!'
    return 0
def pca(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, sp_master,
        spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory,
        sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples,
        fromweb, pca_jstr, jobname, model_data_folder):

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])
    '''
    SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress'))
    SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
    SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
    SparkContext.setSystemProperty('spark.cores.max', args.core_max)

    sc = SparkContext(args.sp_master, 'pca_sklearn:'+str(args.row_id))
    '''

    pca_param = json.loads(pca_jstr)
    if "k" in pca_param:
        k = pca_param["k"]
    else:
        k = None
    if "threshold" in pca_param:
        threshold = pca_param["threshold"]
    else:
        threshold = None
    if "lib" in pca_param:
        lib = pca_param["lib"]
    else:
        lib = 'sklearn'
    #if "recreate" in pca_param:
    #    recreate=pca_param["recreate"]
    #else:
    #    recreate='0'

    ret = -1
    # start here =================================================================== ===============
    t0 = time()

    # source libsvm filename
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)

    # load sample RDD from text file
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, '')

    # convert labeled point to tuple (label,features)
    #labels_and_features_rdd = samples_rdd.map(lambda p: (p.label, p.features))

    all_data = samples_rdd.collect()
    sample_count = len(all_data)

    # 2-D array
    features_list = [x.features.toArray() for x, _ in all_data]
    # label array
    labels_list_all = [x.label for x, _ in all_data]
    # hash array
    hash_list_all = [x for _, x in all_data]

    # convert to int to speed up
    features_array = np.array(features_list, dtype=np.int32)
    true_label_array = np.array(labels_list_all, dtype=np.int8)
    hash_list_all = np.array(hash_list_all)

    # PCA here, TBD for removal
    print "INFO: features_array.shape=", features_array.shape
    print "INFO: true_label_array.shape=", true_label_array.shape
    print "INFO: hash_list_all.shape=", hash_list_all.shape

    print "INFO: Doing PCA... threshold=", threshold, ",k=", k
    (features_array_reduced, k,
     pca) = sklearn_PCA_transform(features_array, threshold, k)
    #print "features_array_reduced t=", type(features_array_reduced), ",features_array_reduced=", features_array_reduced
    print "INFO: features_array_reduced.shape", features_array_reduced.shape, ",k=", k

    pca_fname3 = ""
    if not threshold is None:
        pca_fname = os.path.join(
            model_data_folder, row_id_str + '_pca_' + str(threshold) + '.pkl')
        pca_fname3 = os.path.join(
            model_data_folder, row_id_str + '_pca3_' + str(threshold) + '.pkl')
    else:
        pca_fname = os.path.join(model_data_folder,
                                 row_id_str + '_pca_' + str(k) + '.pkl')

    #pc_3=features_array_reduced[:,0:3]
    #print "type=",type(pc_3),"shape=",pc_3.shape,"pca_fname3=",pca_fname3
    #np.save(pca_fname3,pc_3)

    if not os.path.exists(model_data_folder):
        os.makedirs(model_data_folder)
    if os.path.exists(pca_fname):  # remove file if exists
        try:
            for fl in glob.glob(pca_fname + "*"):
                os.remove(fl)
        except OSError, e:
            print("Error: %s - %s." % (e.pca_fname, e.strerror))
def exclude_sample(hdfs_src_dir,
                   fname_list,
                   hdfs_out_dir,
                   hdfs_excl_dir,
                   hdfs_excl_fname_list,
                   sp_master,
                   spark_rdd_compress,
                   spark_driver_maxResultSize,
                   sp_exe_memory,
                   sp_core_max,
                   jobname,
                   delimitor="_"):

    # clean up output hdfs folder

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname)

    t0 = time()

    # get exclusion data
    ex_files = os.path.join(hdfs_excl_dir, hdfs_excl_fname_list)
    if ',' in hdfs_excl_fname_list:
        ex_files = ""
        comma = ""
        for fn in hdfs_excl_fname_list.split(','):
            ex_files = ex_files + comma + os.path.join(hdfs_excl_dir, fn)
            comma = ","
    print "INFO: mc only files=", ex_files
    ex_rdd = sc.textFile(ex_files).map(lambda x: (x, x)).distinct().cache()

    # get source files by pattern: the 1st part of filename
    if fname_list == "_1st_part_":
        f_list = hdfs.ls(hdfs_src_dir)
        f_dict = {}
        for i in f_list:
            bname = os.path.basename(i)
            pattern = bname
            if bname.index('_') > 0:
                pattern = bname[:bname.index('_') + 1] + '*.gz'
            dname = os.path.dirname(i)
            pname = os.path.join(dname, pattern)
            #
            if not pname in f_dict:
                #print "p:",pname
                f_dict[pname] = 1
        # create list string
        fname_list = ','.join(f_dict)

    # for compression
    codec = "org.apache.hadoop.io.compress.GzipCodec"
    # for each source files, exclude data and save to output
    for fn in sorted(fname_list.split(',')):
        if len(fn) == 0:
            continue
        ofn = None
        if not 'hdfs://' in fn:
            src_file = os.path.join(hdfs_src_dir, fn)
            # assume output name is the 1st part of input fname
            ofn = fn.partition(delimitor)[0]
            out_file = os.path.join(hdfs_out_dir, ofn)
        else:  # list from "_1st_part_"
            src_file = fn
            bname = os.path.basename(fn)
            ofn = bname.partition(delimitor)[0]
            out_file = os.path.join(hdfs_out_dir, ofn)

        print "INFO: out files=", out_file, "ofn=", ofn

        # clean up existing file
        ml_util.ml_clean_up_hdfs_file(out_file)

        #load data & set key; may have .<familyname>; left join, filter non-matching; remove key & save
        src_rdd = None
        cnts = 0
        label_list = None
        try:
            src_rdd = sc.textFile(src_file)
            #label_before=src_rdd.map(lambda x: x[:7]).distinct().collect()
            #print "INFO: label_before=",label_before
            src_rdd=src_rdd.filter(lambda x: not x is None) \
                .map(lambda x: (x.split('\t'),x)) \
                .filter(lambda x: len(x[0])>1) \
                .cache()
            #label_list=src_rdd.map(lambda x:x[0][0]).distinct().collect()
            #print "INFO: label_list=",label_list
            src_rdd = src_rdd.map(lambda x:
                                  (x[0][1].split('.')[0], x[1])).cache()

            #    .map(lambda x: (x.split('\t')[1].split('.')[0],x)).cache()
            cnts = src_rdd.count()
        except:
            print "WARNING:", sys.exc_info()[0]

        #print "INFO: input ["+ofn+"] count=",cnt
        if cnts > 0:
            f_rdd=src_rdd \
                .leftOuterJoin(ex_rdd) \
                .filter(lambda x:x[1][1] is None) \
                .map(lambda x:x[1][0]).cache()
            cntf = f_rdd.count()
            print "INFO: [" + ofn + "] remain ratio=", cntf, "/", cnts, "=", str(
                cntf * 1.0 / cnts)
            if cntf > 0:
                f_rdd.saveAsTextFile(out_file, codec)
            else:
                print "WARNING: filter excluded all data for [" + ofn + "]"
        else:
            print "WARNING: [" + ofn + "] has no data"
    #End for loop

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)
    print 'INFO: Finished!'
    return 0
示例#16
0
def mrun(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr,
         excluded_feat_cslist, sp_master, spark_rdd_compress,
         spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir,
         zipcode_dir, zip_file_name, mongo_tuples, fromweb, training_fraction,
         jobname, run_number, bin_number):

    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist

    ### load libsvm file ###
    #libsvm_data_file = data_folder + "libsvm_data"
    # source libsvm filename
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file:", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    #print samples_rdd.count()

    # collect all data to local for processing ===============
    all_data = samples_rdd.map(
        lambda p: p[0]).collect()  # keep LabeledPoint only
    # 2-D array
    features_list = [x.features.toArray() for x in all_data]
    # label array
    labels_list_all = [x.label for x in all_data]

    # convert to np array
    labels_list_all = array(labels_list_all)
    features_array = np.array(features_list)

    ### generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    t1 = time()
    print "INFO: labels_list_all=", labels_list_all
    print 'INFO: data generating time: %f' % (t1 - t0)

    label_set = set(labels_list_all)
    class_num = len(label_set)
    #class_num = len(labels_list)
    if class_num > 2:
        print "INFO: Number of classes =", class_num

    ###############################################
    ###########build learning model################
    ###############################################

    ### parse parameters and generate the model ###
    (clf, model_name) = parse_para_and_get_model(ml_opts)
    if model_name == "none":
        return

    t0 = time()
    accuracy_array = np.zeros(run_number)
    for rnd in range(0, run_number):

        ### randomly split the samples into training and testing data
        X_train_sparse, X_test_sparse, labels_train, labels_test = \
            cross_validation.train_test_split(features_sparse_mtx, labels_list_all, test_size=(1-training_fraction))

        #####fit the model to training dataset ####
        try:
            clf.fit(X_train_sparse, labels_train)
        except ValueError as v:
            print "INFO: ValueError:", v
            #raise v didn't work
            return -1

        ### Evaluating the model on testing data
        labels_pred = clf.predict(X_test_sparse)

        accuracy = clf.score(X_test_sparse, labels_test)
        accuracy_array[rnd] = accuracy

        print "INFO: current round: ", rnd
        print "INFO: Accuracy = ", accuracy

    ###############################################
    #######plot distribution and variance##########
    ###############################################

    plt.figure(1)

    num_bins = bin_number  ####10 is default
    n, bins, patches = plt.hist(accuracy_array,
                                num_bins,
                                normed=1,
                                facecolor='green',
                                alpha=0.5)
    ave = np.mean(accuracy_array)
    print "INFO: num_bins=", num_bins
    print "INFO: accuracy_array=", accuracy_array
    print "INFO: Accuracy mean: ", ave
    variance = np.std(accuracy_array)
    print "INFO: Accuracy variance: ", variance

    # add a 'best fit' line
    y = mlab.normpdf(bins, ave, variance)
    #print "INFO: y: ", y
    plt.plot(bins, y, 'r--')

    plt.title('Accuracy distribution of ' + str(run_number) + ' runs:')
    plt.xlabel('Accuracy Values')
    plt.ylabel('Probability / Accuracy bar width')
    mrun_fname = os.path.join(local_out_dir,
                              row_id_str + "_var_" + str(run_number) + ".png")
    plt.savefig(mrun_fname)

    # create data for graph ====================
    all_json = []
    barp_arr = []  #n
    disp_arr = []  #y
    last_idx = 0
    for idx, ht in enumerate(n):  # n is bar height
        #print "mrun bar=",idx, bins[idx], bins[idx+1],((bins[idx]+bins[idx+1])/2)
        barp_arr.append([((bins[idx] + bins[idx + 1]) / 2.0),
                         n[idx]])  # mid point for x axis, may shift 100% bar
        if not math.isnan(y[idx]):
            disp_arr.append([bins[idx], y[idx]])
        last_idx = idx
    #print last_idx+1, bins[last_idx+1], y[last_idx+1]
    if not math.isnan(y[last_idx + 1]):
        disp_arr.append([bins[last_idx + 1], y[last_idx + 1]])
    #print "barp_arr=", barp_arr
    #print "disp_arr=", disp_arr
    #bar
    bar_json = {}
    bar_json["values"] = barp_arr
    bar_json["key"] = 'Mutil-Run Accuracy'  #
    bar_json["type"] = "bar"  # light blue
    bar_json["yAxis"] = 1
    all_json.append(bar_json)
    #distribution
    if len(disp_arr) > 0:
        dis_json = {}
        dis_json["values"] = disp_arr
        dis_json["key"] = 'Normal Distribution'  #
        dis_json["type"] = "line"  # light blue
        dis_json["yAxis"] = 1
        all_json.append(dis_json)

    mrun_jfile = os.path.join(local_out_dir, row_id_str + "_mrun.json")
    #mrun_jfile = local_out_dir+row_id_str+"_mrun.json"
    #print "INFO: all_json=",all_json
    print "INFO: mrun_jfile=", mrun_jfile
    if os.path.exists(mrun_jfile):
        try:
            os.remove(mrun_jfile)
        except OSError, e:
            print("ERROR: %s - %s." % (e.mrun_jfile, e.strerror))
def feat_importance_ip(row_id_str, ds_id, hdfs_feat_dir, local_score_file,
                       score_file_IT, sp_master, spark_rdd_compress,
                       spark_driver_maxResultSize, sp_exe_memory, sp_core_max,
                       zipout_dir, zipcode_dir, zip_file_name, mongo_tuples,
                       jobname, uploadtype):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])
    '''    
    SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress'))
    SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
    #SparkContext.setSystemProperty('spark.kryoserializer.buffer.mb', config.get('spark', 'spark_kryoserializer_buffer_mb'))
    SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
    SparkContext.setSystemProperty('spark.cores.max', args.core_max)
    sc = SparkContext(args.sp_master, 'feature_importance_2ways:'+str(args.row_id))
    '''
    t0 = time()

    # get folder list (labels) from hdfs data_out/<id>/metadata  ==============
    dirFile_loc = os.path.join(hdfs_feat_dir, "metadata")
    dirFolders = sc.textFile(dirFile_loc)

    hash_Folders = dirFolders.collect()
    print "INFO: dirFile_loc=", dirFile_loc, ", hash_Folders=", hash_Folders
    folder_list = [x.encode('UTF8') for x in hash_Folders]
    print "INFO: folder_list=", folder_list

    # get feature seq : ngram hash mapping ==================================
    key = "dic_seq_hashes"  #{"123":"136,345"}
    jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
    jstr_proj = '{"value":1}'

    # get parent dataset's data
    if ds_id != row_id_str:
        jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

    doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
    dic_list = doc['value']

    dic_all_columns = dic_list
    feature_count = len(dic_list)

    #print "INFO: feature_count=",feature_count
    #print "dic_list=",dic_list #{u'123,345':u'136'}
    #print "INFO: dic_all_columns=",dic_all_columns # {1: u'8215,8216'}
    # end

    # get hash : raw string mapping ==================================
    key = "dic_hash_str"  #{"123":"openFile"}
    jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
    jstr_proj = '{"value":1}'
    # get parent dataset's data
    if ds_id != row_id_str:
        jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

    doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
    dic_hash_str = doc['value']
    '''
    # get folder list (labels) from hdfs data_out/<id>/libsvm  ==============
    libsvm_loc = os.path.join(hdfs_feat_dir , "libsvm_data") 
    
    # based on label, divide RDD into arrays
    f_rdd = sc.textFile(libsvm_loc).map(lambda x: libsvm2tuple_arr(x))
    
    arr_libsvm=sorted(f_rdd.collect(), key=lambda x:x[0]) # sorted by label
    '''
    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file
    print "INFO: feature_count=",feature_count\

    # get sample array from hdfs
    arr_libsvm = zip_feature_util.get_sample_tuple_arr(sc, libsvm_data_file)
    # sorted by label
    arr_libsvm = sorted(arr_libsvm, key=lambda x: x[0])

    # convert libsvm to features_list, row_list, col_list, sample count, col_num
    lbl_flag = -1

    row_num_training = 0

    sparse_mtx_list = []  # for feat impor calculation
    features_list = []  # for csc_matrix
    row_list = []  # for csc_matrix
    col_list = []  # for csc_matrix
    sample_numbers = []  # for csc_matrix
    feature_arr = None

    for idx, i in enumerate(arr_libsvm):
        #print "idx=",idx,",l=",i[0],",d=",i[1:]

        if lbl_flag != i[0]:
            if feature_arr and len(feature_arr) > 0:
                features_list.append(np.array(feature_arr))
                row_list.append(np.array(row_arr))
                col_list.append(np.array(col_arr))
                sample_numbers.append(cnt)
            row_arr = []
            col_arr = []
            feature_arr = []
            cnt = 0
            lbl_flag += 1

        for j in i[1:]:
            row_arr.append(cnt)
            col_arr.append(j[0] - 1)
            feature_arr.append(j[1])
        cnt += 1
    # for last part
    if len(feature_arr) > 0:
        features_list.append(np.array(feature_arr))
        row_list.append(np.array(row_arr))
        col_list.append(np.array(col_arr))
        sample_numbers.append(cnt)

    #print ",features_list=",features_list
    #print ",row_list=",row_list
    #print ",col_list=",col_list
    print "INFO: sample_numbers=", sample_numbers

    col_num = len(dic_list)
    print "INFO: column number: ", col_num  #, ",len(max_feat_list)=",len(max_feat_list)

    for i in range(0, len(features_list)):
        #print "i=",i
        #print "features_list=",features_list[i]
        #print "row_list=",row_list[i]
        #print "col_list=",col_list[i]
        #print "sample_numbers=",sample_numbers[i]
        sparse_mtx = csc_matrix((features_list[i], (row_list[i], col_list[i])),
                                shape=(sample_numbers[i], col_num))
        sparse_mtx_list.append(sparse_mtx)

    #print sparse_mtx_list[0]
    print "INFO: sparse_mtx_list[0].shape=", sparse_mtx_list[0].shape
    #print sparse_mtx_list[1]
    print "INFO: sparse_mtx_list[1].shape=", sparse_mtx_list[1].shape

    exclusive_feature_set_mal = []
    exclusive_feature_set_clean = []
    dic_feature_cnt_mal = {}
    dic_feature_cnt_clean = {}

    dic_score = {}
    dic_cnt_mal = {}
    dic_cnt_clean = {}
    dic_IT_grain = {}
    ####################################################
    ####feature importance algorithms: 2 methods ####### # Only for 2 classes ???
    ####################################################
    if len(sample_numbers) == 2:

        ###################################################
        ################## calculate probability ############
        ###################################################

        print "INFO: =======Feature Importance(probability) ================ "

        for j in range(0, col_num):

            curr_col_dirty = sparse_mtx_list[0].getcol(j)
            sum_col = curr_col_dirty.sum(0)
            cnt_mal = sum_col.tolist()[0][0]

            curr_col_clean = sparse_mtx_list[1].getcol(j)
            sum_col = curr_col_clean.sum(0)
            cnt_clean = sum_col.tolist()[0][0]

            percnt_mal = cnt_mal / float(sample_numbers[0])
            percnt_clean = cnt_clean / float(sample_numbers[1])
            score_j = (percnt_mal + 1 - percnt_clean) / 2

            dic_score[j + 1] = score_j
            dic_cnt_clean[j + 1] = cnt_clean
            dic_cnt_mal[j + 1] = cnt_mal

        sorted_score = sorted(dic_score.items(),
                              key=operator.itemgetter(1),
                              reverse=True)

        #print "sorted_score:", sorted_score
        #print "dic_cnt_clean", dic_cnt_clean
        #print "dic_cnt_mal", dic_cnt_mal

        ############output result########################

        if os.path.exists(local_score_file):
            try:
                os.remove(local_score_file)
            except OSError, e:
                print("Error: %s - %s." % (e.local_score_file, e.strerror))

        for ii in range(0, len(sorted_score)):
            (feat, score) = sorted_score[ii]
            #print feat, score, dic_all_columns[feat]

            if dic_hash_str:
                description_str = feats2strs(dic_all_columns[str(feat)],
                                             dic_hash_str)
            else:
                description_str = "N/A"
                print "Warning: No mapping found for feature number"

            str01 = str(feat) + "\t" + str(
                score) + "\t" + description_str + "\n"
            with open(local_score_file, "a") as f:
                f.write(str01)

        ########################################################
        ##################Information Gain (entropy)############
        ########################################################

        print "INFO: =======Information Gain================ "
        for j in range(0, col_num):
            cnt_mal = dic_cnt_mal[j + 1]
            cnt_clean = dic_cnt_clean[j + 1]

            total_samples = sample_numbers[0] + sample_numbers[1]

            p0 = float(sample_numbers[0]) / total_samples
            p1 = 1 - p0

            if p0 == 0 or p1 == 0:
                parent_entropy = 0
            else:
                parent_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(p1)

            if cnt_clean + cnt_mal == 0:
                information_gain = 0
            elif total_samples - cnt_clean - cnt_mal == 0:
                information_gain = 0
            else:
                p0 = float(cnt_mal) / (cnt_clean + cnt_mal)
                p1 = 1 - p0
                if p0 == 0 or p1 == 0:
                    child_left_entropy = 0
                else:
                    child_left_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(
                        p1)

                p0 = float(sample_numbers[0] - cnt_mal) / (total_samples -
                                                           cnt_clean - cnt_mal)
                p1 = 1 - p0
                if p0 == 0 or p1 == 0:
                    child_right_entropy = 0
                else:
                    child_right_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(
                        p1)

                weighted_child_entropy = child_left_entropy * float(
                    cnt_clean +
                    cnt_mal) / total_samples + child_right_entropy * float(
                        total_samples - cnt_clean - cnt_mal) / total_samples
                information_gain = parent_entropy - weighted_child_entropy

            dic_IT_grain[j + 1] = information_gain

        sorted_IT_gain = sorted(dic_IT_grain.items(),
                                key=operator.itemgetter(1),
                                reverse=True)

        if os.path.exists(score_file_IT):
            try:
                os.remove(score_file_IT)
            except OSError, e:
                print("Error: %s - %s." % (e.score_file_IT, e.strerror))
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr,
          sp_master, spark_rdd_compress, spark_driver_maxResultSize,
          sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name,
          mongo_tuples, labelnameflag, fromweb, src_filename, jobname,
          model_data_folder):

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    #data_folder = hdfs_feat_dir + "/"
    #local_out_dir = local_out_dir + "/"
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # ML model filename ====
    model_fname = os.path.join(model_data_folder, row_id_str + '.pkl')
    print "INFO: model_data_folder=", model_data_folder
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir,
                                   model_data_folder, model_fname)

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    # start here =================================================================== ===============
    t0 = time()

    ### load libsvm file: may or may not be PCA-ed ###
    libsvm_data_file = os.path.join(hdfs_feat_dir, src_filename)
    print "INFO: libsvm_data_file=", libsvm_data_file

    # feature count is a variable if PCA
    feature_count = 0

    # samples_rdd may be from PCAed data
    # load sample RDD from text file
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, '')

    # collect all data to local for processing ===============
    all_data = samples_rdd.collect()
    total_sample_count = len(all_data)
    # 2-D array, may be PCAed
    features_list = [x.features.toArray() for x, _ in all_data]
    # label array
    labels_list_all = [x.label for x, _ in all_data]
    # hash array
    hash_list_all = [x for _, x in all_data]
    # convert to np array
    features_array_reduced = np.array(features_list)
    hash_list_all = np.array(hash_list_all)
    labels_list_all = np.array(labels_list_all)
    true_label_array = np.array(labels_list_all, dtype=np.int8)

    print "INFO: total_sample_count=", total_sample_count
    print "INFO: features_array_reduced.shape=", features_array_reduced.shape
    print "INFO: labels_list_all.shape=", labels_list_all.shape
    print "INFO: true_label_array.shape=", true_label_array.shape

    t1 = time()
    print 'INFO: data generating time: %f' % (t1 - t0)

    ###############################################
    ########## build learning model ###############
    ###############################################

    ### parse parameters and generate the model ###
    (model, alg, n_clusters) = parse_para_and_get_model(ml_opts_jstr)
    if model is None:
        return

    labels_kmeans = None
    #### fit the model to training dataset ####
    try:
        model.fit(features_array_reduced)
        labels_kmeans = model.labels_  #'numpy.ndarray'

    except:
        print "ERROR: Error in model.fit(): ", "model=", model, ", sys.exc_info:", sys.exc_info(
        )[0]
        return

    #### save clf for future use ####
    #joblib.dump(model, model_data_folder + row_id_str+'.pkl')
    joblib.dump(model, model_fname)

    #print "**model:intercept***"
    #print clf.intercept_

    print "INFO: model type=", type(model), " model=", model

    ###################################################
    ### generate label names (family names) ###########
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    ###################################################

    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
        jstr_proj = '{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

        doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']

        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels_list=", labels_list

    #Adjusted Mutual Information between two clusterings
    amis = adjusted_mutual_info_score(labels_list_all, labels_kmeans)
    print "INFO: Adjusted_mutual_info_score=", amis
    #Similarity measure between two clusterings
    ars = adjusted_rand_score(labels_list_all, labels_kmeans)
    print "INFO: Adjusted_rand_score=", ars

    ###################################################
    #######plot histogram                       ####
    ###################################################
    plot_col_num = int(math.ceil(math.sqrt(n_clusters)))
    figsize = (4 * plot_col_num,
               3 * int(math.ceil(n_clusters * 1.0 / plot_col_num)))
    print "INFO: labels_list_all.shape=", labels_list_all.shape, "labels_kmeans.shape=", labels_kmeans.shape
    print "INFO: labels_list_all t=", type(
        labels_list_all), "labels_kmeans t=", type(labels_kmeans)
    print "INFO: n_clusters=", n_clusters, ",label_dic=", label_dic
    print "INFO: plot_col_num=", plot_col_num, ",figsize=", figsize, ",local_out_dir=", local_out_dir

    # kmeans histogram
    _, p_true = ml_plot_kmeans_histogram_subfigures(labels_list_all,
                                                    labels_kmeans,
                                                    n_clusters,
                                                    names=label_dic,
                                                    plot_col_num=plot_col_num,
                                                    figsize=figsize,
                                                    folder=local_out_dir,
                                                    rid=row_id_str)
    # normalized kmeans histogram
    _, p_true_norm = ml_plot_kmeans_histogram_subfigures(
        labels_list_all,
        labels_kmeans,
        n_clusters,
        names=label_dic,
        plot_col_num=plot_col_num,
        figsize=figsize,
        normalize=True,
        folder=local_out_dir,
        rid=row_id_str)

    ####plot "reverse" histogram with labels ####
    #num_bars = len(np.unique(labels_list_all))
    num_bars = max(labels_list_all) + 1
    figsize = (4 * plot_col_num,
               3 * int(math.ceil(num_bars * 1.0 / plot_col_num)))

    _, p_cluster = ml_plot_kmeans_histogram_subfigures(
        labels_kmeans,
        labels_list_all,
        num_bars,
        names=label_dic,
        plot_col_num=plot_col_num,
        figsize=figsize,
        reverse=True,
        folder=local_out_dir,
        rid=row_id_str)

    #### plot dot figures ####
    #mtx_label = model.labels_
    mtx_center = model.cluster_centers_
    # dot plot for Kmeans   ===========
    filename = os.path.join(local_out_dir, row_id_str + '_cluster.png')
    filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d.json')
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced,
                                       labels_kmeans,
                                       mtx_center,
                                       n_clusters,
                                       figsize=(10, 7),
                                       filename=filename,
                                       title='KMeans',
                                       filename_3d=filename_3d)
    #print "features_array_reduced s=",features_array_reduced.shape

    # dot plot for True Labels  ===========
    filename = os.path.join(local_out_dir, row_id_str + '_cluster_tl.png')
    filename_3d = os.path.join(local_out_dir,
                               row_id_str + '_cluster_3d_tl.json')
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced,
                                       true_label_array,
                                       mtx_center,
                                       n_clusters,
                                       figsize=(10, 7),
                                       filename=filename,
                                       title='True Labels',
                                       filename_3d=filename_3d)

    dataset_info = {
        "training_fraction": 1,
        "class_count": n_clusters,
        "dataset_count": total_sample_count
    }
    # only update db for web request   ===========
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set accuracy = '" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', total_feature_numb='"+str(feature_count) \
            +"', perf_measures='{}" \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    #print 'Finished!'
    return 0