예제 #1
0
def replace_miss(data, features, label = "", for_train = True):
	delete_fea_index = []
	missing_num = []
	new_data = data.copy()
	new_features = features.copy()

	if for_train:
		SAVE_DIR = "resultData"
	else:
		SAVE_DIR = "resultData/test/"
	#!start from range(1,...) is because the first line of the feature is the id, useless
	for fea_pos in range(1, len(features)):
		fea_val_cla = feature_value_class(data, fea_pos, label)
		if not fea_val_cla[-1]._present_num == 0:
			new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \
												delete_fea_index, missing_num)
	if for_train:
		new_data, new_features, deleted_feas = delete_features(new_data, new_features, \
															delete_fea_pos = delete_fea_index)

		save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features)

		save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \
					np.array(deleted_feas), dir_name = SAVE_DIR)


	return new_data, new_features
예제 #2
0
def use_PCA_to_delete(data, features, needed_delete_featuers):
    stored_features = dict()
    for fea in needed_delete_featuers:

        stored = list()

        print("now!:", fea)
        fea_index = find_featuers_index(fea, features)
        print("finded: ", fea_index)
        delete_features_data = data[:, fea_index]
        from sklearn import decomposition
        pca = decomposition.PCA()
        pca.fit(delete_features_data)

        result = pca.explained_variance_
        print(result)
        mean = np.mean(result)
        print("mean:", mean)
        stored = [features[fea_index[i]] for i in range(len(result)) \
              if result[i] >= mean]
        #print(stored)

        save_result(stored, "after_deleted_by_pca.csv", style="a+")
        stored_features[fea] = stored
    print(stored_features)
    return stored_features
예제 #3
0
def use_PCA_to_delete(data, features, needed_delete_featuers):
	stored_features = dict()
	for fea in needed_delete_featuers:

		stored = list()

		print("now!:", fea)
		fea_index = find_featuers_index(fea, features)
		print("finded: ", fea_index)
		delete_features_data = data[:, fea_index]
		from sklearn import decomposition
		pca = decomposition.PCA()
		pca.fit(delete_features_data)

		result = pca.explained_variance_
		print(result)
		mean = np.mean(result)
		print("mean:", mean)
		stored = [features[fea_index[i]] for i in range(len(result)) \
							 if result[i] >= mean]
		#print(stored)

		save_result(stored, "after_deleted_by_pca.csv", style = "a+")
		stored_features[fea] = stored
	print(stored_features)
	return stored_features 
예제 #4
0
def replace_miss(data, features, label = "", for_train = True, is_round_two = False):
	delete_fea_index = []
	missing_num = []
	new_data = data.copy()
	new_features = features.copy()

	if for_train:
		if is_round_two:
			SAVE_DIR = "resultData_two"
		else:
			SAVE_DIR = "resultData"
	else:
		if is_round_two:
			SAVE_DIR = "resultData_two/test/"
		else:
			SAVE_DIR = "resultData/test/"
	threshold = int(data.shape[0] * 2 / 3)
	print("threshold: ", threshold)
	#!start from range(1,...) is because the first line of the feature is the id, useless
	for fea_pos in range(1, len(features)):
		fea_val_cla = feature_value_class(data, fea_pos, label)
		if not fea_val_cla[-1]._present_num == 0:
			new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \
												delete_fea_index, missing_num, threshold)
	if for_train and not is_round_two:
		new_data, new_features, deleted_feas = delete_features(new_data, new_features, \
															delete_fea_pos = delete_fea_index)
		print("delete while training: ", deleted_feas)
		#save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features)

		save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \
					np.array(deleted_feas), dir_name = SAVE_DIR)


	return new_data, new_features
예제 #5
0
def calculate_draw_roc(classifier, data, features, label, cv_Flod, original_data, original_label):
	mean_tpr = 0.0
	mean_fpr = np.linspace(0, 1, 100)
	all_tpr = []

	my_test = original_data[:3000]
	my_label = original_label[:3000]

	features_importance = dict()

	for i, (train, test) in enumerate(cv_Flod):
	    
	    fitted_classifier = classifier.fit(data[train], label[train])
	    probas_ = fitted_classifier.predict_proba(data[test])
	    if i == 1:
	    	save_result(probas_, "predict_result.csv")
	    	save_result(label[test], "original_result.csv")

	    # Compute ROC curve and area the curve
	    fpr, tpr, thresholds = roc_curve(label[test], probas_[:, 1])
	    mean_tpr += interp(mean_fpr, fpr, tpr)
	    mean_tpr[0] = 0.0
	    roc_auc = auc(fpr, tpr)
	    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))


	    importances = fitted_classifier.feature_importances_

	    indices = np.argsort(importances)[::-1]

	    print("Feature ranking: ")
	    for f in range(data.shape[1]):
	    	print("%s. %d (%f)" % (features[indices[f]], indices[f], importances[indices[f]]))
	    	features_importance[features[indices[f]]] = importances[indices[f]]

	test_probs = fitted_classifier.predict_proba(my_test)
	test_fpr, test_tpr, test_thresholds = roc_curve(my_label, test_probs[:, 1])
	roc_auc = auc(test_fpr, test_tpr)
	plt.plot(test_fpr, test_tpr, lw=1, label='ROC test (area = %0.2f)' % (roc_auc))



	plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

	mean_tpr /= len(cv_Flod)
	mean_tpr[-1] = 1.0
	mean_auc = auc(mean_fpr, mean_tpr)
	plt.plot(mean_fpr, mean_tpr, 'k--',
	         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

	plt.xlim([-0.05, 1.05])
	plt.ylim([-0.05, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver operating characteristic example')
	plt.legend(loc="lower right")
	plt.savefig("ROC_GB_user_all_solved_lr(0.05).png")

	return features_importance
예제 #6
0
def combine_land_modify_infos(data,
                              log_info_data,
                              update_info_data,
                              saved_dir="resultData"):
    for_train = False
    # this is a data from test data
    if data.shape[1] == 2:
        for_train = True

    all_id_info = OrderedDict()
    for id_pos in range(len(data)):
        id_name = data[id_pos, 0]
        all_id_info[id_name] = OrderedDict()
        if for_train:
            all_id_info[id_name]["target"] = None
        else:
            all_id_info[id_name]["target"] = data[id_pos, 1]
        splited_date = data[id_pos, -1].split("/")
        # is the date`s style is 02/3/2014
        if int(splited_date[-1]) > int(splited_date[0]):
            # convert it to 2014/3/02
            t = splited_date[-1]
            splited_date[-1] = splited_date[0]
            splited_date[0] = t
            data[id_pos, -1] = splited_date[0] + "/" + splited_date[
                1] + "/" + splited_date[-1]
        all_id_info[id_name]["borrow_success_date"] = data[id_pos, -1]
        # add the land info
        all_id_info[id_name]["land_info"] = OrderedDict()
        land_date = list()
        land_operate_code = list()
        land_operate_style = list()

        id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0]
        for i in id_index_in_land:

            land_operate_code.append(log_info_data[i, 2])
            land_operate_style.append(log_info_data[i, 3])
            land_date.append(log_info_data[i, 4])
        all_id_info[id_name]["land_info"][
            "land_operate_code"] = land_operate_code
        all_id_info[id_name]["land_info"][
            "land_operate_style"] = land_operate_style
        all_id_info[id_name]["land_info"]["land_date"] = land_date

        # add the modify info
        all_id_info[id_name]["modify_info"] = OrderedDict()
        modify_info = list()
        modify_date = list()
        id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0]
        for i in id_index_in_modify:
            modify_info.append(update_info_data[i, 2])
            modify_date.append(update_info_data[i, 3])

        all_id_info[id_name]["modify_info"]["modify_things"] = modify_info
        all_id_info[id_name]["modify_info"]["modify_date"] = modify_date
    save_result(all_id_info, "all_id_info.pickle", dir_name=saved_dir)
def combine_land_modify_infos(data, log_info_data, update_info_data, for_train, LU_info_file, saved_dir):


	all_id_info = OrderedDict()
	for id_pos in range(len(data)):
		id_name = data[id_pos, 0]
		all_id_info[id_name] = OrderedDict()
		if not for_train:
			all_id_info[id_name]["target"] = None
		else:
			all_id_info[id_name]["target"] = data[id_pos, 1]
		data[id_pos, -1] = rule_the_date_style(data[id_pos, -1])
		# splited_date = data[id_pos, -1].split("/")
		# # it is not splited by "/", so we try another "-"
		# if len(splited_date) == 1:
		# 	splited_date = data[id_pos, -1].split("-")
		# # is the date`s style is 02/3/2014 
		# if int(splited_date[-1]) > int(splited_date[0]):
		# 	# convert it to 2014/3/02
		# 	t = splited_date[-1]
		# 	splited_date[-1] = splited_date[0]
		# 	splited_date[0] = t
		# 	data[id_pos, -1] = splited_date[0] + "/" + splited_date[1] + "/" + splited_date[-1]
		all_id_info[id_name]["borrow_success_date"] = data[id_pos, -1]
		# add the land info
		all_id_info[id_name]["land_info"] = OrderedDict()
		land_date = list()
		land_operate_code = list()
		land_operate_style = list()

		id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0]
		for i in id_index_in_land:

			land_operate_code.append(log_info_data[i, 2])
			land_operate_style.append(log_info_data[i, 3])
			land_date.append(rule_the_date_style(log_info_data[i, 4]))
		all_id_info[id_name]["land_info"]["land_operate_code"] = land_operate_code
		all_id_info[id_name]["land_info"]["land_operate_style"] = land_operate_style
		all_id_info[id_name]["land_info"]["land_date"] = land_date

		# add the modify info
		all_id_info[id_name]["modify_info"] = OrderedDict()
		modify_info = list()
		modify_date  =list()
		id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0]
		for i in id_index_in_modify:
			modify_info.append(update_info_data[i, 2])
			modify_date.append(rule_the_date_style(update_info_data[i, 3]))

		all_id_info[id_name]["modify_info"]["modify_things"] = modify_info
		all_id_info[id_name]["modify_info"]["modify_date"] = modify_date
	save_result(all_id_info, LU_info_file, dir_name = saved_dir)
def solve_weblog_info_package(data, features, saved_dir = "resultData/"):
	from map_features_to_digit import convert_to_numerical
	from solve_data import delete_features

	data, features = new_WI_19(data, features)
	data, features = new_WI_20_by_present(data, features)
	#data, features = new_WI_20_by_positive(data, features)
	data, features = new_WI_21(data, features)
#	save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir)
	
	save_result(data, "data_after_solved_weblog.csv", features, dir_name = saved_dir)

	return data, features
예제 #9
0
파일: main.py 프로젝트: CSJLOVEJX/DataPigs
def pipeline_for_features_solved(for_train = True, is_round_two = False):
	if not for_train and not is_round_two:
		print("I f**k your mother, cao ni ma de !! SB!!!!!")
		return 0
	if for_train:
		print("**************** Train ************************")
		if is_round_two:
			print("******* Round Two *********")
			data_file_name = "Kesci_Master_9w_gbk_3_2.csv"
			saved_area = "resultData_two"
		else:
			data_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv"
			saved_area = "resultData"
	else:
		print("**************** Test ************************")
		if is_round_two:
			print("******* Round Two *********")
			data_file_name = "Kesci_Master_9w_gbk_1_test_set.csv"
			saved_area = "resultData_two/test"
		else:
			data_file_name = "PPD_Master_GBK_2_Test_Set.csv"
			saved_area = "resultData/test"

	data, features, label = load_data_for_solve(data_file_name, for_train, is_round_two)
	data, features = replace_miss(data, features, label, for_train, is_round_two)

	if not for_train or is_round_two:
		print("all deleted: ")
		deleted_features_in_train = load_all_deleted_features_during_train(is_round_two = False, deleted_features_file_label = "deleted_")
		#print(deleted_features_in_train)
		data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train)
		print(deleted)

	data, features = solve_user_info_package(data, features, saved_dir = saved_area)
	#save_result(data, "after_solve_user_info.csv", features, dir_name = saved_area)
	data, features = solve_weblog_info_package(data, features, saved_dir = saved_area)
	if for_train and not is_round_two:
		data, features = deleted_web_log_features(data, features, saved_dir = saved_area)

	data, features = solve_thirdparty_info_package(data, features, saved_dir = saved_area)


	data, features = extract_log_update_package(data, features, for_train, is_round_two)

	save_result(data, "data_after_features_processed.csv", features, dir_name = saved_area)

	print("****** all finished *********")
	print("size: (data, features)")
	print(data.shape)

	return data, features
예제 #10
0
def combine_land_modify_infos(data, log_info_data, update_info_data, saved_dir = "resultData"):
	for_train = False
	# this is a data from test data
	if data.shape[1] == 2:
		for_train = True

	all_id_info = OrderedDict()
	for id_pos in range(len(data)):
		id_name = data[id_pos, 0]
		all_id_info[id_name] = OrderedDict()
		if for_train:
			all_id_info[id_name]["target"] = None
		else:
			all_id_info[id_name]["target"] = data[id_pos, 1]
		splited_date = data[id_pos, -1].split("/")
		# is the date`s style is 02/3/2014 
		if int(splited_date[-1]) > int(splited_date[0]):
			# convert it to 2014/3/02
			t = splited_date[-1]
			splited_date[-1] = splited_date[0]
			splited_date[0] = t
			data[id_pos, -1] = splited_date[0] + "/" + splited_date[1] + "/" + splited_date[-1]
		all_id_info[id_name]["borrow_success_date"] = data[id_pos, -1]
		# add the land info
		all_id_info[id_name]["land_info"] = OrderedDict()
		land_date = list()
		land_operate_code = list()
		land_operate_style = list()

		id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0]
		for i in id_index_in_land:

			land_operate_code.append(log_info_data[i, 2])
			land_operate_style.append(log_info_data[i, 3])
			land_date.append(log_info_data[i, 4])
		all_id_info[id_name]["land_info"]["land_operate_code"] = land_operate_code
		all_id_info[id_name]["land_info"]["land_operate_style"] = land_operate_style
		all_id_info[id_name]["land_info"]["land_date"] = land_date

		# add the modify info
		all_id_info[id_name]["modify_info"] = OrderedDict()
		modify_info = list()
		modify_date  =list()
		id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0]
		for i in id_index_in_modify:
			modify_info.append(update_info_data[i, 2])
			modify_date.append(update_info_data[i, 3])

		all_id_info[id_name]["modify_info"]["modify_things"] = modify_info
		all_id_info[id_name]["modify_info"]["modify_date"] = modify_date
	save_result(all_id_info, "all_id_info.pickle", dir_name = saved_dir)
def solve_thirdparty_info_package(data, features, saved_dir = "resultData/"):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

	data, features = sta_start_missing_period(data, features)
	data, features = remove_thirdparty6(data, features)

	data, features = fill_thirdParty_miss(data, features)

	data, features = third_party_stable(data, features)

	data, features = third_party_level(data, features)
	save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir)
	return data, features 
예제 #12
0
def correlation_between_properties(data, features):
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)

	title = list()
	title.append("features1")
	title.append("features2")
	title.append("calculate_method") 
	title.append("cor")
	title.append("pval")
	save_result(title, "pearsonr_spearmanr_results.csv")
	save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
	for fea_pos in range(len(features)):
		for fea_pos_add in range(fea_pos + 1, len(features)):
			info_result = list()
			info_result.append(features[fea_pos])
			info_result.append(features[fea_pos_add])
			a1 = data[:, fea_pos]
			a2 = data[:, fea_pos_add]
			# they are all not str style features
			if fea_pos not in indexs and fea_pos_add not in indexs:
				info_result.append("pearsonr")
				cor, pval = stats.pearsonr(a1, a2)
			else: # one of them or all of them are str style features
				info_result.append("spearmanr")
				cor, pval = stats.spearmanr(a1, a2)
			cor = round(cor, 3)
			info_result.append(cor)
			info_result.append(pval)
			if abs(cor) >= 0.2:
				save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+")
			if abs(cor) >= 0.86:
				save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
												style = "a+")
예제 #13
0
def use_RandomForestRegressor_to_delete(data, features, label):
	from sklearn.cross_validation import cross_val_score, ShuffleSplit
	from sklearn.ensemble import RandomForestRegressor

	rf = RandomForestRegressor(n_estimators=50, max_depth=4)
	scores = []
	deleted_features = list()
	for i in range(1, data.shape[1]):
	     score = cross_val_score(rf, data[:, i:i+1], label, scoring="r2",
	                              cv=ShuffleSplit(len(data), 3, .3))
	     scores.append((round(np.mean(score), 3), features[i]))
	     if round(np.mean(score), 3) < 0.01:
	     	deleted_features.append({features[i]:round(np.mean(score), 3)})
	save_result(deleted_features, "RandomForestRegressor_delete_result.csv")
	print(sorted(scores, reverse=True))
예제 #14
0
def remove_no_discrimination(data, features, label):
	try:
		a = label.shape
	except:
		return data, features

	index_entroy = sort_features_with_entroy(data, features, label)
	new_data, new_features, deleted_features, delete_fea_entroy = delete_no_discrimination_features(data, features, index_entroy)
	save_result(np.array(delete_fea_entroy), "deleted_features_with_no_discrimination(entroy).csv", \
				np.array(deleted_features))
	# save_result(new_data, "data_after_delete_no_discrimination_features.csv", \
	# 			new_features)
	# save_features_info(new_data, new_features, label, "infos_after_delete_features.csv")
	#write_to_deleted_features_area(np.array(deleted_features))

	return new_data, new_features
예제 #15
0
def solve_weblog_info_package(data, features, saved_dir="resultData/"):
    from map_features_to_digit import convert_to_numerical
    from solve_data import delete_features

    data, features = new_WI_19(data, features)
    data, features = new_WI_20_by_present(data, features)
    #data, features = new_WI_20_by_positive(data, features)
    data, features = new_WI_21(data, features)
    #	save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir)

    save_result(data,
                "data_after_solved_weblog.csv",
                features,
                dir_name=saved_dir)

    return data, features
예제 #16
0
def solve_thirdparty_info_package(data, features, saved_dir="resultData/"):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    data = convert_to_numerical(data, features)

    data, features = sta_start_missing_period(data, features)
    data, features = remove_thirdparty6(data, features)

    data, features = fill_thirdParty_miss(data, features)

    data, features = third_party_stable(data, features)

    data, features = third_party_level(data, features)
    save_result(data,
                "data_after_thirdparty_solved.csv",
                features,
                dir_name=saved_dir)
    return data, features
예제 #17
0
def use_RandomForestRegressor_to_delete(data, features, label):
    from sklearn.cross_validation import cross_val_score, ShuffleSplit
    from sklearn.ensemble import RandomForestRegressor

    rf = RandomForestRegressor(n_estimators=50, max_depth=4)
    scores = []
    deleted_features = list()
    for i in range(1, data.shape[1]):
        score = cross_val_score(rf,
                                data[:, i:i + 1],
                                label,
                                scoring="r2",
                                cv=ShuffleSplit(len(data), 3, .3))
        scores.append((round(np.mean(score), 3), features[i]))
        if round(np.mean(score), 3) < 0.01:
            deleted_features.append({features[i]: round(np.mean(score), 3)})
    save_result(deleted_features, "RandomForestRegressor_delete_result.csv")
    print(sorted(scores, reverse=True))
예제 #18
0
def load_data_for_solve(file_name, for_train = True):
	label = ""
	if for_train:
		SAVE_DIR = "resultData/"
		# for training, load the data from train directory
		original_features, original_data, original = load_data(file_name)
		original_features, original_data, train_label = extract_target(original_features, original_data)
		label = train_label.copy()
		save_result(label, "train_label_original.csv", dir_name = SAVE_DIR)
	else:
		SAVE_DIR = "resultData/test/"
		# for testing or else, load the data from other place
		original_features, original_data, original = load_data(file_name, data_style = "Test Set")
		#print(deleted_features)
	data = original_data.copy()
	features = original_features.copy()
	save_result(data, "withoutLabel_originalData.csv", features, dir_name = SAVE_DIR)

	return data, features, label
예제 #19
0
파일: main.py 프로젝트: CSJLOVEJX/DataPigs
def submit(test_predict, save_dir):
	###################################### Idx #########################
	print(test_predict)
	test_predict = np.array([round(test_predict[i], 4) for i in range(test_predict.shape[0])])
	print(test_predict)
	contents = load_result("withoutLabel_originalData.csv", dir_name = "resultData_All/test")
	features = np.array(contents[0])

	sublime_features = np.array([features[0], "score"] )

	save_result(sublime_features, "sublime_data.csv", dir_name = save_dir)

	data = np.array(contents[1:])

	test_users = data[:, 0]
	test_users = test_users.reshape((test_users.size, 1))

	test_predict = test_predict.reshape((test_predict.size, 1))

	sublime_data = np.concatenate((test_users, test_predict), axis = 1)


	save_result(sublime_data, "sublime_data.csv", style = "a+", dir_name = save_dir)
def deleted_web_log_features(data, features, saved_dir = "result"):
	from create_new_features import find_featuers_index
	features_name = "WeblogInfo"
	fea_indexs = find_featuers_index(features_name, features)
	# print(fea_indexs)
	weblog_data = data[:, fea_indexs]
	weblog_features = features[fea_indexs]

	correlation_between_properties(weblog_data, weblog_features)
	delete_result = according_properties_correlation_delete()
	#save_result(data, file_name, features, style, dir_name)
	save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv")

	weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13",
						"WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28",
						"WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"]
	save_result(weblog_delete_needed, "deleted_useless_weblog.csv")

	delete_result.extend(weblog_delete_needed)
	
	data, features, deleted = delete_features(data, features, \
											delete_feas_list = delete_result)
	print("Train delete(weblog) : ", deleted)
	return data, features
예제 #21
0
def correlation_between_properties(data, features):
    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    indexs = get_known_features_index(features, fixed_str_features)

    title = list()
    title.append("features1")
    title.append("features2")
    title.append("calculate_method")
    title.append("cor")
    title.append("pval")
    save_result(title, "pearsonr_spearmanr_results.csv")
    save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
    for fea_pos in range(len(features)):
        for fea_pos_add in range(fea_pos + 1, len(features)):
            info_result = list()
            info_result.append(features[fea_pos])
            info_result.append(features[fea_pos_add])
            a1 = data[:, fea_pos]
            a2 = data[:, fea_pos_add]
            # they are all not str style features
            if fea_pos not in indexs and fea_pos_add not in indexs:
                info_result.append("pearsonr")
                cor, pval = stats.pearsonr(a1, a2)
            else:  # one of them or all of them are str style features
                info_result.append("spearmanr")
                cor, pval = stats.spearmanr(a1, a2)
            cor = round(cor, 3)
            info_result.append(cor)
            info_result.append(pval)
            if abs(cor) >= 0.2:
                save_result(info_result,
                            "pearsonr_spearmanr_results.csv",
                            style="a+")
            if abs(cor) >= 0.86:
                save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
                        style = "a+")
예제 #22
0
def load_data_for_solve(file_name, for_train = True, is_round_two = False):
	label = ""
	if for_train:
		if is_round_two:
			SAVE_DIR = "resultData_two/"
			data_dir = "PPD-Second-Round-Data/"
			data_style = "Rematch Train/"
			# for training, load the data from train directory
			original_features, original_data, original = load_data(file_name, data_dir, data_style)
			original_features, original_data, train_label = extract_target(original_features, original_data)
			label = train_label.copy()
			save_result(label, "train_label_original_round_two.csv", dir_name = SAVE_DIR)
		else:
			SAVE_DIR = "resultData/"
			# for training, load the data from train directory
			original_features, original_data, original = load_data(file_name)
			original_features, original_data, train_label = extract_target(original_features, original_data)
			label = train_label.copy()
			save_result(label, "train_label_original.csv", dir_name = SAVE_DIR)
	else:
		if is_round_two:
			SAVE_DIR = "resultData_two/test"
			data_dir = "PPD-Second-Round-Data/"
			data_style = "Rematch Test/"
			# for training, load the data from train directory
			original_features, original_data, original = load_data(file_name, data_dir, data_style)

		else: 
			SAVE_DIR = "resultData/test/"
			# for testing or else, load the data from other place
			original_features, original_data, original = load_data(file_name, data_style = "Test Set")
	
	data = original_data.copy()
	features = original_features.copy()
	save_result(data, "withoutLabel_originalData.csv", features, dir_name = SAVE_DIR)

	return data, features, label
예제 #23
0
	data, features = new_UserInfo_23_education_level(data, features)
#	save_result(data, "data_after_solved_UserInfo23.csv", features, dir_name = saved_dir)

	data, features = new_UserInfo_24_resident_level(data, features)
#	save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir)

	#data, features = new_UserInfo_22_23_combine1(data, features)
#	save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir)

	data, features = new_UserInfo_22_23_combine2(data, features)
	#save_result(data, "data_after_solved_user_info.csv", features, dir_name = saved_dir)

	return data, features

# new_UserInfo_7_num
if __name__ == '__main__':

	contents = load_result("withoutLabel_originalData.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	data, features = new_UserInfo_7_num(data, features)
	save_result(data, "test.csv", features)
	# deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_")
	# data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train)

	# data, features = solve_user_info_package(data, features)

	# from create_features_from_weblog import solve_weblog_info_package
	# data, features = solve_weblog_info_package(data, features)
예제 #24
0
def view_each_features_label(data, features, label):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    str_style_features = np.array(load_result("str_features.csv")[0])
    str_features_index = get_known_features_index(features, str_style_features)

    new_label = label.reshape((label.size, ))
    x = range(len(data))
    for fea_pos in range(len(features)):
        feature_name = features[fea_pos]
        if fea_pos in str_features_index:
            file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(
                fea_pos) + ")" + feature_name + ".png"
        else:
            file_path = "view_data_area/after_all/with_label_under_mean/" + str(
                fea_pos) + ")" + feature_name + ".png"
        features_info = feature_value_class(data, fea_pos, label,
                                            str_features_index)
        if features_info["num_of_value"] > 30:
            save_result([features[fea_pos]],
                        "complex_value_features.csv",
                        style="a+")
        else:
            if fea_pos not in str_features_index:
                save_result(
                    [features[fea_pos]],
                    "simple_discrete_value_features(nonestrfeatures).csv",
                    style="a+")

        y_positive = data[new_label == 1, fea_pos]
        y_negitive = data[new_label == 0, fea_pos]
        positive_index = np.array([
            index for index in range(len(new_label)) if new_label[index] == 1
        ])
        negitive_index = np.array([
            index for index in range(len(new_label)) if new_label[index] == 0
        ])
        plt.scatter(positive_index, y_positive, marker='o', color='r', s=10)
        plt.scatter(negitive_index, y_negitive, marker='x', color='g', s=10)

        plt.xlabel("instances(30000)")
        plt.ylabel("value")
        if features_info["num_of_value"] < 40:
            plt.title(feature_name + " value - label " + "distributed " + "in instances" + \
               "\n the arrow --> Proportion of positive in that value & in positive")
            for k, v in features_info.items():
                if isinstance(v, FeatureInData):
                    arrow_data = round(
                        v._respond_positive_num /
                        features_info["num_positive"], 4)
                    arrow_start_position_x = len(data) + 2000
                    arrow_start_position_y = int(k)
                    arrow_end_postion_x = arrow_start_position_x
                    arrow_end_postion_y = int(k)
                    plt.annotate(arrow_data, \
                       xy=(arrow_start_position_x,arrow_start_position_y), \
                       xytext=(arrow_end_postion_x,arrow_end_postion_y), \
                       arrowprops=dict(facecolor='blue', shrink=0.02))

                    arrow_data = round(
                        v._respond_positive_num / v._present_num, 4)
                    arrow_start_position_x = -4000
                    arrow_start_position_y = int(k)
                    arrow_end_postion_x = arrow_start_position_x
                    arrow_end_postion_y = int(k)
                    plt.annotate(arrow_data, \
                       xy=(arrow_start_position_x,arrow_start_position_y), \
                       xytext=(arrow_end_postion_x,arrow_end_postion_y), \
                       arrowprops=dict(facecolor='blue', shrink=0.02))

        else:
            fea_average = round(np.mean(data[:, fea_pos]), 4)
            fea_std = np.std(data[:, fea_pos])
            fea_oo = round(fea_std / fea_average, 4)
            max_v = np.amax(data[:, fea_pos])
            min_v = np.amin(data[:, fea_pos])
            plt.title(feature_name + " | mean & Proportion of positive under that mean" + \
             "\n degree of fluctuation --> " + str(fea_oo))
            x1 = np.array(range(-5000, 35000))
            y_mean = fea_average * np.ones((x1.size))
            #plt.plot(x1, y_mean, color = 'k', linestyle = "--")
            plt.annotate(fea_average, \
                 xy=(-4000,fea_average), \
                 xytext=(-4000,fea_average), \
                 arrowprops=dict(facecolor='blue', shrink=0.05))
            under_mean_positive = 0
            under_mean_num = 0

            for k, v in features_info.items():
                if isinstance(v, FeatureInData):
                    if k <= fea_average:
                        under_mean_num += v._present_num
                        under_mean_positive += v._respond_positive_num
            ave_posi = round(
                under_mean_positive / features_info["num_positive"], 4)
            plt.annotate(ave_posi, \
              xy=(31000,fea_average), \
              xytext=(31000,fea_average), \
              arrowprops=dict(facecolor='blue', shrink=0.05))
            pos_rat = 0
            pos_rat_whole = 0
            if -1 in features_info.keys():
                pos_rat = features_info[
                    -1]._respond_positive_num / features_info[-1]._present_num
                pos_rat_whole = features_info[
                    -1]._respond_positive_num / features_info["num_positive"]
                plt.annotate(round(pos_rat_whole, 4), \
                  xy=(31000,-1), \
                  xytext=(31000,-1))
                plt.annotate(round(pos_rat, 4), \
                  xy=(-4000,-1), \
                  xytext=(-4000,-1))
            plt.ylim(min_v - 10, fea_average * 2)
            #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10))
        plt.savefig(file_path)
        plt.close()
예제 #25
0
def solve_user_info_package(data, features, saved_dir="resultData"):
    data, features = count_missed_create_new_feature(data, features,
                                                     "UserInfo")

    # ################### solve the education info ####################
    data, features = new_EI_2(data, features)
    data, features = new_EI_4(data, features)
    data, features = new_EI_1_2_3_4(data, features)
    #	save_result(data, "data_after_combine_EI1234.csv", features, dir_name = saved_dir)
    data, features = new_EI_6(data, features)
    data, features = new_EI_8(data, features)
    data, features = new_EI_5_6_7_8(data, features)
    #	save_result(data, "data_after_combine_EI5678.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_miss_count(data, features)
    #	save_result(data, "data_after_count_UserInfo_miss.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_2_level(data, features)
    #	save_result(data, "data_after_solved_UserInfo2_level.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_7_level(data, features)
    #	save_result(data, "data_after_solved_UserInfo7_level.csv", features, dir_name = saved_dir)

    key_features = ["UserInfo_2", "UserInfo_4"]
    feature_name = "UserInfo_2_4_wrong_correspond_city)"
    data, features = new_UserInfo_differ(data, features, key_features,
                                         feature_name)
    #	save_result(data, "data_after_solved_UserInfo2_4.csv", features, dir_name = saved_dir)

    key_features = ["UserInfo_5", "UserInfo_6"]
    feature_name = "UserInfo_5_6_differ"
    data, features = new_UserInfo_differ(data,
                                         features,
                                         key_features,
                                         feature_name,
                                         deleted_all=False)
    #	save_result(data, "data_after_solved_UserInfo5_6.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_789(data, features)
    #	save_result(data, "data_after_solved_UserInfo789.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_11_12_13(data, features)
    key_features = ["UserInfo_11", "UserInfo_12", "UserInfo_13"]
    feature_name = "UserInfo_11_12_13_is_miss"
    data, features = new_UserInfo_differ(data, features, key_features,
                                         feature_name)
    #	save_result(data, "data_after_solved_UserInfo11_12_13.csv", features, dir_name = saved_dir)

    key_features = ["UserInfo_14", "UserInfo_15"]
    feature_name = "UserInfo_14_15_differ"
    data, features = new_UserInfo_differ(data,
                                         features,
                                         key_features,
                                         feature_name,
                                         deleted_all=False)
    #	save_result(data, "data_after_solved_UserInfo14_15.csv", features, dir_name = saved_dir)

    key_features = ["UserInfo_16", "UserInfo_17"]
    feature_name = "UserInfo_16_17_differ"
    data, features = new_UserInfo_differ(data,
                                         features,
                                         key_features,
                                         feature_name,
                                         deleted_all=False)
    #	save_result(data, "data_after_solved_UserInfo16_17.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_18(data, features)
    #	save_result(data, "data_after_solved_UserInfo18.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_19_20(data, features)
    #	save_result(data, "data_after_solved_UserInfo19_20.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_22_marrage(data, features)
    #	save_result(data, "data_after_solved_UserInfo22.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_23_education_level(data, features)
    #	save_result(data, "data_after_solved_UserInfo23.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_24_resident_level(data, features)
    #	save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir)

    #data, features = new_UserInfo_22_23_combine1(data, features)
    #	save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_22_23_combine2(data, features)
    save_result(data,
                "data_after_solved_user_info.csv",
                features,
                dir_name=saved_dir)

    return data, features
예제 #26
0
def strStyle_features_to_digit(data, features, for_train = True, use_experience = False, save_dir = "resultData/"):

	city_features = ["UserInfo_2", "UserInfo_4", "UserInfo_8", "UserInfo_20"]
	privince_features = ["UserInfo_7", "UserInfo_19"]
	phone_features = ["UserInfo_9"]
	marry_features = ["UserInfo_22"]
	resident_features = ["UserInfo_24"]

	# features_with_simply_value = ["WeblogInfo_2", "WeblogInfo_5", "WeblogInfo_8",
	# 								"UserInfo_10", "UserInfo_18", "WeblogInfo_24",
	# 								"WeblogInfo_27", "WeblogInfo_30"]
	# it is no need to map the other str style features, 
	#	only when the other features contain special characters
	# --> the list below contain all those features
	contain_special_features = ["UserInfo_23", "Education_Info2", "Education_Info3", \
								"Education_Info4", "Education_Info6", "Education_Info7", \
								"WeblogInfo_19", "WeblogInfo_20", "Education_Info8", \
								"WeblogInfo_21", "ListingInfo"]


	digited_special_str_features = list()

	# digit city features
	digit_city_data = digit_city_features(data, features, city_features, use_original_features = True)
	#save_result(digit_city_data, "digited_city_data.csv", features)
	digited_special_str_features.extend(city_features)
	# digit province features
	digited_province_data = digit_province_features(digit_city_data, features, privince_features, \
													use_original_features = True)
	#save_result(digited_province_data, "digited_province_data.csv", features)
	digited_special_str_features.extend(privince_features)
	# digit phone features
	digited_phone_data = digit_phone_features(digited_province_data, features, phone_features, \
												use_original_features = True)
	#save_result(digited_phone_data, "digited_phone_data.csv", features)
	digited_special_str_features.extend(phone_features)
	# digit marrage features
	digited_marrage_data = digit_marry_features(digited_phone_data, features, marry_features, \
												use_original_features = True)
	#save_result(digited_marrage_data, "digited_marrage_data.csv", features)
	digited_special_str_features.extend(marry_features)
	# digit resident features
	digited_residence_data = digit_resident_features(digited_marrage_data, features, resident_features, \
													use_original_features = True)
	save_result(digited_residence_data, "data_when_digited_residence.csv", features)
	digited_special_str_features.extend(resident_features)

	digited_special_features_data = digited_residence_data
	
	if not for_train:
		use_experience = True
		save_dir = "resultData/test"
	# if this map is just or train, which means we do not have experience map style
	if not use_experience:
		digited_data, features_map_info = map_str_to_digit(digited_special_features_data, \
															features, digited_special_str_features, \
															contain_special_features)
		save_result(features_map_info, \
							FEATURES_MAP_INFO_FILE_NAME, \
							dir_name = "resultData/features_map")

	else:
		digited_data = map_str_to_digit_with_experience(digited_special_features_data, \
													features, digited_special_str_features, \
													contain_special_features)

	#digited_data = convert_to_numerical(convert_to_digit)
	save_result(digited_data, "data_after_Str_features_digited.csv", features, dir_name = save_dir)


	return digited_special_features_data
예제 #27
0
# @Author  : chensijia ([email protected])
# @Version : 0.0.0
# @Style   : Python3.5
#
# @Description: 


from main_for_process_data import load_data_for_solve, replace_miss, strStyle_features_to_digit
from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train
from solve_data import delete_features
import numpy as np



if __name__ == '__main__':
	data, features, label = load_data_for_solve("PPD_Master_GBK_2_Test_Set.csv", for_train = False)

	data, features = replace_miss(data, features, label, for_train = False)
	#save_result(data, "test/data_after_filling_missing_.csv", features)

	deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_features_with_too_many_missing")
	data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train)
	save_result(data, "test_data_after_deleted_features.csv", features, dir_name = "resultData/test/")

	data = strStyle_features_to_digit(data, features, for_train = False, use_experience = True)
	save_result(data, "data_after_digited.csv", features, dir_name= "resultData/test/")
	save_features_info(data, features, label, "info_after_digit_all_features.csv", \
						dir_name = "resultData/test/")


예제 #28
0
    data, features, deleted = delete_features(
        data, features, delete_feas_list=deleted_features_in_train)
    print(deleted)

    data, features = solve_user_info_package(data,
                                             features,
                                             saved_dir=saved_area)

    data, features = solve_weblog_info_package(data,
                                               features,
                                               saved_dir=saved_area)

    data, features = solve_thirdparty_info_package(data,
                                                   features,
                                                   saved_dir=saved_area)

    data, features = extract_log_update_package(data, features, for_train)

    return data, features


if __name__ == '__main__':
    # "resultData/test"
    data, features = pipeline_for_features_solved(
        for_train=False, saved_area="resultData/test/")
    print(data.shape)
    save_result(data,
                "data_after_features_processed.csv",
                features,
                dir_name="resultData/test")
예제 #29
0
    contents = load_result("data_after_delete_no_discrimination_features.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])

    from map_features_to_digit import convert_to_numerical
    from solve_data import delete_features

    data = convert_to_numerical(data, features)

    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])

    correlation_between_properties(data, features)

    delete_result = according_properties_correlation_delete()
    save_result(delete_result, "deleted_features_with_strong_correlation.csv")


    data, features, deleted_features = delete_features(data, features, \
                  delete_feas_list = delete_result)
    # print(deleted_features)
    save_result(data, "data_after_delete_strong_correlation_features.csv",
                features)
    print(data.shape)

    ###############3 used pca to delete #####################

    # features_style = ["UserInfo", "WeblogInfo", "ThirdParty_Info_Period1", \
    # 				"ThirdParty_Info_Period2", "ThirdParty_Info_Period3", \
    # 				"ThirdParty_Info_Period4", "ThirdParty_Info_Period5", \
    # 				"ThirdParty_Info_Period6"]
예제 #30
0
from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train
from solve_data import delete_features
import numpy as np

if __name__ == '__main__':
    data, features, label = load_data_for_solve(
        "PPD_Master_GBK_2_Test_Set.csv", for_train=False)

    data, features = replace_miss(data, features, label, for_train=False)
    #save_result(data, "test/data_after_filling_missing_.csv", features)

    deleted_features_in_train = load_all_deleted_features_during_train(
        deleted_features_file_label="deleted_features_with_too_many_missing")
    data, features, deleted = delete_features(
        data, features, delete_feas_list=deleted_features_in_train)
    save_result(data,
                "test_data_after_deleted_features.csv",
                features,
                dir_name="resultData/test/")

    data = strStyle_features_to_digit(data,
                                      features,
                                      for_train=False,
                                      use_experience=True)
    save_result(data,
                "data_after_digited.csv",
                features,
                dir_name="resultData/test/")
    save_features_info(data, features, label, "info_after_digit_all_features.csv", \
         dir_name = "resultData/test/")
예제 #31
0
	contents = load_result("data_after_delete_no_discrimination_features.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	from map_features_to_digit import convert_to_numerical
	from solve_data import delete_features

	data = convert_to_numerical(data, features)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])


	correlation_between_properties(data, features)

	delete_result = according_properties_correlation_delete()
	save_result(delete_result, "deleted_features_with_strong_correlation.csv")

	
	data, features, deleted_features = delete_features(data, features, \
	 													delete_feas_list = delete_result)
	# print(deleted_features)
	save_result(data, "data_after_delete_strong_correlation_features.csv", features)
	print(data.shape)

	###############3 used pca to delete #####################

	# features_style = ["UserInfo", "WeblogInfo", "ThirdParty_Info_Period1", \
	# 				"ThirdParty_Info_Period2", "ThirdParty_Info_Period3", \
	# 				"ThirdParty_Info_Period4", "ThirdParty_Info_Period5", \
	# 				"ThirdParty_Info_Period6"]
	# #print(features)
예제 #32
0
    weblog_data = data[:, fea_indexs]
    weblog_features = features[fea_indexs]

    print(weblog_data.shape)
    print(weblog_features.shape)
    #save_result(weblog_data, "weblog_data_view.csv", weblog_features)

    # # label_lines = np.array(load_result("train_label_original.csv"))
    # # #print(label_lines.shape)
    # # from save_load_result import convert_to_float
    # # label = convert_to_float(label_lines)

    # # label = label.reshape((label.size, ))
    correlation_between_properties(weblog_data, weblog_features)
    delete_result = according_properties_correlation_delete()
    save_result(delete_result,
                "deleted_weblog_features_with_strong_correlation.csv")
    weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \
                 delete_feas_list = delete_result)
    save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv",
                weblog_features)

    weblog_delete_needed = [
        "WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13",
        "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28",
        "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"
    ]
    save_result(weblog_delete_needed, "deleted_useless_weblog.csv")

    new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \
           delete_feas_list = weblog_delete_needed)
    save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)
예제 #33
0
	print (grid.best_params_)
	print (grid.best_score_)

	return grid.best_params_, grid.best_score_


if __name__ == '__main__':
	contents = load_result("all_data_after_features_processed.csv", dir_name = "resultData_All")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	print("data: ", data.shape)
	label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All"))
	print(label_lines.shape)
	from save_load_result import convert_to_int
	label = convert_to_int(label_lines)

	label = label.reshape((label.size, ))
	print("label: ", label.shape)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)


	# test_preds = module_xgboost_pre(train_data, train_label, test_data)
	# calculate_draw_roc(test_label, test_preds, save_fig_name = "module_xgb_ROC.png")

	auc_result, paras_result = grid_search_xgboost_params(data, label)
	print(auc_result)
	print(paras_result)
	save_result(auc_result, "grid_search_aucs.pickle", dir_name = "resultData_All")
	save_result(paras_result, "grid_search_paras.pickle", dir_name = "resultData_All")
예제 #34
0
def view_each_features_label(data, features, label):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	str_style_features = np.array(load_result("str_features.csv")[0])
	str_features_index = get_known_features_index(features, str_style_features)

	new_label = label.reshape((label.size,))
	x = range(len(data))
	for fea_pos in range(len(features)):
		feature_name = features[fea_pos]
		if fea_pos in str_features_index:
			file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(fea_pos) + ")" + feature_name +  ".png"
		else:
			file_path = "view_data_area/after_all/with_label_under_mean/" + str(fea_pos) + ")" + feature_name +  ".png"
		features_info = feature_value_class(data, fea_pos, label, str_features_index)
		if features_info["num_of_value"] > 30:
			save_result([features[fea_pos]], "complex_value_features.csv", style = "a+")
		else:
			if fea_pos not in str_features_index:
				save_result([features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style = "a+")


		y_positive = data[new_label == 1, fea_pos]
		y_negitive = data[new_label == 0, fea_pos]
		positive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 1])
		negitive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 0])
		plt.scatter(positive_index, y_positive, marker = 'o', color = 'r', s = 10)
		plt.scatter(negitive_index, y_negitive, marker = 'x', color = 'g', s = 10)

		plt.xlabel("instances(30000)")
		plt.ylabel("value")
		if features_info["num_of_value"] < 40:
			plt.title(feature_name + " value - label " + "distributed " + "in instances" + \
						"\n the arrow --> Proportion of positive in that value & in positive")
			for k, v in features_info.items():
				if isinstance(v, FeatureInData):
					arrow_data = round(v._respond_positive_num / features_info["num_positive"] , 4)
					arrow_start_position_x = len(data) + 2000
					arrow_start_position_y = int(k)
					arrow_end_postion_x = arrow_start_position_x
					arrow_end_postion_y = int(k)
					plt.annotate(arrow_data, \
								xy=(arrow_start_position_x,arrow_start_position_y), \
								xytext=(arrow_end_postion_x,arrow_end_postion_y), \
								arrowprops=dict(facecolor='blue', shrink=0.02))

					arrow_data = round(v._respond_positive_num / v._present_num , 4)
					arrow_start_position_x = -4000
					arrow_start_position_y = int(k)
					arrow_end_postion_x = arrow_start_position_x
					arrow_end_postion_y = int(k)
					plt.annotate(arrow_data, \
								xy=(arrow_start_position_x,arrow_start_position_y), \
								xytext=(arrow_end_postion_x,arrow_end_postion_y), \
								arrowprops=dict(facecolor='blue', shrink=0.02))

		else:
			fea_average = round(np.mean(data[:, fea_pos]), 4)
			fea_std = np.std(data[:, fea_pos])
			fea_oo = round(fea_std / fea_average, 4)
			max_v = np.amax(data[:, fea_pos])
			min_v = np.amin(data[:, fea_pos])
			plt.title(feature_name + " | mean & Proportion of positive under that mean" + \
				"\n degree of fluctuation --> " + str(fea_oo))
			x1 = np.array(range(-5000, 35000))
			y_mean = fea_average * np.ones((x1.size))
			#plt.plot(x1, y_mean, color = 'k', linestyle = "--")
			plt.annotate(fea_average, \
								xy=(-4000,fea_average), \
								xytext=(-4000,fea_average), \
								arrowprops=dict(facecolor='blue', shrink=0.05))
			under_mean_positive = 0
			under_mean_num = 0

			for k, v in features_info.items():
				if isinstance(v, FeatureInData):
					if k <= fea_average:
						under_mean_num += v._present_num
						under_mean_positive += v._respond_positive_num
			ave_posi = round(under_mean_positive / features_info["num_positive"], 4)
			plt.annotate(ave_posi, \
					xy=(31000,fea_average), \
					xytext=(31000,fea_average), \
					arrowprops=dict(facecolor='blue', shrink=0.05))
			pos_rat = 0
			pos_rat_whole = 0
			if -1 in features_info.keys():
				pos_rat = features_info[-1]._respond_positive_num / features_info[-1]._present_num
				pos_rat_whole = features_info[-1]._respond_positive_num / features_info["num_positive"]
				plt.annotate(round(pos_rat_whole, 4), \
						xy=(31000,-1), \
						xytext=(31000,-1))
				plt.annotate(round(pos_rat, 4), \
						xy=(-4000,-1), \
						xytext=(-4000,-1))
			plt.ylim(min_v - 10, fea_average * 2)
			#plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10))
		plt.savefig(file_path)
		plt.close()
	fea_indexs = find_featuers_index(features_name, features)
	print(fea_indexs)
	weblog_data = data[:, fea_indexs]
	weblog_features = features[fea_indexs]

	print(weblog_data.shape)
	print(weblog_features.shape)
	#save_result(weblog_data, "weblog_data_view.csv", weblog_features)

	# # label_lines = np.array(load_result("train_label_original.csv"))
	# # #print(label_lines.shape)
	# # from save_load_result import convert_to_float
	# # label = convert_to_float(label_lines)

	# # label = label.reshape((label.size, ))
	correlation_between_properties(weblog_data, weblog_features)
	delete_result = according_properties_correlation_delete()
	save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv")
	weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \
 													delete_feas_list = delete_result)
	save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv", weblog_features)


	weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13",
							"WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28",
							"WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"]
	save_result(weblog_delete_needed, "deleted_useless_weblog.csv")

	new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \
								delete_feas_list = weblog_delete_needed)
	save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)