def get_respond_data(for_train = True):
	if for_train:
		log_file_name = "PPD_LogInfo_3_1_Training_Set.csv"
		update_file_name = "PPD_Userupdate_Info_3_1_Training_Set.csv"
		id_target_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv"
		DATA_DIR = "PPD-First-Round-Data/Training Set/"
	else:
		log_file_name = "PPD_LogInfo_2_Test_Set.csv"
		update_file_name = "PPD_Userupdate_Info_2_Test_Set.csv"
		id_target_file_name = "PPD_Master_GBK_2_Test_Set.csv"
		DATA_DIR = "PPD-First-Round-Data/Test Set/"

	#################### load the needed data ####################################
	log_info = np.array(load_result(log_file_name, dir_name = DATA_DIR))
	log_info_features = log_info[0]
	log_info_data = log_info[1:]


	update_info = np.array(load_result(update_file_name, dir_name = DATA_DIR))
	update_info_features = update_info[0]
	update_info_data = update_info[1:]

	id_target = np.array(load_result(id_target_file_name, dir_name = DATA_DIR))

	if for_train:
		features = id_target[0, [0, -2, -1]]
		data = id_target[1:, [0, -2, -1]]
	else:
		features = id_target[0, [0, -1]]
		data = id_target[1:, [0, -1]]

	return data, log_info_data, update_info_data
Пример #2
0
def view_each_features(data, features):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	str_style_features = np.array(load_result("str_features.csv")[0])
	str_features_index = get_known_features_index(features, str_style_features)

	x = range(len(data))

	for fea_pos in range(len(features)):
		feature_name = features[fea_pos]
		# not draw the str style features
		if fea_pos in str_features_index:
			file_path = "view_data_area/csj/" + "(str" + str(fea_pos) + ")" + feature_name +  ".png"
			# print(fea_pos)
			# print(features[fea_pos])
		else:
			file_path = "view_data_area/csj/" + str(fea_pos) + ")" + feature_name +  ".png"
		y = data[:, fea_pos]
		plt.scatter(x, y)
		


		plt.xlabel("instances(30000)")
		plt.ylabel("value")
		plt.title(feature_name + " value " + "distributed " + "in instances")
		plt.ylim(-2)
		# rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15)
		# plt.legend((rect,),(feature_name + "`s value",))

		#print(file_path)
		plt.savefig(file_path)
		plt.close()
Пример #3
0
def view_each_features(data, features):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    str_style_features = np.array(load_result("str_features.csv")[0])
    str_features_index = get_known_features_index(features, str_style_features)

    x = range(len(data))

    for fea_pos in range(len(features)):
        feature_name = features[fea_pos]
        # not draw the str style features
        if fea_pos in str_features_index:
            file_path = "view_data_area/csj/" + "(str" + str(
                fea_pos) + ")" + feature_name + ".png"
            # print(fea_pos)
            # print(features[fea_pos])
        else:
            file_path = "view_data_area/csj/" + str(
                fea_pos) + ")" + feature_name + ".png"
        y = data[:, fea_pos]
        plt.scatter(x, y)

        plt.xlabel("instances(30000)")
        plt.ylabel("value")
        plt.title(feature_name + " value " + "distributed " + "in instances")
        plt.ylim(-2)
        # rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15)
        # plt.legend((rect,),(feature_name + "`s value",))

        #print(file_path)
        plt.savefig(file_path)
        plt.close()
Пример #4
0
def new_UserInfo_7_num(data, features):

	province_features = ["UserInfo_7"]
	finded_index = np.where( features == province_features[0])[0][0]
	#### create new features --> UserInfo_7_province_preIncome
	new_add_feature = np.array(["UserInfo_7_province_preIncome"])
	features = np.concatenate((features, new_add_feature))
	
	province_content = load_result("2014中国各省人均可支配收入排行.csv", dir_name = "material_data")
	province_info = np.array(province_content[0])
	province_data = np.array(province_content[1:])

	province_index = np.where( province_info == "省份名")[0][0]
	income_index = np.where( province_info == "可支配收入")[0][0]	

	provinces = list(province_data[:, province_index])
	feature_data = np.zeros((data.shape[0], 1))
	def is_contain(provinces, finded):
		for i in range(len(provinces)):
			if finded in provinces[i]:
				return i
		return -1
	for user in range(data.shape[0]):
		temp = is_contain(provinces, data[user, finded_index])
		if not temp == -1:
			feature_data[user, 0] = int(province_data[temp, income_index])
		else:
			feature_data[user, 0] = temp


	data = np.concatenate((data, feature_data), axis = 1)

	print("UserInfo_7_province_preIncome" + " solved")
	return data, features
Пример #5
0
def correlation_between_properties(data, features):
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)

	title = list()
	title.append("features1")
	title.append("features2")
	title.append("calculate_method") 
	title.append("cor")
	title.append("pval")
	save_result(title, "pearsonr_spearmanr_results.csv")
	save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
	for fea_pos in range(len(features)):
		for fea_pos_add in range(fea_pos + 1, len(features)):
			info_result = list()
			info_result.append(features[fea_pos])
			info_result.append(features[fea_pos_add])
			a1 = data[:, fea_pos]
			a2 = data[:, fea_pos_add]
			# they are all not str style features
			if fea_pos not in indexs and fea_pos_add not in indexs:
				info_result.append("pearsonr")
				cor, pval = stats.pearsonr(a1, a2)
			else: # one of them or all of them are str style features
				info_result.append("spearmanr")
				cor, pval = stats.spearmanr(a1, a2)
			cor = round(cor, 3)
			info_result.append(cor)
			info_result.append(pval)
			if abs(cor) >= 0.2:
				save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+")
			if abs(cor) >= 0.86:
				save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
												style = "a+")
Пример #6
0
def map_str_to_digit(data,
                     features,
                     no_map_features,
                     only_map_features=" ",
                     label=" "):
    no_map_features_index = get_known_features_index(features, no_map_features)
    features_map_info = dict()

    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    fixed_str_features_index = get_known_features_index(
        features, fixed_str_features)

    only_map_features_index = range(len(features))
    if not only_map_features == " ":
        only_map_features_index = get_known_features_index(
            features, only_map_features)
    for fea_pos in range(1, len(features)):
        if not fea_pos in no_map_features_index and fea_pos in only_map_features_index:
            map_info = OrderedDict()
            #feature_map_info = OrderedDict()
            fea_val_cla = feature_value_class(data, fea_pos, label,
                                              fixed_str_features_index)
            # if this feature is a string value, just convert it to value
            if fea_val_cla["str_feature"]:

                data, map_info = map_str_feature_to_value(
                    data, fea_pos, fea_val_cla)
                features_map_info[features[fea_pos]] = map_info
                #features_map_info[].append([feature_map_info])

    digited_data = convert_to_numerical(data, features)
    return digited_data, features_map_info
def Integrate_Log_Update(for_train = True, is_round_two = False):
	if for_train:
		if is_round_two:
			DATA_DIR = "PPD-Second-Round-Data/Rematch Train/"
			LOG_FILE = "LogInfo_9w_3_2.csv"
			UPDATE_FILE = "Userupdate_Info_9w_3_2.csv"
			DATA_FILE = "Kesci_Master_9w_gbk_3_2.csv"
		else:
			DATA_DIR = "PPD-First-Round-Data/Training Set/"
			LOG_FILE = "PPD_LogInfo_3_1_Training_Set.csv"
			UPDATE_FILE = "PPD_Userupdate_Info_3_1_Training_Set.csv"
			DATA_FILE = "PPD_Training_Master_GBK_3_1_Training_Set.csv"
	else:
		if is_round_two:
			DATA_DIR = "PPD-Second-Round-Data/Rematch Test/"
			LOG_FILE = "LogInfo_9w_1.csv"
			UPDATE_FILE = "Userupdate_Info_9w_1.csv"
			DATA_FILE = "Kesci_Master_9w_gbk_1_test_set.csv"
		else:
			DATA_DIR = "PPD-First-Round-Data/Test Set/"
			LOG_FILE = "PPD_LogInfo_2_Test_Set.csv"
			UPDATE_FILE = "PPD_Userupdate_Info_2_Test_Set.csv"
			DATA_FILE = "PPD_Master_GBK_2_Test_Set.csv"

	# #################### load the needed data ####################################
	log_info = np.array(load_result(LOG_FILE, dir_name = DATA_DIR))
	log_info_features = log_info[0]
	log_info_data = log_info[1:]


	update_info = np.array(load_result(UPDATE_FILE, dir_name = DATA_DIR))
	update_info_features = update_info[0]
	update_info_data = update_info[1:]

	print("for_train: ", for_train)
	print("is_round_two: ", is_round_two)

	id_target = np.array(load_result(DATA_FILE, dir_name = DATA_DIR))
	if for_train:
		features = id_target[0, [0, -2, -1]]
		data = id_target[1:, [0, -2, -1]]
	else:
		features = id_target[0, [0, -1]]
		data = id_target[1:, [0, -1]]

	return data, log_info_data, update_info_data
Пример #8
0
def digit_city_features(data, features, city_features, use_original_features = False):
	# get the map basis
	cei_record_content = load_result("2013中国直辖市 省会城市和计划单列市排名榜.csv", dir_name = "material_data")
	cei_features = np.array(cei_record_content[0])
	cei_recored_data1 = np.array(cei_record_content[1:])

	cei_record_content = load_result("2013中国城市商业信用环境指数地级市排名榜.csv", dir_name = "material_data")
	cei_recored_data2 = np.array(cei_record_content[1:])
	cei_recored_data = np.concatenate((cei_recored_data1, cei_recored_data2))
	# create the map basis
	city_map_basis = create_city_map_basis(cei_recored_data, cei_features)

	if use_original_features:
		data = replace_with_original(data, features, city_features)
	digited_city_data = use_map_basis_to_digit(data, features, city_map_basis, city_features)



	return digited_city_data
Пример #9
0
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \
         contain_special_features):
    map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \
           dir_name = "resultData/features_map")

    print(map_experience)
    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    fixed_str_features_index = get_known_features_index(
        features, fixed_str_features)

    digited_special_str_features_index = get_known_features_index(features, \
               digited_special_str_features)
    contain_special_features_index = get_known_features_index(features, \
               contain_special_features)
    remember = list()
    for fea_pos in range(1, len(features)):
        # str style features + str .. but not digited + the str we want to digit
        if fea_pos in fixed_str_features_index and \
         fea_pos not in digited_special_str_features_index and \
         fea_pos in contain_special_features_index:
            # the ListingInfo may be reverse !!!
            if features[fea_pos] == "ListingInfo" and int(
                    data[0, fea_pos].split("/")[0]) < 1000:
                data = reverse_date(data, fea_pos)
            if features[fea_pos] in map_experience.keys():
                for i in range(len(data)):
                    if data[i, fea_pos] == "-1":
                        continue
                    try:
                        data[i, fea_pos] = map_experience[features[fea_pos]][
                            data[i, fea_pos]]
                    except:
                        if i < 50:
                            print(features[fea_pos])
                            print(map_experience[features[fea_pos]])
                            print(map_experience[features[fea_pos]][data[
                                i, fea_pos]])
                        remember.append(i)  # this is a error value
    #print(remember)
    data = np.delete(data, remember, 0)
    digited_data = convert_to_numerical(data, features)
    return digited_data
Пример #10
0
def according_properties_correlation_delete():
	contents = load_result("pearsonr_spearmanr_Strong_correlation.csv")
	array_contents = np.array(contents)
	comp_fea1 = np.array(array_contents[1:, 0])
	comp_fea2 = np.array(array_contents[1:, 1])



	delete_features = [comp_fea2[i] for i in range(len(comp_fea2)) \
						if comp_fea1[i] not in comp_fea2]
	#print(set(delete_features))
	return np.array(list(set(delete_features)))
Пример #11
0
def according_properties_correlation_delete():
    contents = load_result("pearsonr_spearmanr_Strong_correlation.csv")
    array_contents = np.array(contents)
    comp_fea1 = np.array(array_contents[1:, 0])
    comp_fea2 = np.array(array_contents[1:, 1])



    delete_features = [comp_fea2[i] for i in range(len(comp_fea2)) \
         if comp_fea1[i] not in comp_fea2]
    #print(set(delete_features))
    return np.array(list(set(delete_features)))
Пример #12
0
def according_properties_correlation_delete():
	contents = load_result("pearsonr_spearmanr_Strong_correlation.csv")
	array_contents = np.array(contents)
	comp_fea1 = np.array(array_contents[1:, 0])
	comp_fea2 = np.array(array_contents[1:, 1])

	delete_features = [comp_fea2[i] for i in range(len(comp_fea2)) \
						if comp_fea1[i] not in comp_fea2]
	os.remove(os.path.join(os.getcwd(), "resultData/pearsonr_spearmanr_results.csv"))
	os.remove(os.path.join(os.getcwd(), "resultData/pearsonr_spearmanr_Strong_correlation.csv"))
	#print(set(delete_features))
	return list(set(delete_features))
Пример #13
0
def fill_all_missing(data, features, label = None):
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)
	#!start from range(1,...) is because the first line of the feature is the id, useless
	for fea_pos in range(1, len(features)):
		fea_val_cla = feature_value_class(data, fea_pos, label, indexs)
		if not fea_val_cla[-1]._present_num == 0:
			if fea_pos == 5:
				print(fea_val_cla)
			data = fill_the_missing(data, fea_pos, fea_val_cla, label)
	#write_to_deleted_features_area(np.array(deleted_feas))
	return data, features
Пример #14
0
def according_coefficient_variation_delete(data, features):
	waiting_to_delete = np.array(load_result("complex_value_features.csv"))
	waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size,))
	#print(waiting_to_delete)
	indexs = get_known_features_index(features, waiting_to_delete)
	coefficient_variation_info = OrderedDict()
	for fea_pos in indexs:
		try:
			coefficient_variation_fea = stats.variation(data[:, fea_pos])
			coefficient_variation_info[features[fea_pos]] = coefficient_variation_fea
		except:
			pass
	return coefficient_variation_info
Пример #15
0
def digit_province_features(data, features, province_features, use_original_features = False):
	province_content = load_result("2014中国各省人均可支配收入排行.csv", dir_name = "material_data")
	province_info = np.array(province_content[0])
	province_data = np.array(province_content[1:])

	province_map_basis = create_province_map_basis(province_data, province_info)
	if use_original_features:
		data = replace_with_original(data, features, province_features)

	digited_province_data = use_map_basis_to_digit(data, features, province_map_basis, province_features)


	return digited_province_data
Пример #16
0
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \
									contain_special_features):
	map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \
								dir_name = "resultData/features_map")

	print(map_experience)
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	fixed_str_features_index = get_known_features_index(features, fixed_str_features)

	digited_special_str_features_index = get_known_features_index(features, \
												digited_special_str_features)
	contain_special_features_index = get_known_features_index(features, \
												contain_special_features)
	remember = list()
	for fea_pos in range(1, len(features)):
		# str style features + str .. but not digited + the str we want to digit
		if fea_pos in fixed_str_features_index and \
			fea_pos not in digited_special_str_features_index and \
			fea_pos in contain_special_features_index:
			# the ListingInfo may be reverse !!!
			if features[fea_pos] == "ListingInfo" and int(data[0, fea_pos].split("/")[0]) < 1000:
				data = reverse_date(data, fea_pos)
			if features[fea_pos] in map_experience.keys():
				for i in range(len(data)):
					if data[i, fea_pos] == "-1":
						continue
					try:
						data[i, fea_pos] = map_experience[features[fea_pos]][data[i, fea_pos]]
					except:
						if i < 50:
							print(features[fea_pos])
							print(map_experience[features[fea_pos]])
							print(map_experience[features[fea_pos]][data[i, fea_pos]])
						remember.append(i) # this is a error value
	#print(remember)			
	data = np.delete(data, remember, 0)
	digited_data = convert_to_numerical(data, features)
	return digited_data
Пример #17
0
def digit_city_features(data,
                        features,
                        city_features,
                        use_original_features=False):
    # get the map basis
    cei_record_content = load_result("2013中国直辖市 省会城市和计划单列市排名榜.csv",
                                     dir_name="material_data")
    cei_features = np.array(cei_record_content[0])
    cei_recored_data1 = np.array(cei_record_content[1:])

    cei_record_content = load_result("2013中国城市商业信用环境指数地级市排名榜.csv",
                                     dir_name="material_data")
    cei_recored_data2 = np.array(cei_record_content[1:])
    cei_recored_data = np.concatenate((cei_recored_data1, cei_recored_data2))
    # create the map basis
    city_map_basis = create_city_map_basis(cei_recored_data, cei_features)

    if use_original_features:
        data = replace_with_original(data, features, city_features)
    digited_city_data = use_map_basis_to_digit(data, features, city_map_basis,
                                               city_features)

    return digited_city_data
Пример #18
0
def according_coefficient_variation_delete(data, features):
    waiting_to_delete = np.array(load_result("complex_value_features.csv"))
    waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size, ))
    #print(waiting_to_delete)
    indexs = get_known_features_index(features, waiting_to_delete)
    coefficient_variation_info = OrderedDict()
    for fea_pos in indexs:
        try:
            coefficient_variation_fea = stats.variation(data[:, fea_pos])
            coefficient_variation_info[
                features[fea_pos]] = coefficient_variation_fea
        except:
            pass
    return coefficient_variation_info
Пример #19
0
def replace_with_original(data, features, replace_features, \
      original_name = "withoutLabel_originalData.csv"):
    original_contents = load_result(original_name, dir_name=SAVE_DIR)
    original_features = np.array(original_contents[0])
    original_data = np.array(original_contents[1:])

    for fea in replace_features:
        try:
            original_index = np.where(original_features == fea)[0][0]
            index = np.where(features == fea)[0][0]
            data[:, index] = original_data[:, original_index]
        except:
            print(str(fea) + "may not existed in input features")
            continue
    return data
Пример #20
0
def replace_with_original(data, features, replace_features, \
						original_name = "withoutLabel_originalData.csv"):
	original_contents = load_result(original_name, dir_name = SAVE_DIR)
	original_features = np.array(original_contents[0])
	original_data = np.array(original_contents[1:])

	for fea in replace_features:
		try:
			original_index = np.where(original_features == fea)[0][0]
			index = np.where(features == fea)[0][0]	
			data[:, index] = original_data[:, original_index]
		except:
			print(str(fea) + "may not existed in input features")
			continue
	return data
Пример #21
0
def digit_province_features(data,
                            features,
                            province_features,
                            use_original_features=False):
    province_content = load_result("2014中国各省人均可支配收入排行.csv",
                                   dir_name="material_data")
    province_info = np.array(province_content[0])
    province_data = np.array(province_content[1:])

    province_map_basis = create_province_map_basis(province_data,
                                                   province_info)
    if use_original_features:
        data = replace_with_original(data, features, province_features)

    digited_province_data = use_map_basis_to_digit(data, features,
                                                   province_map_basis,
                                                   province_features)

    return digited_province_data
Пример #22
0
def combine_results(results_dir = "resultData/test/test_result/final_predict"):
	dir_path = results_dir

	upper_count = 0
	sum_upper_thresh = np.zeros((19999, 1))

	for result_file in os.listdir(dir_path):

		#result_file_path = os.path.join(dir_path, result_file)
		content = load_result(result_file, dir_path)
		test_scores = np.array(content[1:])
		test_fea = np.array(content[0])

		test_scores = np.array(list(map(float, list(test_scores[:, 1]))))
		test_scores = test_scores.reshape((test_scores.size, 1))

		

		result_test_score = result_file.split(".")[1][:3]

		

		if int(result_test_score) > 770:
			print(result_file)
			print(test_scores)
			print(result_test_score)

			print("add+ sum_upper_thresh", result_file)
			sum_upper_thresh += test_scores
			upper_count += 1

	sum_upper_average = sum_upper_thresh / upper_count
	print("*** average ***")
	print(sum_upper_average)
	print(sum_upper_average.shape)

	sum_upper_average = sum_upper_average.reshape((sum_upper_average.size,))
	submit(sum_upper_average)
Пример #23
0
def submit(test_predict, save_dir):
	###################################### Idx #########################
	print(test_predict)
	test_predict = np.array([round(test_predict[i], 4) for i in range(test_predict.shape[0])])
	print(test_predict)
	contents = load_result("withoutLabel_originalData.csv", dir_name = "resultData_All/test")
	features = np.array(contents[0])

	sublime_features = np.array([features[0], "score"] )

	save_result(sublime_features, "sublime_data.csv", dir_name = save_dir)

	data = np.array(contents[1:])

	test_users = data[:, 0]
	test_users = test_users.reshape((test_users.size, 1))

	test_predict = test_predict.reshape((test_predict.size, 1))

	sublime_data = np.concatenate((test_users, test_predict), axis = 1)


	save_result(sublime_data, "sublime_data.csv", style = "a+", dir_name = save_dir)
Пример #24
0
def map_str_to_digit(data, features, no_map_features, only_map_features = " ", label = " "):
	no_map_features_index = get_known_features_index(features, no_map_features)
	features_map_info = dict()

	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	fixed_str_features_index = get_known_features_index(features, fixed_str_features)

	only_map_features_index = range(len(features))
	if not only_map_features == " ":
		only_map_features_index = get_known_features_index(features, only_map_features)
	for fea_pos in range(1, len(features)):
		if not fea_pos in no_map_features_index and fea_pos in only_map_features_index:
			map_info = OrderedDict()
			#feature_map_info = OrderedDict()
			fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index)
			# if this feature is a string value, just convert it to value
			if fea_val_cla["str_feature"]:

				data, map_info = map_str_feature_to_value(data, fea_pos, fea_val_cla)
				features_map_info[features[fea_pos]] = map_info
				#features_map_info[].append([feature_map_info])

	digited_data = convert_to_numerical(data, features)
	return digited_data, features_map_info
Пример #25
0
def correlation_between_properties(data, features):
    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    indexs = get_known_features_index(features, fixed_str_features)

    title = list()
    title.append("features1")
    title.append("features2")
    title.append("calculate_method")
    title.append("cor")
    title.append("pval")
    save_result(title, "pearsonr_spearmanr_results.csv")
    save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
    for fea_pos in range(len(features)):
        for fea_pos_add in range(fea_pos + 1, len(features)):
            info_result = list()
            info_result.append(features[fea_pos])
            info_result.append(features[fea_pos_add])
            a1 = data[:, fea_pos]
            a2 = data[:, fea_pos_add]
            # they are all not str style features
            if fea_pos not in indexs and fea_pos_add not in indexs:
                info_result.append("pearsonr")
                cor, pval = stats.pearsonr(a1, a2)
            else:  # one of them or all of them are str style features
                info_result.append("spearmanr")
                cor, pval = stats.spearmanr(a1, a2)
            cor = round(cor, 3)
            info_result.append(cor)
            info_result.append(pval)
            if abs(cor) >= 0.2:
                save_result(info_result,
                            "pearsonr_spearmanr_results.csv",
                            style="a+")
            if abs(cor) >= 0.86:
                save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
                        style = "a+")
Пример #26
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2016-03-06 09:00:08
# @Author  : chensijia ([email protected])
# @Version : 0.0.0
# @Style   : Python3.5
#
# @Description:

import os
import numpy as np

from save_load_result import save_result, load_result

if __name__ == '__main__':
    contents = load_result("after_Str_features_digited_data.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])

    from map_features_to_digit import convert_to_numerical

    data = convert_to_numerical(data, features)
Пример #27
0
	random_forest = RandomForestClassifier(class_weight = {1: ratio})

	#print(random_forest.get_params().keys())
	cv = StratifiedKFold(train_target)
	grid = GridSearchCV(random_forest, parameters,scoring='roc_auc',cv=cv,verbose=10,n_jobs=-1)
	grid.fit(train_data, train_target)

	#print best params
	print (grid.best_params_)
	print (grid.best_score_)

	return grid.best_params_, grid.best_score_


if __name__ == '__main__':
	contents = load_result("all_data_after_features_processed.csv", dir_name = "resultData_All")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	print("data: ", data.shape)
	label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All"))
	print(label_lines.shape)
	from save_load_result import convert_to_int
	label = convert_to_int(label_lines)

	label = label.reshape((label.size, ))
	print("label: ", label.shape)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

Пример #28
0
	lr.fit(data, label)
	#A helper method for pretty-printing linear models
	def pretty_print_linear(coefs, names = None, sort = False):
	  if names == None:
	    names = ["X%s" % x for x in range(len(coefs))]
	  lst = zip(coefs, names)
	  if sort:
	    lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
	  return " + ".join("%s * %s" % (round(coef, 3), name)
	                   for coef, name in lst)
	print("Linear model:", pretty_print_linear(lr.coef_, features[1:]))


if __name__ == '__main__':

	contents = load_result("after_delete_strong_correlation_features_data.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	label_lines = np.array(load_result("train_label_original.csv"))
	data = convert_to_numerical(data, features)

	label = convert_to_float(label_lines)
	label = label.reshape((label.size, ))

	#use_RandomForestRegressor_to_delete(data, features, label)
	use_LR_to_delete(data, features, label)
#################### first example #######################
	# testNum = 10
	# average = 0
	# for i in range(0, testNum):  
	#     #加载数据集,切分数据集80%训练,20%测试  
	data = convert_to_numerical(data, features)

	data, features = sta_start_missing_period(data, features)
	data, features = remove_thirdparty6(data, features)

	data, features = fill_thirdParty_miss(data, features)

	data, features = third_party_stable(data, features)

	data, features = third_party_level(data, features)
	save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir)
	return data, features 

if __name__ == '__main__':

	contents = load_result("data_after_solved_weblog.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])

	data = convert_to_numerical(data, features)

	solve_thirdparty_info_package(data, features)

	# calculate_number = ["17"]
	# users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0])
	# print(users_sta_name)
	# for i in range(10):
	# 	print(users_stability[i])
	# from create_new_features import find_featuers_index
	# features_name = "ThirdPart"
Пример #30
0
	return user_value_info

def compare_features_info2(data, features, key_features):
	fea_indexs = get_known_features_index(features, key_features)
	compare_result = OrderedDict()

	for user in range(data.shape[0]):
		# user_id = data[user, 0]
		combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs]))
		if combine_data not in compare_result.keys():
			compare_result[combine_data] = 0
		compare_result[combine_data] += 1
	return compare_result

if __name__ == '__main__':
	contents = load_result("withoutLabel_originalData.csv", dir_name = "resultData/")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	#data = convert_to_numerical(data, features)

	print(data.shape)
	print(features.shape)

	# label_lines = np.array(load_result("train_label_original.csv"))
	#print(label_lines.shape)
	# from save_load_result import convert_to_int
	# label = convert_to_int(label_lines)

	# label = label.reshape((label.size, ))
Пример #31
0
def view_each_features_label(data, features, label):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    str_style_features = np.array(load_result("str_features.csv")[0])
    str_features_index = get_known_features_index(features, str_style_features)

    new_label = label.reshape((label.size, ))
    x = range(len(data))
    for fea_pos in range(len(features)):
        feature_name = features[fea_pos]
        if fea_pos in str_features_index:
            file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(
                fea_pos) + ")" + feature_name + ".png"
        else:
            file_path = "view_data_area/after_all/with_label_under_mean/" + str(
                fea_pos) + ")" + feature_name + ".png"
        features_info = feature_value_class(data, fea_pos, label,
                                            str_features_index)
        if features_info["num_of_value"] > 30:
            save_result([features[fea_pos]],
                        "complex_value_features.csv",
                        style="a+")
        else:
            if fea_pos not in str_features_index:
                save_result(
                    [features[fea_pos]],
                    "simple_discrete_value_features(nonestrfeatures).csv",
                    style="a+")

        y_positive = data[new_label == 1, fea_pos]
        y_negitive = data[new_label == 0, fea_pos]
        positive_index = np.array([
            index for index in range(len(new_label)) if new_label[index] == 1
        ])
        negitive_index = np.array([
            index for index in range(len(new_label)) if new_label[index] == 0
        ])
        plt.scatter(positive_index, y_positive, marker='o', color='r', s=10)
        plt.scatter(negitive_index, y_negitive, marker='x', color='g', s=10)

        plt.xlabel("instances(30000)")
        plt.ylabel("value")
        if features_info["num_of_value"] < 40:
            plt.title(feature_name + " value - label " + "distributed " + "in instances" + \
               "\n the arrow --> Proportion of positive in that value & in positive")
            for k, v in features_info.items():
                if isinstance(v, FeatureInData):
                    arrow_data = round(
                        v._respond_positive_num /
                        features_info["num_positive"], 4)
                    arrow_start_position_x = len(data) + 2000
                    arrow_start_position_y = int(k)
                    arrow_end_postion_x = arrow_start_position_x
                    arrow_end_postion_y = int(k)
                    plt.annotate(arrow_data, \
                       xy=(arrow_start_position_x,arrow_start_position_y), \
                       xytext=(arrow_end_postion_x,arrow_end_postion_y), \
                       arrowprops=dict(facecolor='blue', shrink=0.02))

                    arrow_data = round(
                        v._respond_positive_num / v._present_num, 4)
                    arrow_start_position_x = -4000
                    arrow_start_position_y = int(k)
                    arrow_end_postion_x = arrow_start_position_x
                    arrow_end_postion_y = int(k)
                    plt.annotate(arrow_data, \
                       xy=(arrow_start_position_x,arrow_start_position_y), \
                       xytext=(arrow_end_postion_x,arrow_end_postion_y), \
                       arrowprops=dict(facecolor='blue', shrink=0.02))

        else:
            fea_average = round(np.mean(data[:, fea_pos]), 4)
            fea_std = np.std(data[:, fea_pos])
            fea_oo = round(fea_std / fea_average, 4)
            max_v = np.amax(data[:, fea_pos])
            min_v = np.amin(data[:, fea_pos])
            plt.title(feature_name + " | mean & Proportion of positive under that mean" + \
             "\n degree of fluctuation --> " + str(fea_oo))
            x1 = np.array(range(-5000, 35000))
            y_mean = fea_average * np.ones((x1.size))
            #plt.plot(x1, y_mean, color = 'k', linestyle = "--")
            plt.annotate(fea_average, \
                 xy=(-4000,fea_average), \
                 xytext=(-4000,fea_average), \
                 arrowprops=dict(facecolor='blue', shrink=0.05))
            under_mean_positive = 0
            under_mean_num = 0

            for k, v in features_info.items():
                if isinstance(v, FeatureInData):
                    if k <= fea_average:
                        under_mean_num += v._present_num
                        under_mean_positive += v._respond_positive_num
            ave_posi = round(
                under_mean_positive / features_info["num_positive"], 4)
            plt.annotate(ave_posi, \
              xy=(31000,fea_average), \
              xytext=(31000,fea_average), \
              arrowprops=dict(facecolor='blue', shrink=0.05))
            pos_rat = 0
            pos_rat_whole = 0
            if -1 in features_info.keys():
                pos_rat = features_info[
                    -1]._respond_positive_num / features_info[-1]._present_num
                pos_rat_whole = features_info[
                    -1]._respond_positive_num / features_info["num_positive"]
                plt.annotate(round(pos_rat_whole, 4), \
                  xy=(31000,-1), \
                  xytext=(31000,-1))
                plt.annotate(round(pos_rat, 4), \
                  xy=(-4000,-1), \
                  xytext=(-4000,-1))
            plt.ylim(min_v - 10, fea_average * 2)
            #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10))
        plt.savefig(file_path)
        plt.close()
Пример #32
0
	return user_value_info

def compare_features_info2(data, features, key_features):
	fea_indexs = get_known_features_index(features, key_features)
	compare_result = OrderedDict()

	for user in range(data.shape[0]):
		# user_id = data[user, 0]
		combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs]))
		if combine_data not in compare_result.keys():
			compare_result[combine_data] = 0
		compare_result[combine_data] += 1
	return compare_result

if __name__ == '__main__':
	contents = load_result("PPD_Userupdate_Info_3_1_Training_Set.csv", dir_name = "PPD-First-Round-Data/Training Set/")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	#data = convert_to_numerical(data, features)

	print(data.shape)
	print(features.shape)

	# label_lines = np.array(load_result("train_label_original.csv"))
	#print(label_lines.shape)
	# from save_load_result import convert_to_int
	# label = convert_to_int(label_lines)

	# label = label.reshape((label.size, ))
Пример #33
0
    model = xgb.train(plst,
                      xgtrain,
                      num_boost_round=num_rounds,
                      evals=watchlist,
                      early_stopping_rounds=120)
    preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

    model.save_model('0001_1.model')
    #combine predictions
    #since the metric only cares about relative rank we don't need to average
    preds = (preds1) * 1.4 + (preds2) * 8.6
    return preds


if __name__ == '__main__':
    contents = load_result("data_after_features_processed.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])

    label_lines = np.array(load_result("train_label_original.csv"))
    #print(label_lines.shape)
    from save_load_result import convert_to_int
    label = convert_to_int(label_lines)

    label = label.reshape((label.size, ))
    print(label.shape)

    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    data = convert_to_numerical(data, features)
Пример #34
0
    data, features = new_WI_19(data, features)
    data, features = new_WI_20_by_present(data, features)
    #data, features = new_WI_20_by_positive(data, features)
    data, features = new_WI_21(data, features)
    #	save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir)

    save_result(data,
                "data_after_solved_weblog.csv",
                features,
                dir_name=saved_dir)

    return data, features


if __name__ == '__main__':
    contents = load_result("data_after_solved_UserInfo22_23.csv",
                           dir_name=saved_dir)
    features = np.array(contents[0])
    data = np.array(contents[1:])

    #data = convert_to_numerical(data, features)

    print(data.shape)
    print(features.shape)

    from create_new_features import find_featuers_index
    features_name = "WeblogInfo"
    fea_indexs = find_featuers_index(features_name, features)
    print(fea_indexs)
    weblog_data = data[:, fea_indexs]
    weblog_features = features[fea_indexs]
Пример #35
0
    #A helper method for pretty-printing linear models
    def pretty_print_linear(coefs, names=None, sort=False):
        if names == None:
            names = ["X%s" % x for x in range(len(coefs))]
        lst = zip(coefs, names)
        if sort:
            lst = sorted(lst, key=lambda x: -np.abs(x[0]))
        return " + ".join("%s * %s" % (round(coef, 3), name)
                          for coef, name in lst)

    print("Linear model:", pretty_print_linear(lr.coef_, features[1:]))


if __name__ == '__main__':

    contents = load_result("after_delete_strong_correlation_features_data.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])
    label_lines = np.array(load_result("train_label_original.csv"))
    data = convert_to_numerical(data, features)

    label = convert_to_float(label_lines)
    label = label.reshape((label.size, ))

    #use_RandomForestRegressor_to_delete(data, features, label)
    use_LR_to_delete(data, features, label)
#################### first example #######################
# testNum = 10
# average = 0
# for i in range(0, testNum):
#     #加载数据集,切分数据集80%训练,20%测试
Пример #36
0
    data, features = fill_thirdParty_miss(data, features)

    data, features = third_party_stable(data, features)

    data, features = third_party_level(data, features)
    save_result(data,
                "data_after_thirdparty_solved.csv",
                features,
                dir_name=saved_dir)
    return data, features


if __name__ == '__main__':

    contents = load_result("data_after_solved_weblog.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])

    data = convert_to_numerical(data, features)

    solve_thirdparty_info_package(data, features)

    # calculate_number = ["17"]
    # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0])
    # print(users_sta_name)
    # for i in range(10):
    # 	print(users_stability[i])
    # from create_new_features import find_featuers_index
Пример #37
0
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(data.shape[1]):
        print("%s. %d (%f)" %
              (features[indices[f]], indices[f], importances[indices[f]]))


# if a value in one features is bigger than 20000, besides
#	the positive in it is almost equal to the positive in the train data

if __name__ == '__main__':
    #################### used to calculate the correlation between properties #########
    contents = load_result("data_after_delete_no_discrimination_features.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])

    from map_features_to_digit import convert_to_numerical
    from solve_data import delete_features

    data = convert_to_numerical(data, features)

    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])

    correlation_between_properties(data, features)

    delete_result = according_properties_correlation_delete()
    save_result(delete_result, "deleted_features_with_strong_correlation.csv")
Пример #38
0
	# labels = labels.reshape((labels.size, ))
	# print(labels.shape)


	# key_words = ["Log_", "Update_"]
	# LU_features, LU_data = extract_features(key_words)

	# LU_data = convert_to_numerical(LU_data, LU_features)

	# print("***** UI analysis data *******")
	# print(LU_features)
	# print(LU_data.shape)
	# save_result(LU_data, "UI_analysis_data.csv", LU_features, dir_name = "resultData_All/")

	label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All"))
	#print(label_lines.shape)
	from save_load_result import convert_to_int
	labels = convert_to_int(label_lines)

	labels = labels.reshape((labels.size, ))
	print(labels.shape)

	contents = load_result("UI_analysis_data.csv", dir_name = "resultData_All")
	LU_features = np.array(contents[0])
	LU_data = np.array(contents[1:])
	print("LU_data: ", LU_data.shape)

	#LU_analysis_xgboost(LU_data, labels)
	# LU_analysis_LR(LU_data, labels)
	LU_analysis_SVC(LU_data, labels)
Пример #39
0
    data, features = new_UserInfo_24_resident_level(data, features)
    #	save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir)

    #data, features = new_UserInfo_22_23_combine1(data, features)
    #	save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir)

    data, features = new_UserInfo_22_23_combine2(data, features)
    save_result(data,
                "data_after_solved_user_info.csv",
                features,
                dir_name=saved_dir)

    return data, features


if __name__ == '__main__':

    contents = load_result("data_after_delete_too_many_missing_features.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])

    deleted_features_in_train = load_all_deleted_features_during_train(
        deleted_features_file_label="deleted_")
    data, features, deleted = delete_features(
        data, features, delete_feas_list=deleted_features_in_train)

    data, features = solve_user_info_package(data, features)

    from create_features_from_weblog import solve_weblog_info_package
    data, features = solve_weblog_info_package(data, features)
Пример #40
0
	# Print the feature ranking
	print("Feature ranking:")

	for f in range(data.shape[1]):
	    print("%s. %d (%f)" % (features[indices[f]], indices[f], importances[indices[f]]))




# if a value in one features is bigger than 20000, besides
#	the positive in it is almost equal to the positive in the train data

if __name__ == '__main__':
	#################### used to calculate the correlation between properties #########
	contents = load_result("data_after_delete_no_discrimination_features.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	from map_features_to_digit import convert_to_numerical
	from solve_data import delete_features

	data = convert_to_numerical(data, features)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])


	correlation_between_properties(data, features)

	delete_result = according_properties_correlation_delete()
	save_result(delete_result, "deleted_features_with_strong_correlation.csv")
Пример #41
0
	print(bins)
	iteral = 0

	while iteral < 100:
		sta_result = stats.binned_statistic(sorted_to_bin_values, sorted_to_bin_values, \
										bin_label, bins)

		new_bins = compare_and_combine(sta_result)
		if bins == new_bins:
			break
		bins = new_bins

	return sta_result

if __name__ == '__main__':
	contents = load_result("after_Str_features_digited_data.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	label_lines = np.array(load_result("train_label_original.csv"))
	print(label_lines.shape)

	label = convert_to_float(label_lines)

	from map_features_to_digit import convert_to_numerical

	data = convert_to_numerical(data, features)

	#index = np.where(features == "ThirdParty_Info_Period4_1")[0][0]
	index = np.where(features == "WeblogInfo_12")[0][0]
	fea_info = feature_value_class(data, index, label)
	#print(fea_info)
Пример #42
0
	data, features = new_UserInfo_23_education_level(data, features)
#	save_result(data, "data_after_solved_UserInfo23.csv", features, dir_name = saved_dir)

	data, features = new_UserInfo_24_resident_level(data, features)
#	save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir)

	#data, features = new_UserInfo_22_23_combine1(data, features)
#	save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir)

	data, features = new_UserInfo_22_23_combine2(data, features)
	#save_result(data, "data_after_solved_user_info.csv", features, dir_name = saved_dir)

	return data, features

# new_UserInfo_7_num
if __name__ == '__main__':

	contents = load_result("withoutLabel_originalData.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	data, features = new_UserInfo_7_num(data, features)
	save_result(data, "test.csv", features)
	# deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_")
	# data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train)

	# data, features = solve_user_info_package(data, features)

	# from create_features_from_weblog import solve_weblog_info_package
	# data, features = solve_weblog_info_package(data, features)
Пример #43
0
    iteral = 0

    while iteral < 100:
        sta_result = stats.binned_statistic(sorted_to_bin_values, sorted_to_bin_values, \
                bin_label, bins)

        new_bins = compare_and_combine(sta_result)
        if bins == new_bins:
            break
        bins = new_bins

    return sta_result


if __name__ == '__main__':
    contents = load_result("after_Str_features_digited_data.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])
    label_lines = np.array(load_result("train_label_original.csv"))
    print(label_lines.shape)

    label = convert_to_float(label_lines)

    from map_features_to_digit import convert_to_numerical

    data = convert_to_numerical(data, features)

    #index = np.where(features == "ThirdParty_Info_Period4_1")[0][0]
    index = np.where(features == "WeblogInfo_12")[0][0]
    fea_info = feature_value_class(data, index, label)
    #print(fea_info)
Пример #44
0
	xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

	watchlist = [(xgtrain, 'train'),(xgval, 'eval')]
	model = xgb.train(plst, xgtrain, num_boost_round = num_rounds, evals = watchlist, early_stopping_rounds=120)
	preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

	model.save_model('0001_1.model')
	#combine predictions
	#since the metric only cares about relative rank we don't need to average
	preds = (preds1)*1.4 + (preds2)*8.6
	return preds



if __name__ == '__main__':
	contents = load_result("data_after_features_processed.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	label_lines = np.array(load_result("train_label_original.csv"))
	#print(label_lines.shape)
	from save_load_result import convert_to_int
	label = convert_to_int(label_lines)

	label = label.reshape((label.size, ))
	print(label.shape)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

	test_data = data[:2000]
Пример #45
0
        for i in id_index_in_land:

            land_operate_code.append(log_info_data[i, 2])
            land_operate_style.append(log_info_data[i, 3])
            land_date.append(log_info_data[i, 4])
        all_id_info[id_name]["land_info"][
            "land_operate_code"] = land_operate_code
        all_id_info[id_name]["land_info"][
            "land_operate_style"] = land_operate_style
        all_id_info[id_name]["land_info"]["land_date"] = land_date

        # add the modify info
        all_id_info[id_name]["modify_info"] = OrderedDict()
        modify_info = list()
        modify_date = list()
        id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0]
        for i in id_index_in_modify:
            modify_info.append(update_info_data[i, 2])
            modify_date.append(update_info_data[i, 3])

        all_id_info[id_name]["modify_info"]["modify_things"] = modify_info
        all_id_info[id_name]["modify_info"]["modify_date"] = modify_date
    save_result(all_id_info, "all_id_info.pickle", dir_name=saved_dir)


if __name__ == '__main__':

    #combine_land_modify_infos(data, log_info_data, update_info_data)
    combined = load_result("all_id_info.pickle", dir_name=SAVE_DIR)
    print(combined["10001"])
def solve_weblog_info_package(data, features, saved_dir = "resultData/"):
	from map_features_to_digit import convert_to_numerical
	from solve_data import delete_features

	data, features = new_WI_19(data, features)
	data, features = new_WI_20_by_present(data, features)
	#data, features = new_WI_20_by_positive(data, features)
	data, features = new_WI_21(data, features)
#	save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir)
	
	#save_result(data, "data_after_solved_weblog.csv", features, dir_name = saved_dir)

	return data, features

if __name__ == '__main__':
	contents = load_result("data_after_solved_UserInfo22_23.csv", dir_name = saved_dir)
	features = np.array(contents[0])
	data = np.array(contents[1:])

	#data = convert_to_numerical(data, features)

	print(data.shape)
	print(features.shape)

	from create_new_features import find_featuers_index
	features_name = "WeblogInfo"
	fea_indexs = find_featuers_index(features_name, features)
	print(fea_indexs)
	weblog_data = data[:, fea_indexs]
	weblog_features = features[fea_indexs]
Пример #47
0
def view_each_features_label(data, features, label):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	str_style_features = np.array(load_result("str_features.csv")[0])
	str_features_index = get_known_features_index(features, str_style_features)

	new_label = label.reshape((label.size,))
	x = range(len(data))
	for fea_pos in range(len(features)):
		feature_name = features[fea_pos]
		if fea_pos in str_features_index:
			file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(fea_pos) + ")" + feature_name +  ".png"
		else:
			file_path = "view_data_area/after_all/with_label_under_mean/" + str(fea_pos) + ")" + feature_name +  ".png"
		features_info = feature_value_class(data, fea_pos, label, str_features_index)
		if features_info["num_of_value"] > 30:
			save_result([features[fea_pos]], "complex_value_features.csv", style = "a+")
		else:
			if fea_pos not in str_features_index:
				save_result([features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style = "a+")


		y_positive = data[new_label == 1, fea_pos]
		y_negitive = data[new_label == 0, fea_pos]
		positive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 1])
		negitive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 0])
		plt.scatter(positive_index, y_positive, marker = 'o', color = 'r', s = 10)
		plt.scatter(negitive_index, y_negitive, marker = 'x', color = 'g', s = 10)

		plt.xlabel("instances(30000)")
		plt.ylabel("value")
		if features_info["num_of_value"] < 40:
			plt.title(feature_name + " value - label " + "distributed " + "in instances" + \
						"\n the arrow --> Proportion of positive in that value & in positive")
			for k, v in features_info.items():
				if isinstance(v, FeatureInData):
					arrow_data = round(v._respond_positive_num / features_info["num_positive"] , 4)
					arrow_start_position_x = len(data) + 2000
					arrow_start_position_y = int(k)
					arrow_end_postion_x = arrow_start_position_x
					arrow_end_postion_y = int(k)
					plt.annotate(arrow_data, \
								xy=(arrow_start_position_x,arrow_start_position_y), \
								xytext=(arrow_end_postion_x,arrow_end_postion_y), \
								arrowprops=dict(facecolor='blue', shrink=0.02))

					arrow_data = round(v._respond_positive_num / v._present_num , 4)
					arrow_start_position_x = -4000
					arrow_start_position_y = int(k)
					arrow_end_postion_x = arrow_start_position_x
					arrow_end_postion_y = int(k)
					plt.annotate(arrow_data, \
								xy=(arrow_start_position_x,arrow_start_position_y), \
								xytext=(arrow_end_postion_x,arrow_end_postion_y), \
								arrowprops=dict(facecolor='blue', shrink=0.02))

		else:
			fea_average = round(np.mean(data[:, fea_pos]), 4)
			fea_std = np.std(data[:, fea_pos])
			fea_oo = round(fea_std / fea_average, 4)
			max_v = np.amax(data[:, fea_pos])
			min_v = np.amin(data[:, fea_pos])
			plt.title(feature_name + " | mean & Proportion of positive under that mean" + \
				"\n degree of fluctuation --> " + str(fea_oo))
			x1 = np.array(range(-5000, 35000))
			y_mean = fea_average * np.ones((x1.size))
			#plt.plot(x1, y_mean, color = 'k', linestyle = "--")
			plt.annotate(fea_average, \
								xy=(-4000,fea_average), \
								xytext=(-4000,fea_average), \
								arrowprops=dict(facecolor='blue', shrink=0.05))
			under_mean_positive = 0
			under_mean_num = 0

			for k, v in features_info.items():
				if isinstance(v, FeatureInData):
					if k <= fea_average:
						under_mean_num += v._present_num
						under_mean_positive += v._respond_positive_num
			ave_posi = round(under_mean_positive / features_info["num_positive"], 4)
			plt.annotate(ave_posi, \
					xy=(31000,fea_average), \
					xytext=(31000,fea_average), \
					arrowprops=dict(facecolor='blue', shrink=0.05))
			pos_rat = 0
			pos_rat_whole = 0
			if -1 in features_info.keys():
				pos_rat = features_info[-1]._respond_positive_num / features_info[-1]._present_num
				pos_rat_whole = features_info[-1]._respond_positive_num / features_info["num_positive"]
				plt.annotate(round(pos_rat_whole, 4), \
						xy=(31000,-1), \
						xytext=(31000,-1))
				plt.annotate(round(pos_rat, 4), \
						xy=(-4000,-1), \
						xytext=(-4000,-1))
			plt.ylim(min_v - 10, fea_average * 2)
			#plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10))
		plt.savefig(file_path)
		plt.close()
Пример #48
0
		id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0]
		for i in id_index_in_land:

			land_operate_code.append(log_info_data[i, 2])
			land_operate_style.append(log_info_data[i, 3])
			land_date.append(log_info_data[i, 4])
		all_id_info[id_name]["land_info"]["land_operate_code"] = land_operate_code
		all_id_info[id_name]["land_info"]["land_operate_style"] = land_operate_style
		all_id_info[id_name]["land_info"]["land_date"] = land_date

		# add the modify info
		all_id_info[id_name]["modify_info"] = OrderedDict()
		modify_info = list()
		modify_date  =list()
		id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0]
		for i in id_index_in_modify:
			modify_info.append(update_info_data[i, 2])
			modify_date.append(update_info_data[i, 3])

		all_id_info[id_name]["modify_info"]["modify_things"] = modify_info
		all_id_info[id_name]["modify_info"]["modify_date"] = modify_date
	save_result(all_id_info, "all_id_info.pickle", dir_name = saved_dir)


if __name__ == '__main__':

	#combine_land_modify_infos(data, log_info_data, update_info_data)
	combined = load_result("all_id_info.pickle", dir_name = SAVE_DIR)
	print(combined["10001"])