def run_similarity_search(include_amen, amenities_list, regress_df, scaler, dict2, cat_pref_dict, preference_dict): df = regress_df.copy() df.loc[9999] = [0]*len(df.columns) df.ListingTitle.loc[9999] = dict2['ListingTitle'] df.Area_Sqft.loc[9999] = dict2['Area_Sqft'] df.Price_psf.loc[9999] = dict2['Price_psf'] df.RentalRate.loc[9999] = dict2['RentalRate'] df.NumBeds.loc[9999] = dict2['NumBeds'] number = dict2['District'] district = 'District_D' + number try: df[district].loc[9999] = 1 except: pass state = dict2['Furnishing'] furnishing = 'Furnishing_' + state try: df[furnishing].loc[9999] = 1 except: pass proptype = dict2['PropertyType'] property_type = 'PropertyType_' + proptype try: df[property_type].loc[9999] = 1 except: pass duration = dict2['Leasedur'] lease_dur = 'Lease_' + duration try: df[lease_dur].loc[9999] = 1 except: pass if include_amen == 'Y': df.Amenities.loc[9999] = dict2['Amenities'] for amenity in amenities_list: if amenity in df.Amenities.loc[9999]: df[amenity].loc[9999] = 1 instance = df.loc[9999].copy() instance['Summary'] = ','.join([property_type.split('_')[1], district.split('_')[1], furnishing.split('_')[1], lease_dur.split('_')[1]]) df.drop(['ListingTitle','Url','Description','Amenities'], axis=1, inplace=True) scaled = scaler.fit_transform(df) df_scaled = pd.DataFrame(data=scaled, columns=df.columns, index=df.index) scaler_second = MinMaxScaler() features = list(preference_dict.keys()) for feature in features: scaler_second.feature_range = (0, preference_dict[feature]) df_scaled[feature] = scaler_second.fit_transform(np.array(df_scaled[feature]).reshape(-1,1)) features_cat = list(cat_pref_dict.keys()) for feature in features_cat: scaler_second.feature_range = (0, cat_pref_dict[feature]) for column in df.columns: if feature in column: df_scaled[column] = scaler_second.fit_transform(np.array(df_scaled[column]).reshape(-1,1)) knn_filtering = NearestNeighbors(radius=2) knn_filtering.fit(df_scaled) return knn_filtering, df_scaled, instance, property_type
def _min_max_scaler(ranges, feature_range=(0, 100)): res = MinMaxScaler() res.data_max_ = ranges[:, 1] res.data_min_ = ranges[:, 0] res.data_range_ = res.data_max_ - res.data_min_ res.scale_ = (feature_range[1] - feature_range[0]) / (ranges[:, 1] - ranges[:, 0]) res.min_ = -res.scale_ * res.data_min_ res.n_samples_seen_ = 1 res.feature_range = feature_range return res
def genetic_programming(pidata): if plot_gp: from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sympy import init_printing init_printing(use_latex=True, forecolor="White") # Y = np.array(pidata['PI1']) # X = np.array(pidata[['PI2','PI3','PI4','PI5','PI6','PI7']]) XY = np.array(pidata) min_max_scaler = MinMaxScaler() min_max_scaler.feature_range = (0.1, 1) XY_train_minmax = min_max_scaler.fit_transform(XY) Y = XY_train_minmax[:, 0] X = XY_train_minmax[:, 1:] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) # xm = np.min(X_train[:,2]) # xM = np.max(X_train[:,2]) ## X_train[:,2] = (X_train[:,2] - xm) / (xM-xm) ## X_train[:,2] *= 1e-12 ## X_train[:,4] *= 1e12 # # xm = np.min(X_train[:,4]) # xM = np.max(X_train[:,4]) ## X_train[:,4] = (X_train[:,4] - xm) / (xM-xm) def _powf(x1, x2): with np.errstate(over='ignore'): return np.where(np.abs(x1**x2) < 100, x1**x2, 0.) powf = make_function(function=_powf, name='powf', arity=2) function_set = ['add', 'sub', 'mul', 'div', 'neg', 'inv'] est_gp = SymbolicRegressor( population_size=5000, generations=20, # tournament_size=50, stopping_criteria=0.01, const_range=(-10, 10), init_depth=(2, 10), function_set=function_set, # p_crossover=0.7, # p_subtree_mutation=0.05, # p_hoist_mutation=0.1, # p_point_mutation=0.1, # max_samples=0.9, verbose=1, # feature_names = ('\Pi_2','\Pi_3','\Pi_4','\Pi_5','\Pi_6','\Pi_7'), # parsimony_coefficient=0.01, random_state=0, n_jobs=1) est_gp.fit(X_train, Y_train) from sympy import symbols, Add, Mul, Lambda, sympify import sympy as sp x, y = symbols('x y') loc = { "add": Add, "mul": Mul, "sub": Lambda((x, y), x - y), "div": Lambda((x, y), x / y), "sqrt": Lambda(x, sp.sqrt(sp.Abs(x))), "neg": Lambda(x, -x), "inv": Lambda(x, 1 / x), "powf": Lambda((x, y), x**y), "log": Lambda(x, sp.log(x)), "abs": Lambda(x, sp.Abs(x)) } exp = sympify(est_gp._program, locals=loc) score_gp = est_gp.score(X_test, Y_test) print(exp) print(score_gp) fres = [] for row in pidata.iterrows(): drow = row[1] X0 = (drow['PI2'] - np.min(pidata[['PI2']])) / ( np.max(pidata[['PI2']]) - np.min(pidata[['PI2']])) X1 = (drow['PI3'] - np.min(pidata[['PI3']])) / ( np.max(pidata[['PI3']]) - np.min(pidata[['PI3']])) X2 = (drow['PI4'] - np.min(pidata[['PI4']])) / ( np.max(pidata[['PI4']]) - np.min(pidata[['PI4']])) X3 = (drow['PI5'] - np.min(pidata[['PI5']])) / ( np.max(pidata[['PI5']]) - np.min(pidata[['PI5']])) X4 = (drow['PI6'] - np.min(pidata[['PI6']])) / ( np.max(pidata[['PI6']]) - np.min(pidata[['PI6']])) X5 = (drow['PI7'] - np.min(pidata[['PI7']])) / ( np.max(pidata[['PI7']]) - np.min(pidata[['PI7']])) tmp = exp.subs({ 'X0': X0, 'X1': X1, 'X2': X2, 'X3': X3, 'X4': X4, 'X5': X5 }) fres.append(tmp) fres = np.array(fres) dPi = np.max(pidata[['PI1']]) - np.min(pidata[['PI1']]) plt.loglog(fres, (pidata[['PI1']] - np.min(pidata[['PI1']])) / dPi, 'ko') # plt.xlim([0.5,1e5]) # plt.ylim([0.5,1e5]) plt.loglog(np.logspace(0, 5), np.logspace(0, 5), 'k--')
def scale(dataframe, scale=(0,1)): columns = dataframe.columns scaler = MinMaxScaler() scaler.feature_range = scale return pd.DataFrame(scaler.fit_transform(dataframe), columns=columns).dropna()