def test_pairwise_distance(): assert (sim.pairwise_distances( np.array([1, 0]).reshape(1, -1), np.array([1, 0]).reshape(1, -1), "euclidean") == 0) assert sim.pairwise_distances( np.array([2, 0]).reshape(1, -1), np.array([1, 0]).reshape(1, -1), "euclidean") == 1
def update_graph(origin_name, move_type, range_km, n_most, max_price, target): df_filtered = similarity.filter_w_price(df, max_price, [origin_name, target]) df_origin = df.loc[df['nimi'] == origin_name, :] if move_type == 'difference': target = origin_name area, included = map_fi_plot.get_included_area(df_filtered, move_type, origin_name, range_km, target) X, y, target_names = viz.get_pca_data(included, 2018, 5) target_names.index = range(len(target_names)) X_pca = pipe.transform(X) d = similarity.pairwise_distances(X_pca, X_pca, 'euclidean') similar = similarity.get_similar_in_geo_area(included, origin_name, d, target_names, n_most) df_comparison = df.loc[df['nimi'].isin(similar), :] coords_max = { 'miny': df.bounds.miny.min(), 'minx': df.bounds.minx.min(), 'maxy': df.bounds.miny.max(), 'maxx': df.bounds.miny.max() } layout = define_layout() return { # 'data': set_fill_colors_for_origin_and_comp(plot_data4, origin_name, similar, target), 'data': set_fill_colors_for_origin_and_comp(make_graph_data(included), origin_name, similar, target), 'layout': layout }
def update_table(origin_name, move_type, range_km, n_most, max_price, target): df_filtered = similarity.filter_w_price(df, max_price, [origin_name, target]) # transform values to ranks for easy understanding df_filtered_ranks = similarity.full_df_to_ranks(df_filtered, bins=10) df_origin = df.loc[df['nimi'] == origin_name, :] if move_type == 'difference': target = origin_name area, included = map_fi_plot.get_included_area(df_filtered, move_type, origin_name, range_km, target) NA, included_ranks = map_fi_plot.get_included_area(df_filtered_ranks, move_type, origin_name, range_km, target) X, y, target_names = viz.get_pca_data(included, 2018, 5) target_names.index = range(len(target_names)) X_pca = pipe.transform(X) d = similarity.pairwise_distances(X_pca, X_pca, 'euclidean') similar = similarity.get_similar_in_geo_area(included, origin_name, d, target_names, n_most) tb = viz.table_similar_with_names(included_ranks, origin_name, similar, target_names, X_pca, ['pono', 'nimi', 'he_kika', 'ra_asunn', 'te_laps', 'te_as_valj', 'tp_tyopy', 'tr_mtu', 'yliopistot', 'amk'], tail=False) tb = tb.drop_duplicates() tb = format_numeric_table_cols(tb, numcols=['dist']) cols = [x for x in tb.columns.values if x not in ['geometry', 'kunta', 'kuntanro', 'pono', 'pono.level', 'nimi', 'nimi_x', 'vuosi', 'dist', 'rakennukset_bin']] # tb.loc[:, cols] = tb.loc[:, cols].applymap(lambda x: similarity.value_to_plusses(x)) trace = go.Table( header=dict(values=list( ['Pono', 'Nimi', 'Keski-ikä', 'Asunnot', 'Lapsitaloudet', 'Työpaikat', 'Mediaanitulo', 'Yliopistot', 'AMK', 'Dist']), fill=dict(color='#C2D4FF'), align=['left'] * 5, height=40), cells=dict(values=[tb.pono, tb.nimi, tb.he_kika, tb.ra_asunn, tb.te_laps, tb.tp_tyopy, tb.tr_mtu, tb.yliopistot, tb.amk, tb.dist], fill=dict(color='#F5F8FF'), align=['left'] * 5, height=30) ) # py.plot([trace], 'test.html') return {'data': [trace], 'layout': dict(autosize=True, margin=dict( t=0, b=0, r=0, l=0 ) ) }
def plot_similar_in_geo_area(data, orig_name, target, range_km, how, n_most, pipe, figsize=(12, 10)): methods = ['intersection', 'difference'] if how not in methods: raise ValueError('how should be either "intersection" or "difference"') if target is None: target = orig_name df = merge_to_polygons_for_year(data, 2018) if orig_name not in list(df['nimi_x']): raise ValueError('origin_name not in data!') if target not in list(df['nimi_x']): raise ValueError('target not in data!') area, included = get_included_area(df, how, orig_name, range_km, target) x, y, target_names = viz.get_pca_data(included, 2018, 5) target_names.index = range(len(target_names)) x_pca = pipe.transform(x) d = similarity.pairwise_distances(x_pca, x_pca, 'euclidean') similar = similarity.get_similar_in_geo_area(included, orig_name, d, target_names, n_most) # included.plot(alpha=0.5, edgecolor='k', cmap='tab10') map_with_highlights_names(data, '', orig_name, similar, 2018, area=area, figsize=figsize)
def test_get_similar_in_geo_area(get_sample_geodata): included_area = get_sample_geodata.loc[get_sample_geodata.pono.isin( ['00180', '00200', '00210'])] x, y, target_names = viz.get_pca_data(get_sample_geodata, 2018, 5) target_names.index = range(len(target_names)) x_pca, pipe = viz.do_pca(x, 5) d = sim.pairwise_distances(x_pca, x_pca, 'euclidean') res = sim.get_similar_in_geo_area(included_area, orig_name="Vattuniemi", d=d, target_names=target_names, n_most=1) assert res == ['Lauttasaari'] included_area2 = get_sample_geodata.loc[get_sample_geodata.pono.isin( ['00180', '00210'])] assert sim.get_similar_in_geo_area(included_area2, orig_name="Vattuniemi", d=d, target_names=target_names, n_most=1) == ["Kamppi - Ruoholahti"]
X, y, target_names = viz.get_pca_data(data, 2018, 5) target_names.index = range(len(target_names)) viz.exploratory_pca(X, 20) X_pca, pipe = viz.do_pca(X, 5) pca_comp = viz.generate_pca_report(pipe.named_steps['pca']) pca_comp['vars'] = viz.get_pca_cols(data) print(pca_comp) viz.pca_plot(X_pca, target_names, y.ravel()) # save pca to csv # pca_comp.to_csv('pca.csv') # similarity calculation d = similarity.pairwise_distances(X_pca, X_pca, 'euclidean') names = similarity.get_n_most_similar_with_name("Otaniemi", d, target_names, 10) print(names) data.reset_index(inplace=True, drop=True) data_l5 = data.loc[data['pono.level'] == 5, :].assign( max_factor=pd.DataFrame(X_pca.argmax(axis=1))) map_fi_plot.map_fi_postinumero(data_l5, "Highest factors per area", color_var='max_factor') map_fi_plot.map_with_highlights_names( data_l5, "How similar to Vattuniemi?", 'Vattuniemi', similarity.get_n_most_similar_with_name('Vattuniemi', d, target_names, 15)) map_fi_plot.map_with_highlights_names(