def normal_distribution():
    residence_table = Residences()
    # residences = residence_table.get_residences(5000)
    residences = residence_table.get_all_residences()

    residences_df = pd.DataFrame(residences)
    prices = residences_df['price']
    standard_deviation = prices.std()

    mu = prices.mean()
    variance = prices.var()
    sigma = math.sqrt(variance)
    x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 50)

    plt.style.use('ggplot')
    plt.plot(x, stats.norm.pdf(x, mu, sigma))
    plt.title('Distribuția normală a prețurilor')
    plt.xlabel('Prețul chiriei lunare')
    plt.ylabel('Probabilitatea de densitate')
    plt.suptitle('Valoarea medie: ' + str(round(mu, 2)) +
                 '\nDeviația standard: ' + str(round(standard_deviation, 2)),
                 x=0.75,
                 y=0.85,
                 fontsize=10)
    plt.show()
Пример #2
0
        def process_item(self, item, spider):
            res_table = Residences()
            zones = res_table.get_zones()

            if 'zone' in item:
                zone_from_item = item['zone'].lower()
                for zone in zones:
                    if zone_from_item in zone.get('name').lower():
                        item['zone_id'] = zone['zone_id']
                        return item

                for zone in zones:
                    if similar(zone_from_item,
                               zone.get('name').lower()) > 0.75:
                        item['zone_id'] = zone['zone_id']
                        return item

            return item
def plot_surface_and_rental_price():
    residence_table = Residences()
    residences = residence_table.get_all_residences()

    residences_df = pd.DataFrame(residences)
    residences_df = residences_df[residences_df.livable_area.notnull()]
    plt.figure()
    plt.style.use('ggplot')
    plt.scatter(residences_df['livable_area'],
                residences_df['price'],
                s=15,
                edgecolor="black",
                c="darkorange")
    plt.xlabel("Suprafața locuibilă")
    plt.ylabel("Prețului chiriei lunare")
    plt.title("Suprafața locuibilă / Prețului chiriei lunare")
    plt.legend()
    plt.show()
def plot_rooms_vs_price():
    residence_table = Residences()
    residences = residence_table.get_all_residences()

    residences_df = pd.DataFrame(residences)
    residences_df = residences_df[residences_df.rooms.notnull()]
    plt.figure()
    plt.style.use('ggplot')
    plt.scatter(residences_df['rooms'],
                residences_df['price'],
                s=15,
                edgecolor="black",
                c="darkorange")
    plt.xlabel("Număr camere")
    plt.ylabel("Prețului chiriei lunare")
    plt.title("Număr camere / Prețului chiriei lunare")
    plt.legend()
    plt.show()
def plot_surface_vs_price_per_sq_meter():
    residence_table = Residences()
    residences = residence_table.get_all_residences()

    residences_df = pd.DataFrame(residences)
    residences_df = residences_df[residences_df.livable_area.notnull()]
    residences_df = residences_df[residences_df.livable_area > 0]
    livable_area = residences_df['livable_area']
    price_per_sq_meter = residences_df['price'] / residences_df['livable_area']

    plt.figure()
    plt.style.use('ggplot')
    plt.scatter(livable_area,
                price_per_sq_meter,
                s=15,
                edgecolor="black",
                c="darkorange")
    plt.xlabel("Suprafața locuibilă")
    plt.ylabel("Prețul pe metrul pătrat")
    plt.title("Suprafața locuibilă / Suprafața locuibilă")
    plt.legend()
    plt.show()
def predict():
    res_table = Residences()
    zones = res_table.get_zones()

    if request.method == 'POST':

        if request.form.get('url_to_crawl'):
            item = None
            item = crawl_item(request.form.get('url_to_crawl'))

            total_sleep = 0
            while item is None:
                print(
                    'Crawler iteration - waiting for item in flask - sleep 0.5'
                )
                total_sleep += 0.5
                time.sleep(0.5)
                if total_sleep == 8:
                    return render_template(
                        "layout.html",
                        zones=zones,
                        error_msg=
                        'Nu am putut face extrage detaliile despre apartament')

            # print('Item:', item)
            if 'error' in item:
                return render_template("layout.html",
                                       zones=zones,
                                       error_msg=item.get('error'))
            json_ = item
        else:
            json_ = {}

            for key, val in request.form.items(multi=False):
                if val:
                    json_[key] = float(val)

        json_ = [json_]

        print('Json:', json_)

        query = pd.get_dummies(pd.DataFrame(json_))
        query = query.reindex(columns=rnd_columns, fill_value=0)

        predict = list(lr.predict(query))

        print('Prediction:', predict)

        price_range_min = round(predict[0]) * 50
        price_range_max = round(predict[0]) * 50 + 50
        string_interval = "{} - {}".format(price_range_min, price_range_max)

        value = None
        if json_[0].get('price'):
            if price_range_min > json_[0].get('price'):
                value = 'subevaluat'
            elif price_range_max < json_[0].get('price'):
                value = 'supraevaluat'
            else:
                value = 'evaluat corect'
        return render_template("layout.html",
                               zones=zones,
                               price_interval=string_interval,
                               specs=translate_specs(json_[0]),
                               value=value)
    else:
        return render_template("layout.html", zones=zones)
from datetime import datetime
import joblib
import numpy as np
import math
from src.models.residences import Residences
import pandas as pd

from src.visualization.feature_importance import print_feature_importance

residences_nr = 12000
save = False

list_features_to_drop = ['price', 'currency', 'price_interval']

residence_table = Residences()
# residences = residence_table.get_residences(residences_nr)
residences = residence_table.get_all_residences()
print('Number of residences: {}'.format(len(residences)))
residences = pd.DataFrame(residences)
residences = residences.fillna(-999)
residences = residences.sample(frac=1)

# target = np.array(residences['price'])
target = np.array(residences['price_interval'])
features = residences.drop(list_features_to_drop, axis=1)
features_columns = features.columns
feature_list = list(features.columns)
features = np.array(features)

data_train = features[:math.floor(len(features) * 0.9)]
target_train = target[:math.floor(len(target) * 0.9)]
Пример #8
0
import numpy as np

from src.models.residences import Residences
import pandas as pd

list_features_to_drop = [
    'price', '_sa_instance_state', 'id', 'main_ad_id', 'conditioning',
    'heating', 'currency', 'availability', 'status', 'created_at'
]

residence_table = Residences()
residences = residence_table.get_residences(1600)
residences = pd.DataFrame(residences)
residences = residences.fillna(-1)
residences = residences.sample(frac=1)

residences_train = residences[np.math.floor(len(residences) * 0.8):]
residences_test = residences[:np.math.floor(len(residences) * 0.8)]

target_train = np.array(residences_train['price'])
features_train = residences.drop(list_features_to_drop, axis=1)
feature_train_list = list(features_train.columns)
features_train = np.array(features_train)

target_test = np.array(residences_test['price'])
features_test = residences.drop(list_features_to_drop, axis=1)
feature_test_list = list(features_test.columns)
features_test = np.array(features_test)

## RANDOM FOREST - KFOLD AND MODEL
    def open_spider(self, spider):
        res_table = Residences()
        ad_locations = res_table.get_ad_locations()

        spider.ad_locations = ad_locations