Exemplo n.º 1
0
def main():
    dataset_train = read_dataset(
        "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/treino.csv"
    )
    dataset_test = read_dataset(
        "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/teste.csv"
    )

    dataset_train.replace(to_replace=[None], value=np.nan, inplace=True)
    dataset_test.replace(to_replace=[None], value=np.nan, inplace=True)

    raw_dataset_values_train = dataset_train.drop(columns=['inadimplente'])

    transformed_values_train = input_data(raw_dataset_values_train)
    transformed_values_test = input_data(dataset_test)

    # Deve-se utilizar a mesma escala de dados para o treinamento e teste
    # https://datascience.stackexchange.com/questions/27615/should-we-apply-normalization-to-test-data-as-well
    scaler = StandardScaler()
    standardized_values_train = scaler.fit_transform(transformed_values_train)
    standardized_values_test = scaler.transform(transformed_values_test)

    standardized_values_train = pd.DataFrame(
        standardized_values_train, columns=raw_dataset_values_train.keys())
    standardized_values_test = pd.DataFrame(standardized_values_test,
                                            columns=dataset_test.keys())

    train_x = standardized_values_train
    train_y = dataset_train.inadimplente

    test_x = standardized_values_test

    undersample = RandomUnderSampler(sampling_strategy='majority')

    model = RandomForestClassifier()

    X_under, y_under = undersample.fit_resample(train_x, train_y)

    model.fit(X_under, y_under)

    filename = 'test_data_scientist_dataminer/modelo-adaboost.joblib'
    dump(model, filename)

    loaded_model = load(filename)

    predictions = model.predict(test_x)

    dataset_test_raw_df = read_dataset(
        "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/teste.csv"
    )
    dataset_test_raw_df['inadimplente'] = predictions
    dataset_test_raw_df.to_csv("test_data_scientist_dataminer/teste.csv",
                               index=False)
Exemplo n.º 2
0
def note():
    if os.path.isfile(LOVE_NOTES_FILE_PATH):
        data = dict(
            note=raw_input()
        )
        append_data_into_file(data, LOVE_NOTES_FILE_PATH)
    else:
        data = dict(
            notes=[
                dict(
                    note=raw_input()
                )
            ]
        )
        util.input_data(data, LOVE_NOTES_FILE_PATH)
Exemplo n.º 3
0
def complete_task():
    not_valid_task_number = 1
    if os.path.isfile(TODAYS_TASKS_ENTRY_FILE_PATH):
        with open(TODAYS_TASKS_ENTRY_FILE_PATH, 'r') as todays_tasks_entry:
            contents = yaml.load(todays_tasks_entry)
            i = 0
            no_task_left = True
            for entry in contents['entries']:
                i += 1
                if entry['status'] == 0:
                    no_task_left = False

            if no_task_left:
                chalk.green(
                    'All tasks have been competed! Add a new task by entering "yoda  diary nt"')
            else:
                click.echo('Today\'s agenda:')
                click.echo('----------------')
                click.echo("Number |  Time   | Task")
                click.echo("-------|---------|-----")

                i = 0
                for entry in contents['entries']:
                    i += 1
                    time = entry['time']
                    text = entry['text'] if entry['status'] == 0 else strike(
                        entry['text'])
                    # status = "O" if entry['status'] == 0 else "X"
                    if entry['status'] == 0:
                        no_task_left = False
                        click.echo("   " + str(i) + "   | " +
                                   time + ": " + text)
                while not_valid_task_number:
                    chalk.blue(
                        'Enter the task number that you would like to set as completed')
                    task_to_be_completed = int(raw_input())
                    if(task_to_be_completed > len(contents['entries'])):
                        chalk.red('Please Enter a valid task number!')
                    else:
                        contents['entries'][task_to_be_completed -
                                            1]['status'] = 1
                        util.input_data(contents, TODAYS_TASKS_ENTRY_FILE_PATH)
                        not_valid_task_number = 0
    else:
        chalk.red(
            'There are no tasks for today. Add a new task by entering "yoda diary nt"')
Exemplo n.º 4
0
def setup():
    util.create_folder(MONEY_CONFIG_FOLDER_PATH)

    if util.ask_overwrite(MONEY_CONFIG_FILE_PATH):
        return

    chalk.blue('Enter default currency code:')
    currency_code = (raw_input().strip())
    click.echo(currency_rates.get_rates(currency_code))
    click.echo(currency_codes.get_symbol(currency_code))
    click.echo(currency_codes.get_currency_name(currency_code))

    chalk.blue('Enter inital amount:')
    initial_money = int(raw_input().strip())

    setup_data = dict(currency_code=currency_code, initial_money=initial_money)

    util.input_data(setup_data, MONEY_CONFIG_FILE_PATH)
Exemplo n.º 5
0
def setup():
    util.create_folder(LOVE_CONFIG_FOLDER_PATH)

    if util.ask_overwrite(LOVE_CONFIG_FILE_PATH):
        return

    chalk.blue('Enter their name:')
    name = (raw_input().strip())

    chalk.blue('Enter sex(M/F):')
    sex = (raw_input().strip())

    chalk.blue('Where do they live?')
    place = (raw_input().strip())

    setup_data = dict(
        name=name,
        place=place,
        sex=sex
    )

    util.input_data(setup_data, LOVE_CONFIG_FILE_PATH)
Exemplo n.º 6
0
def new_note():
    today_entry_check()

    chalk.blue('Input your entry for note:')
    note = raw_input().strip()

    if os.path.isfile(TODAYS_NOTES_ENTRY_FILE_PATH):
        with open(TODAYS_NOTES_ENTRY_FILE_PATH, "r"):
            setup_data = dict(
                time=now_time(),
                text=note
            )
            append_data_into_file(setup_data, TODAYS_NOTES_ENTRY_FILE_PATH)
    else:
        setup_data = dict(
            entries=[
                dict(
                    time=now_time(),
                    text=note
                )
            ]
        )
        util.input_data(setup_data, TODAYS_NOTES_ENTRY_FILE_PATH)
Exemplo n.º 7
0
def new_task():
    today_entry_check()

    chalk.blue('Input your entry for task:')
    note = raw_input().strip()

    if os.path.isfile(TODAYS_TASKS_ENTRY_FILE_PATH):
        setup_data = dict(
            time=now_time(),
            text=note,
            status=0
        )
        append_data_into_file(setup_data, TODAYS_TASKS_ENTRY_FILE_PATH)
    else:
        setup_data = dict(
            entries=[
                dict(
                    time=now_time(),
                    text=note,
                    status=0
                )
            ]
        )
        util.input_data(setup_data, TODAYS_TASKS_ENTRY_FILE_PATH)
def main():
    dataset = read_dataset(
        "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/treino.csv"
    )

    dataset.replace(to_replace=[None], value=np.nan, inplace=True)

    raw_dataset_values = dataset.drop(columns=['inadimplente'])

    transformed_values = input_data(raw_dataset_values)

    standardized_values = rescale_data(transformed_values, raw_dataset_values)

    # calc_corr_fig(standardized_values)

    x = standardized_values
    # Remove-se as demais características correlacionadas, mantendo-se apenas uma
    x_without_corr_feat = standardized_values.drop(columns=[
        'vezes_passou_de_30_59_dias', 'numero_de_vezes_que_passou_60_89_dias'
    ])
    y = dataset.inadimplente

    SEED = 7707
    np.random.seed(SEED)
    # Realiza-se a estratificação dos dados tendo em vista o desbalanceamento da base
    train_x, test_x, train_y, test_y = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        stratify=y)

    train_x_without_corr_feat, test_x_without_corr_feat, train_y_without_corr_feat, test_y_without_corr_feat = train_test_split(
        x_without_corr_feat, y, test_size=0.3, stratify=y)

    undersample = RandomUnderSampler(sampling_strategy='majority')

    X_without_corr_feat_under, y_without_corr_feat_under = undersample.fit_resample(
        x_without_corr_feat, y)
    x_under, y_under = undersample.fit_resample(x, y)
    train_x_under, train_y_under = undersample.fit_resample(train_x, train_y)
    train_x_without_corr_feat_under, train_y_without_corr_feat_under = undersample.fit_resample(
        train_x_without_corr_feat, train_y_without_corr_feat)

    #tsne_scatterplot(x_without_corr_feat, y)

    # Os classificadores validados foram escolhidos de acordo com o aspecto da base de dados:
    # características numéricas, multidimensional com alto número de instâncias e problema não linearmente separável
    models = [
        DummyClassifier(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        GaussianNB(),
        AdaBoostClassifier(n_estimators=100),
        RandomForestClassifier(),
        BaggingClassifier(base_estimator=GaussianNB(), n_estimators=100)
    ]
    k_size = 5

    # Criando aleatoridade nos grupos de folds (para evitar repetição). Abordagem mais adequada para bases desbalanceadas
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html#sklearn.model_selection.GroupKFold
    x_under['idade_r'] = x_under.idade + np.random.randint(-2, 3, size=14662)
    x_under.idade_r = x_under.idade + abs(x_under.idade.min()) + 1

    print("Validando modelos com todas as características")
    validate_models_cv(x_under, y_under, x_under.idade_r, models, k_size)
    validate_models_holdout(train_x_under, train_y_under, test_x, test_y,
                            models, k_size)

    print("Validando modelos sem as características correlacionadas")
    validate_models_cv(X_without_corr_feat_under, y_without_corr_feat_under,
                       x_under.idade_r, models, k_size)
    validate_models_holdout(train_x_without_corr_feat_under,
                            train_y_without_corr_feat_under,
                            test_x_without_corr_feat, test_y_without_corr_feat,
                            models, k_size)