def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
예제 #3
0
def test_s3_test_profile(es, s3_client, s3_bucket, setup_test_profile):
    features_original = ft.dfs(target_entity='sessions',
                               entityset=es,
                               features_only=True)

    ft.save_features(features_original, TEST_S3_URL, profile_name='test')

    obj = list(s3_bucket.objects.all())[0].key
    s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL='public-read-write')

    features_deserialized = ft.load_features(TEST_S3_URL, profile_name='test')
    assert_features(features_original, features_deserialized)
예제 #4
0
def construct_retail_example(ftens_file='retail_binary_files/ftens.csv',
                             labels_file='retail_binary_files/labels.csv',
                             fl_file='retail_binary_files/fl.p'):
    es = ft.demo.load_retail()
    if os.path.exists(ftens_file):
        ftens = pd.read_csv(ftens_file,
                            index_col=['customer_id', 'time'],
                            parse_dates=['time'])
        labels = pd.read_csv(labels_file, index_col='customer_id')['label']
        fl = ft.load_features(fl_file, es)
    else:
        labels = create_labels(es,
                               min_training_data='8 days',
                               lead='7 days',
                               window='30 days',
                               reduce='sum',
                               binarize=None,
                               iterate_by=None)
        labels_binary = labels.copy()
        labels_binary['label'] = labels_binary['label'] > 300
        sampled = sample_labels(labels_binary, n=1)
        sampled = sampled[['customer_id', 'time', 'label']]
        sampled = sampled.sample(300)

        ftens, fl = ft.tdfs(target_entity='customers',
                            entityset=es,
                            cutoffs=sampled,
                            window_size='30d',
                            num_windows=5,
                            verbose=True)

        ftens = (ftens.reset_index(
            'customer_id', drop=False).reset_index(drop=False).merge(
                sampled[['customer_id', 'label']],
                on='customer_id',
                how='left').set_index('customer_id').set_index('time',
                                                               append=True))

        labels = (ftens['label'].reset_index(
            'customer_id',
            drop=False).drop_duplicates('customer_id').set_index('customer_id')
                  )
        del ftens['label']
        ftens.to_csv(ftens_file)
        labels.to_csv(labels_file)
        labels = labels['label']
        ft.save_features(fl, fl_file)
    return ftens, labels, fl
def test_pickle_features(es):
    features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True)

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserialized = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_original, features_deserialized):
        assert feat_1.unique_name() == feat_2.unique_name()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    os.remove(filepath)
예제 #6
0
    def test_serialize(self, es):
        features = dfs(
            entityset=es,
            target_dataframe_name="log",
            trans_primitives=[self.primitive],
            max_features=-1,
            max_depth=3,
            features_only=True,
        )

        feat_to_serialize = None
        for feature in features:
            if feature.primitive.__class__ == self.primitive:
                feat_to_serialize = feature
                break
            for base_feature in feature.get_dependencies(deep=True):
                if base_feature.primitive.__class__ == self.primitive:
                    feat_to_serialize = base_feature
                    break
        assert feat_to_serialize is not None

        # Skip calculating feature matrix for long running primitives
        skip_primitives = ["elmo"]

        if self.primitive.name not in skip_primitives:
            df1 = calculate_feature_matrix([feat_to_serialize], entityset=es)

        new_feat = load_features(save_features([feat_to_serialize]))[0]
        assert isinstance(new_feat, ft.FeatureBase)

        if self.primitive.name not in skip_primitives:
            df2 = calculate_feature_matrix([new_feat], entityset=es)
            assert df1.equals(df2)
예제 #7
0
def get_train_data(project,
                   train_file,
                   prediction_key,
                   prediction_target,
                   variable_types={},
                   drop_columns=None):

    # Read the training data
    print("==========Reading the training file {}".format(train_file))
    train_data = pd.read_csv(train_file)
    train_data.head(5)

    print("==========Preparing training labels for target {}".format(
        prediction_target))
    train_labels = train_data[prediction_target].values
    train_data = train_data.drop(prediction_target, axis=1)

    if drop_columns is not None:
        print("==========dropping columns {}".format(drop_columns))
        train_data = train_data.drop(drop_columns, axis=1)

    print("==========Generating the feature with featuretools")

    es = ft.EntitySet(project)

    entities = get_ft_entities(es=es,
                               project=project,
                               prediction_key=prediction_key,
                               data=train_data,
                               variable_types=variable_types)

    print("==========entities are:")
    print(entities)

    feature_matrix, feature_defs = ft.dfs(entityset=entities,
                                          target_entity=project)

    feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)
    print("==========columns are:")
    print(feature_matrix_enc.columns)

    print("==========saving features to {}".format(project))
    ft.save_features(feature_defs, "data/{}/ft_features".format(project))

    return feature_matrix_enc, train_labels
예제 #8
0
def build_card_one_hot():
    """ Reads in the raw data from train.csv and creates
        one-hot encodings for the feature and date fields.

        :return: Data frame with one-hot encoding
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in data.")
    df = pd.read_csv('data/raw/train.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")

    logger.info("Creating entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=df,
                                              index='card_id',
                                              time_index="first_active_month",
                                              variable_types=CARD_TYPES)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity="transactions")

    logger.info("Creating one-hot training data")
    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Creating one-hot test data")
    df = pd.read_csv('data/raw/test.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")
    df['target'] = 0
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=df,
                                            index='card_id',
                                            time_index="first_active_month",
                                            variable_types=CARD_TYPES)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)
    test_feature_matrix_enc.drop(columns='target', inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
예제 #9
0
def test_custom_feature_names_retained_during_serialization(pd_es, tmpdir):
    class MultiCumulative(TransformPrimitive):
        name = "multi_cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 3

    multi_output_trans_feat = ft.Feature(
        pd_es["log"].ww["value"], primitive=MultiCumulative
    )
    groupby_trans_feat = ft.GroupByTransformFeature(
        pd_es["log"].ww["value"],
        primitive=MultiCumulative,
        groupby=pd_es["log"].ww["product_id"],
    )
    multi_output_agg_feat = ft.Feature(
        pd_es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    slice = FeatureOutputSlice(multi_output_trans_feat, 1)
    stacked_feat = ft.Feature(slice, primitive=Negate)

    trans_names = ["cumulative_sum", "cumulative_max", "cumulative_min"]
    multi_output_trans_feat.set_feature_names(trans_names)
    groupby_trans_names = ["grouped_sum", "grouped_max", "grouped_min"]
    groupby_trans_feat.set_feature_names(groupby_trans_names)
    agg_names = ["first_most_common", "second_most_common"]
    multi_output_agg_feat.set_feature_names(agg_names)

    features = [
        multi_output_trans_feat,
        multi_output_agg_feat,
        groupby_trans_feat,
        stacked_feat,
    ]
    file = os.path.join(tmpdir, "features.json")
    ft.save_features(features, file)
    deserialized_features = ft.load_features(file)

    new_trans, new_agg, new_groupby, new_stacked = deserialized_features
    assert new_trans.get_feature_names() == trans_names
    assert new_agg.get_feature_names() == agg_names
    assert new_groupby.get_feature_names() == groupby_trans_names
    assert new_stacked.get_feature_names() == ["-(cumulative_max)"]
def pickle_features_test_helper(es_size, features_original, dir_path):
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserializedA = ft.load_features(filepath)
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    with open(filepath, "w") as f:
        ft.save_features(features_original, f)
    features_deserializedB = ft.load_features(open(filepath))
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    features = ft.save_features(features_original)
    features_deserializedC = ft.load_features(features)
    assert asizeof(features) < es_size

    features_deserialized_options = [features_deserializedA, features_deserializedB, features_deserializedC]
    for features_deserialized in features_deserialized_options:
        assert_features(features_original, features_deserialized)
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(features_no_pickle[0].entityset, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath, es)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(feat_1.entityset)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
예제 #12
0
def pickle_features_test_helper(es_size, features_original):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserializedA = ft.load_features(filepath)
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    with open(filepath, "w") as f:
        ft.save_features(features_original, f)
    features_deserializedB = ft.load_features(open(filepath))
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    features = ft.save_features(features_original)
    features_deserializedC = ft.load_features(features)
    assert asizeof(features) < es_size

    features_deserialized_options = [features_deserializedA, features_deserializedB, features_deserializedC]
    for features_deserialized in features_deserialized_options:
        for feat_1, feat_2 in zip(features_original, features_deserialized):
            assert feat_1.unique_name() == feat_2.unique_name()
            assert feat_1.entityset == feat_2.entityset
예제 #13
0
def test_serialize_url(es):
    features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True)
    error_text = "Writing to URLs is not supported"
    with pytest.raises(ValueError, match=error_text):
        ft.save_features(features_original, URL)
예제 #14
0
 def deploy_features_create(features_enc, model_path):
     '''
     you can save self.features_def to feature_definitions.json for deploying,
     '''
     ft.save_features(features_enc, model_path)
    target_entity="users",
    cutoff_time=label_times,
    training_window=ft.Timedelta("60 days"),
    entities=es,
    verbose=True)
fm_encode, f_encode = es.feature_encoder(feature_matrix=feature_matrix,
                                         features=features)
print("Number of featyres %s" % len(fm_encode))
print(fm_encode.head(10))

# 机器学习
X = merge_features_labels(fm_encode, label_times)
X.drop(["user_id", "time"], axis=1, inplace=True)
X = X.fillna(0)
y = X.pop("label")

# 随机森林的使用
clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)  # 树的个数
scores = cross_val_score(
    estimator=clf,
    X=X,
    y=y,
    cv=3,  # cv 3折交叉验证
    scoring="roc_auc",
    verbose=True)
print("AUC %.2f +/- %.2f" % (scores.mean(), scores.std()))

clf.fit(X, y)  #训练
top_feature = feature_importtance(clf, f_encode, n=20)
ft.save_features(top_feature, "top_features")
예제 #16
0
def main(users_from, users_till):
    # ### DEFINE THE PIPELINE PARAMETERS

    # In[2]:

    show_report = False
    save_model = True

    # the timeframe of extracted users

    # users_from = '2016-10-01'
    # users_till = '2017-09-30'
    cohort_size = 3000

    # the timeframe of extracted behavioral data
    interval = '3 weeks'

    # the type of the prediction problem
    # 'regression', 'binary classification', 'multiclass classification'
    prediction_problem_type = 'binary classification'

    # multiclass values
    medium_value = 5
    high_value = 50

    # number of the most important features to extract
    number_of_features = 20

    print("Pipeline parameters defined")

    # ### CONNECT TO THE DATABASE

    # In[3]:

    conn, cur = utils.connect_to_db()

    # ### BUILD ENTITY TABLES AND LABELS

    # #### Cohorts entity

    # In[4]:

    cohorts = utils_bux.build_cohorts_entity(cur=cur,
                                             users_from=users_from,
                                             users_till=users_till)

    # #### Users entity

    # In[5]:

    users = utils_bux.build_users_entity(cur=cur,
                                         users_from=users_from,
                                         users_till=users_till,
                                         interval=interval,
                                         cohorts=cohorts,
                                         cohort_size=cohort_size)

    # #### Transactions entity

    # In[6]:

    transactions = utils_bux.build_transactions_entity(cur=cur,
                                                       interval=interval)

    # #### Labels

    # In[7]:

    labels = utils_bux.build_target_values(cur=cur,
                                           medium_value=medium_value,
                                           high_value=high_value)

    # ### CREATE THE ENTITY SET

    # In[8]:

    es = utils_bux.create_bux_entity_set(cohorts, users, transactions)
    es

    # ### FEATURE ENGINEERING (DFS) FOR ALL FEATURES

    # In[9]:

    from featuretools.primitives import (Sum, Std, Max, Min, Mean, Count,
                                         PercentTrue, NUnique, Day, Week,
                                         Month, Weekday, Weekend)

    trans_primitives = [Day, Week, Month, Weekday, Weekend]
    agg_primitives = [Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique]

    fm_encoded, features_encoded = utils.calculate_feature_matrix(
        es,
        "users",
        trans_primitives=trans_primitives,
        agg_primitives=agg_primitives,
        max_depth=2)
    X = fm_encoded.reset_index().merge(labels)

    # ### TRAINING  ON ALL FEATURES

    # In[10]:

    # define the labels based on the prediction problem type
    X, y = utils.make_labels(X, prediction_problem_type)
    # split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # train the model
    model = utils.rf_train(X_train, y_train, prediction_problem_type)
    # extract the most important features
    top_features = utils.feature_importances(model,
                                             features_encoded,
                                             n=number_of_features)
    # save the top features
    ft.save_features(top_features, "top_features")
    print("All features built and the most important features saved")

    # ### FEATURE ENGINEERING (DFS) FOR TOP FEATURES

    # In[11]:

    fm = utils.calculate_feature_matrix_top_features(es, top_features)
    X = fm.reset_index().merge(labels)
    print("Top features built")

    # ### TRAINING AND PREDICTION ON TOP FEATURES

    # In[12]:

    # define the labels based on the prediction problem type
    X, y = utils.make_labels(X, prediction_problem_type)
    # split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # fit the model
    model = utils.rf_train(X_train, y_train, prediction_problem_type)
    print("Model trained on top features")

    # ### SAVE THE MODEL

    # In[13]:

    if save_model == True:
        joblib.dump(model, 'models/model.pkl')
        print("Model saved")
    else:
        print("Model not saved")

    # ### REPORT

    # In[ ]:

    if show_report:
        utils.show_report(model, X, y, X_train, y_train, X_test, y_test,
                          prediction_problem_type, top_features)
예제 #17
0
def build_transaction_data():
    """ Builds a data set from raw card and transaction data
        using the featuretools package.

        The resulting data set will be strictly concerned
        with transactions shown in the historical transactions CSV,
        and linking them to the proper card.

        :return:    training, testing feature matrices
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in card data")
    customer_df = pd.read_csv("data/raw/train.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    customer_df.drop(columns='target', inplace=True)

    logger.info("Reading in transactions")
    transactions_df = pd.read_csv("data/raw/historical_transactions.csv",
                                  dtype=TRANSACTION_LOAD_DTYPES)
    transactions_df['authorized_flag'] = np.where(
        transactions_df['authorized_flag'] == 'Y', 1, 0)
    transactions_df.reset_index(inplace=True)

    logger.info("Creating training entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='customer',
                                              dataframe=customer_df,
                                              index='card_id',
                                              time_index='first_active_month',
                                              variable_types=CARD_TYPES)

    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=transactions_df,
                                              index='index',
                                              variable_types=TRANSACTION_TYPES)

    del customer_df
    gc.collect()

    logger.info("Defining relationships")
    relationship = ft.Relationship(es_train['customer']['card_id'],
                                   es_train['transactions']['card_id'])

    es_train = es_train.add_relationship(relationship)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity='customer')

    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Loading test data")
    customer_df = pd.read_csv("data/raw/test.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    logger.info("Creating testing entity set")
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='customer',
                                            dataframe=customer_df,
                                            index='card_id',
                                            time_index='first_active_month',
                                            variable_types=CARD_TYPES)

    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=transactions_df,
                                            index='index',
                                            variable_types=TRANSACTION_TYPES)

    es_test = es_test.add_relationship(relationship)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)

    for col in train_feature_matrix_enc.columns:
        logger.debug(f"Normalizing feature [{col}]")
        old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max'])

        if (old_min == old_max):
            logger.debug(f"Droping feature [{col}] due to lack of variation")
            train_feature_matrix_enc.drop(columns=col, inplace=True)
            test_feature_matrix_enc.drop(columns=col, inplace=True)

            continue

        train_feature_matrix_enc[col] = normalize_series(
            series=train_feature_matrix_enc[col], min_max=(old_min, old_max))

        assert col in test_feature_matrix_enc.columns

        test_feature_matrix_enc[col] = normalize_series(
            series=test_feature_matrix_enc[col], min_max=(old_min, old_max))

    logger.info("Dropping SKEW features.")
    # TODO: Determine why these have lower counts than other features
    drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c]
    train_feature_matrix_enc.drop(columns=drop_cols, inplace=True)
    test_feature_matrix_enc.drop(columns=drop_cols, inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
예제 #18
0
 def save_features_to_json(self):
     ft.save_features(self.feature_names,
                      os.path.join(self.path, self.feature_names_file))
예제 #19
0
es

primitives = ft.list_primitives()

pd.options.display.max_colwidth = 100

default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]
feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       where_primitives = [], seed_features = [],
                       max_depth = 2, n_jobs = -1, verbose = 1,
                       features_only=True)

ft.save_features(feature_names, '../input/features.txt')


#Run Deep-Feature Synthesis- will take a lot of time to process- use pararell processors

print('Total size of entityset: {:.5f} gb.'.format(sys.getsizeof(es) / 1e9))

import psutil

print('Total number of cpus detected: {}.'.format(psutil.cpu_count()))
print('Total size of system memory: {:.5f} gb.'.format(psutil.virtual_memory().total / 1e9))


# feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='app',
#                                        agg_primitives = agg_primitives,
#                                        trans_primitives = trans_primitives,
예제 #20
0
    def transform(
        self,
        groups: Optional[Dict[str, Sequence[str]]] = None,
        use_forgotten: bool = False,
        trans_primitives: Optional[Sequence[str]] = None,
        max_depth: int = 1,
        entity_set_folder_name: Optional[str] = None,
        features_file_name: Optional[str] = None,
        n_jobs: int = 1,
        verbose: bool = True,
    ) -> pd.DataFrame:
        """
        Create new features.

        Wraps Featuretools Deep Feature Synthesis.
        Default Featuretools trans primitives are:
            - "add_numeric"
            - "subtract_numeric"
            - "multiply_numeric"
            - "divide_numeric"
            - "greater_than"
            - "less_than"
            - "and"
            - "or"

        Use relationship groups to relate variables. This avoid wasting
        time creating features from other totally unrelated features.

        This is specially useful when working with datasets with several
        features. Be careful with bias.

        This method does not support multiples entities (consequently
        agg_primitives) yet. Groups are not entities, but only clusters
        of related features.


        Args:
            groups: Dict of related features groups. None to not use
                relationships. (default: None)
            use_forgotten: Create a relationship group for the forgotten
                features in the the arg groups. (default: None)
            trans_primitives: Featuretools trans primitives to use.
                None to use default. (default: None)
            max_depth: Number of iterations in the feature creation
                process. (default: 1)
            entity_set_folder_name: Folder name to store entity set with
                created features. (default: None)
            features_file_name: File name to store created features
                names. Must be JSON. (default: None)
            n_jobs: Number of parallel workers. (default: 1)
            verbose: Verbosity. (default: False)

        Returns:
            DataFrame with new features.
        """
        # Manage groups.
        if not groups:
            groups = self._set_group(self.features)
        groups = self._fix_groups(features=self.features,
                                  groups=groups,
                                  use_forgotten=use_forgotten)

        es = self._set_entity_set(data=self._x, groups=groups)

        old_n_features = self._x.shape[1]  # For comparing later.

        if not trans_primitives:
            trans_primitives = self._TRANS_PRIMITIVES

        index_name = self._index_name(self._x)

        # Define kwargs outside the function just to improve readability.
        dfs_kwargs = {
            "entityset": es,
            "ignore_variables": {group: [index_name]
                                 for group in groups},
            "trans_primitives": trans_primitives,
            "max_depth": max_depth,
            "n_jobs": n_jobs,
            "verbose": False,
        }

        # Create features for each group.
        dfs = [
            ft.dfs(target_entity=key, **dfs_kwargs) for key in groups.keys()
        ]
        # DFS returns a tuple (df and features). Split them.
        features = [features for _, features in dfs for features in features]
        dfs = [matrix for matrix, _ in dfs]

        # Concat all params from all groups to form the new dataset.
        self._x = pd.concat(dfs, axis=1)
        # Do a little cleaning just to remove useless features.
        self._x = selection.remove_low_information_features(self._x)

        # Keep only feature names that are still in the dataset.
        # noinspection PyProtectedMember
        features = [
            feature for feature in features if feature._name in self._x.columns
        ]
        # Update property.
        # noinspection PyProtectedMember
        self.features = [feature._name for feature in features]

        # Export params.
        if entity_set_folder_name:
            es.to_csv(entity_set_folder_name)
        if features_file_name:
            ft.save_features(features, features_file_name)

        # Compare number of features.
        n_new_features = self._x.shape[1] - old_n_features
        if verbose:
            print(f"{n_new_features} features created.")

        return self._x
예제 #21
0
def test_deserializer_uses_common_primitive_instances_with_args(es, tmp_path):
    # Single argument
    scalar1 = MultiplyNumericScalar(value=1)
    scalar5 = MultiplyNumericScalar(value=5)
    features = ft.dfs(
        entityset=es,
        target_dataframe_name="products",
        features_only=True,
        agg_primitives=["sum"],
        trans_primitives=[scalar1, scalar5],
    )

    scalar1_features = [
        f
        for f in features
        if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name()
    ]
    scalar5_features = [
        f
        for f in features
        if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name()
    ]

    # Make sure we have multiple features of each type
    assert len(scalar1_features) > 1
    assert len(scalar5_features) > 1

    # DFS should use the the passed in primitive instance for all features
    assert all([f.primitive is scalar1 for f in scalar1_features])
    assert all([f.primitive is scalar5 for f in scalar5_features])

    file = os.path.join(tmp_path, "features.json")
    ft.save_features(features, file)
    deserialized_features = ft.load_features(file)

    new_scalar1_features = [
        f
        for f in deserialized_features
        if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name()
    ]
    new_scalar5_features = [
        f
        for f in deserialized_features
        if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name()
    ]

    # After deserialization all features that share a primitive should use the same primitive instance
    new_scalar1_primitive = new_scalar1_features[0].primitive
    new_scalar5_primitive = new_scalar5_features[0].primitive
    assert all([f.primitive is new_scalar1_primitive for f in new_scalar1_features])
    assert all([f.primitive is new_scalar5_primitive for f in new_scalar5_features])
    assert new_scalar1_primitive.value == 1
    assert new_scalar5_primitive.value == 5

    # Test primitive with multiple args - pandas only due to primitive compatibility
    if es.dataframe_type == Library.PANDAS.value:
        distance_to_holiday = DistanceToHoliday(
            holiday="Victoria Day", country="Canada"
        )
        features = ft.dfs(
            entityset=es,
            target_dataframe_name="customers",
            features_only=True,
            agg_primitives=[],
            trans_primitives=[distance_to_holiday],
        )

        distance_features = [
            f for f in features if f.primitive.name == "distance_to_holiday"
        ]

        assert len(distance_features) > 1

        # DFS should use the the passed in primitive instance for all features
        assert all([f.primitive is distance_to_holiday for f in distance_features])

        file = os.path.join(tmp_path, "distance_features.json")
        ft.save_features(distance_features, file)
        new_distance_features = ft.load_features(file)

        # After deserialization all features that share a primitive should use the same primitive instance
        new_distance_primitive = new_distance_features[0].primitive
        assert all(
            [f.primitive is new_distance_primitive for f in new_distance_features]
        )
        assert new_distance_primitive.holiday == "Victoria Day"
        assert new_distance_primitive.country == "Canada"

    # Test primitive with list arg
    is_in = IsIn(list_of_outputs=[5, True, "coke zero"])
    features = ft.dfs(
        entityset=es,
        target_dataframe_name="customers",
        features_only=True,
        agg_primitives=[],
        trans_primitives=[is_in],
    )

    is_in_features = [f for f in features if f.primitive.name == "isin"]
    assert len(is_in_features) > 1

    # DFS should use the the passed in primitive instance for all features
    assert all([f.primitive is is_in for f in is_in_features])

    file = os.path.join(tmp_path, "distance_features.json")
    ft.save_features(is_in_features, file)
    new_is_in_features = ft.load_features(file)

    # After deserialization all features that share a primitive should use the same primitive instance
    new_is_in_primitive = new_is_in_features[0].primitive
    assert all([f.primitive is new_is_in_primitive for f in new_is_in_features])
    assert new_is_in_primitive.list_of_outputs == [5, True, "coke zero"]