def test_tokenize_entityset(es, int_es): dupe = make_ecommerce_entityset() # check identitcal entitysets hash to same token assert tokenize(es) == tokenize(dupe) # not same if product relationship is missing productless = make_ecommerce_entityset() productless.relationships.pop() assert tokenize(es) != tokenize(productless) # not same if integer entityset assert tokenize(es) != tokenize(int_es) # add row to cohorts cohorts_df = dupe['cohorts'].df new_row = pd.DataFrame(data={ 'cohort': [2], 'cohort_name': ['On Time Adopters'], 'cohort_end': [pd.Timestamp('2011-04-08 12:00:00')] }, columns=['cohort', 'cohort_name', 'cohort_end'], index=[2]) more_cohorts = cohorts_df.append(new_row, ignore_index=True, sort=True) dupe['cohorts'].update_data(more_cohorts) assert tokenize(es) == tokenize(dupe)
def test_tokenize_entityset(pd_es, int_es): dupe = make_ecommerce_entityset() # check identitcal entitysets hash to same token assert tokenize(pd_es) == tokenize(dupe) # not same if product relationship is missing productless = make_ecommerce_entityset() productless.relationships.pop() assert tokenize(pd_es) != tokenize(productless) # not same if integer entityset assert tokenize(pd_es) != tokenize(int_es) # add row to cohorts cohorts_df = dupe["cohorts"] new_row = pd.DataFrame( data={ "cohort": [2], "cohort_name": None, "cohort_end": [pd.Timestamp("2011-04-08 12:00:00")], }, columns=["cohort", "cohort_name", "cohort_end"], index=[2], ) more_cohorts = cohorts_df.append(new_row, ignore_index=True, sort=True) dupe.replace_dataframe(dataframe_name="cohorts", df=more_cohorts) assert tokenize(pd_es) == tokenize(dupe)
def test_eq(es): other_es = make_ecommerce_entityset() latlong = es['log'].df['latlong'].copy() assert es['log'].__eq__(es['log'], deep=True) assert es['log'].__eq__(other_es['log'], deep=True) assert (es['log'].df['latlong'] == latlong).all() other_es['log'].add_interesting_values() assert not es['log'].__eq__(other_es['log'], deep=True) es['log'].id = 'customers' es['log'].index = 'notid' assert not es['customers'].__eq__(es['log'], deep=True) es['log'].index = 'id' assert not es['customers'].__eq__(es['log'], deep=True) es['log'].time_index = 'signup_date' assert not es['customers'].__eq__(es['log'], deep=True) es['log'].secondary_time_index = { 'cancel_date': ['cancel_reason', 'cancel_date'] } assert not es['customers'].__eq__(es['log'], deep=True)
def test_ecommerce(): es = make_ecommerce_entityset() cutoffs = es['log'].df[['session_id', 'datetime']] cutoffs = cutoffs.rename(columns={'session_id': 'id'}) ftens, fl = ft.dfs(entityset=es, cutoff_time=cutoffs, target_entity="sessions", cutoff_time_in_index=True) ftens.sort_index(inplace=True) ids = ftens.index.get_level_values('id').drop_duplicates() n_instances = ids.shape[0] labels_binary = [i % 2 for i in range(n_instances)] labels_multiclass = np.random.randint(10, size=(n_instances, )) labels_regression = np.random.random(size=(n_instances, )) labels = pd.DataFrame( { 'label_binary': labels_binary, 'label_multiclass': labels_multiclass, 'label_regression': labels_regression }, index=ids) ftens = (ftens.reset_index('id', drop=False).merge(labels, left_on='id', right_index=True, how='left').set_index( 'id', append=True)) train_ftens, test_ftens = train_test_split(ftens, test_size=0.4, shuffle=False) train_labels = train_ftens[labels.columns] test_labels = test_ftens[labels.columns] for c in labels.columns: del train_ftens[c] del test_ftens[c] scores = {} scoring_functions = { 'label_regression': mean_absolute_error, 'label_binary': roc_auc_score, 'label_multiclass': f1_macro } for label_type in labels.columns: classes = labels[label_type].unique() dl_model = DLDB(regression=label_type == 'label_regression', classes=classes, categorical_max_vocab=10) dl_model.fit(train_ftens, train_labels[label_type].values, fl=fl, epochs=1, batch_size=4) predictions = dl_model.predict(test_ftens) score = scoring_functions[label_type](test_labels[label_type].values, predictions) scores[label_type] = score return scores
def test_eq(es): other_es = make_ecommerce_entityset() latlong = es['log'].df['latlong'].copy() assert es['log'].__eq__(es['log'], deep=True) assert es['log'].__eq__(other_es['log'], deep=True) assert all(to_pandas(es['log'].df['latlong']).eq(to_pandas(latlong))) # Test different index other_es['log'].index = None assert not es['log'].__eq__(other_es['log']) other_es['log'].index = 'id' assert es['log'].__eq__(other_es['log']) # Test different time index other_es['log'].time_index = None assert not es['log'].__eq__(other_es['log']) other_es['log'].time_index = 'datetime' assert es['log'].__eq__(other_es['log']) # Test different secondary time index other_es['customers'].secondary_time_index = {} assert not es['customers'].__eq__(other_es['customers']) other_es['customers'].secondary_time_index = { 'cancel_date': ['cancel_reason', 'cancel_date'] } assert es['customers'].__eq__(other_es['customers']) original_variables = es['sessions'].variables # Test different variable list length other_es['sessions'].variables = original_variables[:-1] assert not es['sessions'].__eq__(other_es['sessions']) # Test different variable list contents other_es['sessions'].variables = original_variables[:-1] + [ original_variables[0] ] assert not es['sessions'].__eq__(other_es['sessions']) # Test different interesting values assert es['log'].__eq__(other_es['log'], deep=True) other_es['log'].add_interesting_values() assert not es['log'].__eq__(other_es['log'], deep=True) # Check one with last time index, one without other_es['log'].last_time_index = other_es['log'].df['datetime'] assert not other_es['log'].__eq__(es['log'], deep=True) assert not es['log'].__eq__(other_es['log'], deep=True) # Both set with different values es['log'].last_time_index = other_es['log'].last_time_index + pd.Timedelta( '1h') assert not other_es['log'].__eq__(es['log'], deep=True) # Check different dataframes other_es['stores'].df = other_es['stores'].df.head(0) assert not other_es['stores'].__eq__(es['stores'], deep=True)
def create_feature_matrix(): es = make_ecommerce_entityset() f1 = ft.Feature(es["log"]["product_id"]) f2 = ft.Feature(es["log"]["value"]) features = [f1, f2] ids = [0, 1, 2, 3, 4, 5] feature_matrix = ft.calculate_feature_matrix(features, es, instance_ids=ids) return feature_matrix, features, f1, f2, es, ids
def test_multi_output_selection(): df1 = pd.DataFrame({"id": [0, 1, 2, 3]}) df2 = pd.DataFrame({ "first_id": [0, 1, 1, 3], "all_nulls": [None, None, None, None], "quarter": ["a", "b", None, "c"], }) dataframes = { "first": (df1, "id"), "second": (df2, "index"), } relationships = [("first", "id", "second", "first_id")] es = ft.EntitySet("data", dataframes, relationships=relationships) es["second"].ww.set_types(logical_types={ "all_nulls": "categorical", "quarter": "categorical" }) fm, features = ft.dfs( entityset=es, target_dataframe_name="first", trans_primitives=[], agg_primitives=["n_most_common"], max_depth=2, ) multi_output, multi_output_features = ft.selection.remove_single_value_features( fm, features) assert multi_output.columns == ["N_MOST_COMMON(second.quarter)[0]"] assert len(multi_output_features) == 1 assert multi_output_features[0].get_name() == multi_output.columns[0] es = make_ecommerce_entityset() fm, features = ft.dfs( entityset=es, target_dataframe_name="régions", trans_primitives=[], agg_primitives=["n_most_common"], max_depth=2, ) matrix_with_slices, unsliced_features = ft.selection.remove_highly_null_features( fm, features) assert len(matrix_with_slices.columns) == 18 assert len(unsliced_features) == 14 matrix_columns = set(matrix_with_slices.columns) for f in unsliced_features: for f_name in f.get_feature_names(): assert f_name in matrix_columns
def test_multi_output_selection(): df1 = pd.DataFrame({'id': [0, 1, 2, 3]}) df2 = pd.DataFrame({ 'first_id': [0, 1, 1, 3], "all_nulls": [None, None, None, None], 'quarter': ['a', 'b', None, 'c'] }) dataframes = { "first": (df1, 'id'), "second": (df2, 'index'), } relationships = [("first", 'id', 'second', 'first_id')] es = ft.EntitySet("data", dataframes, relationships=relationships) es['second'].ww.set_types(logical_types={ 'all_nulls': 'categorical', 'quarter': 'categorical' }) fm, features = ft.dfs(entityset=es, target_dataframe_name="first", trans_primitives=[], agg_primitives=['n_most_common'], max_depth=2) multi_output, multi_output_features = ft.selection.remove_single_value_features( fm, features) assert multi_output.columns == ['N_MOST_COMMON(second.quarter)[0]'] assert len(multi_output_features) == 1 assert multi_output_features[0].get_name() == multi_output.columns[0] es = make_ecommerce_entityset() fm, features = ft.dfs(entityset=es, target_dataframe_name="régions", trans_primitives=[], agg_primitives=['n_most_common'], max_depth=2) matrix_with_slices, unsliced_features = ft.selection.remove_highly_null_features( fm, features) assert len(matrix_with_slices.columns) == 18 assert len(unsliced_features) == 14 matrix_columns = set(matrix_with_slices.columns) for f in unsliced_features: for f_name in f.get_feature_names(): assert f_name in matrix_columns
def es(): return make_ecommerce_entityset()
def int_es(): return make_ecommerce_entityset(with_integer_time_index=True)
def es(self): es = make_ecommerce_entityset() return es
def string_count_get_name(self): return u"STRING_COUNT(%s, %s)" % (self.base_features[0].get_name(), '"' + str(self.kwargs['string'] + '"')) # %% StringCount = make_trans_primitive(function=string_count, input_types=[Text], return_type=Numeric, cls_attributes={"get_name": string_count_get_name}) # %% from featuretools.tests.testing_utils import make_ecommerce_entityset es = make_ecommerce_entityset() count_the_feat = StringCount(es['log']['comments'], string="the") # 原始日志数据 # %% es['log'].df.head() # %% md # 统计日志表的评论字段出现the的求和值、平均值、标准差 # %% feature_matrix, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[Sum, Mean, Std], seed_features=[count_the_feat]) feature_matrix[['STD(log.STRING_COUNT(comments, "the"))', 'SUM(log.STRING_COUNT(comments, "the"))', 'MEAN(log.STRING_COUNT(comments, "the"))']]
def es(feature_matrix): es = make_ecommerce_entityset() es.entity_from_dataframe('test', feature_matrix, index='test') return es