def test_transform_consistency():
    # Create dataframe
    df = pd.DataFrame({
        'a': [14, 12, 10],
        'b': [False, False, True],
        'b1': [True, True, False],
        'b12': [4, 5, 6],
        'P': [10, 15, 12]
    })
    es = ft.EntitySet(id='test')
    # Add dataframe to entityset
    es.entity_from_dataframe(entity_id='first',
                             dataframe=df,
                             index='index',
                             make_index=True)

    # Generate features
    feature_defs = ft.dfs(entityset=es,
                          target_entity='first',
                          trans_primitives=['and', 'add_numeric', 'or'],
                          features_only=True)

    # Check for correct ordering of features
    assert feature_with_name(feature_defs, 'a')
    assert feature_with_name(feature_defs, 'b')
    assert feature_with_name(feature_defs, 'b1')
    assert feature_with_name(feature_defs, 'b12')
    assert feature_with_name(feature_defs, 'P')
    assert feature_with_name(feature_defs, 'AND(b, b1)')
    assert not feature_with_name(
        feature_defs, 'AND(b1, b)')  # make sure it doesn't exist the other way
    assert feature_with_name(feature_defs, 'a + P')
    assert feature_with_name(feature_defs, 'b12 + P')
    assert feature_with_name(feature_defs, 'a + b12')
    assert feature_with_name(feature_defs, 'OR(b, b1)')
    assert feature_with_name(feature_defs, 'OR(AND(b, b1), b)')
    assert feature_with_name(feature_defs, 'OR(AND(b, b1), b1)')
示例#2
0
文件: ftools.py 项目: mmaioli/tasks
    def parse_data(self):
        """
        Parse data from dataframe to EntitySet (FeatureTools formatting)
        Parameters
        ----------
        Nothing
        Returns
        ----------
        es: EntitySet
            The entity set of grouped data.
        """

        es = ft.EntitySet(id="Dados")

        columns = list(self.data.columns)
        variable_types = {}

        for indx, ftype in enumerate(self.feature_types[0].values):

            if ftype == 'Categorical':
                variable_types[columns[indx]] = vtypes.Categorical

        # Create EntitySet and load data
        es = es.entity_from_dataframe(entity_id="entity",
                                      dataframe=self.data,
                                      make_index=True,
                                      index=self.names,
                                      time_index=self.date_var,
                                      variable_types=variable_types)

        # Groups data if required
        # Commented because variable grouped doesn't exist on dataset
        es.normalize_entity(new_entity_id="normal",
                            base_entity_id="entity",
                            index=self.group_var[0])

        return es
示例#3
0
    def new_features(self):
        """
        creates new features using current numeric features.
        Datafarme must only have numeric values.

        Returns
        ----------
        feature_matrix : dataframe containing all the old features
        and new synthetized features
        """
        self.numeric_features()

        if self.numeric_features.shape[
                1] == self.numeric_features.select_dtypes(
                    include=np.number).shape[1]:
            # Make an entityset and add the entity
            es = ft.EntitySet(id='id_1')
            es.entity_from_dataframe(entity_id='id_2',
                                     dataframe=self.numeric_features,
                                     make_index=True,
                                     index='new_index')
            # Run deep feature synthesis
            self.feature_matrix, self.feature_defs = ft.dfs(
                entityset=es,
                target_entity='id_2',
                agg_primitives=self.list_agg_primitives,
                trans_primitives=self.list_trans_primitives,
                max_depth=self.max_depth_value)

            # Add categorical features back to the features dataframe
            if self.categorical_col_name is not None:
                for col in self.categorical_col_name:
                    self.feature_matrix[col] = self.cat_features[col].values

            return self.feature_matrix
        else:
            raise ValueError("Data Frame contains non-numeric values")
示例#4
0
def get_cross_features(df, features, key='tim'):
    """
    :type df: DataFrame
    :type features: list[str]]
    :rtype: DataFrame
    """
    use_df = copy.deepcopy(df.loc[:, [key] + features])
    es = ft.EntitySet(id='temperature_predict')
    es = es.entity_from_dataframe(entity_id='temp',
                                  dataframe=use_df,
                                  index=key)
    trans_primitives = [
        'add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric'
    ]

    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity='temp',
        max_depth=1,  # max_depth=1,只在原特征上进行运算产生新特征
        verbose=1,
        trans_primitives=trans_primitives)
    features_df = pd.DataFrame(feature_matrix).reset_index()
    features_df.drop(columns=features, inplace=True)
    return features_df
示例#5
0
    def _set_entity_set(self, data: pd.DataFrame,
                        groups: Dict[str, Sequence[str]]) -> ft.EntitySet:
        """
        Set an entity set.

        Args:
            data: DataFrame
            groups: Dict of features groups.

        Returns:
            Featuretools entity set.
        """
        es = ft.EntitySet(id="main")

        index_name = self._index_name(data)

        for group, features in groups.items():
            es = es.entity_from_dataframe(
                entity_id=group,
                dataframe=data[features].reset_index(),
                index=index_name,
            )

        return es
示例#6
0
文件: lbr_v2.py 项目: ikem55/HRsystem
    def _create_feature(self):
        """ マージしたデータから特徴量を生成する """
        raceuma_df = self.base_df.copy()[["競走コード", "馬番", "予想タイム指数", "予想展開", "クラス変動", "騎手評価", "調教師評価", "枠順評価", "脚質評価", "馬齢", "前走着順", "前走人気", "前走頭数", "騎手ランキング", "調教師ランキング"]]
        raceuma_df.loc[:, "競走馬コード"] = raceuma_df["競走コード"].astype(str).str.cat(raceuma_df["馬番"].astype(str))
        raceuma_df.drop("馬番", axis=1, inplace=True)
        # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5
        es = ft.EntitySet(id="race")
        es.entity_from_dataframe(entity_id='race', dataframe=raceuma_df, index="競走馬コード")
        es.normalize_entity(base_entity_id='race', new_entity_id='raceuma', index="競走コード")
        # 集約関数
        aggregation_list = ['count', 'min', 'max', 'mean']
        transform_list = []
        # run dfs
        feature_matrix, features_dfs = ft.dfs(entityset= es, target_entity= 'race', agg_primitives = aggregation_list , trans_primitives=transform_list, max_depth=2)
        print("_create_feature: feature_matrix", feature_matrix.shape)
        feature_matrix.head(3)

        # 予想1番人気のデータを取得
        ninki_df = self.base_df.query("予想人気==1")[["競走コード", "枠番", "性別コード", "予想タイム指数順位", "見習区分", "キャリア", "馬齢", "予想展開", "距離増減", "前走頭数", "前走人気", "テン乗り"]].add_prefix("人気_").rename(columns={"人気_競走コード":"競走コード"})
        # 逃げ予想馬のデータを取得
        nige_df = self.base_df.query("予想展開==1")[["競走コード", "先行指数", "距離増減", "前走人気", "前走頭数", "テン乗り"]].add_prefix("逃げ_").rename(columns={"逃げ_競走コード":"競走コード"})
        self.base_df = pd.merge(feature_matrix, nige_df, on="競走コード")
        self.base_df = pd.merge(self.base_df, ninki_df, on="競走コード")
        self.base_df = pd.merge(self.base_df, self.ld.race_df, on="競走コード")
示例#7
0
def auto_adp_features(train, test, cols, entities):

    df_c = train[cols]
    es = ft.EntitySet(id='petfinder')
    es.entity_from_dataframe(entity_id="Pets", dataframe=df_c, index="PetID")
    ignored_variable =  {}
    ignored_variable.update({'Pets': entities})
    for e in entities:
        print(e)

        es.normalize_entity(base_entity_id='Pets', new_entity_id=e, index=e)
        feature_matrix, feature_names = ft.dfs(entityset=es,
                                               target_entity=e,
                                               max_depth=2,
                                               verbose=1,
                                               #n_jobs=3,
                                               ignore_variables=ignored_variable)
        fm = feature_matrix.add_prefix(e+"_")
        print(feature_names)
        fm.drop([e+"_COUNT(Pets)"], axis = 1, inplace=True)
        train = train.set_index(e).join(fm).reset_index()
        test = test.set_index(e).join(fm).reset_index()

    return train, test
示例#8
0
    def _make_entityset(self, input_df):
        es = ft.EntitySet()

        primary_key = find_primary_key(input_df)
        make_index = False

        if primary_key is None:
            primary_key = "D3M_INDEX"
            make_index = True

        cols_to_use = input_df.metadata.list_columns_with_semantic_types(
            [st.PRIMARY_KEY, st.ATTRIBUTE])

        input_df = input_df.select_columns(cols_to_use)

        variable_types = get_featuretools_variable_types(input_df)

        es.entity_from_dataframe(entity_id=TARGET_ENTITY,
                                 dataframe=pd.DataFrame(input_df.copy()),
                                 index=primary_key,
                                 make_index=make_index,
                                 variable_types=variable_types)

        return es
def create_entity_set(data: pd.DataFrame, train_table: str,
                      test_table: str) -> ft.EntitySet:
    print(f"\nCreating entity set based on client data")
    start = time.monotonic()
    es = ft.EntitySet(id='clients')

    es = es.entity_from_dataframe(entity_id='combined_train_test',
                                  dataframe=data['combined_train_test'],
                                  index='SK_ID_CURR')

    es = es.entity_from_dataframe(entity_id='bureau',
                                  dataframe=data['bureau'],
                                  index='SK_ID_BUREAU')

    es = es.entity_from_dataframe(entity_id='bureau_balance',
                                  dataframe=data['bureau_balance'],
                                  make_index=True,
                                  index='bureaubalance_index')

    es = es.entity_from_dataframe(entity_id='previous_application',
                                  dataframe=data['previous_application'],
                                  index='SK_ID_PREV')

    es = es.add_relationships([
        ft.Relationship(es['combined_train_test']['SK_ID_CURR'],
                        es['bureau']['SK_ID_CURR']),
        ft.Relationship(es['bureau']['SK_ID_BUREAU'],
                        es['bureau_balance']['SK_ID_BUREAU']),
        ft.Relationship(es['combined_train_test']['SK_ID_CURR'],
                        es['previous_application']['SK_ID_CURR'])
    ])
    end = time.monotonic()

    print(f"  Entity set creation completed in {round(end - start)} seconds")

    return es
示例#10
0
    def generate_entity_set(self):
        """ 05. Define the entity set along with the table relations. """

        import featuretools as ft
        self.es = ft.EntitySet(id='clients')
        self.es = self.es.entity_from_dataframe(
            entity_id='users',
            dataframe=self.users.reset_index(),
            index='user_id',
            time_index='created_date')

        for d in self.feature_windows:
            self.es = self.es.entity_from_dataframe(
                entity_id=f'transactions_{d}d',
                dataframe=self.transactions.query(
                    f' {d}  > days_before_cutoff >= 0  ').reset_index(),
                index='transaction_id',
                time_index='created_date')
            # Add the relationship between customera and transactions
            self.es = self.es.add_relationship(
                ft.Relationship(self.es['users']['user_id'],
                                self.es[f'transactions_{d}d']['user_id']))

        self.next(self.generate_features)
示例#11
0
def create_entitysets(part_num, partition_name, dfs_params):
    es_dict = {}
    es = ft.EntitySet(id = partition_name)
    for target in dfs_params['target']: 
        for t_k, t_v in target.items():
            s = data[t_k].duplicated(t_v)
            df_target = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) +'/partition/'+partition_name+'/'+part_num+'/'+t_k+'.csv')
            index = index_needed(t_k,t_v)
            if index == True:
                es = es.entity_from_dataframe(entity_id = t_k, dataframe = df_target, make_index = True, index = t_v)
            else:
                es = es.entity_from_dataframe(entity_id = t_k, dataframe = df_target, index = t_v)
            for frame in dfs_params['frames']:
                for f_k, f_v in frame.items():
                    index = index_needed(f_k,f_v)
                    df = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) +'/partition/'+partition_name+'/'+part_num+'/'+f_k+'.csv')
                    if index == True:
                        es = es.entity_from_dataframe(entity_id = f_k, dataframe = df, make_index = True, index = f_k+'_'+f_v)
                    else:
                        es = es.entity_from_dataframe(entity_id = f_k, dataframe = df, index = f_v)
                    r = ft.Relationship(es[t_k][t_v], es[f_k][f_v])
                    es = es.add_relationships([r])
            es_dict.update({t_k: es, 'num': part_num})
    return es_dict
示例#12
0
def dd_mock_customer(pd_mock_customer):
    dataframes = {}
    for df in pd_mock_customer.dataframes:
        dd_df = dd.from_pandas(df.reset_index(drop=True), npartitions=4)
        dd_df.ww.init(schema=df.ww.schema)
        dataframes[df.ww.name] = (
            dd_df,
            df.ww.index,
            df.ww.time_index,
            df.ww.logical_types,
        )
    relationships = [
        (
            rel._parent_dataframe_name,
            rel._parent_column_name,
            rel._child_dataframe_name,
            rel._child_column_name,
        )
        for rel in pd_mock_customer.relationships
    ]

    return ft.EntitySet(
        id=pd_mock_customer.id, dataframes=dataframes, relationships=relationships
    )
示例#13
0
def diamond_es():
    regions_df = pd.DataFrame({
        'id': range(3),
        'name': ['Northeast', 'Midwest', 'South'],
    })
    stores_df = pd.DataFrame({
        'id': range(5),
        'region_id': [0, 1, 2, 2, 1],
    })
    customers_df = pd.DataFrame({
        'id': range(5),
        'region_id': [1, 0, 0, 1, 1],
        'name': ['A', 'B', 'C', 'D', 'E'],
    })
    transactions_df = pd.DataFrame({
        'id': range(8),
        'store_id': [4, 4, 2, 3, 4, 0, 1, 1],
        'customer_id': [3, 0, 2, 4, 3, 3, 2, 3],
        'amount': [100, 40, 45, 83, 13, 94, 27, 81],
    })

    entities = {
        'regions': (regions_df, 'id'),
        'stores': (stores_df, 'id'),
        'customers': (customers_df, 'id'),
        'transactions': (transactions_df, 'id'),
    }
    relationships = [
        ('regions', 'id', 'stores', 'region_id'),
        ('regions', 'id', 'customers', 'region_id'),
        ('stores', 'id', 'transactions', 'store_id'),
        ('customers', 'id', 'transactions', 'customer_id'),
    ]
    return ft.EntitySet(id='ecommerce_diamond',
                        entities=entities,
                        relationships=relationships)
示例#14
0
def load_retail(id='demo_retail_data', nrows=None):
    '''
    Returns the retail entityset example.

    Args:
        id (str):  Id to assign to EntitySet.
        nrows (int):  Number of rows to load of item_purchases
            entity. If None, load all.

    Examples:

        .. ipython::
            :verbatim:

            In [1]: import featuretools as ft

            In [2]: es = ft.demo.load_retail()

            In [3]: es
            Out[3]:
            Entityset: demo_retail_data
              Entities:
                invoices (shape = [25900, 3])
                items (shape = [4070, 3])
                customers (shape = [4373, 3])
                item_purchases (shape = [541909, 6])

        Load in subset of data

        .. ipython::
            :verbatim:

            In [2]: es = ft.demo.load_retail(nrows=1000)

            In [3]: es
            Out[3]:
            Entityset: demo_retail_data
              Entities:
                invoices (shape = [66, 3])
                items (shape = [590, 3])
                customers (shape = [49, 3])
                item_purchases (shape = [1000, 6])

    '''
    demo_save_path = make_retail_pathname(nrows)

    es = ft.EntitySet(id)
    csv_s3 = "s3://featuretools-static/uk_online_retail.csv"

    if not os.path.isfile(demo_save_path):
        df = pd.read_csv(csv_s3, nrows=nrows, parse_dates=["InvoiceDate"])
        df.to_csv(demo_save_path)

    df = pd.read_csv(demo_save_path, nrows=nrows, parse_dates=["InvoiceDate"])

    df.rename(columns={"Unnamed: 0": 'item_purchase_id'}, inplace=True)

    es.entity_from_dataframe("item_purchases",
                             dataframe=df,
                             index="item_purchase_id",
                             time_index="InvoiceDate")

    es.normalize_entity(new_entity_id="items",
                        base_entity_id="item_purchases",
                        index="StockCode",
                        additional_variables=["Description"])

    es.normalize_entity(new_entity_id="invoices",
                        base_entity_id="item_purchases",
                        index="InvoiceNo",
                        additional_variables=["CustomerID", "Country"])

    es.normalize_entity(new_entity_id="customers",
                        base_entity_id="invoices",
                        index="CustomerID",
                        additional_variables=["Country"])
    es.add_last_time_indexes()

    return es
示例#15
0
# dataframe is instead loan_id.

# When we create an entity in featuretools, we have to identify which column of the dataframe is
# the index. If the data does not have a unique index we can tell featuretools to make an index
# for the entity by passing in make_index = True and specifying a name for the index. If the data
# also has a uniquely identifying time index, we can pass that in as the time_index parameter.

# Featuretools will automatically infer the variable types (numeric, categorical, datetime) of
# the columns in our data, but we can also pass in specific datatypes to override this behavior.
# As an example, even though the repaid column in the loans dataframe is represented as an integer,
# we can tell featuretools that this is a categorical feature since it can only take on two
# discrete values. This is done using an integer with the variables as keys and the feature types
# as values.

# Create new EntitySet
es = ft.EntitySet(id='clients')

# Create an entity from the client DataFrame
# This dataframe already has an index and a time index
es = es.entity_from_dataframe(entity_id='clients',
                              dataframe=clients,
                              index='client_id',
                              time_index='joined')
# Create an entity from the loans DataFrame
# This DataFrame already has an index and a time index
es = es.entity_from_dataframe(
    entity_id='loans',
    dataframe=loans,
    variable_types={'repaid': ft.variable_types.Categorical},
    index='loan_id',
    time_index='loan_start')
    ("encode", OrdinalEncoder()),
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])
# -

combine[["Sex", "Embarked", "Title"]] = categorical_transformer.fit_transform(
    combine[["Sex", "Embarked", "Title"]])

combine[["Age",
         "Fare"]] = numeric_transformer.fit_transform(combine[["Age", "Fare"]])

# +
es = ft.EntitySet(id="titanic_data")

es = es.entity_from_dataframe(
    entity_id="combine",
    dataframe=combine.drop(["Survived"], axis=1),
    variable_types={
        "Embarked": ft.variable_types.Categorical,
        "Sex": ft.variable_types.Boolean,
        "Title": ft.variable_types.Categorical,
    },
    index="PassengerId",
)

es
# -
def datashop_to_entityset(filename):
    # Make an EntitySet called Dataset with the following structure
    #
    # schools       students     problems
    #        \        |         /
    #   classes   sessions   problem steps
    #          \     |       /
    #           transactions  -- attempts
    #

    # Convert the csv into a dataframe using pandas
    data = pd.read_csv(filename, '\t', parse_dates=True)

    # Make the Transaction Id the index column of the dataframe and clean other columns
    data.index = data['Transaction Id']
    data = data.drop(['Row'], axis=1)
    data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1})

    # Make a new 'End Time' column which is start_time + duration
    # This is /super useful/ because you shouldn't be using outcome data at
    # any point before the student has attempted the problem.
    data['End Time'] = pd.to_datetime(
        data['Time']) + pd.to_timedelta(pd.to_numeric(data['Duration (sec)']), 's')

    # Make a list of all the KC and CF columns present
    kc_and_cf_cols = [x for x in data.columns if (
        x.startswith('KC ') or x.startswith('CF '))]

    # Now we start making an entityset. We make 'End Time' a time index for 'Outcome'
    # even though our primary time index for a row is 'Time' preventing label leakage.
    es = ft.EntitySet('Dataset')
    es.entity_from_dataframe(entity_id='transactions',
                             index='Transaction Id',
                             dataframe=data,
                             variable_types={'Outcome': vtypes.Boolean, 'Attempt At Step': vtypes.Categorical},
                             time_index='Time',
                             secondary_time_index={'End Time': [
                                 'Outcome', 'Is Last Attempt', 'Duration (sec)']}
                             )

    # Every transaction has a `problem_step` which is associated to a problem
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='problem_steps',
                        index='Step Name',
                        additional_variables=['Problem Name'] + kc_and_cf_cols,
                        make_time_index=True)

    es.normalize_entity(base_entity_id='problem_steps',
                        new_entity_id='problems',
                        index='Problem Name',
                        make_time_index=True)

    # Every transaction has a `session` associated to a student
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='sessions',
                        index='Session Id',
                        additional_variables=['Anon Student Id'],
                        make_time_index=True)

    es.normalize_entity(base_entity_id='sessions',
                        new_entity_id='students',
                        index='Anon Student Id',
                        make_time_index=True)

    # Every transaction has a `class` associated to a school
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='classes',
                        index='Class',
                        additional_variables=['School'],
                        make_time_index=False)

    es.normalize_entity(base_entity_id='classes',
                        new_entity_id='schools',
                        index='School',
                        make_time_index=False)

    # And because we might be interested in creating features grouped
    # by attempts we normalize by those as well.
#     es.normalize_entity(base_entity_id='transactions',
#                         new_entity_id='attempts',
#                         index='Attempt At Step',
#                         additional_variables=[],
#                         make_time_index=False)
    return es
    def es_set(self):
        self.__es = ft.EntitySet(id="application_test")
        self.__es = self.__es.entity_from_dataframe(
            entity_id="application_test",
            dataframe=self.__application_test,
            index="SK_ID_CURR",
            variable_types=None if len(self.__application_test_categorical)
            == 0 else self.__application_test_categorical)
        self.__es = self.__es.entity_from_dataframe(
            entity_id="bureau",
            dataframe=self.__bureau,
            index="SK_ID_BUREAU",
            variable_types=None if len(
                self.__bureau_categorical) == 0 else self.__bureau_categorical)
        self.__es = self.__es.entity_from_dataframe(
            entity_id="bureau_balance",
            dataframe=self.__bureau_balance,
            make_index=True,
            index="bureau_balance_id",
            variable_types=None if len(self.__bureau_balance_categorical) == 0
            else self.__bureau_balance_categorical)
        self.__es = self.__es.entity_from_dataframe(
            entity_id="previous_application",
            dataframe=self.__previous_application,
            index="SK_ID_PREV",
            variable_types=None if len(self.__previous_application_categorical)
            == 0 else self.__previous_application_categorical)
        self.__es = self.__es.entity_from_dataframe(
            entity_id="pos_cash_balance",
            dataframe=self.__pos_cash_balance,
            make_index=True,
            index="pos_cash_balance_id",
            variable_types=None if len(self.__pos_cash_balance_categorical)
            == 0 else self.__pos_cash_balance_categorical)
        self.__es = self.__es.entity_from_dataframe(
            entity_id="credit_card_balance",
            dataframe=self.__credit_card_balance,
            make_index=True,
            index="credit_card_balance_id",
            variable_types=None if len(self.__credit_card_balance_categorical)
            == 0 else self.__credit_card_balance_categorical)
        self.__es = self.__es.entity_from_dataframe(
            entity_id="installments_payments",
            dataframe=self.__installments_payments,
            make_index=True,
            index="installments_payments_id",
            variable_types=None
            if len(self.__installments_payments_categorical) == 0 else
            self.__installments_payments_categorical)

        self.__es = self.__es.add_relationship(
            ft.Relationship(self.__es["application_test"]["SK_ID_CURR"],
                            self.__es["bureau"]["SK_ID_CURR"]))
        self.__es = self.__es.add_relationship(
            ft.Relationship(self.__es["bureau"]["SK_ID_BUREAU"],
                            self.__es["bureau_balance"]["SK_ID_BUREAU"]))
        self.__es = self.__es.add_relationship(
            ft.Relationship(self.__es["application_test"]["SK_ID_CURR"],
                            self.__es["previous_application"]["SK_ID_CURR"]))
        self.__es = self.__es.add_relationship(
            ft.Relationship(self.__es["previous_application"]["SK_ID_PREV"],
                            self.__es["pos_cash_balance"]["SK_ID_PREV"]))
        self.__es = self.__es.add_relationship(
            ft.Relationship(self.__es["previous_application"]["SK_ID_PREV"],
                            self.__es["credit_card_balance"]["SK_ID_PREV"]))
        self.__es = self.__es.add_relationship(
            ft.Relationship(self.__es["previous_application"]["SK_ID_PREV"],
                            self.__es["installments_payments"]["SK_ID_PREV"]))
        self.__es["previous_application"][
            "NAME_CONTRACT_STATUS_Refused"].interesting_values = [1]
        self.__es["previous_application"][
            "NAME_PRODUCT_TYPE_walk-in"].interesting_values = [1]
        self.__es["previous_application"][
            "CODE_REJECT_REASON_HC"].interesting_values = [1]
示例#19
0
		if feat_name != '':
			feature_list.append(feat_name)

	f.close()
	return feature_list

# finally let's import the data
df = pd.read_csv("creditcard.csv")
df = df.drop(['Time'], axis=1) #,'V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8'], axis =1)
df = df.dropna()



# ok and then we'll do all the featuretools things that need to happen
es = ft.EntitySet(id = 'card') # no clue what this means but whatever

# make an entity from the observations data
es = es.entity_from_dataframe(dataframe = df.drop('Class', axis=1),
								entity_id = 'obs',
								index = 'index')

feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='obs',
										#agg_primitives = ['min', 'max', 'mean', 'count', 'sum', 'std', 'trend'],
										trans_primitives = ['divide_by_feature', 'add_numeric', 'less_than_equal_to', 'greater_than_equal_to_scalar', 'multiply_numeric', 'subtract_numeric_scalar', 'divide_numeric_scalar', 'add_numeric_scalar', 'subtract_numeric', 'divide_numeric', 'percentile', 'greater_than', 'less_than', 'multiply_numeric_scalar', 'greater_than_equal_to', 'modulo_by_feature', 'scalar_subtract_numeric_feature', 'absolute', 'modulo_numeric'],
										max_depth=1,
										n_jobs=1,
										verbose=1)

# alright here is where we're going to want to cut down all the variables
feature_list = fetch_feature_list()
示例#20
0
stats.columns = ['loan_amount','payment','missed']
clients2 = clients.merge(stats, left_on='client_id', right_index=True, how='left')

# Then add some features manually
clients2['join_month'] = clients2['joined'].dt.month
clients2['log_income'] = np.log(clients2['income'])

# 5 new features with 7 lines of code
clients2.head()
##########################################################################
# So far so good.
# OK Let's use featuretools
# 
# An ft entity is simply a data-frame. And ft uses sets of them - an entityset!
# Basically we're creating metadata. NB payments has no payment_id, so create one.
es = ft.EntitySet(id = 'myentityset')
es = es.entity_from_dataframe(entity_id = 'clients', 
                              dataframe=clients, 
                              index='client_id',
                              time_index='joined')

es = es.entity_from_dataframe(entity_id = 'loans', 
                              dataframe=loans, 
                              variable_types = {'repaid': ft.variable_types.Categorical},
                              index='loan_id',
                              time_index='loan_start')

es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe=payments, 
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
示例#21
0
# Let's merge the hit and miss dfs for modelling
df = pd.merge(hit_df, miss_df, how='outer')

# Get rid of duplicate values again
df = df.drop_duplicates(subset='CompoundSMILES', keep="first")

# Produce rdkit features from SMILES
df, properties = rdkit_utils.get_rdkit_properties(df)

# Get X, y and training and test data
y = df['Site_No']
X = df.drop(columns=['Site_No'])

# Let's try add some feature engineering from feature tools
# Make an entityset and add the entity
es = ft.EntitySet(id='chem_features')
es.entity_from_dataframe(entity_id='data',
                         dataframe=X,
                         make_index=False,
                         index='CompoundSMILES')

# Run deep feature synthesis with transformation primitives
X, feature_defs = ft.dfs(entityset=es,
                         max_depth=1,
                         target_entity='data',
                         agg_primitives=["mean", "sum", "mode"],
                         trans_primitives=[
                             'add_numeric', 'multiply_numeric', 'cum_count',
                             'cum_mean', 'cum_sum', 'equal'
                         ])
示例#22
0
 def __init__(self, id_name=None):
     if id_name is None:
         id_name = 'auto_create'
     self.auto_create = ft.EntitySet(id=id_name)
示例#23
0
    def get_create_feature_race_df(self, base_df, race_df):
        """ マージしたデータから特徴量を生成する """
        print("_create_feature")
        raceuma_df = base_df[[
            "RACE_KEY", "UMABAN", "脚質", "距離適性", "父馬産駒連対平均距離", "母父馬産駒連対平均距離",
            "IDM", "テン指数", "ペース指数", "上がり指数", "位置指数", "IDM結果_1", "テン指数結果_1",
            "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "先行率_1", "追込率_1", "fa_1_1",
            "fa_2_1", "fa_3_1", "fa_4_1", "fa_5_1"
        ]]
        raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df[
            "RACE_KEY"] + raceuma_df["UMABAN"]
        raceuma_df.drop("UMABAN", axis=1, inplace=True)
        # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5
        es = ft.EntitySet(id="race")
        es.entity_from_dataframe(entity_id='race',
                                 dataframe=race_df,
                                 index="RACE_KEY")
        es.entity_from_dataframe(entity_id='raceuma',
                                 dataframe=raceuma_df,
                                 index="RACE_UMA_KEY")
        relationship = ft.Relationship(es['race']["RACE_KEY"],
                                       es['raceuma']["RACE_KEY"])
        es = es.add_relationship(relationship)
        print(es)
        # 集約関数
        aggregation_list = ['min', 'max', 'mean', 'skew', 'percent_true']
        transform_list = []
        # run dfs
        print("un dfs")
        feature_matrix, features_dfs = ft.dfs(entityset=es,
                                              target_entity='race',
                                              agg_primitives=aggregation_list,
                                              trans_primitives=transform_list,
                                              max_depth=2)
        print("_create_feature: feature_matrix", feature_matrix.shape)

        # 予想1番人気のデータを取得
        ninki_df = base_df.query("基準人気順位==1")[[
            "RACE_KEY",
            "脚質",
            "距離適性",
            "上昇度",
            "激走指数",
            "蹄コード",
            "見習い区分",
            "枠番",
            "総合印",
            "IDM印",
            "情報印",
            "騎手印",
            "厩舎印",
            "調教印",
            "激走印",
            "展開記号",
            "輸送区分",
            "騎手期待単勝率",
            "騎手期待3着内率",
            #"激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ",
            "放牧先ランク",
            "厩舎ランク",
            "調教量評価",
            "仕上指数変化",
            "調教評価",
            "IDM",
            "騎手指数",
            "情報指数",
            "総合指数",
            "人気指数",
            "調教指数",
            "厩舎指数",
            "テン指数",
            "ペース指数",
            "上がり指数",
            "位置指数",
            "追切指数",
            "仕上指数",
            "IDM結果_1",
            "IDM結果_2"
        ]].add_prefix("人気_").rename(columns={"人気_RACE_KEY": "RACE_KEY"})
        # 逃げ予想馬のデータを取得
        nige_df = base_df.query("展開記号=='1'")[[
            "RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番",
            "総合印", "IDM印", "基準人気順位", "輸送区分", "IDM", "騎手指数", "テン指数", "ペース指数",
            "上がり指数", "位置指数", "追切指数", "仕上指数", "テン指数結果_1", "上がり指数結果_1",
            "ペース指数結果_1", "レースP指数結果_1", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2",
            "レースP指数結果_2", "先行率_1", "先行率_2", "距離", "距離_1"
        ]].add_prefix("逃げ_").rename(columns={"逃げ_RACE_KEY": "RACE_KEY"})
        # "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "斤量_2","斤量_1",
        nige_df.loc[:, "逃げ_距離増減"] = nige_df["逃げ_距離"] - nige_df["逃げ_距離_1"]
        nige_df.drop(["逃げ_距離", "逃げ_距離_1"], axis=1, inplace=True)
        nige_ddf = nige_df.groupby("RACE_KEY")
        nige_df2 = nige_df.loc[nige_ddf["逃げ_テン指数"].idxmax(), :]
        # 上がり最速予想馬のデータを取得
        agari_df = base_df.query("展開記号=='2'")[[
            "RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番",
            "総合印", "IDM印", "基準人気順位", "輸送区分", "IDM", "騎手指数", "テン指数", "ペース指数",
            "上がり指数", "位置指数", "追切指数", "仕上指数", "テン指数結果_1", "上がり指数結果_1",
            "ペース指数結果_1", "レースP指数結果_1", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2",
            "レースP指数結果_2", "先行率_1", "先行率_2"
        ]].add_prefix("上り_").rename(columns={"上り_RACE_KEY": "RACE_KEY"})
        # "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ","斤量_1", "斤量_2",

        base_df = pd.merge(feature_matrix, nige_df2, on="RACE_KEY", how="left")
        base_df = pd.merge(base_df, agari_df, on="RACE_KEY", how="left")
        base_df = pd.merge(base_df, ninki_df, on="RACE_KEY")
        return base_df
示例#24
0
import featuretools as ft
if __name__ == '__main__':
    # 基础原始数据
    data = ft.load_mock_customer()
    # 获取原始数据中不同的部分
    # session
    sessions_df = data["sessions"]
    # products
    products_df = data["products"]
    # transactions
    transactions_df = data["transactions"]
    # customers
    customers_df = data["customers"]

    # 创建EntitySet
    es = ft.EntitySet(id="tests")
    # 添加entity
    es = es.entity_from_dataframe(
                entity_id="trans_entity",
                dataframe=transactions_df,
                index="transaction_id",

    )

    es = es.entity_from_dataframe(
                entity_id="session_entity",
                dataframe=sessions_df,
                index="session_id",
    )

    es = es.entity_from_dataframe(
def test_empty_child_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({
        "id": [1, 2, 3],
        "parent_id": [1, 1, 1],
        "time_index":
        pd.date_range(start='1/1/2018', periods=3),
        "value": [10, 5, 2]
    })

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id")
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index")
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'],
                       parent_entity=es["parent"],
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                       parent_entity=es["parent"],
                       primitive=Trend)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Trend)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(
        entityset=es,
        features=[count, count_where, trend, trend_where],
        cutoff_time=pd.Timestamp("12/31/2017"))
    names = [
        count.get_name(),
        count_where.get_name(),
        trend.get_name(),
        trend_where.get_name()
    ]
    assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]])

    # cutoff time after all rows, but where clause filters all rows
    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=[count_where, trend_where],
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    names = [count_where.get_name(), trend_where.get_name()]
    assert_array_equal(fm2[names], [[0, np.nan]])
示例#26
0
 def __init__(self, sparse_feats=None, dense_feats=None):
     self.sparse_feats = sparse_feats
     self.dense_feats = dense_feats
     self.es = ft.EntitySet(id='MAIN')
示例#27
0
def load_mock_customer(n_customers=5,
                       n_products=5,
                       n_sessions=35,
                       n_transactions=500,
                       random_seed=0,
                       return_single_table=False,
                       return_entityset=False):
    """Return dataframes of mock customer data"""

    random.seed(random_seed)

    customers_df = pd.DataFrame({"customer_id": range(1, n_customers + 1)})
    customers_df["zip_code"] = choice(
        ["60091", "02139"],
        n_customers,
    )
    customers_df["join_date"] = pd.date_range(
        '1/1/2008', periods=n_customers,
        freq='50d')  # todo make these less regular

    products_df = pd.DataFrame({"product_id": range(1, n_products + 1)})
    products_df["brand"] = choice(["A", "B", "C"], n_products)

    sessions_df = pd.DataFrame({"session_id": range(1, n_sessions + 1)})
    sessions_df["customer_id"] = choice(customers_df["customer_id"],
                                        n_sessions)
    sessions_df["device"] = choice(["desktop", "mobile", "tablet"], n_sessions)

    transactions_df = pd.DataFrame(
        {"transaction_id": range(1, n_transactions + 1)})
    transactions_df["session_id"] = choice(sessions_df["session_id"],
                                           n_transactions)
    transactions_df = transactions_df.sort_values("session_id").reset_index(
        drop=True)
    transactions_df["transaction_time"] = pd.date_range(
        '1/1/2014', periods=n_transactions,
        freq='65s')  # todo make these less regular
    transactions_df["product_id"] = pd.Categorical(
        choice(products_df["product_id"], n_transactions))
    transactions_df["amount"] = random.randint(500, 15000,
                                               n_transactions) / 100.0

    # calculate and merge in session start
    # based on the times we came up with for transactions
    session_starts = transactions_df.drop_duplicates("session_id")[[
        "session_id", "transaction_time"
    ]].rename(columns={"transaction_time": "session_start"})
    sessions_df = sessions_df.merge(session_starts)

    if return_single_table:
        return transactions_df.merge(sessions_df).merge(customers_df).merge(
            products_df).reset_index(drop=True)

    elif return_entityset:
        es = ft.EntitySet(id="transactions")
        es = es.entity_from_dataframe(
            entity_id="transactions",
            dataframe=transactions_df,
            index="transaction_id",
            time_index="transaction_time",
            variable_types={"product_id": ft.variable_types.Categorical})

        es = es.entity_from_dataframe(entity_id="products",
                                      dataframe=products_df,
                                      index="product_id")

        es = es.entity_from_dataframe(entity_id="sessions",
                                      dataframe=sessions_df,
                                      index="session_id",
                                      time_index="session_start")

        es = es.entity_from_dataframe(entity_id="customers",
                                      dataframe=customers_df,
                                      index="customer_id",
                                      time_index="join_date")

        rels = [
            ft.Relationship(es["products"]["product_id"],
                            es["transactions"]["product_id"]),
            ft.Relationship(es["sessions"]["session_id"],
                            es["transactions"]["session_id"]),
            ft.Relationship(es["customers"]["customer_id"],
                            es["sessions"]["customer_id"])
        ]
        es = es.add_relationships(rels)
        es.add_last_time_indexes()
        return es

    return {
        "customers": customers_df,
        "sessions": sessions_df,
        "transactions": transactions_df,
        "products": products_df
    }
# here put the import lib

from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import featuretools as ft



if __name__ == "__main__":
    dataset = load_iris()
    X = dataset.data
    y = dataset.target
    iris_feature_names = dataset.feature_names
    df = pd.DataFrame(X, columns=iris_feature_names)
    es = ft.EntitySet(id='single_dataframe')  # 用id标识实体集
    # 增加一个数据框,命名为iris
    es.entity_from_dataframe(entity_id='iris',
                             dataframe=df,
                             index='index',
                             make_index=True)
    trans_primitives=['add_numeric', 'subtract_numeric', ,'multiply_numeric', 'divide_numeric']  # 2列相加减乘除来生成新特征
    feature_matrix, feature_names = ft.dfs(entityset=es,
                                            target_entity='iris',
                                            max_depth=1,    # max_depth=1,只在原特征上进行运算产生新特征
                                            verbose=1,
                                            trans_primitives=trans_primitives
                                            )
    ft.list_primitives()  # 查看可使用的特征集元
    # features_df = pd.DataFrame(feature_matrix, columns= feature_names)
    # print(features_df.head())
示例#29
0
def test_empty_child_dataframe(parent_child):
    parent_df, child_df = parent_child
    if not isinstance(parent_df, pd.DataFrame):
        parent_vtypes = {'id': variable_types.Index}
        child_vtypes = {
            'id': variable_types.Index,
            'parent_id': variable_types.Numeric,
            'time_index': variable_types.Datetime,
            'value': variable_types.Numeric,
            'cat': variable_types.Categorical
        }
    else:
        parent_vtypes = None
        child_vtypes = None
    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id",
                             variable_types=parent_vtypes)
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index",
                             variable_types=child_vtypes)
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'],
                       parent_entity=es["parent"],
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                       parent_entity=es["parent"],
                       primitive=Trend)

    # create multi-output agg feature
    n_most_common = ft.Feature(es["child"]['cat'],
                               parent_entity=es["parent"],
                               primitive=NMostCommon)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Trend)
    n_most_common_where = ft.Feature(es["child"]['cat'],
                                     parent_entity=es["parent"],
                                     where=where,
                                     primitive=NMostCommon)

    if isinstance(parent_df, pd.DataFrame):
        features = [
            count, count_where, trend, trend_where, n_most_common,
            n_most_common_where
        ]
        names = [
            count.get_name(),
            count_where.get_name(),
            trend.get_name(),
            trend_where.get_name(), *n_most_common.get_feature_names(),
            *n_most_common_where.get_feature_names()
        ]
        values = [
            0, 0, np.nan, np.nan,
            *np.full(n_most_common.number_output_features, np.nan),
            *np.full(n_most_common_where.number_output_features, np.nan)
        ]
    else:
        features = [count, count_where]
        names = [count.get_name(), count_where.get_name()]
        values = [0, 0]

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     cutoff_time=pd.Timestamp("12/31/2017"))
    fm = to_pandas(fm)

    assert_array_equal(fm[names], [values])

    # cutoff time after all rows, but where clause filters all rows
    if isinstance(parent_df, pd.DataFrame):
        features = [count_where, trend_where, n_most_common_where]
        names = [
            count_where.get_name(),
            trend_where.get_name(), *n_most_common_where.get_feature_names()
        ]
        values = [
            0, np.nan,
            *np.full(n_most_common_where.number_output_features, np.nan)
        ]
    else:
        features = [count_where]
        names = [count_where.get_name()]
        values = [0]

    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=features,
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    fm2 = to_pandas(fm2)

    assert_array_equal(fm2[names], [values])
示例#30
0
numeric = []
for col in config.ordinal:
    num_col = f'{col}_num'
    numeric.append(num_col)
    X_joined[num_col] = X_joined[col]

# Add dataframe to entityset
categorical_ft = dict([col, ft.variable_types.Boolean]
                      for col in config.categorical.keys())
ordinal_ft = dict([col, ft.variable_types.Ordinal] for col in config.ordinal)
numeric_ft = dict([col, ft.variable_types.Numeric] for col in numeric)
variable_dtypes = {**categorical_ft, **ordinal_ft, **numeric_ft}

# Create an entity set
es = ft.EntitySet(id='flu')
es = es.entity_from_dataframe(
    entity_id='flu',
    dataframe=X_joined,
    index='respondent_id',
    variable_types=variable_dtypes,
)

agg_primitives = ['count', 'median', 'entropy']
trans_primitives = ['add_numeric']

# Run deep feature synthesis
dfs_feat, dfs_defs = ft.dfs(entityset=es,
                            target_entity='flu',
                            trans_primitives=trans_primitives,
                            agg_primitives=agg_primitives,