def computePeriod(X_df, orderedValues, suffix_feature="SumImports"): """ There are 134 different periods in this dataset. We can see that taking any of the SumSomething statistics. Indeed the column 1 to 12 such a SumSomething statistics have a finite number of values and they overlap in continuous way, according to the considerated period. Here, we take 1_diffSumImports(kmt) and 2_diffSumImports as indicators to travel through data and re-establish the hidden time parameter as periods. :param X_df: The original dataframe :return: The dataframe with a new column named period """ # We find the value for the following periods by considering # that for any entry: the value of 1_diffSumImports(kmt) for # the next period is in 2_diffSumImports(kmt) at time t. if (len(orderedValues) == 0): orderedValues += [getFirstValue(X_df, suffix_feature)] for columnNumber in [1, 11]: while True: prev_value = orderedValues[-1] array_next = list(set(X_df.ix[(X_df[get_suffix(suffix_feature, columnNumber)[0]] == prev_value), [ get_suffix(suffix_feature, columnNumber + 1)[0]]].values.ravel())) if (len(array_next) == 0): # In this case, it's the end of the loop, since # we have found all the values for 1_diffSumImports(kmt) break elif (len(array_next) != 1): raise ("There is not one unique value") orderedValues += [array_next[0]] index_feature = get_suffix(suffix_feature, 1)[0] period_df = pd.DataFrame( {index_feature: orderedValues, "period": range(0, len(orderedValues))}).set_index(index_feature) return period_df
def getFirstValue(X_df, suffix_feature): valuesSumImport = X_df[get_suffix(suffix_feature, 1)[0]].unique() # The loop below finds the only unique value of valuesSumImport that doesn't # appear anywhere in the 2_diffSumImports(kmt) of our data. It corresponds to # the value of 1_diffSumImports(kmt) for the first period for value in valuesSumImport: if (sum(X_df[get_suffix(suffix_feature, 2)[0]] == value) == 0): return value
def transform(self, X_df): self.registerEngineeredFeatures(computePeriod(X_df, self.ordered_values, suffix_feature="SumImports"), "period", left_on=get_suffix("SumImports", 1)) for engineered_df in self.engineered_df.values(): X_df = mergeDf(X_df, engineered_df) X_df = createFeature(X_df,self.engineered_features) # X_df = X_df.ix[:, get_prefix(12) + self.engineered_features] X_df = X_df.ix[:, get_suffix('sumprod',[11,12])+get_suffix(["exports",'refinery'],[10,11,12])+ self.engineered_features] #X_df = X_df.ix[:, except_suffix(['wti','sumclosing','refinery'],12) + get_suffix(["exports","refinery"],11) + self.engineered_features] #X_df = X_df.ix[:, get_prefix(12) + get_suffix(["exports", "refinery"],11) + self.engineered_features] #X_df = X_df.ix[:, get_suffix(["exports","refinery","sumImports"],[11,12])+ self.engineered_features] #X_df = self.computePrePred(X_df) return X_df
def computeCountryQuotient(X_df): countrySum = dict() for columns_group in ['Imports', 'Exports']: countrySum[columns_group] = (abs( X_df[["country"] + get_suffix(columns_group)])).groupby("country").mean().mean( axis=1) return countrySum['Imports'] / countrySum['Exports']
def computeVariance(X_df): variance = np.log(X_df[get_suffix("Imports", range(7, 13))].var(axis=1) + 1) variance = variance / max(variance) X_df["variance_diff" + "Imports(kmt)"] = variance return X_df
def createFeature(X_df, engineered_features): engineered_features += ["Exports_10_11_12"] X_df[engineered_features[-1]] = X_df[get_suffix("exports", range(10, 13))].sum(axis=1) return X_df