예제 #1
0
 def resort(self):
     self.col_map.sort_values(by=self.columns,
                              inplace=True,
                              ignore_index=True)
     for index, row in self.col_map.iterrows():
         key_hash = get_row_hash(row, self.columns)
         self.map_dict[key_hash] = index
예제 #2
0
    def test_lookup_cols(self):
        t_map = KeyMap(self.key_cols, self.target_cols)
        stern_df = pd.read_csv(self.stern_map_path,
                               delimiter='\t',
                               header=0,
                               keep_default_na=False,
                               na_values=",null")
        t_map.update(stern_df)
        t_col = t_map.col_map
        for index, row in stern_df.iterrows():
            key = get_row_hash(row, self.key_cols)
            key_value = t_map.map_dict[key]
            self.assertEqual(t_col.iloc[key_value]['type'], row['type'],
                             "The key should be looked up for same map")

        stern_test1 = pd.read_csv(self.stern_test1_path,
                                  delimiter='\t',
                                  header=0)
        for index, row in stern_test1.iterrows():
            key = get_row_hash(row, self.key_cols)
            key_value = t_map.map_dict[key]
            self.assertEqual(t_col.iloc[key_value]['type'], row['type'],
                             "The key should be looked up for other file")
예제 #3
0
 def test_get_row_hash(self):
     stern_df = read_csv(self.stern_map_path,
                         delimiter='\t',
                         header=0,
                         keep_default_na=False,
                         na_values=",null")
     key_columns = ['type', 'event_type']
     my_map = {}
     for index, row in stern_df.iterrows():
         key = get_row_hash(row, key_columns)
         my_map[key] = index
     self.assertEqual(
         len(my_map.keys()), len(stern_df),
         "get_row_hash should uniquely hash all of the keys in stern map")
예제 #4
0
    def _update(self, base_df):
        """ Takes DataFrame objects containing keys

        Args:
            base_df (DataFrame):       DataFrame of consisting of the columns in the KeyMap
        """

        for index, row in base_df.iterrows():
            key = get_row_hash(row, self.columns)
            if key not in self.map_dict:
                self.map_dict[key] = len(self.col_map)
                self.col_map = self.col_map.append(row[self.columns],
                                                   ignore_index=True)
                self.count_dict[key] = 0
            self.count_dict[key] += 1
예제 #5
0
    def _update(self, base_df):
        """ Takes DataFrame objects containing keys and DataFrame containing targets and overwrites existing keys

        Args:
            base_df (DataFrame):       DataFrame of consisting of the columns in the KeyMap

        Returns:
            duplicate_indices (list):         List of key positions that were duplicated
        """

        duplicate_indices = []
        for index, row in base_df.iterrows():
            key = get_row_hash(row, self.key_cols)
            if key not in self.map_dict:
                self.map_dict[key] = len(self.col_map)
                self.col_map = self.col_map.append(row, ignore_index=True)
            else:
                duplicate_indices.append(index)
        return duplicate_indices
예제 #6
0
    def _remap(self, df):
        """ Utility method that iterates through df to do the replacements

        Args:
            df (DataFrame):         DataFrame in which to perform the mapping

        Returns:
            list                           List of row numbers that had no correspondence in the mapping
        """

        missing_indices = []
        for index, row in df.iterrows():
            key = get_row_hash(row, self.key_cols)
            key_value = self.map_dict.get(key, None)
            if key_value:
                result = self.col_map.iloc[key_value]
                row[self.target_cols] = result[self.target_cols].values
                df.iloc[index] = row
            else:
                missing_indices.append(index)
        return missing_indices
예제 #7
0
 def print(self, file=None):
     print(f"Counts for key [{str(self.columns)}]:", file=file)
     for index, row in self.col_map.iterrows():
         key_hash = get_row_hash(row, self.columns)
         print(f"{str(list(row.values))}\t{self.count_dict[key_hash]}",
               file=file)