def preprocess_table(dataframe, overlap_attr, id_attr, stopwords=None): objsc = StringContainer() projdf = dataframe[[overlap_attr, id_attr]] projdf = projdf.dropna() projdf[overlap_attr] = process_col(projdf[overlap_attr], stopwords) for row in projdf.itertuples(): val = str2bytes(row[1]) uid = row[-1] objsc.push_back(uid, val) return objsc
def _preprocess_table(self, table, key_attr, block_attr, rem_stop_words): objsc = StringContainer() tbl = table[[block_attr, key_attr]] tbl.is_copy = False # avoid setting with copy warning tbl[block_attr] = self._process_column(tbl[block_attr], rem_stop_words) for row in tbl.itertuples(): val = str2bytes(row[1]) key = row[-1] objsc.push_back(key, val) return objsc
def preprocess_table(dataframe, idcol): strcols = list(get_str_cols(dataframe)) strcols.append(idcol) projdf = dataframe[strcols] objsc = StringContainer() for row in projdf.itertuples(): colvalues = row[1:-1] uid = row[-1] strings = [colvalue.strip() for colvalue in colvalues if not pd.isnull(colvalue)] concat_row = str2bytes(' '.join(strings).lower()) concat_row = concat_row.translate(None, string.punctuation) objsc.push_back(uid, concat_row) return objsc