示例#1
0
def preprocess_table(dataframe, overlap_attr, id_attr, stopwords=None):
    objsc = StringContainer()
    projdf = dataframe[[overlap_attr, id_attr]]
    projdf = projdf.dropna()
    projdf[overlap_attr] = process_col(projdf[overlap_attr], stopwords)
    for row in projdf.itertuples():
        val = str2bytes(row[1])
        uid = row[-1]
        objsc.push_back(uid, val)
    return objsc
示例#2
0
 def _preprocess_table(self, table, key_attr, block_attr, rem_stop_words):
     objsc = StringContainer()
     tbl = table[[block_attr, key_attr]]
     tbl.is_copy = False  # avoid setting with copy warning
     tbl[block_attr] = self._process_column(tbl[block_attr], rem_stop_words)
     for row in tbl.itertuples():
         val = str2bytes(row[1])
         key = row[-1]
         objsc.push_back(key, val)
     return objsc
示例#3
0
def preprocess_table(dataframe, idcol):
    strcols = list(get_str_cols(dataframe))
    strcols.append(idcol)
    projdf = dataframe[strcols]
    objsc = StringContainer()
    for row in projdf.itertuples():
        colvalues = row[1:-1]
        uid = row[-1]
        strings = [colvalue.strip() for colvalue in colvalues if not pd.isnull(colvalue)]
        concat_row = str2bytes(' '.join(strings).lower())
        concat_row = concat_row.translate(None, string.punctuation)
        objsc.push_back(uid, concat_row)
    return objsc