Пример #1
0
    def process_table(self, table, overlap_attr, q_val, rem_stop_words):

        # get overlap_attr column
        attr_col_values = table[overlap_attr]

        # remove non-ascii chars
        attr_col_values = [helper.remove_non_ascii(val) for val in attr_col_values]

        # remove special characters
        attr_col_values = [self.rem_punctuations(val).lower() for val in attr_col_values]

        # chop the attribute values
        col_values_chopped = [val.split() for val in attr_col_values]

        # convert the chopped values into a set
        col_values_chopped = [list(set(val)) for val in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [self.rem_stopwords(val) for val in col_values_chopped]

        if q_val is not None:
            values = [' '.join(val) for val in col_values_chopped]
            col_values_chopped = [qgram(val, q_val) for val in values]

        return col_values_chopped
Пример #2
0
 def process_val(self, val, overlap_attr, q_val, rem_stop_words):
     val = helper.remove_non_ascii(val)
     val = self.rem_punctuations(val).lower()
     chopped_vals = val.split()
     if rem_stop_words == True:
         chopped_vals = self.rem_stopwords(chopped_vals)
     if q_val != None:
         values = ' '.join(chopped_vals)
         chopped_vals = qgram(values, q_val)
     return list(set(chopped_vals))