def transform(self, dirty_df: pd.DataFrame, col: str): char_features = self.char_counter.transform( dirty_df[col].values.tolist()).todense() # word_features = self.word_counter.transform( # dataset.dirty_df[col].values.tolist() # ).todense() regex_features = self.regex_counter.transform([ str2regex(val, match_whole_token=False) for val in dirty_df[col].values ]).todense() regex_features2 = self.regex_counter2.transform([ str2regex(val, match_whole_token=True) for val in dirty_df[col].values ]).todense() # word_features = self.word_counter.transform( # dirty_df[col].values # ).todense() return [ torch.tensor( np.concatenate( [char_features, regex_features, regex_features2], axis=1)) ]
def fit(self, dirty_df: pd.DataFrame, col: str): self.char_counter.fit(dirty_df[col].values.tolist()) self.regex_counter.fit([ str2regex(val, match_whole_token=False) for val in dirty_df[col].values ]) self.regex_counter2.fit([ str2regex(val, match_whole_token=True) for val in dirty_df[col].values ])
def get_coexist_counts(self, values): set_values = set(values) query = "{}\n" + "\n{}\n".join( [ json.dumps( { "query": { "term": { "data": { "value": str2regex(val, match_whole_token=True) } } } } ) for val in set_values ] ) mresult = self.es.msearch(query, index="n_reversed_indices") indices_list = [ESQuery.get_results(res, "idx") for res in mresult["responses"]] coexist_count = defaultdict(lambda: {}) for idx1, val1 in enumerate(values): for idx2, val2 in enumerate(values): if indices_list[idx1] is None or indices_list[idx2] is None: coexist_count[val1][val2] = 0 else: coexist_count[val1][val2] = set(indices_list[idx1]).intersection( indices_list[idx2] ) return coexist_count
def transform(self, dirty_df: pd.DataFrame, col: str): tfidf = self.tfidf.transform(dirty_df[col].values.tolist()).todense() sym_tfidf = self.sym_tfidf.transform(dirty_df[col].apply( lambda x: str2regex(x, match_whole_token=False)).values).todense() return [torch.tensor(np.concatenate([tfidf], axis=1))]
def fit(self, values): trigram = [["".join(x) for x in list(xngrams(val, 3))] for val in values] ngrams = list(itertools.chain.from_iterable(trigram)) self.trigram_counter = Counter(ngrams) sym_ngrams = [str2regex(x, False) for x in ngrams] self.sym_trigram_counter = Counter(sym_ngrams) self.val_counter = Counter(values) sym_values = [str2regex(x, False) for x in values] self.sym_val_counter = Counter(sym_values) self.func2counter = { val_trigrams: self.trigram_counter, sym_trigrams: self.sym_trigram_counter, value_freq: self.val_counter, sym_value_freq: self.sym_val_counter, }
def sym_value_freq(values, counter): patterns = list(map(lambda x: str2regex(x, True), values)) return value_freq(patterns, counter)
def sym_trigrams(values, counter): patterns = list(map(lambda x: str2regex(x, False), values)) return val_trigrams(patterns, counter)
def transform(self, dirty_df: pd.DataFrame, col): return (dirty_df[col].swifter.apply(lambda x: self.counter[str2regex( x, match_whole_token=True)] / len(dirty_df)).values)
def fit(self, dirty_df: pd.DataFrame, col): self.counter = (dirty_df[col].swifter.apply(lambda x: str2regex( x, match_whole_token=True)).value_counts().to_dict())
def fit(self, dirty_df: pd.DataFrame, col: str): self.tfidf.fit(dirty_df[col].values.tolist()) self.sym_tfidf.fit(dirty_df[col].apply( lambda x: str2regex(x, match_whole_token=False)).values)
def clean_str(x): x = x.strip().encode("ascii", "ignore").decode("ascii") return str2regex(x, True)