def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) pos_list = _inter_norm_pos_list(obs_ngrams, target_ngrams) return pos_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return np_utils._try_divide( self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) s = 0. for w1 in obs_ngrams: for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. return np_utils._try_divide(s, len(obs_ngrams) * len(target_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) s = 0. for w1 in obs_ngrams: for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
def _get_bleu_feat(s1, s2, ngrams=3): if isinstance(s1, str): s1 = s1.split(" ") if isinstance(s2, str): s2 = s2.split(" ") feat = [] for ngram in range(ngrams + 1): s1_ngram = ngram_utils._ngrams(s1, ngram + 1, "_") s2_ngram = ngram_utils._ngrams(s2, ngram + 1, "_") feat.append(_get_bleu(s1_ngram, s2_ngram)) return np.array(feat, dtype=np.float32)
def transform_one(self, obs, target, id): assert isinstance(obs, unicode) assert isinstance(target, unicode) if obs != self.last_obs: self.last_obs = obs obs_tokens = nlp_utils._tokenize(obs, token_pattern) self.last_obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return self.distance(self.last_obs_ngrams, target_ngrams)
def transform(self): # ngrams obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus)) target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus)) # cooccurrence ngrams cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams)) ## tfidf tfidf = self._init_word_ngram_tfidf(ngram=1) X = tfidf.fit_transform(cooc_terms) ## svd svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED) return svd.fit_transform(X)
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for w1 in obs_ngrams: s = 0. for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. val_list.append(np_utils._try_divide(s, len(target_ngrams))) if len(val_list) == 0: val_list = [config.MISSING_VALUE_NUMERIC] return val_list
def transform_one(self, obs, target, id): df = self.dfTrain[self.dfTrain["search_term"] == obs].copy() val_list = [config.MISSING_VALUE_NUMERIC] if df is not None: df = df[df["id"] != id].copy() df = df[df["relevance"] == self.relevance].copy() if df is not None and df.shape[0] > 0: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for x in df[self.target_field]: x_tokens = nlp_utils._tokenize(x, token_pattern) x_ngrams = ngram_utils._ngrams(x_tokens, self.ngram) val_list.append(dist_utils._jaccard_coef(x_ngrams, target_ngrams)) return val_list
def _get_avg_ngram_doc_len(self): lst = [] for target in self.target_corpus: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) lst.append(len(target_ngrams)) return np.mean(lst)
def transform_one(self, obs, target, id): df = self.dfTrain[self.dfTrain["search_term"] == obs].copy() val_list = [config.MISSING_VALUE_NUMERIC] if df is not None: df = df[df["id"] != id].copy() df = df[df["relevance"] == self.relevance].copy() if df is not None and df.shape[0] > 0: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for x in df[self.target_field]: x_tokens = nlp_utils._tokenize(x, token_pattern) x_ngrams = ngram_utils._ngrams(x_tokens, self.ngram) val_list.append( dist_utils._jaccard_coef(x_ngrams, target_ngrams)) return val_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for w1 in obs_ngrams: _val_list = [] for w2 in target_ngrams: s = dist_utils._edit_dist(w1, w2) _val_list.append(s) if len(_val_list) == 0: _val_list = [ config.MISSING_VALUE_NUMERIC ] val_list.append( _val_list ) if len(val_list) == 0: val_list = [ [config.MISSING_VALUE_NUMERIC] ] return val_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) K = self.k1 * (1 - self.b + self.b * np_utils._try_divide(len(target_ngrams), self.avg_ngram_doc_len)) val_list = [] for w1 in obs_ngrams: s = 0. for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. bm25 = s * self._get_idf(w1) * np_utils._try_divide(1 + self.k1, s + K) val_list.append(bm25) if len(val_list) == 0: val_list = [config.MISSING_VALUE_NUMERIC] return val_list
def _get_df_dict(self): # smoothing d = defaultdict(lambda : 1) for target in self.target_corpus: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) for w in set(target_ngrams): d[w] += 1 return d
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return _inter_pos_list(target_ngrams, [obs_ngrams[self.idx]])
def _ngram(self, text): tokens = text.split(" ") tokens = [token for token in tokens if token not in self.stopwords] return ngram_utils._ngrams(tokens, self.ngram, " ")
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return dist_utils._dice_dist(obs_ngrams, target_ngrams)
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) return np_utils._try_divide(len(set(obs_ngrams)), len(obs_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return np_utils._try_divide(self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))