def has_relation(self, hypo, hyper): hypo = unicode(hypo) hyper = unicode(hyper) hypo_variants = set([hypo, hypo.lower(), lemmatize(hypo), lemmatize(hypo).lower()]) hyper_variants = set([hyper, hyper.lower(), lemmatize(hyper), lemmatize(hyper).lower()]) freqs = [0] for w in hypo_variants: for iw in hyper_variants: if w in self._hypo2hyper and iw in self._hypo2hyper[w]: freqs.append(self._hypo2hyper[w][iw]) return max(freqs)
def _str_in_str(self, substr, supstr): substr = unicode(substr).lower() supstr = unicode(supstr).lower() if len(substr) < 5: return 0 index = supstr.find(substr) if index == -1: substr_l = lemmatize(substr) index = supstr.find(substr_l) if index == -1: supstr_l = lemmatize(supstr) index = supstr_l.find(substr_l) if index == -1: index = supstr_l.find(substr) if index == -1: return 0, index return float(len(substr)) / float(len(supstr)), index
def has_relation(self, hypo, hyper): hypo = unicode(hypo) hyper = unicode(hyper) hypo_variants = set( [hypo, hypo.lower(), lemmatize(hypo), lemmatize(hypo).lower()]) hyper_variants = set( [hyper, hyper.lower(), lemmatize(hyper), lemmatize(hyper).lower()]) freqs = [0] for w in hypo_variants: for iw in hyper_variants: if w in self._hypo2hyper and iw in self._hypo2hyper[w]: freqs.append(self._hypo2hyper[w][iw]) return max(freqs)
def _is_identical(self, str1, str2): str1 = re_dash.sub(u" ", str1) str2 = re_dash.sub(u" ", str2) return lemmatize(str1) == lemmatize(str2)
def __init__(self, isas_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True, lowercase=True): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding='utf8', error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = unicode(row["hyponym"]).split("#")[0].lower( ) if lowercase else unicode(row["hyponym"]).split("#")[0] hyper = unicode(row["hypernym"]).split("#")[0].lower( ) if lowercase else unicode(row["hypernym"]).split("#")[0] freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[ hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[ hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo
def __init__( self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True ): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = ( unicode(row["hyponym"]).split("#")[0].lower() if lowercase else unicode(row["hyponym"]).split("#")[0] ) hyper = ( unicode(row["hypernym"]).split("#")[0].lower() if lowercase else unicode(row["hypernym"]).split("#")[0] ) freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo