示例#1
0
    def has_relation(self, hypo, hyper):
        hypo = unicode(hypo)
        hyper = unicode(hyper)

        hypo_variants = set([hypo, hypo.lower(), lemmatize(hypo), lemmatize(hypo).lower()])
        hyper_variants = set([hyper, hyper.lower(), lemmatize(hyper), lemmatize(hyper).lower()])
        freqs = [0]

        for w in hypo_variants:
            for iw in hyper_variants:
                if w in self._hypo2hyper and iw in self._hypo2hyper[w]:
                    freqs.append(self._hypo2hyper[w][iw])

        return max(freqs)
示例#2
0
文件: taxo.py 项目: binarymax/taxi
    def _str_in_str(self, substr, supstr):
        substr = unicode(substr).lower()
        supstr = unicode(supstr).lower()

        if len(substr) < 5: return 0

        index = supstr.find(substr)
        if index == -1:
            substr_l = lemmatize(substr)
            index = supstr.find(substr_l)
            if index == -1:
                supstr_l = lemmatize(supstr)
                index = supstr_l.find(substr_l)
                if index == -1:
                    index = supstr_l.find(substr)
                    if index == -1:
                        return 0, index

        return float(len(substr)) / float(len(supstr)), index
示例#3
0
文件: taxo.py 项目: anukat2015/taxi
    def _str_in_str(self, substr, supstr):
        substr = unicode(substr).lower()
        supstr = unicode(supstr).lower()

        if len(substr) < 5: return 0
        
        index = supstr.find(substr)
        if index == -1:
            substr_l = lemmatize(substr) 
            index = supstr.find(substr_l)
            if index == -1:
                supstr_l = lemmatize(supstr)
                index = supstr_l.find(substr_l)
                if index == -1:
                    index = supstr_l.find(substr)
                    if index == -1: 
                        return 0, index

        return float(len(substr)) / float(len(supstr)), index
示例#4
0
文件: isas.py 项目: shannonyu/taxi
    def has_relation(self, hypo, hyper):
        hypo = unicode(hypo)
        hyper = unicode(hyper)

        hypo_variants = set(
            [hypo,
             hypo.lower(),
             lemmatize(hypo),
             lemmatize(hypo).lower()])
        hyper_variants = set(
            [hyper,
             hyper.lower(),
             lemmatize(hyper),
             lemmatize(hyper).lower()])
        freqs = [0]

        for w in hypo_variants:
            for iw in hyper_variants:
                if w in self._hypo2hyper and iw in self._hypo2hyper[w]:
                    freqs.append(self._hypo2hyper[w][iw])

        return max(freqs)
示例#5
0
文件: taxo.py 项目: binarymax/taxi
 def _is_identical(self, str1, str2):
     str1 = re_dash.sub(u" ", str1)
     str2 = re_dash.sub(u" ", str2)
     return lemmatize(str1) == lemmatize(str2)
示例#6
0
文件: taxo.py 项目: anukat2015/taxi
 def _is_identical(self, str1, str2):
     str1 = re_dash.sub(u" ", str1)
     str2 = re_dash.sub(u" ", str2)
     return lemmatize(str1) == lemmatize(str2)
示例#7
0
文件: isas.py 项目: shannonyu/taxi
    def __init__(self,
                 isas_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True,
                 lowercase=True):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = unicode(row["hyponym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hyponym"]).split("#")[0]
                    hyper = unicode(row["hypernym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hypernym"]).split("#")[0]
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[
                            hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[
                            hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath
                                            if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo
示例#8
0
    def __init__(
        self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True
    ):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = (
                        unicode(row["hyponym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hyponym"]).split("#")[0]
                    )
                    hyper = (
                        unicode(row["hypernym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hypernym"]).split("#")[0]
                    )
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo