示例#1
0
    def __init__(self, freq_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True):
        """ Reads a word frequency list in CSV format "word<TAB>freq" """

        if not exists(freq_fpath):
            self._freq = {}
            return

        pkl_fpath = freq_fpath + ".pkl"
        if use_pickle and exists(pkl_fpath):
            voc = pickle.load(open(pkl_fpath, "rb"))
        else:
            # load words to datafame
            if preprocess:
                freq_cln_fpath = freq_fpath + "-cln"
                preprocess_pandas_csv(freq_fpath, freq_cln_fpath)
                word_df = read_csv(freq_cln_fpath, sep, encoding='utf8', error_bad_lines=False)
                try_remove(freq_cln_fpath)
            else:
                word_df = read_csv(freq_fpath, sep, encoding='utf8', error_bad_lines=False)

            # load from dataframe to dictionary
            word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index)
            if strip_pos:
                voc = {}
                for i, row in word_df.iterrows():
                    try:
                        word = unicode(row["word"]).split("#")[0]
                        freq = int(row["freq"])
                        if word not in voc or voc[word] < freq: voc[word] = freq
                    except:
                        print "Bad row:", row
                        print format_exc()
            else:
                voc = { row["word"]: row["freq"] for i, row in word_df.iterrows() }

            print "dictionary is loaded:", len(voc)

            if use_pickle:
                pickle.dump(voc, open(pkl_fpath, "wb"))
                print "Pickled voc:", pkl_fpath

        print "Loaded %d words from: %s" % (len(voc), pkl_fpath if pkl_fpath else freq_fpath)

        self._freq = voc
示例#2
0
文件: taxo.py 项目: binarymax/taxi
    def fill_direct_isas(self, subphrases=False):
        # get direct hypernyms of different isas: model_name -> (hypo, hyper) -> weight
        hypo2hyper_freq = defaultdict(dict)  # raw frequency
        hypo2hyper_inorm = defaultdict(
            dict)  # in-voc norm: divide by max invoc frequency
        hypo2hyper_anorm = defaultdict(
            dict)  # absolute norm: divide by max frequency per word

        for isa_name in self._isas:
            print isa_name, len(self._isas[isa_name].data)
            for hypo in self.voc:
                # find hypernyms
                hypers_list = self._isas[isa_name].all_hyper(hypo)
                hypers_dict = {hyper: freq for hyper, freq in hypers_list}
                invoc_hypers_dict = {
                    w: hypers_dict[w]
                    for w in set(hypers_dict.keys()).intersection(self.voc)
                }
                invoc_hypers_dict.pop(hypo, None)
                invoc_hypers_list = sorted(invoc_hypers_dict.items(),
                                           key=operator.itemgetter(1),
                                           reverse=True)

                if VERBOSE:
                    if len(invoc_hypers_list) > 0:
                        print hypo, len(hypers_dict), len(
                            invoc_hypers_list), ", ".join(
                                w + ":" + unicode(freq)
                                for w, freq in invoc_hypers_list)
                    # print len(invoc_hypers_list),

                # find hypernyms of subphrases
                if len(invoc_hypers_list) == 0 and subphrases:
                    for hypo_subphrase in self._subphrases(hypo):
                        hypers_list = self._isas[isa_name].all_hyper(
                            hypo_subphrase)
                        hypers_dict = {
                            hyper: freq
                            for hyper, freq in hypers_list
                        }
                        invoc_hypers_dict = {
                            w: hypers_dict[w]
                            for w in set(hypers_dict.keys()).intersection(
                                self.voc)
                        }
                        invoc_hypers_dict.pop(hypo_subphrase, None)
                        invoc_hypers_list = sorted(invoc_hypers_dict.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True)
                        if (invoc_hypers_list) > 0: break
                    if len(invoc_hypers_list) == 0:
                        continue
                    elif VERBOSE:
                        print hypo, "-->", hypo_subphrase, ":", invoc_hypers_list
                elif len(invoc_hypers_list) == 0:
                    continue

                # normalize
                max_freq = float(hypers_list[0][1])
                invoc_max_freq = float(invoc_hypers_list[0][1])
                for hyper, freq in invoc_hypers_list:
                    hypo2hyper_freq[isa_name][(hypo, hyper)] = freq
                    hypo2hyper_anorm[isa_name][(hypo, hyper)] = freq / max_freq
                    hypo2hyper_inorm[isa_name][(hypo,
                                                hyper)] = freq / invoc_max_freq
        # average: (hypo, hyper) -> weight
        hypo2hyper_iavg = self._average(hypo2hyper_inorm)
        hypo2hyper_aavg = self._average(hypo2hyper_anorm)

        # initialize arrays
        hyper2hypo_iavg_arr = np.zeros(len(self._relations))
        hypo2hyper_iavg_arr = np.zeros(len(self._relations))
        hypo2hyper_iavg2_arr = np.zeros(len(self._relations))
        hyper2hypo_iavg2_arr = np.zeros(len(self._relations))
        hypo2hyper_aavg_arr = np.zeros(len(self._relations))
        hyper2hypo_aavg_arr = np.zeros(len(self._relations))
        hypo2hyper_arr = {}
        hyper2hypo_arr = {}
        for isa_name in hypo2hyper_inorm:
            hypo2hyper_arr[isa_name] = np.zeros(len(self._relations))
            hyper2hypo_arr[isa_name] = np.zeros(len(self._relations))

        # fill the arrays
        for i, row in self._relations.iterrows():
            if i != 0 and i % 100000 == 0: print i
            hypo2hyper_iavg_arr[i] = hypo2hyper_iavg.pop(
                (row.hyponym, row.hypernym), 0)
            hyper2hypo_iavg_arr[i] = hypo2hyper_iavg.pop(
                (row.hypernym, row.hyponym), 0)
            hypo2hyper_aavg_arr[i] = hypo2hyper_aavg.pop(
                (row.hyponym, row.hypernym), 0)
            hyper2hypo_aavg_arr[i] = hypo2hyper_aavg.pop(
                (row.hypernym, row.hyponym), 0)
            for isa_name in hypo2hyper_inorm:
                hypo2hyper_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop(
                    (row.hyponym, row.hypernym), 0)
                hyper2hypo_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop(
                    (row.hypernym, row.hyponym), 0)

        # insert arrays as columns
        s = "_s" if subphrases else ""
        for isa_name in hypo2hyper_inorm:
            col = "hypo2hyper_" + isa_name + s
            self._relations[col] = Series(hypo2hyper_arr[isa_name],
                                          index=self._relations.index)
            hypo2hyper_iavg2_arr += self._relations[col] / self._relations[
                col].max()

            col = "hyper2hypo_" + isa_name + s
            self._relations[col] = Series(hyper2hypo_arr[isa_name],
                                          index=self._relations.index)
            hyper2hypo_iavg2_arr += self._relations[col] / self._relations[
                col].max()

        self._relations["hypo2hyper" + s] = Series(hypo2hyper_iavg_arr,
                                                   index=self._relations.index)
        self._relations["hyper2hypo" + s] = Series(hyper2hypo_iavg_arr,
                                                   index=self._relations.index)
        self._relations["hypo2hyper2" + s] = Series(
            hypo2hyper_iavg2_arr, index=self._relations.index)
        self._relations["hyper2hypo2" + s] = Series(
            hyper2hypo_iavg2_arr, index=self._relations.index)
        self._relations["hypo2hyper3" + s] = Series(
            hypo2hyper_aavg_arr, index=self._relations.index)
        self._relations["hyper2hypo3" + s] = Series(
            hyper2hypo_aavg_arr, index=self._relations.index)
        self._save_relations()

        # debug info
        debug_fpath = self._relations_fpath + "-direct-hypo2hyper" + s + ".csv"
        tmp_fpath = debug_fpath + ".tmp"
        with codecs.open(tmp_fpath, "w", "utf-8") as out:
            print >> out, "hyponym\thypernym\tfreq"
            for hypo, hyper in hypo2hyper_iavg:
                print >> out, "%s\t%s\t%.3f" % (hypo, hyper,
                                                hypo2hyper_iavg[(hypo, hyper)])
        df = read_csv(tmp_fpath,
                      encoding='utf-8',
                      delimiter="\t",
                      error_bad_lines=False)
        df = df.sort(["hyponym", "freq"], ascending=[1, 0])
        df.to_csv(debug_fpath,
                  sep="\t",
                  encoding="utf-8",
                  float_format='%.3f',
                  index=False)
        try_remove(tmp_fpath)
        print "Direct hypernyms:", debug_fpath
示例#3
0
文件: freq.py 项目: shannonyu/taxi
    def __init__(self,
                 freq_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True):
        """ Reads a word frequency list in CSV format "word<TAB>freq" """

        if not exists(freq_fpath):
            self._freq = {}
            return

        pkl_fpath = freq_fpath + ".pkl"
        if use_pickle and exists(pkl_fpath):
            voc = pickle.load(open(pkl_fpath, "rb"))
        else:
            # load words to datafame
            if preprocess:
                freq_cln_fpath = freq_fpath + "-cln"
                preprocess_pandas_csv(freq_fpath, freq_cln_fpath)
                word_df = read_csv(freq_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(freq_cln_fpath)
            else:
                word_df = read_csv(freq_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            # load from dataframe to dictionary
            word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index)
            if strip_pos:
                voc = {}
                for i, row in word_df.iterrows():
                    try:
                        word = unicode(row["word"]).split("#")[0]
                        freq = int(row["freq"])
                        if word not in voc or voc[word] < freq:
                            voc[word] = freq
                    except:
                        print "Bad row:", row
                        print format_exc()
            else:
                voc = {
                    row["word"]: row["freq"]
                    for i, row in word_df.iterrows()
                }

            print "dictionary is loaded:", len(voc)

            if use_pickle:
                pickle.dump(voc, open(pkl_fpath, "wb"))
                print "Pickled voc:", pkl_fpath

        print "Loaded %d words from: %s" % (len(voc), pkl_fpath
                                            if pkl_fpath else freq_fpath)

        self._freq = voc
示例#4
0
文件: taxo.py 项目: anukat2015/taxi
    def fill_direct_isas(self, subphrases=False):
        # get direct hypernyms of different isas: model_name -> (hypo, hyper) -> weight
        hypo2hyper_freq = defaultdict(dict)   # raw frequency
        hypo2hyper_inorm = defaultdict(dict)  # in-voc norm: divide by max invoc frequency
        hypo2hyper_anorm = defaultdict(dict)  # absolute norm: divide by max frequency per word

        for isa_name in self._isas:
            print isa_name, len(self._isas[isa_name].data)
            for hypo in self.voc:
                # find hypernyms
                hypers_list = self._isas[isa_name].all_hyper(hypo)
                hypers_dict = {hyper: freq for hyper, freq in hypers_list}
                invoc_hypers_dict = {w: hypers_dict[w] for w in set(hypers_dict.keys()).intersection(self.voc)}
                invoc_hypers_dict.pop(hypo, None)
                invoc_hypers_list = sorted(invoc_hypers_dict.items(), key=operator.itemgetter(1), reverse=True)
                
                if VERBOSE:
                    if len(invoc_hypers_list) > 0:
                        print hypo, len(hypers_dict), len(invoc_hypers_list), ", ".join(w + ":" + unicode(freq) for w, freq in invoc_hypers_list)
                    # print len(invoc_hypers_list),
                
                # find hypernyms of subphrases
                if len(invoc_hypers_list) == 0 and subphrases:
                    for hypo_subphrase in self._subphrases(hypo):
                        hypers_list = self._isas[isa_name].all_hyper(hypo_subphrase)
                        hypers_dict = {hyper: freq for hyper, freq in hypers_list}
                        invoc_hypers_dict = {w: hypers_dict[w] for w in set(hypers_dict.keys()).intersection(self.voc)}
                        invoc_hypers_dict.pop(hypo_subphrase, None)
                        invoc_hypers_list = sorted(invoc_hypers_dict.items(), key=operator.itemgetter(1), reverse=True)
                        if (invoc_hypers_list) > 0: break
                    if len(invoc_hypers_list) == 0:
                        continue
                    elif VERBOSE:
                        print hypo, "-->", hypo_subphrase, ":", invoc_hypers_list
                elif len(invoc_hypers_list) == 0:
                    continue
                
                # normalize
                max_freq = float(hypers_list[0][1])
                invoc_max_freq = float(invoc_hypers_list[0][1])
                for hyper, freq in invoc_hypers_list:
                    hypo2hyper_freq[isa_name][(hypo, hyper)] = freq
                    hypo2hyper_anorm[isa_name][(hypo, hyper)] = freq/max_freq
                    hypo2hyper_inorm[isa_name][(hypo, hyper)] = freq/invoc_max_freq
        # average: (hypo, hyper) -> weight
        hypo2hyper_iavg = self._average(hypo2hyper_inorm)
        hypo2hyper_aavg = self._average(hypo2hyper_anorm)

        # initialize arrays
        hyper2hypo_iavg_arr = np.zeros(len(self._relations))
        hypo2hyper_iavg_arr = np.zeros(len(self._relations))
        hypo2hyper_iavg2_arr = np.zeros(len(self._relations))
        hyper2hypo_iavg2_arr = np.zeros(len(self._relations))
        hypo2hyper_aavg_arr = np.zeros(len(self._relations))
        hyper2hypo_aavg_arr = np.zeros(len(self._relations))
        hypo2hyper_arr = {}
        hyper2hypo_arr = {}
        for isa_name in hypo2hyper_inorm:
            hypo2hyper_arr[isa_name] = np.zeros(len(self._relations))
            hyper2hypo_arr[isa_name] = np.zeros(len(self._relations))

        # fill the arrays
        for i, row in self._relations.iterrows():
            if i != 0 and i % 100000 == 0: print i
            hypo2hyper_iavg_arr[i] = hypo2hyper_iavg.pop((row.hyponym, row.hypernym), 0)
            hyper2hypo_iavg_arr[i] = hypo2hyper_iavg.pop((row.hypernym, row.hyponym), 0)
            hypo2hyper_aavg_arr[i] = hypo2hyper_aavg.pop((row.hyponym, row.hypernym), 0)
            hyper2hypo_aavg_arr[i] = hypo2hyper_aavg.pop((row.hypernym, row.hyponym), 0)
            for isa_name in hypo2hyper_inorm:
                hypo2hyper_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop((row.hyponym, row.hypernym), 0)
                hyper2hypo_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop((row.hypernym, row.hyponym), 0)

        # insert arrays as columns
        s = "_s" if subphrases else ""
        for isa_name in hypo2hyper_inorm:
            col = "hypo2hyper_" + isa_name + s
            self._relations[col] = Series(hypo2hyper_arr[isa_name], index=self._relations.index)
            hypo2hyper_iavg2_arr += self._relations[col] / self._relations[col].max()

            col = "hyper2hypo_" + isa_name + s
            self._relations[col] = Series(hyper2hypo_arr[isa_name], index=self._relations.index)
            hyper2hypo_iavg2_arr += self._relations[col] / self._relations[col].max()

        self._relations["hypo2hyper" + s] = Series(hypo2hyper_iavg_arr, index=self._relations.index)
        self._relations["hyper2hypo" + s] = Series(hyper2hypo_iavg_arr, index=self._relations.index)
        self._relations["hypo2hyper2" + s] = Series(hypo2hyper_iavg2_arr, index=self._relations.index)
        self._relations["hyper2hypo2" + s] = Series(hyper2hypo_iavg2_arr, index=self._relations.index)
        self._relations["hypo2hyper3" + s] = Series(hypo2hyper_aavg_arr, index=self._relations.index)
        self._relations["hyper2hypo3" + s] = Series(hyper2hypo_aavg_arr, index=self._relations.index)
        self._save_relations()        
        
        # debug info
        debug_fpath = self._relations_fpath + "-direct-hypo2hyper" + s + ".csv"
        tmp_fpath = debug_fpath + ".tmp"
        with codecs.open(tmp_fpath, "w", "utf-8") as out:
            print >> out, "hyponym\thypernym\tfreq"
            for hypo, hyper in hypo2hyper_iavg: print >> out, "%s\t%s\t%.3f" % (hypo, hyper, hypo2hyper_iavg[(hypo, hyper)])
        df = read_csv(tmp_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
        df = df.sort(["hyponym","freq"], ascending=[1,0])
        df.to_csv(debug_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False)
        try_remove(tmp_fpath)
        print "Direct hypernyms:", debug_fpath
示例#5
0
文件: isas.py 项目: shannonyu/taxi
    def __init__(self,
                 isas_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True,
                 lowercase=True):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = unicode(row["hyponym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hyponym"]).split("#")[0]
                    hyper = unicode(row["hypernym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hypernym"]).split("#")[0]
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[
                            hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[
                            hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath
                                            if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo
示例#6
0
    def __init__(
        self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True
    ):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = (
                        unicode(row["hyponym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hyponym"]).split("#")[0]
                    )
                    hyper = (
                        unicode(row["hypernym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hypernym"]).split("#")[0]
                    )
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo