示例#1
0
文件: taxo.py 项目: anukat2015/taxi
    def __init__(self, taxonomy_resources, voc_fpath="", relations_fpath="", lang="en"):
        self._isas = taxonomy_resources.isas
        self._freqs = taxonomy_resources.freqs
        self.voc_name = fpath2filename(voc_fpath)
        self._voc_fpath = voc_fpath
        self._stopwords = load_stoplist(lang=lang)
        self._lang = lang

        if exists(voc_fpath) and not exists(relations_fpath):
            self.voc = self._load_voc(voc_fpath) 
            relations_fpath = voc_fpath + "-relations.csv"
            print "Generating new relations file:", relations_fpath
            self._relations_fpath = voc_fpath + "-relations.csv"
            self._relations = self._generate_relations(self.voc, self._relations_fpath)
        elif exists(relations_fpath):
            print "Loading relations file:", relations_fpath
            self._relations_fpath = relations_fpath
            self._relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
            print "Loaded %d relations from: %s" % (len(self._relations), relations_fpath)
            hypos_voc = set(self._relations.hyponym.to_dict().values())
            hyper_voc = set(self._relations.hypernym.to_dict().values())
            self.voc = hypos_voc.union(hyper_voc)
            print "Loaded %d voc from relations" % len(self.voc)
        else:
            raise Exception("Error: cannot load relations or generate them. Specify either voc_fpath or relations_fpath.")
示例#2
0
文件: taxo.py 项目: anukat2015/taxi
    def __init__(self, freq_fpaths=[], isa_fpaths=[]):

        tic = time()
        self._freqs = {}
        for fpath in freq_fpaths:
            fname = fpath2filename(fpath)
            self._freqs[fname] = FreqDictionary(fpath)
            print "Loaded freq dictionary '%s': %s" % (fname, fpath)

        self._isas = {}
        for fpath in isa_fpaths:
            fname = fpath2filename(fpath)
            self._isas[fname] = ISAs(fpath)
            print "Loaded isa dictionary (%d words) '%s': %s" % (len(self._isas[fname].data), fname, fpath)

        print "Loaded resources in %d sec." % (time() - tic)
示例#3
0
文件: taxo.py 项目: binarymax/taxi
    def __init__(self, freq_fpaths=[], isa_fpaths=[]):

        tic = time()
        self._freqs = {}
        for fpath in freq_fpaths:
            fname = fpath2filename(fpath)
            self._freqs[fname] = FreqDictionary(fpath)
            print "Loaded freq dictionary '%s': %s" % (fname, fpath)

        self._isas = {}
        for fpath in isa_fpaths:
            fname = fpath2filename(fpath)
            self._isas[fname] = ISAs(fpath)
            print "Loaded isa dictionary (%d words) '%s': %s" % (len(
                self._isas[fname].data), fname, fpath)

        print "Loaded resources in %d sec." % (time() - tic)
示例#4
0
文件: taxo.py 项目: anukat2015/taxi
    def _load_voc(self, voc_fpath):
        if exists(voc_fpath):
            voc_df = read_csv(voc_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
            voc_name = fpath2filename(voc_fpath)

            voc = set()
            for i, row in voc_df.iterrows():
                if "term" in row: voc.add(row.term)
                elif "word" in row: voc.add(row.word)
            print "Loaded %d words vocabulary"  % len(voc) 
            return voc
        else:
            print "Warning: vocabulary is not loaded. This means hypo2hyper features cannot be extracted."
            return set()
示例#5
0
文件: taxo.py 项目: binarymax/taxi
    def _load_voc(self, voc_fpath):
        if exists(voc_fpath):
            voc_df = read_csv(voc_fpath,
                              encoding='utf-8',
                              delimiter="\t",
                              error_bad_lines=False)
            voc_name = fpath2filename(voc_fpath)

            voc = set()
            for i, row in voc_df.iterrows():
                if "term" in row: voc.add(row.term)
                elif "word" in row: voc.add(row.word)
            print "Loaded %d words vocabulary" % len(voc)
            return voc
        else:
            print "Warning: vocabulary is not loaded. This means hypo2hyper features cannot be extracted."
            return set()
示例#6
0
文件: taxo.py 项目: binarymax/taxi
    def __init__(self,
                 taxonomy_resources,
                 voc_fpath="",
                 relations_fpath="",
                 lang="en"):
        self._isas = taxonomy_resources.isas
        self._freqs = taxonomy_resources.freqs
        self.voc_name = fpath2filename(voc_fpath)
        self._voc_fpath = voc_fpath
        self._stopwords = load_stoplist(lang=lang)
        self._lang = lang

        if exists(voc_fpath) and not exists(relations_fpath):
            self.voc = self._load_voc(voc_fpath)
            relations_fpath = voc_fpath + "-relations.csv"
            print "Generating new relations file:", relations_fpath
            self._relations_fpath = voc_fpath + "-relations.csv"
            self._relations = self._generate_relations(self.voc,
                                                       self._relations_fpath)
        elif exists(relations_fpath):
            print "Loading relations file:", relations_fpath
            self._relations_fpath = relations_fpath
            self._relations = read_csv(relations_fpath,
                                       encoding='utf-8',
                                       delimiter="\t",
                                       error_bad_lines=False)
            print "Loaded %d relations from: %s" % (len(
                self._relations), relations_fpath)
            hypos_voc = set(self._relations.hyponym.to_dict().values())
            hyper_voc = set(self._relations.hypernym.to_dict().values())
            self.voc = hypos_voc.union(hyper_voc)
            print "Loaded %d voc from relations" % len(self.voc)
        else:
            raise Exception(
                "Error: cannot load relations or generate them. Specify either voc_fpath or relations_fpath."
            )