Пример #1
0
    def _all_pairs_similarity(self, machine1, machine2):
        words1 = set(MachineTraverser.get_nodes(machine1,
                                                exclude_words=self.stopwords))
        words2 = set(MachineTraverser.get_nodes(machine2,
                                                exclude_words=self.stopwords))
        pair_sims_by_word = defaultdict(dict)
        for word1 in words1:
            for word2 in words2:
                sim = self.word_similarity(word1, word2, -1, -1,
                                           sim_type="strict_links_and_nodes")
                pair_sims_by_word[word1][word2] = sim if sim else 0.0
                pair_sims_by_word[word2][word1] = sim if sim else 0.0

        max_sims_by_word = dict((
            (word, my_max(pair_sims_by_word[word].itervalues()))
            for word in words1 | words2))

        sim = average((average((max_sims_by_word[w] for w in words1)),
                       average((max_sims_by_word[w] for w in words2))))
        #sim = max((my_max((max_sims_by_word[w] for w in words1)),
        #           my_max((max_sims_by_word[w] for w in words2))))
        if sim:
            self.log(
                "{0} - {1} all_pairs similarity: {2} based on: {3}".format(
                    machine1.printname(), machine2.printname(), sim,
                    pair_sims_by_word))
        return sim
Пример #2
0
    def _all_pairs_similarity(self, machine1, machine2):
        words1 = set(
            MachineTraverser.get_nodes(machine1, exclude_words=self.stopwords))
        words2 = set(
            MachineTraverser.get_nodes(machine2, exclude_words=self.stopwords))
        pair_sims_by_word = defaultdict(dict)
        for word1 in words1:
            for word2 in words2:
                sim = self.word_similarity(word1,
                                           word2,
                                           -1,
                                           -1,
                                           sim_type="strict_links_and_nodes")
                pair_sims_by_word[word1][word2] = sim if sim else 0.0
                pair_sims_by_word[word2][word1] = sim if sim else 0.0

        max_sims_by_word = dict(
            ((word, my_max(pair_sims_by_word[word].itervalues()))
             for word in words1 | words2))

        sim = average((average((max_sims_by_word[w] for w in words1)),
                       average((max_sims_by_word[w] for w in words2))))
        #sim = max((my_max((max_sims_by_word[w] for w in words1)),
        #           my_max((max_sims_by_word[w] for w in words2))))
        if sim:
            self.log(
                "{0} - {1} all_pairs similarity: {2} based on: {3}".format(
                    machine1.printname(), machine2.printname(), sim,
                    pair_sims_by_word))
        return sim
Пример #3
0
 def expand_definition(self, machine, stopwords=[]):
     def_machines = dict(
         [(pn, m) for pn, m in [
             (m2.printname(), m2) for m2 in MachineTraverser.get_nodes(
                 machine, names_only=False, keep_upper=True)]
          if pn != machine.printname()])
     self.expand(def_machines, stopwords=stopwords)
Пример #4
0
 def expand_definition(self, machine, stopwords=[]):
     def_machines = dict([
         (pn, m)
         for pn, m in [(m2.printname(), m2)
                       for m2 in MachineTraverser.get_nodes(
                           machine, names_only=False, keep_upper=True)]
         if pn != machine.printname()
     ])
     self.expand(def_machines, stopwords=stopwords)
Пример #5
0
 def get_def_words(self, stream):
     for headword, machines in self.definitions.iteritems():
         if headword[0] == '@':
             continue
         for machine in machines:
             def_words = [
                 word for word in MachineTraverser.get_nodes(machine)
                 if word[0] not in '=@'
             ]
             stream.write(u"{0}\t{1}\n".format(
                 headword, u"\t".join(def_words)).encode("utf-8"))
Пример #6
0
 def get_def_words(self, stream):
     for headword, machines in self.definitions.iteritems():
         if headword[0] == '@':
             continue
         for machine in machines:
             def_words = [
                 word for word in MachineTraverser.get_nodes(machine)
                 if word[0] not in '=@']
             stream.write(
                 u"{0}\t{1}\n".format(
                     headword, u"\t".join(def_words)).encode("utf-8"))
Пример #7
0
    def _get_links_nodes(self, machine, depth):
        if machine in self.seen_for_links or depth > 5:
            return
        self.seen_for_links.add(machine)
        for hypernym in machine.partitions[0]:
            name = hypernym.printname()
            if name == '=AGT' or not name.isupper():
            #    if depth == 0 and name not in ("lack", "to"):  # TMP!!!
                yield name, None

            for link, node in self._get_links_nodes(hypernym, depth=depth+1):
                yield link, node

        for link, node in self.get_binary_links_nodes(machine):
            yield link, node

        for node in MachineTraverser.get_nodes(machine):
            yield None, node
Пример #8
0
    def _get_links_nodes(self, machine, depth):
        if machine in self.seen_for_links or depth > 5:
            return
        self.seen_for_links.add(machine)
        for hypernym in machine.partitions[0]:
            name = hypernym.printname()
            if name == '=AGT' or not name.isupper():
                #    if depth == 0 and name not in ("lack", "to"):  # TMP!!!
                yield name, None

            for link, node in self._get_links_nodes(hypernym, depth=depth + 1):
                yield link, node

        for link, node in self.get_binary_links_nodes(machine):
            yield link, node

        for node in MachineTraverser.get_nodes(machine):
            yield None, node
Пример #9
0
 def build_from_4lang(cfg):
     fn = cfg.get("machine", "definitions")
     primitive_fn = cfg.get("machine", "primitives")
     primitives = set(
         [line.decode('utf-8').strip() for line in open(primitive_fn)])
     logging.info('parsing 4lang definitions...')
     pn_index = 1 if cfg.get("deps", "lang") == 'hu' else 0
     definitions = read_defs(file(fn), pn_index)
     #logging.info('parsed {0} entries, done!'.format(len(definitions)))
     logging.info('lowercasing binaries...')
     for pn, machines in definitions:
         for m in machines:
             for node in MachineTraverser.get_nodes(m,
                                                    keep_upper=True,
                                                    names_only=False):
                 node.printname_ = node.printname_.lower()
     logging.info('done!')
     lexicon = Lexicon.create_from_dict(definitions, primitives, cfg)
     return lexicon
Пример #10
0
    def expand(self, words_to_machines, stopwords=[], cached=False):
        if len(stopwords) == 0:
            stopwords = self.stopwords
        for lemma, machine in words_to_machines.iteritems():
            if ((not cached or lemma not in self.expanded)
                    and lemma in self.known_words()
                    and lemma not in stopwords):

                # deepcopy so that the version in the lexicon keeps its links
                definition = self.get_machine(lemma)
                copied_def = copy.deepcopy(definition)
                """
                for parent, i in list(definition.parents):
                    copied_parent = copy.deepcopy(parent)
                    for m in list(copied_parent.partitions[i]):
                        if m.printname() == lemma:
                            copied_parent.remove(m, i)
                            break
                    else:
                        raise Exception()
                        # "can't find {0} in partition {1} of {2}: {3}".format(
                        # ))
                    copied_parent.append(copied_def, i)
                """

                case_machines = [
                    m for m in MachineTraverser.get_nodes(
                        copied_def, names_only=False, keep_upper=True)
                    if m.printname().startswith('=')
                ]

                machine.unify(copied_def, exclude_0_case=True)

                for cm in case_machines:
                    if cm.printname() == "=AGT":
                        if machine.partitions[1]:
                            machine.partitions[1][0].unify(cm)
                    if cm.printname() == "=PAT":
                        if machine.partitions[2]:
                            machine.partitions[2][0].unify(cm)

                self.expanded.add(lemma)
Пример #11
0
    def expand(self, words_to_machines, stopwords=[], cached=False):
        if len(stopwords) == 0:
            stopwords = self.stopwords
        for lemma, machine in words_to_machines.iteritems():
            if (
                    (not cached or lemma not in self.expanded) and
                    lemma in self.known_words() and lemma not in stopwords):

                # deepcopy so that the version in the lexicon keeps its links
                definition = self.get_machine(lemma)
                copied_def = copy.deepcopy(definition)

                """
                for parent, i in list(definition.parents):
                    copied_parent = copy.deepcopy(parent)
                    for m in list(copied_parent.partitions[i]):
                        if m.printname() == lemma:
                            copied_parent.remove(m, i)
                            break
                    else:
                        raise Exception()
                        # "can't find {0} in partition {1} of {2}: {3}".format(
                        # ))
                    copied_parent.append(copied_def, i)
                """

                case_machines = [
                    m for m in MachineTraverser.get_nodes(
                        copied_def, names_only=False, keep_upper=True)
                    if m.printname().startswith('=')]

                machine.unify(copied_def, exclude_0_case=True)

                for cm in case_machines:
                    if cm.printname() == "=AGT":
                        if machine.partitions[1]:
                            machine.partitions[1][0].unify(cm)
                    if cm.printname() == "=PAT":
                        if machine.partitions[2]:
                            machine.partitions[2][0].unify(cm)

                self.expanded.add(lemma)
Пример #12
0
 def build_from_4lang(cfg):
     fn = cfg.get("machine", "definitions")
     plural_fn = cfg.get("machine", "plurals")
     primitive_fn = cfg.get("machine", "primitives")
     primitives = set(
         [line.decode('utf-8').strip() for line in open(primitive_fn)])
     logging.info('parsing 4lang definitions...')
     pn_index = 1 if cfg.get("deps", "lang") == 'hu' else 0
     definitions = read_defs(
         file(fn), plural_fn, pn_index, three_parts=True)
     logging.info('parsed {0} entries, done!'.format(len(definitions)))
     logging.info('lowercasing binaries...')
     for pn, machines in definitions.iteritems():
         for m in machines:
             for node in MachineTraverser.get_nodes(
                     m, keep_upper=True, names_only=False):
                 node.printname_ = node.printname_.lower()
     logging.info('done!')
     lexicon = Lexicon.create_from_dict(definitions, primitives, cfg)
     return lexicon
Пример #13
0
 def text_to_4lang_demo(self, text, expand, fn='pic', dep_fn='deps'):
     preproc_sen = TextTo4lang.preprocess_text(text.strip().decode('utf-8'))
     deps, corefs = self.parser_wrapper.parse_text(preproc_sen)
     words2machines = self.dep_to_4lang.get_machines_from_deps_and_corefs(
         deps, corefs)
     # TODO
     orig_machines = set()
     for machine in words2machines.itervalues():
         orig_machines |= set(MachineTraverser.get_nodes(
             machine, names_only=False, keep_upper=True))
     # orig_machines = set([m.printname() for m in words2machines.values()])
     # logging.info(u'orig_machines: {0}'.format(
     #     [m.printname() for m in orig_machines]))
     if expand:
         self.dep_to_4lang.lexicon.expand(words2machines)
     pic_path = draw_text_graph(
         words2machines, self.tmp_dir, fn=fn,
         orig_machines=orig_machines)
     dep_path = draw_dep_graph(deps[0], self.tmp_dir, dep_fn)
     # deps_table = self.get_dep_table(deps[0])
     return os.path.basename(dep_path), os.path.basename(pic_path)
Пример #14
0
 def _get_all_nodes(self, machine):
     nodes = [m for m in MachineTraverser.get_nodes(machine, names_only=True, keep_upper=False)]
     return nodes
Пример #15
0
    def expand(self,
               words_to_machines,
               stopwords=[],
               cached=False,
               abstract=False):
        if len(stopwords) == 0:
            stopwords = self.stopwords
        machines_to_append = []
        for lemma, machine in words_to_machines.iteritems():
            if ((not cached or lemma not in self.expanded)
                    and lemma in self.known_words()
                    and lemma not in stopwords):

                # deepcopy so that the version in the lexicon keeps its links
                definition = self.get_machine(lemma)

                copied_def = copy.deepcopy(definition)
                print("machine: " + str(machine))
                print("defintion: " + str(definition))
                if abstract is True:
                    part_one = False
                    part_two = False
                    if len(copied_def.partitions[1]) > 0:
                        if len(machine.partitions[1]) > 0:
                            part_one = True
                            print("machine partitions 1:")
                            for i in machine.partitions[1]:
                                print(i)
                                for j in copied_def.partitions[1]:
                                    for k in range(0, 3):
                                        for m in j.partitions[k]:
                                            i.append(m, k)
                                    for p in j.parents:
                                        i.append(p[0], p[1])

                    if len(copied_def.partitions[2]) > 0:
                        if len(machine.partitions[2]) > 0:
                            part_two = True
                            print("machine partitions 2:")
                            for i in machine.partitions[2]:
                                for j in copied_def.partitions[2]:
                                    print(j)
                                    for k in range(0, 3):
                                        for m in j.partitions[k]:
                                            i.append(m, k)
                                    for p in j.parents:
                                        i.append(p[0], p[1])

                    machine_for_replace = None
                    def_parents = [
                        parent for parent in copied_def.parents
                        if parent[1] == 0
                    ]

                    if len(copied_def.partitions[0]) > 0:
                        machine_for_replace = copied_def.partitions[0][0]
                    elif len(def_parents) > 0:
                        machine_for_replace = def_parents[0][0]
                    if machine_for_replace is not None:
                        for m in machine_for_replace.parents.copy():
                            if m[0].printname().startswith(lemma):
                                machine_for_replace.parents.remove(m)

                        for i in machine.parents.copy():
                            i[0].remove(machine, i[1])
                            i[0].append(machine_for_replace, i[1])

                        for i in range(0, 3):
                            for m in machine.partitions[i]:
                                try:
                                    machine.remove(m, i)
                                except KeyError:
                                    pass
                                machine_for_replace.append(m, i)
                        machines_to_append.append(machine_for_replace)
                    if machine_for_replace is None and part_one is False and part_two is False:
                        pdb.set_trace()
                        machine_graph = [
                            m for m in MachineTraverser.get_nodes(
                                machine, names_only=False, keep_upper=True)
                        ]
                        def_graph = [
                            m for m in MachineTraverser.get_nodes(
                                copied_def, names_only=False, keep_upper=True)
                        ]
                        g1 = MachineGraph.create_from_machines(machine_graph)
                        g2 = MachineGraph.create_from_machines(def_graph)
                        print("rossz machine: " + str(machine))
                        print("Definicio: " + str(copied_def))
                        print("Machine")
                        print(g1.to_dot())
                        print("Definicio")
                        print(g2.to_dot())
                        machine.unify(copied_def, exclude_0_case=True)
                else:
                    machine.unify(copied_def, exclude_0_case=True)

                #machine_for_replace.parents.remove((machine, 0))
                '''
                print("machine for replace childs")
                for i in range(0,3):
                   for m in machine_for_replace.partitions[i]:
                        print(m)
                        print(i)
                '''
                '''
                helpmachine = [
                    m for m in MachineTraverser.get_nodes(
                        copied_def, names_only=False, keep_upper=True)
                    ]
                '''
                """
                for parent, i in list(definition.parents):
                    copied_parent = copy.deepcopy(parent)
                    for m in list(copied_parent.partitions[i]):
                        if m.printname() == lemma:
                            copied_parent.remove(m, i)
                            break
                    else:
                        raise Exception()
                        # "can't find {0} in partition {1} of {2}: {3}".format(
                        # ))
                    copied_parent.append(copied_def, i)
                """

                case_machines = [
                    m for m in MachineTraverser.get_nodes(
                        copied_def, names_only=False, keep_upper=True)
                    if m.printname().startswith('=')
                ]

                #machine.unify(copied_def, exclude_0_case=True)
                for cm in case_machines:
                    if cm.printname() == "=AGT":
                        if machine.partitions[1]:
                            machine.partitions[1][0].unify(cm)
                    if cm.printname() == "=PAT":
                        if machine.partitions[2]:
                            machine.partitions[2][0].unify(cm)
                #for j in machine_for_replace.parents:
                #    print(j)
                self.expanded.add(lemma)
        for m in machines_to_append:
            words_to_machines[m.printname()] = m