def _all_pairs_similarity(self, machine1, machine2): words1 = set(MachineTraverser.get_nodes(machine1, exclude_words=self.stopwords)) words2 = set(MachineTraverser.get_nodes(machine2, exclude_words=self.stopwords)) pair_sims_by_word = defaultdict(dict) for word1 in words1: for word2 in words2: sim = self.word_similarity(word1, word2, -1, -1, sim_type="strict_links_and_nodes") pair_sims_by_word[word1][word2] = sim if sim else 0.0 pair_sims_by_word[word2][word1] = sim if sim else 0.0 max_sims_by_word = dict(( (word, my_max(pair_sims_by_word[word].itervalues())) for word in words1 | words2)) sim = average((average((max_sims_by_word[w] for w in words1)), average((max_sims_by_word[w] for w in words2)))) #sim = max((my_max((max_sims_by_word[w] for w in words1)), # my_max((max_sims_by_word[w] for w in words2)))) if sim: self.log( "{0} - {1} all_pairs similarity: {2} based on: {3}".format( machine1.printname(), machine2.printname(), sim, pair_sims_by_word)) return sim
def _all_pairs_similarity(self, machine1, machine2): words1 = set( MachineTraverser.get_nodes(machine1, exclude_words=self.stopwords)) words2 = set( MachineTraverser.get_nodes(machine2, exclude_words=self.stopwords)) pair_sims_by_word = defaultdict(dict) for word1 in words1: for word2 in words2: sim = self.word_similarity(word1, word2, -1, -1, sim_type="strict_links_and_nodes") pair_sims_by_word[word1][word2] = sim if sim else 0.0 pair_sims_by_word[word2][word1] = sim if sim else 0.0 max_sims_by_word = dict( ((word, my_max(pair_sims_by_word[word].itervalues())) for word in words1 | words2)) sim = average((average((max_sims_by_word[w] for w in words1)), average((max_sims_by_word[w] for w in words2)))) #sim = max((my_max((max_sims_by_word[w] for w in words1)), # my_max((max_sims_by_word[w] for w in words2)))) if sim: self.log( "{0} - {1} all_pairs similarity: {2} based on: {3}".format( machine1.printname(), machine2.printname(), sim, pair_sims_by_word)) return sim
def expand_definition(self, machine, stopwords=[]): def_machines = dict( [(pn, m) for pn, m in [ (m2.printname(), m2) for m2 in MachineTraverser.get_nodes( machine, names_only=False, keep_upper=True)] if pn != machine.printname()]) self.expand(def_machines, stopwords=stopwords)
def expand_definition(self, machine, stopwords=[]): def_machines = dict([ (pn, m) for pn, m in [(m2.printname(), m2) for m2 in MachineTraverser.get_nodes( machine, names_only=False, keep_upper=True)] if pn != machine.printname() ]) self.expand(def_machines, stopwords=stopwords)
def get_def_words(self, stream): for headword, machines in self.definitions.iteritems(): if headword[0] == '@': continue for machine in machines: def_words = [ word for word in MachineTraverser.get_nodes(machine) if word[0] not in '=@' ] stream.write(u"{0}\t{1}\n".format( headword, u"\t".join(def_words)).encode("utf-8"))
def get_def_words(self, stream): for headword, machines in self.definitions.iteritems(): if headword[0] == '@': continue for machine in machines: def_words = [ word for word in MachineTraverser.get_nodes(machine) if word[0] not in '=@'] stream.write( u"{0}\t{1}\n".format( headword, u"\t".join(def_words)).encode("utf-8"))
def _get_links_nodes(self, machine, depth): if machine in self.seen_for_links or depth > 5: return self.seen_for_links.add(machine) for hypernym in machine.partitions[0]: name = hypernym.printname() if name == '=AGT' or not name.isupper(): # if depth == 0 and name not in ("lack", "to"): # TMP!!! yield name, None for link, node in self._get_links_nodes(hypernym, depth=depth+1): yield link, node for link, node in self.get_binary_links_nodes(machine): yield link, node for node in MachineTraverser.get_nodes(machine): yield None, node
def _get_links_nodes(self, machine, depth): if machine in self.seen_for_links or depth > 5: return self.seen_for_links.add(machine) for hypernym in machine.partitions[0]: name = hypernym.printname() if name == '=AGT' or not name.isupper(): # if depth == 0 and name not in ("lack", "to"): # TMP!!! yield name, None for link, node in self._get_links_nodes(hypernym, depth=depth + 1): yield link, node for link, node in self.get_binary_links_nodes(machine): yield link, node for node in MachineTraverser.get_nodes(machine): yield None, node
def build_from_4lang(cfg): fn = cfg.get("machine", "definitions") primitive_fn = cfg.get("machine", "primitives") primitives = set( [line.decode('utf-8').strip() for line in open(primitive_fn)]) logging.info('parsing 4lang definitions...') pn_index = 1 if cfg.get("deps", "lang") == 'hu' else 0 definitions = read_defs(file(fn), pn_index) #logging.info('parsed {0} entries, done!'.format(len(definitions))) logging.info('lowercasing binaries...') for pn, machines in definitions: for m in machines: for node in MachineTraverser.get_nodes(m, keep_upper=True, names_only=False): node.printname_ = node.printname_.lower() logging.info('done!') lexicon = Lexicon.create_from_dict(definitions, primitives, cfg) return lexicon
def expand(self, words_to_machines, stopwords=[], cached=False): if len(stopwords) == 0: stopwords = self.stopwords for lemma, machine in words_to_machines.iteritems(): if ((not cached or lemma not in self.expanded) and lemma in self.known_words() and lemma not in stopwords): # deepcopy so that the version in the lexicon keeps its links definition = self.get_machine(lemma) copied_def = copy.deepcopy(definition) """ for parent, i in list(definition.parents): copied_parent = copy.deepcopy(parent) for m in list(copied_parent.partitions[i]): if m.printname() == lemma: copied_parent.remove(m, i) break else: raise Exception() # "can't find {0} in partition {1} of {2}: {3}".format( # )) copied_parent.append(copied_def, i) """ case_machines = [ m for m in MachineTraverser.get_nodes( copied_def, names_only=False, keep_upper=True) if m.printname().startswith('=') ] machine.unify(copied_def, exclude_0_case=True) for cm in case_machines: if cm.printname() == "=AGT": if machine.partitions[1]: machine.partitions[1][0].unify(cm) if cm.printname() == "=PAT": if machine.partitions[2]: machine.partitions[2][0].unify(cm) self.expanded.add(lemma)
def expand(self, words_to_machines, stopwords=[], cached=False): if len(stopwords) == 0: stopwords = self.stopwords for lemma, machine in words_to_machines.iteritems(): if ( (not cached or lemma not in self.expanded) and lemma in self.known_words() and lemma not in stopwords): # deepcopy so that the version in the lexicon keeps its links definition = self.get_machine(lemma) copied_def = copy.deepcopy(definition) """ for parent, i in list(definition.parents): copied_parent = copy.deepcopy(parent) for m in list(copied_parent.partitions[i]): if m.printname() == lemma: copied_parent.remove(m, i) break else: raise Exception() # "can't find {0} in partition {1} of {2}: {3}".format( # )) copied_parent.append(copied_def, i) """ case_machines = [ m for m in MachineTraverser.get_nodes( copied_def, names_only=False, keep_upper=True) if m.printname().startswith('=')] machine.unify(copied_def, exclude_0_case=True) for cm in case_machines: if cm.printname() == "=AGT": if machine.partitions[1]: machine.partitions[1][0].unify(cm) if cm.printname() == "=PAT": if machine.partitions[2]: machine.partitions[2][0].unify(cm) self.expanded.add(lemma)
def build_from_4lang(cfg): fn = cfg.get("machine", "definitions") plural_fn = cfg.get("machine", "plurals") primitive_fn = cfg.get("machine", "primitives") primitives = set( [line.decode('utf-8').strip() for line in open(primitive_fn)]) logging.info('parsing 4lang definitions...') pn_index = 1 if cfg.get("deps", "lang") == 'hu' else 0 definitions = read_defs( file(fn), plural_fn, pn_index, three_parts=True) logging.info('parsed {0} entries, done!'.format(len(definitions))) logging.info('lowercasing binaries...') for pn, machines in definitions.iteritems(): for m in machines: for node in MachineTraverser.get_nodes( m, keep_upper=True, names_only=False): node.printname_ = node.printname_.lower() logging.info('done!') lexicon = Lexicon.create_from_dict(definitions, primitives, cfg) return lexicon
def text_to_4lang_demo(self, text, expand, fn='pic', dep_fn='deps'): preproc_sen = TextTo4lang.preprocess_text(text.strip().decode('utf-8')) deps, corefs = self.parser_wrapper.parse_text(preproc_sen) words2machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( deps, corefs) # TODO orig_machines = set() for machine in words2machines.itervalues(): orig_machines |= set(MachineTraverser.get_nodes( machine, names_only=False, keep_upper=True)) # orig_machines = set([m.printname() for m in words2machines.values()]) # logging.info(u'orig_machines: {0}'.format( # [m.printname() for m in orig_machines])) if expand: self.dep_to_4lang.lexicon.expand(words2machines) pic_path = draw_text_graph( words2machines, self.tmp_dir, fn=fn, orig_machines=orig_machines) dep_path = draw_dep_graph(deps[0], self.tmp_dir, dep_fn) # deps_table = self.get_dep_table(deps[0]) return os.path.basename(dep_path), os.path.basename(pic_path)
def _get_all_nodes(self, machine): nodes = [m for m in MachineTraverser.get_nodes(machine, names_only=True, keep_upper=False)] return nodes
def expand(self, words_to_machines, stopwords=[], cached=False, abstract=False): if len(stopwords) == 0: stopwords = self.stopwords machines_to_append = [] for lemma, machine in words_to_machines.iteritems(): if ((not cached or lemma not in self.expanded) and lemma in self.known_words() and lemma not in stopwords): # deepcopy so that the version in the lexicon keeps its links definition = self.get_machine(lemma) copied_def = copy.deepcopy(definition) print("machine: " + str(machine)) print("defintion: " + str(definition)) if abstract is True: part_one = False part_two = False if len(copied_def.partitions[1]) > 0: if len(machine.partitions[1]) > 0: part_one = True print("machine partitions 1:") for i in machine.partitions[1]: print(i) for j in copied_def.partitions[1]: for k in range(0, 3): for m in j.partitions[k]: i.append(m, k) for p in j.parents: i.append(p[0], p[1]) if len(copied_def.partitions[2]) > 0: if len(machine.partitions[2]) > 0: part_two = True print("machine partitions 2:") for i in machine.partitions[2]: for j in copied_def.partitions[2]: print(j) for k in range(0, 3): for m in j.partitions[k]: i.append(m, k) for p in j.parents: i.append(p[0], p[1]) machine_for_replace = None def_parents = [ parent for parent in copied_def.parents if parent[1] == 0 ] if len(copied_def.partitions[0]) > 0: machine_for_replace = copied_def.partitions[0][0] elif len(def_parents) > 0: machine_for_replace = def_parents[0][0] if machine_for_replace is not None: for m in machine_for_replace.parents.copy(): if m[0].printname().startswith(lemma): machine_for_replace.parents.remove(m) for i in machine.parents.copy(): i[0].remove(machine, i[1]) i[0].append(machine_for_replace, i[1]) for i in range(0, 3): for m in machine.partitions[i]: try: machine.remove(m, i) except KeyError: pass machine_for_replace.append(m, i) machines_to_append.append(machine_for_replace) if machine_for_replace is None and part_one is False and part_two is False: pdb.set_trace() machine_graph = [ m for m in MachineTraverser.get_nodes( machine, names_only=False, keep_upper=True) ] def_graph = [ m for m in MachineTraverser.get_nodes( copied_def, names_only=False, keep_upper=True) ] g1 = MachineGraph.create_from_machines(machine_graph) g2 = MachineGraph.create_from_machines(def_graph) print("rossz machine: " + str(machine)) print("Definicio: " + str(copied_def)) print("Machine") print(g1.to_dot()) print("Definicio") print(g2.to_dot()) machine.unify(copied_def, exclude_0_case=True) else: machine.unify(copied_def, exclude_0_case=True) #machine_for_replace.parents.remove((machine, 0)) ''' print("machine for replace childs") for i in range(0,3): for m in machine_for_replace.partitions[i]: print(m) print(i) ''' ''' helpmachine = [ m for m in MachineTraverser.get_nodes( copied_def, names_only=False, keep_upper=True) ] ''' """ for parent, i in list(definition.parents): copied_parent = copy.deepcopy(parent) for m in list(copied_parent.partitions[i]): if m.printname() == lemma: copied_parent.remove(m, i) break else: raise Exception() # "can't find {0} in partition {1} of {2}: {3}".format( # )) copied_parent.append(copied_def, i) """ case_machines = [ m for m in MachineTraverser.get_nodes( copied_def, names_only=False, keep_upper=True) if m.printname().startswith('=') ] #machine.unify(copied_def, exclude_0_case=True) for cm in case_machines: if cm.printname() == "=AGT": if machine.partitions[1]: machine.partitions[1][0].unify(cm) if cm.printname() == "=PAT": if machine.partitions[2]: machine.partitions[2][0].unify(cm) #for j in machine_for_replace.parents: # print(j) self.expanded.add(lemma) for m in machines_to_append: words_to_machines[m.printname()] = m