Пример #1
0
    def __init__(self, machine1, machine2, max_depth):
        G1 = MachineGraph.create_from_machines([machine1], max_depth=max_depth)
        G2 = MachineGraph.create_from_machines([machine2], max_depth=max_depth)
        name1 = machine1.printname()
        name2 = machine2.printname()

        self.subgraph_dict = dict()
        # self.subgraph_dict.update(self._get_subgraph_N(G1.G, G2.G, name1, name2))
        # self.subgraph_dict.update(self._get_subgraph_N_X_N(G1.G, G2.G, name1, name2))
        self.subgraph_dict.update(self._get_subgraph_3_nodes(G1.G, G2.G, name1, name2))
Пример #2
0
    def run(self, sentence):
        """Parses a sentence, runs the spreading activation and returns the
        messages that have to be sent to the active plugins."""
        try:
            sp = SentenceParser()
            sa = SpreadingActivation(self.lexicon)
            machines = sp.parse(sentence)
            logging.debug('machines: {}'.format(machines))
            logging.debug('machines: {}'.format(
                [m for m in machines]))
            for machine_list in machines:
                for machine in machine_list:
                    if machine.control.kr['CAT'] == 'VERB':
                        logging.debug('adding verb construction for {}'.format(
                            machine))
                        self.lexicon.add_construction(VerbConstruction(
                            machine.printname(), self.lexicon, self.supp_dict))
            logging.info('constructions: {}'.format(
                self.lexicon.constructions))

            # results is a list of (url, data) tuples
            results = sa.activation_loop(machines)
            print 'results:', results
            print 'machines:', machines

            graph = MachineGraph.create_from_machines(
                [m[0] for m in machines], max_depth=1)
            f = open('machines.dot', 'w')
            f.write(graph.to_dot().encode('utf-8'))

            self.lexicon.clear_active()
        except Exception, e:
            import traceback
            traceback.print_exc(e)
            raise(e)
Пример #3
0
    def lemma_similarity(self, lemma1, lemma2, sim_type):
        if (lemma1, lemma2) in self.lemma_sim_cache:
            return self.lemma_sim_cache[(lemma1, lemma2)]
        elif lemma1 == lemma2:
            return 1
        self.log(u'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2))

        machines1 = self.wrapper.definitions[lemma1]
        machines2 = self.wrapper.definitions[lemma2]

        pairs_by_sim = sorted([
            (self.machine_similarity(machine1, machine2, sim_type),
             (machine1, machine2))
            for machine1 in machines1 for machine2 in machines2], reverse=True)

        sim, (machine1, machine2) = pairs_by_sim[0]

        draw_graphs = True  # use with caution
        if draw_graphs and not self.wrapper.batch:
            graph = MachineGraph.create_from_machines(
                [machine1, machine2])  # , max_depth=1)
            f = open('graphs/{0}_{1}.dot'.format(lemma1, lemma2), 'w')
            f.write(graph.to_dot().encode('utf-8'))

        sim = sim if sim >= 0 else 0
        self.lemma_sim_cache[(lemma1, lemma2)] = sim
        self.lemma_sim_cache[(lemma2, lemma1)] = sim
        return sim
Пример #4
0
 def draw_single_graph(self, word, path):
     clean_word = Machine.d_clean(word)
     for c, machine in enumerate(self.definitions[word]):
         graph = MachineGraph.create_from_machines([machine])
         file_name = os.path.join(path, '{0}_{1}.dot'.format(clean_word, c))
         with open(file_name, 'w') as file_obj:
             file_obj.write(graph.to_dot().encode('utf-8'))
Пример #5
0
def print_text_graph(words_to_machines, graph_dir, fn='text'):
    graph = MachineGraph.create_from_machines(
        words_to_machines.values())
    fn = os.path.join(graph_dir, '{0}.dot'.format(fn))
    with open(fn, 'w') as f:
        f.write(graph.to_dot().encode('utf-8'))
    return fn
Пример #6
0
def draw_text_graph(
        words_to_machines, out_dir, fn='text', orig_machines=[]):
    graph = MachineGraph.create_from_machines(
        words_to_machines.values(), orig_machines=orig_machines)
    src_str = graph.to_dot().encode('utf-8')
    src = graphviz.Source(src_str, format='png')
    pic_path = src.render(filename=fn, directory=out_dir)
    return pic_path
Пример #7
0
def main():
    lex_fn, word = sys.argv[1:3]
    lex = Lexicon.load_from_binary(lex_fn)
    machines = lex.lexicon.get(word, lex.ext_lexicon.get(word))
    if machines is None:
        print '404 :('
    else:
        graph = MachineGraph.create_from_machines(machines)
        sys.stdout.write(graph.to_dot().encode('utf-8'))
Пример #8
0
def test_dep():
    print 'building wrapper...'
    w = Wrapper(sys.argv[1])
    for line in sys.stdin:
        w.add_dependency(line)

    active_machines = w.lexicon.active_machines()
    logging.debug('active machines: {}'.format(active_machines))
    graph = MachineGraph.create_from_machines(active_machines)
    f = open('machines.dot', 'w')
    f.write(graph.to_dot().encode('utf-8'))
Пример #9
0
 def draw_word_graphs(self):
     ensure_dir('graphs/words')
     for c, (word, machines) in enumerate(self.definitions.iteritems()):
         if c % 1000 == 0:
             logging.info("{0}...".format(c))
         for i, machine in enumerate(machines):
             graph = MachineGraph.create_from_machines([machine])
             clean_word = Machine.d_clean(word)
             if clean_word[0] == 'X':
                 clean_word = clean_word[1:]
             f = open('graphs/words/{0}_{1}.dot'.format(clean_word, i), 'w')
             f.write(graph.to_dot().encode('utf-8'))
Пример #10
0
 def run(self):
     logging.info('running QA...')
     input_file = self.cfg.get('qa', 'input_file')
     for entry in QAParser.parse_file(input_file):
         logging.info('processing text...')
         all_text = "\n".join([doc['text'] for doc in entry['docs']])
         model = self.text_to_4lang.process(
             all_text, dep_dir=self.dep_dir, fn='text')
         print_text_graph(model, self.graph_dir)
         model_graph = MachineGraph.create_from_machines(model.values())
         for question in entry['questions']:
             answer = self.answer_question(question, model, model_graph)
             print answer['text']
Пример #11
0
    def add_edges(self, word2machine):
        g = MachineGraph.create_from_machines(word2machine.values())
        g.do_closure()
        binaries = defaultdict(lambda: [set(), set()])
        for n1, n2, edata in g.G.edges(data=True):
            n1_index = self.get_w_index(n1.split('_')[0])
            n2_index = self.get_w_index(n2.split('_')[0])
            if edata['color'] == 0:
                self.add_edge(0, n1_index, n2_index)
            else:
                self.add_binary(n1.split('_')[0])
                if edata['color'] == 1:
                    binaries[n1_index][0].add(n2_index)
                elif edata['color'] == 2:
                    binaries[n1_index][1].add(n2_index)
                else:
                    assert False

        for bin_index, (subjs, objs) in binaries.iteritems():
            for subj_index in subjs:
                for obj_index in objs:
                    self.add_edge(bin_index, subj_index, obj_index)
Пример #12
0
import sys

from pymachine.utils import MachineGraph

from fourlang.lexicon import Lexicon

lexicon = Lexicon.load_from_binary(sys.argv[1])
total = 0
total_size = 0
smallest = 999
largest = 0
for word, machines in lexicon.ext_lexicon.iteritems():
    machine = next(iter(machines))
    graph = MachineGraph.create_from_machines([machine])
    size = len(graph.G) - 1
    if size < 1:
        continue
    total += 1
    total_size += size
    smallest = min(smallest, size)
    largest = max(largest, size)

print 'processed {0} graphs'.format(total)
print 'average size: {0} nodes'.format(total_size/float(total))
print 'smallest: {0}, largest: {1}'.format(smallest, largest)
Пример #13
0
def print_text_graph(words_to_machines, graph_dir, fn='text'):
    graph = MachineGraph.create_from_machines(words_to_machines.values())
    fn = os.path.join(graph_dir, '{0}.dot'.format(fn))
    with open(fn, 'w') as f:
        f.write(graph.to_dot().encode('utf-8'))
    return fn
Пример #14
0
def print_4lang_graph(word, machine, graph_dir):
    graph = MachineGraph.create_from_machines([machine])
    fn = os.path.join(graph_dir, u"{0}.dot".format(word)).encode('utf-8')
    with open(fn, 'w') as dot_obj:
        dot_obj.write(graph.to_dot().encode('utf-8'))
Пример #15
0
    def fullgraph(self, name1, name2, machine1, machine2):
        ####################
        # Only for calculating shortest path
        ####################
        if self.calc_path:
            logging.debug('name1 = {0}, name2 = {1}'.format(name1, name2))

            length = 0
            active_graph = None
            unified_machine = None
            if self.expand_path:
                logging.debug("calc active graph")
                active_graph = MachineGraph.create_from_machines(
                    [machine1], machinegraph_options=self.machinegraph_options).G.to_undirected()
                G2 = MachineGraph.create_from_machines(
                    [machine2], machinegraph_options=self.machinegraph_options).G.to_undirected()
                active_graph.add_edges_from(G2.edges(data=True))
                for word in self.excluded_words:
                    if active_graph.has_node(word) and name1 != word and name2 != word:
                        active_graph.remove_node(word)

                # TODO: e.g. "take" is empty
                if name1 not in active_graph.nodes() or name2 not in G2.nodes():
                    return {"shortest_path": length}

                i = 0
                if self.debug_graph:
                    filename = 'test/temp_graphs/{0}_{1}_{2}.dot'.format(name1, name2, i)
                    nx.drawing.nx_agraph.write_dot(active_graph, filename)

                while not nx.has_path(active_graph, name1, name2):
                    if i > 5:
                        return {"shortest_path": length}
                    self.lexicon.expand_definition(machine1, self.stopwords)
                    self.lexicon.expand_definition(machine2, self.stopwords)
                    active_graph = MachineGraph.create_from_machines(
                        [machine1], machinegraph_options=self.machinegraph_options).G.to_undirected()
                    G2 = MachineGraph.create_from_machines(
                        [machine2], machinegraph_options=self.machinegraph_options).G.to_undirected()
                    active_graph.add_edges_from(G2.edges(data=True))
                    for word in self.excluded_words:
                        if active_graph.has_node(word) and name1 != word and name2 != word:
                            active_graph.remove_node(word)
                    i += 1
                    if self.debug_graph:
                        filename = 'test/temp_graphs/{0}_{1}_{2}.dot'.format(name1, name2, i)
                        nx.drawing.nx_agraph.write_dot(active_graph, filename)

            else:
                active_graph = self.UG

            if name1 not in active_graph.nodes() or name2 not in active_graph.nodes():
                return {"shortest_path" : length}
            if nx.has_path(active_graph, name1, name2):
                if self.node_weights:
                    old_graph = active_graph
                    active_graph = self._transform_node_weights_to_edge_weights(old_graph)
                path = nx.shortest_path(active_graph, name1, name2, weight='weight')
                if self.fullgraph_options.embedding_weighted:
                    length = nx.shortest_path_length(active_graph, name1, name2, weight='weight')
                elif self.node_weights:
                    for w in path:
                        length += self.node_freqs[w]
                    length = length - self.node_freqs[name1] - self.node_freqs[name2]
                else:
                    length = len(path)
                print "PATH: " + name1 + " " + name2
                print path
                print length
                self.shortest_path_res.write("\t".join(path))
                self.shortest_path_res.write("\n")
            else:
                logging.info("path does not exist between {0} and {1}".format(name1, name2))
                self.no_path_cnt += 1
        else:
            length = self.lexicon.get_shortest_path(name1, name2, self.shortest_path_file_name)
        # if length != 0:
        #     length = 1.0 / length
        # else:
        #     length = 1.0
        return {"shortest_path" : length}
Пример #16
0
 def score_answer(self, answer, model, model_graph):
     answer_graph = MachineGraph.create_from_machines(
         answer['machines'].values())
     answer['score'], answer['evidence'] = GraphSimilarity.supported_score(
         answer_graph, model_graph)
Пример #17
0
    def get_full_graph(self, fullgraph_options):
        if self.full_graph is not None:
            return self.full_graph
        allwords = set()
        allwords.update(
            self.lexicon.keys(), self.ext_lexicon.keys(),
            self.oov_lexicon.keys())
        self.full_graph = nx.MultiDiGraph()

        excluded_words = set()

        # get excluded words set
        with open(fullgraph_options.freq_file) as f:
            for line_no, line in enumerate(f):
                fields = line.strip().decode('utf-8').split('\t')
                freq = int(fields[0])
                word = fields[1]
                if line_no > fullgraph_options.freq_cnt and (
                        fullgraph_options.freq_val == 0 or
                        fullgraph_options.freq_val > freq):
                    break
                excluded_words.add(word)

        machinegraph_options = MachineGraphOptions(
            fullgraph_options=fullgraph_options)

        # TODO: only for debugging
        # until = 10
        for i, word in enumerate(allwords):
            # TODO: only for debugging
            # if word not in ['dumb', 'intelligent', 'stupid']:
            #     continue
            # if i > until:
            #     break

            machine = self.get_machine(word)
            MG = MachineGraph.create_from_machines(
                [machine], machinegraph_options=machinegraph_options)
            # TODO: maybe directed is better
            G = MG.G.to_undirected()

            # TODO: to print out all graphs
            # try:
            #     fn = os.path.join(
            #   '/home/eszter/projects/4lang/data/graphs/allwords',
            #   u"{0}.dot".format(word)).encode('utf-8')
            #     with open(fn, 'w') as dot_obj:
            #         dot_obj.write(MG.to_dot_str_graph().encode('utf-8'))
            # except:
            #     print "EXCEPTION: " + word

            # TODO: words to test have nodes
            # if 'other' in G.nodes() and 'car' in G.nodes():
            #     print word
            #
            # if word == 'merry-go-round' or word == 'Klaxon':
            #     print G.edges()

            self.full_graph.add_edges_from(G.edges(data=True))

            # TODO: only for debugging
            # MG.G = self.full_graph
            # fn = os.path.join(
            #   '/home/eszter/projects/4lang/test/graphs/full_graph',
            #   u"{0}.dot".format(i)).encode('utf-8')
            # with open(fn, 'w') as dot_obj:
            #     dot_obj.write(MG.to_dot_str_graph().encode('utf-8'))

        for word in excluded_words:
            if self.full_graph.has_node(word):
                self.full_graph.remove_node(word)

        return self.full_graph
Пример #18
0
 def dump_definition_graph(machine, seen=set()):
     graph = MachineGraph.create_from_machines([machine])
     return graph.to_dict()
Пример #19
0
 def dump_definition_graph(machine, seen=set()):
     graph = MachineGraph.create_from_machines([machine])
     return graph.to_dict()
Пример #20
0
def print_4lang_graph(word, machine, graph_dir, max_depth=None):
    graph = MachineGraph.create_from_machines([machine], max_depth=max_depth)
    fn = os.path.join(graph_dir, u"{0}.dot".format(word)).encode('utf-8')
    with open(fn, 'w') as dot_obj:
        dot_obj.write(graph.to_dot().encode('utf-8'))
Пример #21
0
 def score_answer(self, answer, model, model_graph):
     answer_graph = MachineGraph.create_from_machines(
         answer['machines'].values())
     answer['score'], answer['evidence'] = GraphSimilarity.supported_score(
         answer_graph, model_graph)