Пример #1
0
def build(input_text, input_format, window, separator, keep):
    check.check_iterable(input_text)
    assert window > 0, window
    assert keep >= 0 and keep <= 100, keep
    parse = workbench.parser.parse_input(input_text, input_format, window,
                                         separator)
    builder = GraphBuilder(Graph.UNDIRECTED)
    count_histogram = {}

    for subd in parse.cooccurrences.values():
        for term_sentences in subd.values():
            if len(term_sentences) not in count_histogram:
                count_histogram[len(term_sentences)] = 0

            count_histogram[len(term_sentences)] += 1

    logging.debug("count_histogram: %s" % count_histogram)
    sub_lengths = [[] if len(subd) == 0 else [len(l) for l in subd.values()]
                   for subd in parse.cooccurrences.values()]

    if len(sub_lengths) > 0:
        maximum = max([0 if len(l) == 0 else max(l) for l in sub_lengths])
        minimum = min([0 if len(l) == 0 else min(l) for l in sub_lengths])
        average = sum([i for l in sub_lengths
                       for i in l]) / sum([len(l) for l in sub_lengths])
    else:
        maximum = 0
        minimum = 0
        average = 0.0

    bottom_percent = (100.0 - keep) / 100.0
    cutoff = max(int(maximum * bottom_percent), 1)
    logging.debug("maximum: %s, cutoff: %s" % (maximum, cutoff))
    occurring_sentences = {}
    excluded_lemmas = set()
    included_lemmas = set()

    for source, target_sentences in sorted(parse.cooccurrences.items()):
        #source = parse.inflections.to_dominant_inflection(term_a)
        excluded_lemmas.add(source)
        targets = {
            target: sentences
            for target, sentences in filter(
                lambda item: len(item[1]) >= cutoff, target_sentences.items())
        }
        #targets = {parse.inflections.to_dominant_inflection(term_b): sentences for term_b, sentences in filter(lambda item: len(item[1]) >= cutoff, term_sentences.items())}

        if len(targets) > 0:
            builder.add(source, [t for t in targets.keys()])
            included_lemmas.add(source)
            excluded_lemmas.remove(source)

            if source not in occurring_sentences:
                occurring_sentences[source] = {}

            for target, sentences in targets.items():
                included_lemmas.add(target)

                if target not in occurring_sentences[source]:
                    occurring_sentences[source][target] = set()

                for sentence in sentences:
                    occurring_sentences[source][target].add(" ".join(sentence))

    graph = builder.build()
    graph.export("graph-adjacency.csv",
                 name_fn=lambda identifier: identifier.name())
    properties = Properties(parse.inflections, minimum, maximum, average,
                            cutoff,
                            len(included_lemmas) + len(excluded_lemmas),
                            included_lemmas, excluded_lemmas)

    if len(graph) > 0:
        return Termnet(graph, {LEFT: RankedGraph(graph)}, parse.inflections,
                       occurring_sentences, properties)
    else:
        empty = GraphBuilder(Graph.UNDIRECTED).build()
        inflections = Inflections()
        return Termnet(empty, {LEFT: RankedGraph(empty)}, inflections, {},
                       Properties(inflections))

    return net
Пример #2
0
    def compare_with(self, other):
        assert self.display_graph.kind == other.display_graph.kind
        assert len(self.ranked_graphs) == 1
        assert len(other.ranked_graphs) == 1
        builder = GraphBuilder(self.display_graph.kind)

        for node in self.display_graph.all_nodes:
            builder.add(node.identifier,
                        [d.identifier for d in node.descendants])

        for node in other.display_graph.all_nodes:
            builder.add(node.identifier,
                        [d.identifier for d in node.descendants])

        display_graph = builder.build()
        inflections = self.inflections.combine(other.inflections)
        occurring_sentences = {}

        for a, b_sentences in self.sentences.items():
            if a not in occurring_sentences:
                occurring_sentences[a] = {}

            for b, sentences in b_sentences.items():
                if b not in occurring_sentences[a]:
                    occurring_sentences[a][b] = set()

                for sentence in sentences:
                    occurring_sentences[a][b].add(sentence)

        for a, b_sentences in other.sentences.items():
            if a not in occurring_sentences:
                occurring_sentences[a] = {}

            for b, sentences in b_sentences.items():
                if b not in occurring_sentences[a]:
                    occurring_sentences[a][b] = set()

                for sentence in sentences:
                    occurring_sentences[a][b].add(sentence)

        included_lemmas = self.properties.included_lemmas.union(
            other.properties.included_lemmas)
        excluded_lemmas = self.properties.excluded_lemmas.union(
            other.properties.excluded_lemmas)

        for lemma in included_lemmas:
            excluded_lemmas.discard(lemma)

        properties = Properties(
            inflections,
            min(self.properties.minimum_cooccurrence_count,
                other.properties.minimum_cooccurrence_count),
            max(self.properties.maximum_cooccurrence_count,
                other.properties.maximum_cooccurrence_count),
            (self.properties.average_cooccurrence_count +
             other.properties.average_cooccurrence_count) / 2.0,
            self.properties.cutoff_cooccurrence_count
            if self.properties.cutoff_cooccurrence_count
            == other.properties.cutoff_cooccurrence_count else None,
            len(display_graph), included_lemmas, excluded_lemmas)
        return Termnet(
            display_graph, {
                LEFT: [silly for silly in self.ranked_graphs.values()][0],
                RIGHT: [silly for silly in other.ranked_graphs.values()][0]
            }, inflections, occurring_sentences, properties)