def build(input_text, input_format, window, separator, keep): check.check_iterable(input_text) assert window > 0, window assert keep >= 0 and keep <= 100, keep parse = workbench.parser.parse_input(input_text, input_format, window, separator) builder = GraphBuilder(Graph.UNDIRECTED) count_histogram = {} for subd in parse.cooccurrences.values(): for term_sentences in subd.values(): if len(term_sentences) not in count_histogram: count_histogram[len(term_sentences)] = 0 count_histogram[len(term_sentences)] += 1 logging.debug("count_histogram: %s" % count_histogram) sub_lengths = [[] if len(subd) == 0 else [len(l) for l in subd.values()] for subd in parse.cooccurrences.values()] if len(sub_lengths) > 0: maximum = max([0 if len(l) == 0 else max(l) for l in sub_lengths]) minimum = min([0 if len(l) == 0 else min(l) for l in sub_lengths]) average = sum([i for l in sub_lengths for i in l]) / sum([len(l) for l in sub_lengths]) else: maximum = 0 minimum = 0 average = 0.0 bottom_percent = (100.0 - keep) / 100.0 cutoff = max(int(maximum * bottom_percent), 1) logging.debug("maximum: %s, cutoff: %s" % (maximum, cutoff)) occurring_sentences = {} excluded_lemmas = set() included_lemmas = set() for source, target_sentences in sorted(parse.cooccurrences.items()): #source = parse.inflections.to_dominant_inflection(term_a) excluded_lemmas.add(source) targets = { target: sentences for target, sentences in filter( lambda item: len(item[1]) >= cutoff, target_sentences.items()) } #targets = {parse.inflections.to_dominant_inflection(term_b): sentences for term_b, sentences in filter(lambda item: len(item[1]) >= cutoff, term_sentences.items())} if len(targets) > 0: builder.add(source, [t for t in targets.keys()]) included_lemmas.add(source) excluded_lemmas.remove(source) if source not in occurring_sentences: occurring_sentences[source] = {} for target, sentences in targets.items(): included_lemmas.add(target) if target not in occurring_sentences[source]: occurring_sentences[source][target] = set() for sentence in sentences: occurring_sentences[source][target].add(" ".join(sentence)) graph = builder.build() graph.export("graph-adjacency.csv", name_fn=lambda identifier: identifier.name()) properties = Properties(parse.inflections, minimum, maximum, average, cutoff, len(included_lemmas) + len(excluded_lemmas), included_lemmas, excluded_lemmas) if len(graph) > 0: return Termnet(graph, {LEFT: RankedGraph(graph)}, parse.inflections, occurring_sentences, properties) else: empty = GraphBuilder(Graph.UNDIRECTED).build() inflections = Inflections() return Termnet(empty, {LEFT: RankedGraph(empty)}, inflections, {}, Properties(inflections)) return net
def compare_with(self, other): assert self.display_graph.kind == other.display_graph.kind assert len(self.ranked_graphs) == 1 assert len(other.ranked_graphs) == 1 builder = GraphBuilder(self.display_graph.kind) for node in self.display_graph.all_nodes: builder.add(node.identifier, [d.identifier for d in node.descendants]) for node in other.display_graph.all_nodes: builder.add(node.identifier, [d.identifier for d in node.descendants]) display_graph = builder.build() inflections = self.inflections.combine(other.inflections) occurring_sentences = {} for a, b_sentences in self.sentences.items(): if a not in occurring_sentences: occurring_sentences[a] = {} for b, sentences in b_sentences.items(): if b not in occurring_sentences[a]: occurring_sentences[a][b] = set() for sentence in sentences: occurring_sentences[a][b].add(sentence) for a, b_sentences in other.sentences.items(): if a not in occurring_sentences: occurring_sentences[a] = {} for b, sentences in b_sentences.items(): if b not in occurring_sentences[a]: occurring_sentences[a][b] = set() for sentence in sentences: occurring_sentences[a][b].add(sentence) included_lemmas = self.properties.included_lemmas.union( other.properties.included_lemmas) excluded_lemmas = self.properties.excluded_lemmas.union( other.properties.excluded_lemmas) for lemma in included_lemmas: excluded_lemmas.discard(lemma) properties = Properties( inflections, min(self.properties.minimum_cooccurrence_count, other.properties.minimum_cooccurrence_count), max(self.properties.maximum_cooccurrence_count, other.properties.maximum_cooccurrence_count), (self.properties.average_cooccurrence_count + other.properties.average_cooccurrence_count) / 2.0, self.properties.cutoff_cooccurrence_count if self.properties.cutoff_cooccurrence_count == other.properties.cutoff_cooccurrence_count else None, len(display_graph), included_lemmas, excluded_lemmas) return Termnet( display_graph, { LEFT: [silly for silly in self.ranked_graphs.values()][0], RIGHT: [silly for silly in other.ranked_graphs.values()][0] }, inflections, occurring_sentences, properties)