def main(argv): if len(argv) < 3: print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in.dot graph_file_out.dot [splitmultiwords]") sys.exit(1) split_multiwords = False if len(argv) == 4 and argv[3] == "splitmultiwords": print("Will split multiwords.") split_multiwords = True IN = codecs.open(sys.argv[1], "r", "utf-8") gr = read(IN.read()) IN.close() print("Parse finished.", file=sys.stderr) nodes = gr.nodes() stemmer = Stemmer.Stemmer('spanish') stopwords = qlc.utils.stopwords_from_file("data/stopwords/spa.txt") i = 0 for n in nodes: if "lang" in gr.node[n] and gr.node[n]["lang"] == "spa": phrase_without_stopwords = qlc.utils.remove_stopwords(n, stopwords) phrase_stems = qlc.utils.stem_phrase(phrase_without_stopwords, stemmer, split_multiwords) for stem in phrase_stems: stem = stem + "|stem" gr.add_node(stem, is_stem=True) gr.add_edge(stem, n) OUT = codecs.open(sys.argv[2], "w", "utf-8") OUT.write(write(gr)) OUT.close()
def main(argv): if len(argv) < 3: print("call: translations_spanish_graph.py data_path (bibtex_key|component)") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) for dictdata_id in dictdata_ids: gr = Graph() src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id) if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']): continue if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1): continue language_iso = None if tgt_language_iso == [ 'spa' ]: language_iso = src_language_iso[0] else: language_iso = tgt_language_iso[0] dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == [ 'spa' ]: (head, translation) = (translation, head) head_with_source = escape_string("{0}|{1}".format(head, bibtex_key)) translation = escape_string(translation) #translation_with_language = "{0}|{1}".format(translation, language_iso) #if head_with_source not in gr: gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) #if translation not in gr: gr.add_node(translation, attr_dict={ "lang": "spa" }) #if not gr.has_edge((head_with_source, translation)): gr.add_edge(head_with_source, translation) output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8") output.write(write(gr)) output.close()
def main(argv): if len(argv) < 4: print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in_1.dot graph_file_in_2.dot [...] graph_file_out.dot", file=sys.stderr) sys.exit(1) IN = None file = sys.argv[1] if not os.path.exists(file): files = glob.glob(sys.argv[1]) if len(files) == 0: print("No input files found.", file=sys.stderr) sys.exit(1) file = files.pop(0) else: files = argv[2:len(argv)-1] print("Processing file {0}.".format(file), file=sys.stderr) try: IN = codecs.open(file, "r", "utf-8") except: print("Could not open file {0}.".format(file), file=sys.stderr) sys.exit(1) gr = read(IN.read()) IN.close() files = argv[2:len(argv)-1] for f in files: print("Processing file {0}.".format(f), file=sys.stderr) IN = codecs.open(f, "r", "utf-8") gr2 = read(IN.read()) for node in gr2: gr.add_node(node, gr2.node[node]) for n1, n2 in gr2.edges_iter(): gr.add_edge(n1, n2, gr2.edge[n1][n2]) IN.close() OUT = codecs.open(sys.argv[len(argv)-1], "w", "utf-8") OUT.write(write(gr)) OUT.close()
def combine_graphs(): gr = None for dictdata_id in loaded_data["dictdata_ids"]: #dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) #target_file = "{0}.dot".format(dictdata_string) j = generate_dictdata_graph_job(dictdata_id) target_file = j.job_id IN = codecs.open(target_file, "r", "utf-8") if gr == None: gr = read(IN.read()) else: gr2 = read(IN.read()) for node in gr2: gr.add_node(node, gr2.node[node]) for n1, n2 in gr2.edges_iter(): gr.add_edge(n1, n2, gr2.edge[n1][n2]) IN.close() OUT = codecs.open(filename_combined_graph, "w", "utf-8") OUT.write(write(gr)) OUT.close()
def generate_dictdata_graph(): gr = Graph() src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id) if src_language_iso != 'spa' and tgt_language_iso != 'spa': raise(NoSpanishException) language_iso = None if tgt_language_iso == 'spa': language_iso = src_language_iso else: language_iso = tgt_language_iso bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == 'spa': (head, translation) = (translation, head) head_with_source = escape_string("{0}|{1}".format(head, bibtex_key)) translation = escape_string(translation) #translation_with_language = "{0}|{1}".format(translation, language_iso) #if head_with_source not in gr: gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) #if translation not in gr: gr.add_node(translation, attr_dict={ "lang": "spa" }) #if not gr.has_edge((head_with_source, translation)): gr.add_edge(head_with_source, translation) output = codecs.open(target_file, "w", "utf-8") output.write(write(gr)) output.close()
stopwords = qlc.utils.stopwords_from_file("../../src/qlc/data/stopwords/spa.txt") for node in combined_graph.nodes(): if "lang" in combined_graph.node[node] and combined_graph.node[node]["lang"] == "spa": phrase_without_stopwords = qlc.utils.remove_stopwords(node, stopwords) phrase_stems = qlc.utils.stem_phrase(phrase_without_stopwords, stemmer, split_multiwords) for stem in phrase_stems: stem = stem + "|stem" combined_graph_stemmed.add_node(stem, is_stem=True) combined_graph_stemmed.add_edge(stem, node) print(networkx.algorithms.components.number_connected_components(combined_graph_stemmed)) OUT = codecs.open("translation_graph_stemmed.dot", "w", "utf-8") OUT.write(write(combined_graph_stemmed)) OUT.close() matrix = {} sources = set() for node in combined_graph_stemmed: if "is_stem" in combined_graph_stemmed.node[node] and combined_graph_stemmed.node[node]["is_stem"]: spanish_nodes = [n for n in combined_graph_stemmed[node] if "lang" in combined_graph_stemmed.node[n] and combined_graph_stemmed.node[n]["lang"] == "spa"] head_nodes = [] for sp in spanish_nodes: head_nodes += [n for n in combined_graph_stemmed[sp] if ("lang" not in combined_graph_stemmed.node[n] or combined_graph_stemmed.node[n]["lang"] != "spa") and ("is_stem" not in combined_graph_stemmed.node[n] or not combined_graph_stemmed.node[n]["is_stem"])] head_nodes = set(head_nodes) heads = collections.defaultdict(list) for head in head_nodes: (head, source) = head.split("|")
# <codecell> len(combined_graph_stemmed.nodes()) # <markdowncell> # ## Export the merged graph as DOT # # The graph may now be exported to the DOT format, to be used in other tools for graph analysis or visualization. For this we use a helper function from the [qlc library](https://github.com/pbouda/qlc): # <codecell> from qlc.translationgraph import read, write OUT = codecs.open("translation_graph_stemmed.dot", "w", "utf-8") OUT.write(write(combined_graph_stemmed)) OUT.close() # <markdowncell> # ## Extract a subgraph for the stem of "comer" # # As an example how to further process the graph we will extract the subgraph for the stem "comer" now. For this the graph is traversed again until the node "com|stem" is found. All the neighbours of this node are copied to a new graph. We will also remove the sources from the node strings to make the final visualization more readable: # <codecell> comer_graph = networkx.Graph() for node in combined_graph_stemmed: if node == "com|stem": comer_graph.add_node(node) # spanish nodes