def main(argv): if len(argv) < 3: print("call: translations_spanish_1.py data_path bibtex_key") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = [] dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key.") sys.exit(1) for dictdata_id in dictdata_ids: translations = collections.defaultdict(int) heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) output = codecs.open("translations_subentries_for_%s.txt" % dictdata_string, "w", "utf-8") for entry_id in heads_with_translations: if heads_with_translations[entry_id]['is_subentry'] == 't': for t in heads_with_translations[entry_id]['translations']: translations[t] += 1 for w in sorted(translations.iteritems(), key=itemgetter(1), reverse=True): output.write("{0}\t{1}\n".format(w[0], w[1]))
def main(argv): if len(argv) < 2: print("call: heads_with_translations.py data_path [(bibtex_key|component)]") exit(1) cr = CorpusReaderDict(argv[1]) print("Data loaded", file=sys.stderr) dictdata_ids = [] if len(argv) == 3: dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) else: dictdata_ids = cr.dictdata_string_ids for dictdata_id in dictdata_ids: #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) print("Writing data for dictdata string ID {0}".format(dictdata_string), file=sys.stderr) output = codecs.open("heads_with_translations_%s.txt" % dictdata_string, "w", "utf-8") for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): output.write("%s\t%s\n" % (head, translation)) output.close()
def main(argv): if len(argv) < 3: print("call: translations_spanish_graph.py data_path (bibtex_key|component)") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) for dictdata_id in dictdata_ids: gr = Graph() src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id) if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']): continue if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1): continue language_iso = None if tgt_language_iso == [ 'spa' ]: language_iso = src_language_iso[0] else: language_iso = tgt_language_iso[0] dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == [ 'spa' ]: (head, translation) = (translation, head) head_with_source = escape_string("{0}|{1}".format(head, bibtex_key)) translation = escape_string(translation) #translation_with_language = "{0}|{1}".format(translation, language_iso) #if head_with_source not in gr: gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) #if translation not in gr: gr.add_node(translation, attr_dict={ "lang": "spa" }) #if not gr.has_edge((head_with_source, translation)): gr.add_edge(head_with_source, translation) output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8") output.write(write(gr)) output.close()
graphs = list() for dictdata_id in dictdata_ids: gr = networkx.Graph() src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id) if src_language_iso != 'spa' and tgt_language_iso != 'spa': continue language_iso = None if tgt_language_iso == 'spa': language_iso = src_language_iso else: language_iso = tgt_language_iso dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == 'spa': (head, translation) = (translation, head) head_with_source = re_quotes.sub('', "{0}|{1}".format(head, bibtex_key)) translation = re_quotes.sub('', translation) gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) gr.add_node(translation, attr_dict={ "lang": "spa" }) gr.add_edge(head_with_source, translation) graphs.append(gr) print(networkx.algorithms.components.number_connected_components(graphs[0]))
def main(argv): if len(argv) < 2: print("call: translations_spanish_1.py data_path [component]") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = [] if len(argv) == 3: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key.") sys.exit(1) else: dictdata_ids = cr.dictdata_string_ids spanish_singleword_dict = {} languages_iso = [] spanish_len2_dict = collections.defaultdict(int) spanish_len3_dict = collections.defaultdict(int) spanish_lengreater3_dict = collections.defaultdict(int) stemmer = Stemmer.Stemmer('spanish') stopwords = spanish_stopwords() re_stopwords = re.compile(r"\b(?:{0})\b".format( "|".join(stopwords).encode("utf-8") )) for dictdata_id in dictdata_ids: src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id) if src_language_iso != 'spa' and tgt_language_iso != 'spa': continue heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] language_iso = bibtex_key if bibtex_key not in languages_iso: languages_iso.append(bibtex_key) for entry_id in heads_with_translations: if tgt_language_iso == 'spa': heads = heads_with_translations[entry_id]['heads'] translations = heads_with_translations[entry_id]['translations'] else: heads = heads_with_translations[entry_id]['translations'] translations = heads_with_translations[entry_id]['heads'] for translation in translations: len_translation = len(translation.split(' ')) if len_translation > 1: translation_without_stopwords = re_stopwords.sub("", translation) translation_without_stopwords = translation_without_stopwords.strip(" ") translation_without_stopwords = re.sub(" +", " ", translation_without_stopwords) if translation_without_stopwords == " " or translation_without_stopwords == "": translation_without_stopwords = translation len_translation_without_stopwords = len_translation else: len_translation_without_stopwords = len(translation_without_stopwords.split(' ')) else: translation_without_stopwords = translation len_translation_without_stopwords = len_translation if len_translation_without_stopwords == 1: #print translation.encode("utf-8") translation_stem = stemmer.stemWord(translation_without_stopwords) if not translation_stem in spanish_singleword_dict: spanish_singleword_dict[translation_stem] = collections.defaultdict(set) for head in heads_with_translations[entry_id]['heads']: spanish_singleword_dict[translation_stem][language_iso].add(head) spanish_singleword_dict[translation_stem]["spa"].add(translation) elif len_translation == 2: #output2.write("%s\n" % (translation)) spanish_len2_dict[translation] += 1 elif len_translation == 3: #output3.write("%s\n" % (translation)) spanish_len3_dict[translation] += 1 else: #output4.write("%s\n" % (translation)) spanish_lengreater3_dict[translation] += 1 #output1 = codecs.open("spanish_len1.txt", "w", "utf-8") #for w in sorted(spanish_singleword_dict.iteritems(), key=itemgetter(0), reverse=True): # output1.write(u"{0}\n".format(w[0])) output2 = codecs.open("spanish_len2.txt", "w", "utf-8") for w in sorted(spanish_len2_dict.items(), key=itemgetter(1), reverse=True): output2.write("{0}\t{1}\n".format(w[0], w[1])) output3 = codecs.open("spanish_len3.txt", "w", "utf-8") for w in sorted(spanish_len3_dict.items(), key=itemgetter(1), reverse=True): output3.write("{0}\t{1}\n".format(w[0], w[1])) output4 = codecs.open("spanish_len_greater3.txt", "w", "utf-8") for w in sorted(spanish_lengreater3_dict.items(), key=itemgetter(1), reverse=True): output4.write("{0}\t{1}\n".format(w[0], w[1])) output = codecs.open("spanish_singlewords_matrix.txt", "w", "utf-8") output1 = codecs.open("spanish_len1.txt", "w", "utf-8") total_count = 0 more_than_one_lang_count = 0 output.write("%s\t%s\n" % ("spa", "\t".join(languages_iso[1:]))) for sp in sorted(spanish_singleword_dict): #output.write(sp) output.write("%s" % ('|'.join(sorted(spanish_singleword_dict[sp]["spa"])))) #print spanish_singleword_dict[sp].keys() count_languages = 0 for lang in languages_iso[1:]: if len(spanish_singleword_dict[sp][lang]) > 0: count_languages += 1 output.write("\t%s" % ('|'.join(sorted(spanish_singleword_dict[sp][lang])))) output.write("\n") output1.write("{0}\n".format(sp)) if count_languages > 1: more_than_one_lang_count += 1 total_count += 1 print("total number of entries in single word matrix: {0}".format(total_count)) print("number of entries with more than one language: {0}".format(more_than_one_lang_count))