def main(argv): if len(argv) < 3: print("call: translations_spanish_1.py data_path bibtex_key") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = [] dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key.") sys.exit(1) for dictdata_id in dictdata_ids: translations = collections.defaultdict(int) heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) output = codecs.open("translations_subentries_for_%s.txt" % dictdata_string, "w", "utf-8") for entry_id in heads_with_translations: if heads_with_translations[entry_id]['is_subentry'] == 't': for t in heads_with_translations[entry_id]['translations']: translations[t] += 1 for w in sorted(translations.iteritems(), key=itemgetter(1), reverse=True): output.write("{0}\t{1}\n".format(w[0], w[1]))
def main(argv): if len(argv) < 2: print("call: heads_with_translations.py data_path [(bibtex_key|component)]") exit(1) cr = CorpusReaderDict(argv[1]) print("Data loaded", file=sys.stderr) dictdata_ids = [] if len(argv) == 3: dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) else: dictdata_ids = cr.dictdata_string_ids for dictdata_id in dictdata_ids: #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) print("Writing data for dictdata string ID {0}".format(dictdata_string), file=sys.stderr) output = codecs.open("heads_with_translations_%s.txt" % dictdata_string, "w", "utf-8") for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): output.write("%s\t%s\n" % (head, translation)) output.close()
def main(argv): if len(argv) < 3: print("call: translations_spanish_graph.py data_path (bibtex_key|component)") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) for dictdata_id in dictdata_ids: gr = Graph() src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id) if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']): continue if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1): continue language_iso = None if tgt_language_iso == [ 'spa' ]: language_iso = src_language_iso[0] else: language_iso = tgt_language_iso[0] dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == [ 'spa' ]: (head, translation) = (translation, head) head_with_source = escape_string("{0}|{1}".format(head, bibtex_key)) translation = escape_string(translation) #translation_with_language = "{0}|{1}".format(translation, language_iso) #if head_with_source not in gr: gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) #if translation not in gr: gr.add_node(translation, attr_dict={ "lang": "spa" }) #if not gr.has_edge((head_with_source, translation)): gr.add_edge(head_with_source, translation) output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8") output.write(write(gr)) output.close()
def main(argv): # check for the right number of command line arguments if len(argv) < 3: print() print("Call: create_initial_orthography_profile.py data_path data_source") print() print("python create_initial_orthography_profile.py data/csv/ thiesen1998") sys.exit(1) data_path = sys.argv[1] data_source = sys.argv[2] orthography_profile = open(data_source+"_initial_profile.txt", "w") # output file cr = CorpusReaderDict(data_path) dictdata_ids = cr.dictdata_ids_for_bibtex_key(data_source) # make sure the resource is in the data if len(dictdata_ids) == 0: print("There is no dictionary source for the data source you provided: "+data_source) sys.exit(1) grapheme_frequency_dict = collections.defaultdict(int) grapheme_count = 0.0 for dictdata_id in dictdata_ids: for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): graphemes = qlc.utils.parseGraphemes(head) for grapheme in graphemes: grapheme_count += 1 grapheme_frequency_dict[grapheme] += 1 header = "grapheme"+"\t"+"count"+"\t"+"total frequency" print(header) orthography_profile.write(header+"\n") for k, v in grapheme_frequency_dict.items(): if k == " ": # skip space between words continue result = k+"\t"+str(v)+"\t"+str(v/grapheme_count*100) print(result) orthography_profile.write(result+"\n")
# cr = CorpusReaderWordlist("data/testcorpus") # cr = CorpusReaderDict("data/testcorpus") # cr = CorpusReaderWordlist("data/csv") cr = CorpusReaderDict("data/csv") o = OrthographyParser("data/orthography_profiles/"+source+".txt") rules_file_flag = 0 if os.path.isfile("data/orthography_profiles/"+"rules_"+source+".txt"): rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt") rules_file_flag = 1 # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, head, translation) for wordlistdata_id in cr.dictdata_ids_for_bibtex_key(source) for head, translation in cr.heads_with_translations_for_dictdata_id(wordlistdata_id) ) # print header if rules_file_flag: print("wordlist_id"+"\t"+"translation"+"\t"+"head"+"\t"+"graphemic_parse"+"\t"+"orthographic_rules_parse"+"\t"+"ipa_parse") else: print("wordlist_id"+"\t"+"translation"+"\t"+"head"+"\t"+"graphemic_parse"+"\t"+"orthographic_rules_parse"+"\t"+"ipa_parse") err_count = 0 errors = "" # print all the things! for wordlistdata_id, head, translation in wordlist_iterator: