def main(argv):

    if len(argv) < 3:
        print("call: translations_spanish_1.py data_path bibtex_key")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = []    
    dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
    if len(dictdata_ids) == 0:
        print("did not find any dictionary data for the bibtex_key.")
        sys.exit(1)

    
    for dictdata_id in dictdata_ids:
        translations = collections.defaultdict(int)
        heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        output = codecs.open("translations_subentries_for_%s.txt" % dictdata_string, "w", "utf-8")
        
        for entry_id in heads_with_translations:
            if heads_with_translations[entry_id]['is_subentry'] == 't':
                for t in heads_with_translations[entry_id]['translations']:
                    translations[t] += 1

        for w in sorted(translations.iteritems(), key=itemgetter(1), reverse=True):
            output.write("{0}\t{1}\n".format(w[0], w[1]))
示例#2
0
def main(argv):

    if len(argv) < 2:
        print("call: heads_with_translations.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderDict(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
        if len(dictdata_ids) == 0:
            dictdata_ids = cr.dictdata_ids_for_component(argv[2])
            if len(dictdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
                sys.exit(1)
    else:
        dictdata_ids = cr.dictdata_string_ids
        
    
    for dictdata_id in dictdata_ids:
        #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        print("Writing data for dictdata string ID {0}".format(dictdata_string), file=sys.stderr)

        output = codecs.open("heads_with_translations_%s.txt" % dictdata_string, "w", "utf-8")
        
        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            output.write("%s\t%s\n" % (head, translation))
        
        output.close()
def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph.py data_path (bibtex_key|component)")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
    if len(dictdata_ids) == 0:
        dictdata_ids = cr.dictdata_ids_for_component(argv[2])
        if len(dictdata_ids) == 0:
            print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
            sys.exit(1)
        

    for dictdata_id in dictdata_ids:
        gr = Graph()
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue
        
        if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1):
            continue
        
        language_iso = None
        if tgt_language_iso == [ 'spa' ]:
            language_iso = src_language_iso[0]
        else:
            language_iso = tgt_language_iso[0]
                        
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]

        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)
                
            head_with_source = escape_string("{0}|{1}".format(head, bibtex_key))
            translation = escape_string(translation)
            
            #translation_with_language = "{0}|{1}".format(translation, language_iso)
            
            #if head_with_source not in gr:
            gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key })
            
            #if translation not in gr:
            gr.add_node(translation, attr_dict={ "lang": "spa" })
                
            #if not gr.has_edge((head_with_source, translation)):
            gr.add_edge(head_with_source, translation)

        output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8")
        output.write(write(gr))
        output.close()
def main(argv):
    # check for the right number of command line arguments
    if len(argv) < 3:
        print()
        print("Call: create_initial_orthography_profile.py data_path data_source")
        print()
        print("python create_initial_orthography_profile.py data/csv/ thiesen1998")
        sys.exit(1)

    data_path = sys.argv[1]
    data_source = sys.argv[2]

    orthography_profile = open(data_source+"_initial_profile.txt", "w") # output file
    cr = CorpusReaderDict(data_path) 
    dictdata_ids = cr.dictdata_ids_for_bibtex_key(data_source)

    # make sure the resource is in the data
    if len(dictdata_ids) == 0:
        print("There is no dictionary source for the data source you provided: "+data_source)
        sys.exit(1)


    grapheme_frequency_dict = collections.defaultdict(int)
    grapheme_count = 0.0

    for dictdata_id in dictdata_ids:
        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            graphemes = qlc.utils.parseGraphemes(head)
            for grapheme in graphemes:
                grapheme_count += 1
                grapheme_frequency_dict[grapheme] += 1

    header = "grapheme"+"\t"+"count"+"\t"+"total frequency"
    print(header)
    orthography_profile.write(header+"\n")
    for k, v in grapheme_frequency_dict.items():
        if k == " ": # skip space between words
            continue
        result = k+"\t"+str(v)+"\t"+str(v/grapheme_count*100)
        print(result)
        orthography_profile.write(result+"\n")
# cr = CorpusReaderWordlist("data/testcorpus")
# cr = CorpusReaderDict("data/testcorpus")
# cr = CorpusReaderWordlist("data/csv")
cr = CorpusReaderDict("data/csv")

o = OrthographyParser("data/orthography_profiles/"+source+".txt")

rules_file_flag = 0
if os.path.isfile("data/orthography_profiles/"+"rules_"+source+".txt"):
    rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt")
    rules_file_flag = 1

# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, head, translation)
for wordlistdata_id in cr.dictdata_ids_for_bibtex_key(source)
for head, translation in cr.heads_with_translations_for_dictdata_id(wordlistdata_id)
)

# print header

if rules_file_flag:
    print("wordlist_id"+"\t"+"translation"+"\t"+"head"+"\t"+"graphemic_parse"+"\t"+"orthographic_rules_parse"+"\t"+"ipa_parse")
else:
    print("wordlist_id"+"\t"+"translation"+"\t"+"head"+"\t"+"graphemic_parse"+"\t"+"orthographic_rules_parse"+"\t"+"ipa_parse")

err_count = 0
errors = ""

# print all the things!
for wordlistdata_id, head, translation in wordlist_iterator: