示例#1
0
def main(argv):

    if len(argv) < 2:
        print("call: heads_with_translations.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderDict(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
        if len(dictdata_ids) == 0:
            dictdata_ids = cr.dictdata_ids_for_component(argv[2])
            if len(dictdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
                sys.exit(1)
    else:
        dictdata_ids = cr.dictdata_string_ids
        
    
    for dictdata_id in dictdata_ids:
        #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        print("Writing data for dictdata string ID {0}".format(dictdata_string), file=sys.stderr)

        output = codecs.open("heads_with_translations_%s.txt" % dictdata_string, "w", "utf-8")
        
        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            output.write("%s\t%s\n" % (head, translation))
        
        output.close()
def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph.py data_path (bibtex_key|component)")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
    if len(dictdata_ids) == 0:
        dictdata_ids = cr.dictdata_ids_for_component(argv[2])
        if len(dictdata_ids) == 0:
            print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
            sys.exit(1)
        

    for dictdata_id in dictdata_ids:
        gr = Graph()
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue
        
        if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1):
            continue
        
        language_iso = None
        if tgt_language_iso == [ 'spa' ]:
            language_iso = src_language_iso[0]
        else:
            language_iso = tgt_language_iso[0]
                        
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]

        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)
                
            head_with_source = escape_string("{0}|{1}".format(head, bibtex_key))
            translation = escape_string(translation)
            
            #translation_with_language = "{0}|{1}".format(translation, language_iso)
            
            #if head_with_source not in gr:
            gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key })
            
            #if translation not in gr:
            gr.add_node(translation, attr_dict={ "lang": "spa" })
                
            #if not gr.has_edge((head_with_source, translation)):
            gr.add_edge(head_with_source, translation)

        output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8")
        output.write(write(gr))
        output.close()
示例#3
0
import codecs, unicodedata
import collections
import re
import copy

from qlc.corpusreader import CorpusReaderDict
from qlc.translationgraph import read, write
import qlc.utils

from nltk.stem.snowball import SpanishStemmer

import networkx

cr = CorpusReaderDict("c:/data/qlc")

dictdata_ids = cr.dictdata_ids_for_component("Witotoan")
re_quotes = re.compile('"')

print(dictdata_ids)

graphs = list()
for dictdata_id in dictdata_ids:
    gr = networkx.Graph()

    src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id)
    tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id)
    if src_language_iso != 'spa' and tgt_language_iso != 'spa':
        continue
    
    language_iso = None
    if tgt_language_iso == 'spa':
def main(argv):

    if len(argv) < 2:
        print("call: translations_spanish_1.py data_path [component]")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = []    
    if len(argv) == 3:
        dictdata_ids = cr.dictdata_ids_for_component(argv[2])
        if len(dictdata_ids) == 0:
            print("did not find any dictionary data for the bibtex_key.")
            sys.exit(1)
    else:
        dictdata_ids = cr.dictdata_string_ids
        
    spanish_singleword_dict = {}
    languages_iso = []
    spanish_len2_dict = collections.defaultdict(int)
    spanish_len3_dict = collections.defaultdict(int)
    spanish_lengreater3_dict = collections.defaultdict(int)
    
    stemmer = Stemmer.Stemmer('spanish')
    stopwords = spanish_stopwords()
    re_stopwords = re.compile(r"\b(?:{0})\b".format( "|".join(stopwords).encode("utf-8") ))

    for dictdata_id in dictdata_ids:
        src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id)
        if src_language_iso != 'spa' and tgt_language_iso != 'spa':
            continue
            
            
        heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]

        language_iso = bibtex_key
        if bibtex_key not in languages_iso:
            languages_iso.append(bibtex_key)

        for entry_id in heads_with_translations:
            if tgt_language_iso == 'spa':
                heads = heads_with_translations[entry_id]['heads']
                translations = heads_with_translations[entry_id]['translations']
            else:
                heads = heads_with_translations[entry_id]['translations']
                translations = heads_with_translations[entry_id]['heads']
                
            for translation in translations:
                len_translation = len(translation.split(' '))
                if len_translation > 1:
                    translation_without_stopwords = re_stopwords.sub("", translation)
                    translation_without_stopwords = translation_without_stopwords.strip(" ")
                    translation_without_stopwords = re.sub(" +", " ", translation_without_stopwords)
                    if translation_without_stopwords == " " or translation_without_stopwords == "":
                        translation_without_stopwords = translation
                        len_translation_without_stopwords = len_translation
                    else:
                        len_translation_without_stopwords = len(translation_without_stopwords.split(' '))

                else:
                    translation_without_stopwords = translation
                    len_translation_without_stopwords = len_translation

                
                if len_translation_without_stopwords == 1:
                    #print translation.encode("utf-8")
                    translation_stem = stemmer.stemWord(translation_without_stopwords)
                    if not translation_stem in  spanish_singleword_dict:
                        spanish_singleword_dict[translation_stem] = collections.defaultdict(set)
                    for head in heads_with_translations[entry_id]['heads']:
                        spanish_singleword_dict[translation_stem][language_iso].add(head)
                    spanish_singleword_dict[translation_stem]["spa"].add(translation)
                        

                elif len_translation == 2:
                    #output2.write("%s\n" % (translation))
                    spanish_len2_dict[translation] += 1

                elif len_translation == 3:
                    #output3.write("%s\n" % (translation))
                    spanish_len3_dict[translation] += 1

                else:
                    #output4.write("%s\n" % (translation))
                    spanish_lengreater3_dict[translation] += 1

    #output1 = codecs.open("spanish_len1.txt", "w", "utf-8")
    #for w in sorted(spanish_singleword_dict.iteritems(), key=itemgetter(0), reverse=True):
    #    output1.write(u"{0}\n".format(w[0]))

    output2 = codecs.open("spanish_len2.txt", "w", "utf-8")
    for w in sorted(spanish_len2_dict.items(), key=itemgetter(1), reverse=True):
        output2.write("{0}\t{1}\n".format(w[0], w[1]))

    output3 = codecs.open("spanish_len3.txt", "w", "utf-8")
    for w in sorted(spanish_len3_dict.items(), key=itemgetter(1), reverse=True):
        output3.write("{0}\t{1}\n".format(w[0], w[1]))

    output4 = codecs.open("spanish_len_greater3.txt", "w", "utf-8")
    for w in sorted(spanish_lengreater3_dict.items(), key=itemgetter(1), reverse=True):
        output4.write("{0}\t{1}\n".format(w[0], w[1]))

    output = codecs.open("spanish_singlewords_matrix.txt", "w", "utf-8")
    output1 = codecs.open("spanish_len1.txt", "w", "utf-8")
    total_count = 0
    more_than_one_lang_count = 0
    output.write("%s\t%s\n" % ("spa", "\t".join(languages_iso[1:])))
    for sp in sorted(spanish_singleword_dict):
        #output.write(sp)
        output.write("%s" % ('|'.join(sorted(spanish_singleword_dict[sp]["spa"]))))
        #print spanish_singleword_dict[sp].keys()
        count_languages = 0
        for lang in languages_iso[1:]:
            if len(spanish_singleword_dict[sp][lang]) > 0:
                count_languages += 1
            output.write("\t%s" % ('|'.join(sorted(spanish_singleword_dict[sp][lang]))))
        output.write("\n")
        output1.write("{0}\n".format(sp))
        if count_languages > 1:
            more_than_one_lang_count += 1
        total_count += 1
        
    print("total number of entries in single word matrix: {0}".format(total_count))
    print("number of entries with more than one language: {0}".format(more_than_one_lang_count))