def main(argv):

    if len(argv) < 2:
        print("call: concepts_with_counterparts.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        wordlistdata_ids = cr.wordlistdata_ids_for_bibtex_key(argv[2])
        if len(wordlistdata_ids) == 0:
            wordlistdata_ids = cr.wordlistdata_ids_for_component(argv[2])
            if len(wordlistdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]), file=sys.stderr)
                sys.exit(1)
    else:
        wordlistdata_ids = cr.wordlistdata_string_ids
        
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)
        
    for bibtex_key in bibtex_keys:
    
        print("Writing data for wordlistdata bibtex key {0}".format(bibtex_key), file=sys.stderr)

        output = codecs.open("concepts_with_counterparts_%s.txt" % bibtex_key, "w", "utf-8")
        output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tBIBTEX_KEY\n")

        for wordlistdata_id in bibtex_keys[bibtex_key]:
            #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
            language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
            language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)
            
            for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id):
                output.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(counterpart, concept, language_bookname, language_code, bibtex_key))
            
        output.close()
示例#2
0
def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
        
    output = codecs.open("counterparts_huber1992.txt", "w", "utf-8")
    output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tFAMILY\tBIBTEX_KEY\n")
    
    for wordlistdata_id in cr.wordlist_ids_for_bibtex_key('huber1992'):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlist_data_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlist_data_id(wordlistdata_id)
        family = families[language_bookname]
        
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id):
            output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (counterpart, concept, language_bookname, language_code, family, 'huber1992'))
        
    output.close()
示例#3
0
def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    o = OrthographyParser(qlc.get_data("orthography_profiles/huber1992.txt"))
    
    wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
        for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key('huber1992')
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
    )
    
    wordlist = WordlistStoreWithNgrams(wordlist_iterator, o)
    
    matrix_dict = dict()

    for wordlistdata_id in wordlist.languages:

        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        #language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        if language_bookname != "bora" and language_bookname != "muinane":
            continue

        print("Creating matrix for language {0}...".format(language_bookname))
                
        matrix = numpy.zeros( (len(wordlist.concepts), len(wordlist.unique_ngrams)) )
        
        for i, concept in enumerate(wordlist.concepts):
            for j, n in enumerate(wordlist.unique_ngrams):
                if n in wordlist.counterpart_for_language_and_concept(wordlistdata_id, concept):
                    matrix[i][j] = 1
        
        matrix_dict[language_bookname] = matrix
    
    # sum up over all languages
    #languages = matrix_dict.keys()
    #matrix_languages = numpy.zeros( (len(languages), len(master_ngrams)) )
    #for i, l in enumerate(languages):
    #    matrix_languages[i] = numpy.sum(matrix_dict[l], 0)[0]
            
    #numpy.savetxt("matrix_languages.txt", matrix_languages)
    
    print('Begin comparison of two languages... Bora and Muninane!')
    print
    
    languages_tuples = [ ("bora", "muinane") ]
    
    # for each language to get a matrix of bigrams by meanings
    
    for language1, language2 in languages_tuples:
        matrix1 = matrix_dict[language1]
        matrix2 = matrix_dict[language2]
        
        n1 = wordlist.unique_ngrams.index(('e', '#'))
        n2 = wordlist.unique_ngrams.index(('o', '#'))
        
        matrix_cooccurrences = numpy.dot(numpy.transpose(matrix1), matrix2)
        
        vector1 = numpy.sum(matrix1, 0)
        vector2 = numpy.sum(matrix2, 0)
        
        print(vector1[n1])
        print(vector2[n2])
        
        print(matrix_cooccurrences[n1][n2])
        
        matrix_expectations = numpy.outer(vector1, vector2) / len(wordlist.concepts)

        print(matrix_expectations[n1][n2])

        matrix_significance = matrix_expectations + \
                              numpy.log(scipy.misc.factorial(matrix_cooccurrences)) - \
                              matrix_cooccurrences * numpy.log(matrix_expectations)
        
        numpy.savetxt("matrix_significance.txt", matrix_significance)
        
        print(matrix_significance[n1][n2])
示例#4
0
文件: matrix.py 项目: pombredanne/qlc
    source = sys.argv[1] # dictionary/wordlist source key
    output_dir = "output/"+source+"/"

    # get data from corpus reader
    cr = CorpusReaderWordlist("data/csv")          # real data
    # cr = CorpusReaderWordlist("data/testcorpus") # test data

    # initialize orthography parser for source
    o = OrthographyParser("data/orthography_profiles/"+source+".txt")
    # o = GraphemeParser() # or use the grapheme parser

    # create a generator of corpus reader data
    wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
        for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
    )

    # write the data to disk -- note it exhausts the generator, so either the generator
    # must be "regenerated" or run the following lines without the rest of the code below
    # move this into a method in the class
    """
    file = open(output_dir+source+"_data.txt", "w")    
    file.write("# wordlistdata_id"+"\t"+"language bookname"+"\t"+"concept"+"\t"+"counterpart"+"\n")
    for wordlistdata_id, concept, counterpart in wordlist_iterator:
        result = wordlistdata_id+"\t"+cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)+"\t"+concept+"\t"+counterpart+"\n"
        file.write(result)
    file.close()
    """

    # initialize matrix class