Python CorpusReaderWordlist示例

def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(os.path.join(argv[1], "csv"))
    o = OrthographyParser(os.path.join(argv[1], "orthography_profiles", "huber1992.txt"))
    
    ngrams_by_language_count = list()
    ngrams_set = set()
    
    for i, wordlistdata_id in enumerate(cr.wordlistdata_ids_for_bibtex_key('huber1992')):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        counterpart_graphemes = (o.parse_string_to_graphemes(counterpart) \
               for counterpart in cr.counterparts_for_wordlistdata_id(wordlistdata_id))

        matrix = qlc.ngram.words_ngrams_matrix_for_graphemes_list(counterpart_graphemes, 2)
        
        sum = numpy.sum(matrix.matrix, 0)
        #print("Sum length: {0}".format(len(sum)))
        #print("Column length: {0}".format(len(columns)))
        
        if len(sum.nonzero()[0]) != matrix.number_of_columns:
            print("Error: ")
            print("{0} != {1}".format(len(sum.nonzero()[0]), len(columns)))
            print(language_bookname)
        
        ngrams_by_language_count.append(collections.defaultdict(int))
        for j, c in enumerate(matrix.column_names):
            ngrams_set.add(c)
            ngrams_by_language_count[i][c] = sum[j]

    ngrams_list = sorted(list(ngrams_set))
    matrix = qlc.matrix.Matrix(ngrams_by_language_count, ngrams_list)
    # matrix = numpy.zeros( ( len(ngrams_by_language_count), len(ngrams_list) ) )
    
    for i in range(matrix.number_of_rows):
        for j, ngram in enumerate(ngrams_list):
            matrix.matrix[i][j] = ngrams_by_language_count[i][ngram]
            
    print(matrix.matrix)

示例#2

显示文件

文件： counterparts_huber1992.py 项目： pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
        
    output = codecs.open("counterparts_huber1992.txt", "w", "utf-8")
    output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tFAMILY\tBIBTEX_KEY\n")
    
    for wordlistdata_id in cr.wordlist_ids_for_bibtex_key('huber1992'):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlist_data_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlist_data_id(wordlistdata_id)
        family = families[language_bookname]
        
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id):
            output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (counterpart, concept, language_bookname, language_code, family, 'huber1992'))
        
    output.close()

示例#3

显示文件

文件： concepts_with_counterparts.py 项目： pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: concepts_with_counterparts.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        wordlistdata_ids = cr.wordlistdata_ids_for_bibtex_key(argv[2])
        if len(wordlistdata_ids) == 0:
            wordlistdata_ids = cr.wordlistdata_ids_for_component(argv[2])
            if len(wordlistdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]), file=sys.stderr)
                sys.exit(1)
    else:
        wordlistdata_ids = cr.wordlistdata_string_ids
        
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)
        
    for bibtex_key in bibtex_keys:
    
        print("Writing data for wordlistdata bibtex key {0}".format(bibtex_key), file=sys.stderr)

        output = codecs.open("concepts_with_counterparts_%s.txt" % bibtex_key, "w", "utf-8")
        output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tBIBTEX_KEY\n")

        for wordlistdata_id in bibtex_keys[bibtex_key]:
            #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
            language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
            language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)
            
            for concept, counterpart in cr.data(wordlistdata_id):
                output.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(counterpart, concept, language_bookname, language_code, bibtex_key))
            
        output.close()

        if os.path.getsize("concepts_with_counterparts_%s.txt" % bibtex_key) == 0:
            os.remove("concepts_with_counterparts_%s.txt" % bibtex_key)

示例#4

显示文件

文件： ngram_bag_of_symbols.py 项目： pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    o = OrthographyParser(qlc.get_data("orthography_profiles/huber1992.txt"))
    
    wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
        for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key('huber1992')
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
    )
    
    wordlist = WordlistStoreWithNgrams(wordlist_iterator, o)
    
    matrix_dict = dict()

    for wordlistdata_id in wordlist.languages:

        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        #language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        if language_bookname != "bora" and language_bookname != "muinane":
            continue

        print("Creating matrix for language {0}...".format(language_bookname))
                
        matrix = numpy.zeros( (len(wordlist.concepts), len(wordlist.unique_ngrams)) )
        
        for i, concept in enumerate(wordlist.concepts):
            for j, n in enumerate(wordlist.unique_ngrams):
                if n in wordlist.counterpart_for_language_and_concept(wordlistdata_id, concept):
                    matrix[i][j] = 1
        
        matrix_dict[language_bookname] = matrix
    
    # sum up over all languages
    #languages = matrix_dict.keys()
    #matrix_languages = numpy.zeros( (len(languages), len(master_ngrams)) )
    #for i, l in enumerate(languages):
    #    matrix_languages[i] = numpy.sum(matrix_dict[l], 0)[0]
            
    #numpy.savetxt("matrix_languages.txt", matrix_languages)
    
    print('Begin comparison of two languages... Bora and Muninane!')
    print
    
    languages_tuples = [ ("bora", "muinane") ]
    
    # for each language to get a matrix of bigrams by meanings
    
    for language1, language2 in languages_tuples:
        matrix1 = matrix_dict[language1]
        matrix2 = matrix_dict[language2]
        
        n1 = wordlist.unique_ngrams.index(('e', '#'))
        n2 = wordlist.unique_ngrams.index(('o', '#'))
        
        matrix_cooccurrences = numpy.dot(numpy.transpose(matrix1), matrix2)
        
        vector1 = numpy.sum(matrix1, 0)
        vector2 = numpy.sum(matrix2, 0)
        
        print(vector1[n1])
        print(vector2[n2])
        
        print(matrix_cooccurrences[n1][n2])
        
        matrix_expectations = numpy.outer(vector1, vector2) / len(wordlist.concepts)

        print(matrix_expectations[n1][n2])

        matrix_significance = matrix_expectations + \
                              numpy.log(scipy.misc.factorial(matrix_cooccurrences)) - \
                              matrix_cooccurrences * numpy.log(matrix_expectations)
        
        numpy.savetxt("matrix_significance.txt", matrix_significance)
        
        print(matrix_significance[n1][n2])

示例#5

显示文件

文件： matrix.py 项目： pombredanne/qlc

if __name__=="__main__":
    import sys
    from qlc.corpusreader import CorpusReaderWordlist
    from qlc.orthography import OrthographyParser, GraphemeParser
    from scipy.io import mmread, mmwrite # write sparse matrices

    if len(sys.argv) != 2:
        print("call: python matrix.py source\n")
        print("python matrix.py huber1992\n")

    source = sys.argv[1] # dictionary/wordlist source key
    output_dir = "output/"+source+"/"

    # get data from corpus reader
    cr = CorpusReaderWordlist("data/csv")          # real data
    # cr = CorpusReaderWordlist("data/testcorpus") # test data

    # initialize orthography parser for source
    o = OrthographyParser("data/orthography_profiles/"+source+".txt")
    # o = GraphemeParser() # or use the grapheme parser

    # create a generator of corpus reader data
    wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
        for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
    )

    # write the data to disk -- note it exhausts the generator, so either the generator
    # must be "regenerated" or run the following lines without the rest of the code below
    # move this into a method in the class

示例#6

显示文件

文件： parse_counterparts.py 项目： pombredanne/qlc

unparsables = open("unparsables.txt", "w")

def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple):
    invalid_parse_string = parsed_counterpart_tuple[1]
    error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string
    unparsables.write(error)


if len(sys.argv) != 2:
    print("call: python parse_counterparts.py bibtex_key_source\n")

source = sys.argv[1]

# cr = CorpusReaderWordlist("data/testcorpus")
cr = CorpusReaderWordlist("data/csv")

o = OrthographyParser("data/orthography_profiles/"+source+".txt")
rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt")


# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
)


# print header
print("wordlist_id"+"\t"+"language_book_name"+"\t"+"concept"+"\t"+"counterpart"+"\t"+"graphemic_parse"+"\t"+"ipa_parse"+"\t"+"orthographic_rules_parse")

示例#7

显示文件

文件： unigram_matrices.py 项目： pombredanne/qlc

import sys
from qlc.corpusreader import CorpusReaderWordlist
from qlc.orthography import OrthographyParser, GraphemeParser
from qlc.matrix import WordlistStoreWithNgrams
from scipy.io import mmread, mmwrite # write sparse matrices
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix

if len(sys.argv) != 2:
	print("call: python matrix.py source\n")
	print("python matrix.py huber1992\n")

source = sys.argv[1] # dictionary/wordlist source key
output_dir = source+"/"

# get data from corpus reader
cr = CorpusReaderWordlist("data/csv")          # real data
# cr = CorpusReaderWordlist("data/testcorpus") # test data

# initialize orthography parser for source
o = OrthographyParser("data/orthography_profiles/"+source+".txt")
# o = GraphemeParser()

# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
	for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
	for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
)

"""
# print all the things!
for wordlistdata_id, concept, counterpart in wordlist_iterator: