示例#1
0
文件: ngram.py 项目: pombredanne/qlc
def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(os.path.join(argv[1], "csv"))
    o = OrthographyParser(os.path.join(argv[1], "orthography_profiles", "huber1992.txt"))
    
    ngrams_by_language_count = list()
    ngrams_set = set()
    
    for i, wordlistdata_id in enumerate(cr.wordlistdata_ids_for_bibtex_key('huber1992')):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        counterpart_graphemes = (o.parse_string_to_graphemes(counterpart) \
               for counterpart in cr.counterparts_for_wordlistdata_id(wordlistdata_id))

        matrix = qlc.ngram.words_ngrams_matrix_for_graphemes_list(counterpart_graphemes, 2)
        
        sum = numpy.sum(matrix.matrix, 0)
        #print("Sum length: {0}".format(len(sum)))
        #print("Column length: {0}".format(len(columns)))
        
        if len(sum.nonzero()[0]) != matrix.number_of_columns:
            print("Error: ")
            print("{0} != {1}".format(len(sum.nonzero()[0]), len(columns)))
            print(language_bookname)
        
        ngrams_by_language_count.append(collections.defaultdict(int))
        for j, c in enumerate(matrix.column_names):
            ngrams_set.add(c)
            ngrams_by_language_count[i][c] = sum[j]

    ngrams_list = sorted(list(ngrams_set))
    matrix = qlc.matrix.Matrix(ngrams_by_language_count, ngrams_list)
    # matrix = numpy.zeros( ( len(ngrams_by_language_count), len(ngrams_list) ) )
    
    for i in range(matrix.number_of_rows):
        for j, ngram in enumerate(ngrams_list):
            matrix.matrix[i][j] = ngrams_by_language_count[i][ngram]
            
    print(matrix.matrix)
def main(argv):

    if len(argv) < 2:
        print("call: concepts_with_counterparts.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        wordlistdata_ids = cr.wordlistdata_ids_for_bibtex_key(argv[2])
        if len(wordlistdata_ids) == 0:
            wordlistdata_ids = cr.wordlistdata_ids_for_component(argv[2])
            if len(wordlistdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]), file=sys.stderr)
                sys.exit(1)
    else:
        wordlistdata_ids = cr.wordlistdata_string_ids
        
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)
        
    for bibtex_key in bibtex_keys:
    
        print("Writing data for wordlistdata bibtex key {0}".format(bibtex_key), file=sys.stderr)

        output = codecs.open("concepts_with_counterparts_%s.txt" % bibtex_key, "w", "utf-8")
        output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tBIBTEX_KEY\n")

        for wordlistdata_id in bibtex_keys[bibtex_key]:
            #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
            language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
            language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)
            
            for concept, counterpart in cr.data(wordlistdata_id):
                output.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(counterpart, concept, language_bookname, language_code, bibtex_key))
            
        output.close()

        if os.path.getsize("concepts_with_counterparts_%s.txt" % bibtex_key) == 0:
            os.remove("concepts_with_counterparts_%s.txt" % bibtex_key)