def main(argv): if len(argv) < 2: print("call: counterparts_huber1992.py data_path") exit(1) cr = CorpusReaderWordlist(os.path.join(argv[1], "csv")) o = OrthographyParser(os.path.join(argv[1], "orthography_profiles", "huber1992.txt")) ngrams_by_language_count = list() ngrams_set = set() for i, wordlistdata_id in enumerate(cr.wordlistdata_ids_for_bibtex_key('huber1992')): #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id) #print wordlistdata_id language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id) language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id) counterpart_graphemes = (o.parse_string_to_graphemes(counterpart) \ for counterpart in cr.counterparts_for_wordlistdata_id(wordlistdata_id)) matrix = qlc.ngram.words_ngrams_matrix_for_graphemes_list(counterpart_graphemes, 2) sum = numpy.sum(matrix.matrix, 0) #print("Sum length: {0}".format(len(sum))) #print("Column length: {0}".format(len(columns))) if len(sum.nonzero()[0]) != matrix.number_of_columns: print("Error: ") print("{0} != {1}".format(len(sum.nonzero()[0]), len(columns))) print(language_bookname) ngrams_by_language_count.append(collections.defaultdict(int)) for j, c in enumerate(matrix.column_names): ngrams_set.add(c) ngrams_by_language_count[i][c] = sum[j] ngrams_list = sorted(list(ngrams_set)) matrix = qlc.matrix.Matrix(ngrams_by_language_count, ngrams_list) # matrix = numpy.zeros( ( len(ngrams_by_language_count), len(ngrams_list) ) ) for i in range(matrix.number_of_rows): for j, ngram in enumerate(ngrams_list): matrix.matrix[i][j] = ngrams_by_language_count[i][ngram] print(matrix.matrix)
def main(argv): if len(argv) < 2: print("call: counterparts_huber1992.py data_path") exit(1) cr = CorpusReaderWordlist(argv[1]) output = codecs.open("counterparts_huber1992.txt", "w", "utf-8") output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tFAMILY\tBIBTEX_KEY\n") for wordlistdata_id in cr.wordlist_ids_for_bibtex_key('huber1992'): #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id) #print wordlistdata_id language_bookname = cr.get_language_bookname_for_wordlist_data_id(wordlistdata_id) language_code = cr.get_language_code_for_wordlist_data_id(wordlistdata_id) family = families[language_bookname] for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id): output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (counterpart, concept, language_bookname, language_code, family, 'huber1992')) output.close()
def main(argv): if len(argv) < 2: print("call: concepts_with_counterparts.py data_path [(bibtex_key|component)]") exit(1) cr = CorpusReaderWordlist(argv[1]) print("Data loaded", file=sys.stderr) dictdata_ids = [] if len(argv) == 3: wordlistdata_ids = cr.wordlistdata_ids_for_bibtex_key(argv[2]) if len(wordlistdata_ids) == 0: wordlistdata_ids = cr.wordlistdata_ids_for_component(argv[2]) if len(wordlistdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]), file=sys.stderr) sys.exit(1) else: wordlistdata_ids = cr.wordlistdata_string_ids bibtex_keys = collections.defaultdict(list) for wid in wordlistdata_ids: wordlistdata_string = cr.wordlistdata_string_ids[wid] bibtex_key = wordlistdata_string.split("_")[0] bibtex_keys[bibtex_key].append(wid) for bibtex_key in bibtex_keys: print("Writing data for wordlistdata bibtex key {0}".format(bibtex_key), file=sys.stderr) output = codecs.open("concepts_with_counterparts_%s.txt" % bibtex_key, "w", "utf-8") output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tBIBTEX_KEY\n") for wordlistdata_id in bibtex_keys[bibtex_key]: #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id) language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id) for concept, counterpart in cr.data(wordlistdata_id): output.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(counterpart, concept, language_bookname, language_code, bibtex_key)) output.close() if os.path.getsize("concepts_with_counterparts_%s.txt" % bibtex_key) == 0: os.remove("concepts_with_counterparts_%s.txt" % bibtex_key)
def main(argv): if len(argv) < 2: print("call: counterparts_huber1992.py data_path") exit(1) cr = CorpusReaderWordlist(argv[1]) o = OrthographyParser(qlc.get_data("orthography_profiles/huber1992.txt")) wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key('huber1992') for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) wordlist = WordlistStoreWithNgrams(wordlist_iterator, o) matrix_dict = dict() for wordlistdata_id in wordlist.languages: language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id) #language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id) if language_bookname != "bora" and language_bookname != "muinane": continue print("Creating matrix for language {0}...".format(language_bookname)) matrix = numpy.zeros( (len(wordlist.concepts), len(wordlist.unique_ngrams)) ) for i, concept in enumerate(wordlist.concepts): for j, n in enumerate(wordlist.unique_ngrams): if n in wordlist.counterpart_for_language_and_concept(wordlistdata_id, concept): matrix[i][j] = 1 matrix_dict[language_bookname] = matrix # sum up over all languages #languages = matrix_dict.keys() #matrix_languages = numpy.zeros( (len(languages), len(master_ngrams)) ) #for i, l in enumerate(languages): # matrix_languages[i] = numpy.sum(matrix_dict[l], 0)[0] #numpy.savetxt("matrix_languages.txt", matrix_languages) print('Begin comparison of two languages... Bora and Muninane!') print languages_tuples = [ ("bora", "muinane") ] # for each language to get a matrix of bigrams by meanings for language1, language2 in languages_tuples: matrix1 = matrix_dict[language1] matrix2 = matrix_dict[language2] n1 = wordlist.unique_ngrams.index(('e', '#')) n2 = wordlist.unique_ngrams.index(('o', '#')) matrix_cooccurrences = numpy.dot(numpy.transpose(matrix1), matrix2) vector1 = numpy.sum(matrix1, 0) vector2 = numpy.sum(matrix2, 0) print(vector1[n1]) print(vector2[n2]) print(matrix_cooccurrences[n1][n2]) matrix_expectations = numpy.outer(vector1, vector2) / len(wordlist.concepts) print(matrix_expectations[n1][n2]) matrix_significance = matrix_expectations + \ numpy.log(scipy.misc.factorial(matrix_cooccurrences)) - \ matrix_cooccurrences * numpy.log(matrix_expectations) numpy.savetxt("matrix_significance.txt", matrix_significance) print(matrix_significance[n1][n2])
if __name__=="__main__": import sys from qlc.corpusreader import CorpusReaderWordlist from qlc.orthography import OrthographyParser, GraphemeParser from scipy.io import mmread, mmwrite # write sparse matrices if len(sys.argv) != 2: print("call: python matrix.py source\n") print("python matrix.py huber1992\n") source = sys.argv[1] # dictionary/wordlist source key output_dir = "output/"+source+"/" # get data from corpus reader cr = CorpusReaderWordlist("data/csv") # real data # cr = CorpusReaderWordlist("data/testcorpus") # test data # initialize orthography parser for source o = OrthographyParser("data/orthography_profiles/"+source+".txt") # o = GraphemeParser() # or use the grapheme parser # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) # write the data to disk -- note it exhausts the generator, so either the generator # must be "regenerated" or run the following lines without the rest of the code below # move this into a method in the class
unparsables = open("unparsables.txt", "w") def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple): invalid_parse_string = parsed_counterpart_tuple[1] error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string unparsables.write(error) if len(sys.argv) != 2: print("call: python parse_counterparts.py bibtex_key_source\n") source = sys.argv[1] # cr = CorpusReaderWordlist("data/testcorpus") cr = CorpusReaderWordlist("data/csv") o = OrthographyParser("data/orthography_profiles/"+source+".txt") rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt") # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) # print header print("wordlist_id"+"\t"+"language_book_name"+"\t"+"concept"+"\t"+"counterpart"+"\t"+"graphemic_parse"+"\t"+"ipa_parse"+"\t"+"orthographic_rules_parse")
import sys from qlc.corpusreader import CorpusReaderWordlist from qlc.orthography import OrthographyParser, GraphemeParser from qlc.matrix import WordlistStoreWithNgrams from scipy.io import mmread, mmwrite # write sparse matrices from scipy.sparse import csr_matrix, lil_matrix, coo_matrix if len(sys.argv) != 2: print("call: python matrix.py source\n") print("python matrix.py huber1992\n") source = sys.argv[1] # dictionary/wordlist source key output_dir = source+"/" # get data from corpus reader cr = CorpusReaderWordlist("data/csv") # real data # cr = CorpusReaderWordlist("data/testcorpus") # test data # initialize orthography parser for source o = OrthographyParser("data/orthography_profiles/"+source+".txt") # o = GraphemeParser() # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) """ # print all the things! for wordlistdata_id, concept, counterpart in wordlist_iterator: