class Tokenizer: """ takes as input a file with the QLC format: counterpart \t concept \t language and does things like - tokenizes the file into LINGPY format - tokenizes the data into ortographically parsed QLC format - locates unicorns """ def __init__(self): # deal with configuration file # configparser.read(default.cfg) cfg = SafeConfigParser() cfg.read("default.cfg") data = cfg.get("Paths", "data") orthography_profile = cfg.get("Paths", "orthography_profile") # set variables, e.g. source, orthography parser, etc. self.data = open(data, "r") self.o = OrthographyParser(orthography_profile) # self.o = GraphemeParser() self._languages = collections.defaultdict(int) # given unique ID to each unique language name self._concepts = collections.defaultdict(int) # ... self._counterparts = collections.defaultdict(int) # .. self._wordlist_iterator = self._process_input(self.data) # print(type(self.iterator)) # print(len(self.counterparts)) # words = self.get_qlc_tokenized_words() """ count = 0 for line in words: if line != "": print(line) count += 1 print(count) """ """ self.cr = CorpusReaderWordlist("data/csv") self.wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in self.cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in self.cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) """ def _process_input(self, file): languages_id = 1 concepts_id = 1 counterparts_id = 1 header = file.readline() lines = [] for line in file: line = line.strip() line = line.replace(" ", " ") counterpart, concept, language = line.split("\t") result = (counterpart, concept, language) lines.append(result) if language not in self._languages: self._languages[language] = languages_id languages_id += 1 if concept not in self._concepts: self._concepts[concept] = concepts_id concepts_id += 1 if counterpart not in self._counterparts: counterparts_id += 1 self._counterparts[counterpart] = counterparts_id return ((concept, counterpart, language) for concept, counterpart, language in lines) def get_qlc_tokenized_words(self): unparasables = open("unparsables.txt", "w") tokenized_words = [] for counterpart, concept, language in self._wordlist_iterator: counterpart = unicodedata.normalize("NFD", counterpart) grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) if grapheme_parsed_counterpart_tuple[0] == False: unparsables.write(grapheme_parsed_counterpart_tuple[1]) continue grapheme_parse = grapheme_parsed_counterpart_tuple[1] tokenized_words.append(grapheme_parse) return tokenized_words def get_ipa_tokenized_words(self): tokenized_words = [] words = get_list_qlc_tokenized_words() for word in words: grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) def lingpy_output(self): row_id = 1 # given some data set from the corpusreader or somewhere else, output a lingpy format print("ID"+"\t"+"Taxa"+"\t"+"TaxonID"+"\t"+"Gloss"+"\t"+"GlossID"+"\t"+"IPA"+"\t"+"Orthography") # print("# LANGUAGE"+"\t"+"CONCEPT"+"\t"+"COUNTERPART"+"\t"+"ORTHO_PARSE") for counterpart, concept, language in self._wordlist_iterator: # counterpart, concept, language in self._wordlist_iterator: # skip for Mattis if counterpart == "?" or counterpart == "NONE": continue grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) if grapheme_parsed_counterpart_tuple[0] == False: continue ortho_parse = grapheme_parsed_counterpart_tuple[1] print(str(row_id)+"\t"+language+"\t"+str(self._languages[language])+"\t"+concept+"\t"+str(self._concepts[concept])+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1]) # print(language+"\t"+concept+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1]) row_id += 1 def matrix_output(self): # produce Jelena style output format with matrix pass def qlc_output_format(self): # produce counterpart \t concept \t language QLC output format print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE") for counterpart, concept, language in self._wordlist_iterator: if counterpart == "?" or counterpart == "NONE": print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language) continue grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) # skip shit that doesn't parse if grapheme_parsed_counterpart_tuple[0] == False: continue ortho_parse = grapheme_parsed_counterpart_tuple[1] print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language)
wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) # print header print("wordlist_id"+"\t"+"language_book_name"+"\t"+"concept"+"\t"+"counterpart"+"\t"+"graphemic_parse"+"\t"+"ipa_parse"+"\t"+"orthographic_rules_parse") err_count = 0 errors = "" # print all the things! for wordlistdata_id, concept, counterpart in wordlist_iterator: # counterpart = unicodedata.normalize("NFD", counterpart) grapheme_parsed_counterpart_tuple = o.parse_string_to_graphemes_string(counterpart) phoneme_parsed_counterpart_tuple = o.parse_string_to_ipa_string(counterpart) if grapheme_parsed_counterpart_tuple[0] == False: report_unparsables(wordlistdata_id, concept, counterpart, grapheme_parsed_counterpart_tuple) continue if phoneme_parsed_counterpart_tuple[0] == False: report_unparsables(wordlistdata_id, concept, counterpart, phoneme_parsed_counterpart_tuple) continue grapheme_parse = grapheme_parsed_counterpart_tuple[1] phoneme_parse = phoneme_parsed_counterpart_tuple[1] rule_parsed_grapheme_parse = rules.parse_string(grapheme_parse) print(wordlistdata_id+"\t"+cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)+"\t"+concept+"\t"+counterpart+"\t"+grapheme_parse+"\t"+phoneme_parse+"\t"+rule_parsed_grapheme_parse)