def autoComplete2(lemma1, etTag1, lemma2, etTag2, corpusWords): etAndTagToAdd = set() notFound = 0 allRight = True for (et, form) in etTag1.difference(etTag2): result = applyOnlyTree(lemma2, et) if result == '#error#': allRight = False break if result not in corpusWords or corpusWords[result] <= 3: # orig is 3 notFound += 1 if notFound == 2: allRight = False break else: etAndTagToAdd.add((et, form)) if allRight and etAndTagToAdd: if lemma2 not in toAdd: toAdd[lemma2] = set() toAdd[lemma2] = toAdd[lemma2].union(etAndTagToAdd) for (et, form) in etAndTagToAdd: if (lemma2, form) not in uniquenessCheck: uniquenessCheck[(lemma2, form)] = set() uniquenessCheck[(lemma2, form)].add(applyOnlyTree(lemma2, et))
def autoComplete(lemma1, etTag1, lemma2, etTag2, corpusWords): doneLemmas.add((lemma1, lemma2)) doneLemmas.add((lemma2, lemma1)) etAndTagToAdd2 = set() etAndTagToAdd1 = set() notFound = 0 allRight1 = True allRight2 = True # Check for lemma 1: for (et, form) in etTag2.difference(etTag1): result = applyOnlyTree(lemma1, et) if result == '#error#': allRight1 = False break if result not in corpusWords or corpusWords[result] <= 3: notFound += 1 if notFound == 1: allRight1 = False break else: etAndTagToAdd1.add((et, form)) # Check for lemma 2: for (et, form) in etTag1.difference(etTag2): result = applyOnlyTree(lemma2, et) if result == '#error#': allRight2 = False break if result not in corpusWords or corpusWords[result] <= 3: notFound += 1 if notFound == 1: allRight2 = False break else: etAndTagToAdd2.add((et, form)) if allRight1 and allRight2: if etAndTagToAdd1: if lemma1 not in toAdd: toAdd[lemma1] = set() toAdd[lemma1] = toAdd[lemma1].union(etAndTagToAdd1) for (et, form) in etAndTagToAdd1: if (lemma1, form) not in uniquenessCheck: uniquenessCheck[(lemma1, form)] = set() uniquenessCheck[(lemma1, form)].add(applyOnlyTree(lemma1, et)) if etAndTagToAdd2: if lemma2 not in toAdd: toAdd[lemma2] = set() toAdd[lemma2] = toAdd[lemma2].union(etAndTagToAdd2) for (et, form) in etAndTagToAdd2: if (lemma2, form) not in uniquenessCheck: uniquenessCheck[(lemma2, form)] = set() uniquenessCheck[(lemma2, form)].add(applyOnlyTree(lemma2, et))
def rightFormInEtSet(self, lemma, form, solution): hasSolution = False if form not in self.formToEt: return False for et in self.formToEt[form]: if unicode(applyOnlyTree(lemma, et)) == solution: hasSolution = True return hasSolution
def correctResult(self, lemma, form, word): # We cannot correct this. if form not in self.formToEt: return word # Choose the form that has the mistake at an edit tree border (and maybe for doubling) for et in self.formToEt[form]: newWord = unicode(applyOnlyTree(lemma, et)) if edit_distance(newWord, word) == 1: return newWord # TODO: perform some choice here # No fitting solution has been found: return word
def filterResult(self, lemma, form, word): allRight = False if lemma == word or form not in self.formToEt: #if form not in self.formToEt: return True for et in self.formToEt[form]: #print(lemma) #print(et) #print(applyOnlyTree(lemma, et)) #print('****') if unicode(applyOnlyTree(lemma, et)) == word: allRight = True #sys.exit(0) #if not allRight: # print("filtered: " + lemma + ' : ' + form + ' : ' + word) return allRight
def get_CNN_data(file_in, file_out, USE_CORPUS): if USE_CORPUS: print('\nERROR: USE_CORPUS is not implemented, yet.') exit() # Part 1: Find all edit trees in the training set. in_file = io.open(file_in + 'train', 'r', encoding='utf-8') voc_src = {} voc_trg = {} printToIndex = {} indexToTree = {} indexToFrequ = {} indexToTags = {} # all tags this et has been seen with for line in in_file: w1 = line.strip().split('\t')[0] w2 = line.strip().split('\t')[2] morph_tag = line.strip().split('\t')[1] for char in w1: if char not in voc_src: voc_src[char] = len(voc_src) + 3 new_et = get_only_tree(w1, w2) if not new_et.myprint() in printToIndex: printToIndex[new_et.myprint()] = len(printToIndex) indexToTree[printToIndex[new_et.myprint()]] = new_et indexToFrequ[printToIndex[new_et.myprint()]] = 0 indexToTags[printToIndex[new_et.myprint()]] = set() indexToFrequ[printToIndex[new_et.myprint()]] += 1 indexToTags[printToIndex[new_et.myprint()]].add(morph_tag) for i in range(len(indexToTree)): voc_src['et' + str(i)] = len(voc_src) + 3 voc_trg['et' + str(i)] = len(voc_trg) + 3 voc_src['etUNK'] = len(voc_src) + 3 voc_trg['etUNK'] = len(voc_trg) + 3 in_file.close() # Part 2: Store the vocabulary files. outfile_src_voc = open(file_out + '_src_voc.pkl', 'wb') outfile_trg_voc = open(file_out + '_trg_voc.pkl', 'wb') cPickle.dump(voc_src, outfile_src_voc) cPickle.dump(voc_trg, outfile_trg_voc) outfile_src_voc.close() outfile_trg_voc.close() print('Storing number characters') no_char_file = open(file_out + '_number_chars', 'wb') cPickle.dump(len(voc_src) + 3, no_char_file) cPickle.dump(len(voc_trg) + 3, no_char_file) no_char_file.close() print('vocabulary files done') print(len(voc_src) + 3) print(len(voc_trg) + 3) # Part 3: Make output files. for part in ['train', 'dev', 'test']: output = {} if part == 'test' and not ('german' in file_in or 'arabic' in file_in): continue in_file = io.open(file_in + part, 'r', encoding='utf-8') for line in in_file: out_s = out_t = u'' w1 = line.strip().split('\t')[0] w2 = line.strip().split('\t')[2] morph_tag = line.strip().split('\t')[1] new_et = get_only_tree(w1, w2) if new_et.myprint( ) in printToIndex and indexToFrequ[printToIndex[new_et.myprint( )]] > 1: # this has to be tested for dev and test set out_t = (u'et' + str(printToIndex[new_et.myprint()]) + '\n') else: out_t = (u'etUNK\n') out_s = (u' '.join(list(w1))) counter = 0 for index, tree in indexToTree.iteritems(): if indexToFrequ[index] <= 1 or morph_tag not in indexToTags[ index]: continue # Store every applicable tree. if not 'error' in applyOnlyTree(w1, tree): counter += 1 # TODO: substitute this 0 by a one if the word appears in the corpus if not USE_CORPUS: out_s += (u' et' + str(index)) out_s += (u'\n') #print(counter) output[out_s] = out_t out_src = io.open(file_out + '_' + part + '_src', 'w', encoding='utf-8') out_trg = io.open(file_out + '_' + part + '_trg', 'w', encoding='utf-8') for out_s, out_t in output.iteritems(): out_src.write(out_s) out_trg.write(out_t) in_file.close() out_src.close() out_trg.close() print(part + ' done')
if __name__ == "__main__": lang = sys.argv[1] if len(sys.argv) == 2: usePickle = True else: usePickle = False posToEt, lemmaToEtAndTag, formToEt = editTreesByPos(lang) for lemma, aSet in lemmaToEtAndTag.items(): for (et, form) in aSet: if (lemma, form) not in uniquenessCheck: uniquenessCheck[(lemma, form)] = set() uniquenessCheck[(lemma, form)].add(applyOnlyTree(lemma, et)) #print(applyOnlyTree(lemma, et)) #sys.exit(0) if not usePickle: # Read the bonus corpus. announce('Start reading corpus...') corpusWords = {} # word to its frequency with open(sys.argv[2], 'r') as corpus_file: for line in corpus_file: #tokens = tokenize.word_tokenize(line.strip()) tokens = line.strip().split(' ') for token in tokens: if token not in corpusWords: corpusWords[token] = 0 corpusWords[token] += 1