def autoComplete2(lemma1, etTag1, lemma2, etTag2, corpusWords):
    etAndTagToAdd = set()
    notFound = 0
    allRight = True

    for (et, form) in etTag1.difference(etTag2):

        result = applyOnlyTree(lemma2, et)

        if result == '#error#':
            allRight = False
            break
        if result not in corpusWords or corpusWords[result] <= 3:  # orig is 3
            notFound += 1
            if notFound == 2:
                allRight = False
                break
        else:
            etAndTagToAdd.add((et, form))

    if allRight and etAndTagToAdd:
        if lemma2 not in toAdd:
            toAdd[lemma2] = set()
        toAdd[lemma2] = toAdd[lemma2].union(etAndTagToAdd)
        for (et, form) in etAndTagToAdd:
            if (lemma2, form) not in uniquenessCheck:
                uniquenessCheck[(lemma2, form)] = set()
            uniquenessCheck[(lemma2, form)].add(applyOnlyTree(lemma2, et))
def autoComplete(lemma1, etTag1, lemma2, etTag2, corpusWords):
    doneLemmas.add((lemma1, lemma2))
    doneLemmas.add((lemma2, lemma1))

    etAndTagToAdd2 = set()
    etAndTagToAdd1 = set()
    notFound = 0
    allRight1 = True
    allRight2 = True

    # Check for lemma 1:
    for (et, form) in etTag2.difference(etTag1):

        result = applyOnlyTree(lemma1, et)
        if result == '#error#':
            allRight1 = False
            break
        if result not in corpusWords or corpusWords[result] <= 3:
            notFound += 1
            if notFound == 1:
                allRight1 = False
                break
        else:
            etAndTagToAdd1.add((et, form))

    # Check for lemma 2:
    for (et, form) in etTag1.difference(etTag2):

        result = applyOnlyTree(lemma2, et)
        if result == '#error#':
            allRight2 = False
            break
        if result not in corpusWords or corpusWords[result] <= 3:
            notFound += 1
            if notFound == 1:
                allRight2 = False
                break
        else:
            etAndTagToAdd2.add((et, form))

    if allRight1 and allRight2:
        if etAndTagToAdd1:
            if lemma1 not in toAdd:
                toAdd[lemma1] = set()
            toAdd[lemma1] = toAdd[lemma1].union(etAndTagToAdd1)
            for (et, form) in etAndTagToAdd1:
                if (lemma1, form) not in uniquenessCheck:
                    uniquenessCheck[(lemma1, form)] = set()
                uniquenessCheck[(lemma1, form)].add(applyOnlyTree(lemma1, et))

        if etAndTagToAdd2:
            if lemma2 not in toAdd:
                toAdd[lemma2] = set()
            toAdd[lemma2] = toAdd[lemma2].union(etAndTagToAdd2)
            for (et, form) in etAndTagToAdd2:
                if (lemma2, form) not in uniquenessCheck:
                    uniquenessCheck[(lemma2, form)] = set()
                uniquenessCheck[(lemma2, form)].add(applyOnlyTree(lemma2, et))
  def rightFormInEtSet(self, lemma, form, solution):
    hasSolution = False
    
    if form not in self.formToEt:
      return False
    
    for et in self.formToEt[form]:
      if unicode(applyOnlyTree(lemma, et)) == solution:
        hasSolution = True

    return hasSolution
 def correctResult(self, lemma, form, word):
   # We cannot correct this.
   if form not in self.formToEt:
     return word
   
   # Choose the form that has the mistake at an edit tree border (and maybe for doubling)
   for et in self.formToEt[form]:
     newWord = unicode(applyOnlyTree(lemma, et))
     if edit_distance(newWord, word) == 1:
       return newWord # TODO: perform some choice here
     
   # No fitting solution has been found:
   return word
 def filterResult(self, lemma, form, word):
   allRight = False
   
   if lemma == word or form not in self.formToEt:
   #if form not in self.formToEt:
     return True
   
   for et in self.formToEt[form]:
     #print(lemma)
     #print(et)
     #print(applyOnlyTree(lemma, et))
     #print('****')
     if unicode(applyOnlyTree(lemma, et)) == word:
       allRight = True
   #sys.exit(0) 
    
   #if not allRight:
   #  print("filtered: " + lemma + ' : ' + form + ' : ' + word)
   return allRight
Exemplo n.º 6
0
def get_CNN_data(file_in, file_out, USE_CORPUS):

    if USE_CORPUS:
        print('\nERROR: USE_CORPUS is not implemented, yet.')
        exit()

    # Part 1: Find all edit trees in the training set.
    in_file = io.open(file_in + 'train', 'r', encoding='utf-8')
    voc_src = {}
    voc_trg = {}
    printToIndex = {}
    indexToTree = {}
    indexToFrequ = {}
    indexToTags = {}  # all tags this et has been seen with

    for line in in_file:
        w1 = line.strip().split('\t')[0]
        w2 = line.strip().split('\t')[2]
        morph_tag = line.strip().split('\t')[1]

        for char in w1:
            if char not in voc_src:
                voc_src[char] = len(voc_src) + 3

        new_et = get_only_tree(w1, w2)

        if not new_et.myprint() in printToIndex:
            printToIndex[new_et.myprint()] = len(printToIndex)
            indexToTree[printToIndex[new_et.myprint()]] = new_et
            indexToFrequ[printToIndex[new_et.myprint()]] = 0
            indexToTags[printToIndex[new_et.myprint()]] = set()
        indexToFrequ[printToIndex[new_et.myprint()]] += 1
        indexToTags[printToIndex[new_et.myprint()]].add(morph_tag)

    for i in range(len(indexToTree)):
        voc_src['et' + str(i)] = len(voc_src) + 3
        voc_trg['et' + str(i)] = len(voc_trg) + 3
    voc_src['etUNK'] = len(voc_src) + 3
    voc_trg['etUNK'] = len(voc_trg) + 3

    in_file.close()

    # Part 2: Store the vocabulary files.
    outfile_src_voc = open(file_out + '_src_voc.pkl', 'wb')
    outfile_trg_voc = open(file_out + '_trg_voc.pkl', 'wb')
    cPickle.dump(voc_src, outfile_src_voc)
    cPickle.dump(voc_trg, outfile_trg_voc)
    outfile_src_voc.close()
    outfile_trg_voc.close()

    print('Storing number characters')
    no_char_file = open(file_out + '_number_chars', 'wb')
    cPickle.dump(len(voc_src) + 3, no_char_file)
    cPickle.dump(len(voc_trg) + 3, no_char_file)
    no_char_file.close()

    print('vocabulary files done')
    print(len(voc_src) + 3)
    print(len(voc_trg) + 3)

    # Part 3: Make output files.
    for part in ['train', 'dev', 'test']:
        output = {}

        if part == 'test' and not ('german' in file_in or 'arabic' in file_in):
            continue
        in_file = io.open(file_in + part, 'r', encoding='utf-8')

        for line in in_file:
            out_s = out_t = u''
            w1 = line.strip().split('\t')[0]
            w2 = line.strip().split('\t')[2]
            morph_tag = line.strip().split('\t')[1]

            new_et = get_only_tree(w1, w2)

            if new_et.myprint(
            ) in printToIndex and indexToFrequ[printToIndex[new_et.myprint(
            )]] > 1:  # this has to be tested for dev and test set
                out_t = (u'et' + str(printToIndex[new_et.myprint()]) + '\n')
            else:
                out_t = (u'etUNK\n')
            out_s = (u' '.join(list(w1)))

            counter = 0
            for index, tree in indexToTree.iteritems():
                if indexToFrequ[index] <= 1 or morph_tag not in indexToTags[
                        index]:
                    continue
                # Store every applicable tree.
                if not 'error' in applyOnlyTree(w1, tree):
                    counter += 1
                    # TODO: substitute this 0 by a one if the word appears in the corpus
                    if not USE_CORPUS:
                        out_s += (u' et' + str(index))
            out_s += (u'\n')
            #print(counter)
            output[out_s] = out_t

        out_src = io.open(file_out + '_' + part + '_src',
                          'w',
                          encoding='utf-8')
        out_trg = io.open(file_out + '_' + part + '_trg',
                          'w',
                          encoding='utf-8')
        for out_s, out_t in output.iteritems():
            out_src.write(out_s)
            out_trg.write(out_t)
        in_file.close()
        out_src.close()
        out_trg.close()
        print(part + ' done')

if __name__ == "__main__":
    lang = sys.argv[1]
    if len(sys.argv) == 2:
        usePickle = True
    else:
        usePickle = False

    posToEt, lemmaToEtAndTag, formToEt = editTreesByPos(lang)

    for lemma, aSet in lemmaToEtAndTag.items():
        for (et, form) in aSet:
            if (lemma, form) not in uniquenessCheck:
                uniquenessCheck[(lemma, form)] = set()
            uniquenessCheck[(lemma, form)].add(applyOnlyTree(lemma, et))
            #print(applyOnlyTree(lemma, et))
    #sys.exit(0)

    if not usePickle:
        # Read the bonus corpus.
        announce('Start reading corpus...')
        corpusWords = {}  # word to its frequency
        with open(sys.argv[2], 'r') as corpus_file:
            for line in corpus_file:
                #tokens = tokenize.word_tokenize(line.strip())
                tokens = line.strip().split(' ')
                for token in tokens:
                    if token not in corpusWords:
                        corpusWords[token] = 0
                    corpusWords[token] += 1