def ngram_count_patterns(freq_lexicon, candidates):

    MAX_NGRAMS = 3
    cur_line = 0

    sl_tl_defaults = {}
    sl_tl = {}
    ngrams = {}

    meevents = {}  # events[slword][counter] = [feat, feat, feat]
    meoutcomes = {}  # meoutcomes[slword][counter] = tlword
    event_counter = 0

    features = {}  # features[(slword, ['a', 'list'], tlword)] = 3
    feature_counter = 0

    indexes = {}
    trad_counter = {}
    for line in open(freq_lexicon, 'r').readlines():
        if len(line) < 1:
            continue

        w = int(line.split(' ')[0])
        if w < THRESHOLD:
            continue

        row = common.tokenise_tagger_line(line)
        sl = wrap(row[0]).lower()
        tl = wrap(row[1]).lower()
        if tl[1] == '*':
            tl = tl[:-3] + '$'

        if sl not in sl_tl:
            sl_tl[sl] = []
        
        if sl not in trad_counter:
            trad_counter[sl] = 0

        if line.count('@') > 0:
            sl_tl_defaults[sl] = tl
        sl_tl[sl].append(tl)
        indexes[(sl, tl)] = trad_counter[sl]
        trad_counter[sl] = trad_counter[sl] + 1

    cur_sl_row = []
    cur_tl_row = []
    cur_bt_row = []
    cur_al_row = []

    for line in open(candidates, 'r').readlines():
        line = line.strip()
        if line[0] == '-':
            # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
            #
            # sl_tl[sl_word][tl_word] = tl_freq
            i = 0
            for slword in cur_sl_row:
                if len(cur_bt_row[i]['tls']) > 1:
                    for al in cur_al_row:
                        al_sl = int(al.split('-')[1])
                        al_tl = int(al.split('-')[0])
                        if al_sl != i:
                            continue
                        

                        tlword = wrap(cur_tl_row[al_tl].lower())
                        slword = wrap(slword.lower())

                        if tlword[1] == '*' or slword[1] == '*':
                            continue

                        if slword not in sl_tl_defaults:
                            #						print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping'
                            continue
                        
                        if (slword, tlword) not in indexes:
                            #						print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword)
                            continue
                        
    #					if tlword !=  sl_tl_defaults[slword]:
    #						print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword
    #					else:
    #						print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword
    #					print >>sys.stderr, cur_sl_row
                        for j in range(1, MAX_NGRAMS):
                            #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1]
                            #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1]
                            #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1]

                            pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1]))
                            postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1]))
                            roundgram = ' '.join(
                                map(wrap, cur_sl_row[i-j:i+j+1]))

                            if slword not in ngrams:
                                ngrams[slword] = {}
                            
                            if pregram not in ngrams[slword]:
                                ngrams[slword][pregram] = {}
                            
                            if postgram not in ngrams[slword]:
                                ngrams[slword][postgram] = {}
                            
                            if roundgram not in ngrams[slword]:
                                ngrams[slword][roundgram] = {}
                            
                            if tlword not in ngrams[slword][pregram]:
                                ngrams[slword][pregram][tlword] = 0
                            
                            if tlword not in ngrams[slword][postgram]:
                                ngrams[slword][postgram][tlword] = 0
                            
                            if tlword not in ngrams[slword][roundgram]:
                                ngrams[slword][roundgram][tlword] = 0
                            

                            ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1
                            ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1
                            ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1
                        
                        # print ',' , len(ngrams[slword])
                        if slword not in meevents:
                            meevents[slword] = {}
                        
                        if slword not in meoutcomes:
                            meoutcomes[slword] = {}
                        
                        if event_counter not in meevents:
                            meevents[slword][event_counter] = []
                        
                        if event_counter not in meoutcomes[slword]:
                            meoutcomes[slword][event_counter] = ''
                        
                        for ni in ngrams[slword]:
                            if ni not in features:
                                feature_counter = feature_counter + 1
                                features[ni] = feature_counter
                            
                            meevents[slword][event_counter].append(
                                features[ni])
                            # meevents[slword][event_counter].append(feat)
                            meoutcomes[slword][event_counter] = tlword

                        
                        del ngrams
                        ngrams = {}
                        if len(sl_tl[slword]) < 2:
                            continue
                        
                        for event in meevents[slword]:
                            outline = str(
                                indexes[(slword, meoutcomes[slword][event])]) + ' # '
                            for j in range(0,  len(sl_tl[slword])):
                                for feature in meevents[slword][event]:
                                    outline = outline + \
                                        str(feature) + ':' + str(j) + ' '
                                
                                outline = outline + ' # '
                            
                            print(slword, '\t', len(
                                sl_tl[slword]), '\t', outline)
                        
                        del meevents
                        del meoutcomes
                        meevents = {}
                        meoutcomes = {}

    #					for f in features:
    #						print >>sys.stderr, features[f] , f

                    

    #				for j in range(0, MAX_NGRAMS):
    #					print cur_sl_row[i-j:i+1]
    #					print cur_sl_row[i:i+j]
                    # print ngrams[slword]
                
                i = i + 1

            

            cur_line = 0
            event_counter = event_counter + 1
            # print line
            continue
        

        line = line.split('\t')[1]
        line = line.strip()

        if cur_line == 0:
            cur_sl_row = common.tokenise_tagger_line(line)
        elif cur_line == 1:
            cur_bt_row = common.tokenise_biltrans_line(line)
        elif cur_line == 2:
            cur_tl_row = common.tokenise_tagger_line(line)
        elif cur_line == 3:
            cur_al_row = line.split(' ')
        

        cur_line = cur_line + 1
    

    for feature in features:
        print(features[feature], '\t', feature, file=sys.stderr)
    

    # exit(1)
    return
Пример #2
0
while reading:  #{
    try:
        lineno = lineno + 1
        pt_line = phrase_table.readline().strip()
        bt_line = biltrans_out.readline().strip()

        if bt_line == '' and pt_line == '':  #{
            reading = False
        #}

        row = pt_line.split('|||')
        sl = common.tokenise_tagger_line(row[1])
        tl = common.tokenise_tagger_line(row[0])
        alignments = row[2].strip()
        bt = common.tokenise_biltrans_line(bt_line)

        if not ambiguous(bt):  #{
            continue
        #}
        if len(sl) < 2 and len(tl) < 2:  #{
            continue
        #}

        # Here we collect a set of SL words, with their correspondences in the bilingual
        # dictionary, and the word they have been aligned with in the target.
        # e.g.  words[0] = ('sl', ['bt1', 'bt2', ...], 'tl')

        translations = {}
        i = 0
        for j in alignments.split(' '):  #{
while reading:  #{
    try:
        lineno = lineno + 1
        pt_line = phrase_table.readline().strip()
        bt_line = biltrans_out.readline().strip()

        if not bt_line.strip() and not pt_line.strip():  #{
            reading = False
            break
        elif not bt_line.strip() or not pt_line.strip():  #{
            continue

        #}
        row = pt_line.split('|||')
        bt = common.tokenise_biltrans_line(bt_line.strip())
        sl = common.tokenise_tagger_line(row[1].strip())
        tl = common.tokenise_tagger_line(row[0].strip())

        if not ambiguous(bt):  #{
            not_ambiguous.append(str(lineno))
            if len(not_ambiguous) >= 10:  #{
                print("not ambiguous:",
                      ' '.join(not_ambiguous),
                      file=sys.stderr)
                not_ambiguous = []
            #}
            continue
        #}
        if len(sl) < 2 and len(tl) < 2:  #{
            continue
Пример #4
0
                        sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1
                        # print '+' , slword , tlword , sl_tl[slword][tlword], lineno;
                    #}
                #}
                i = i + 1
            #}
            cur_line = 0
            continue
        #}

        line = line.split('\t')[1]

        if cur_line == 0:  #{
            cur_sl_row = common.tokenise_tagger_line(line)
        elif cur_line == 1:  #{
            cur_bt_row = common.tokenise_biltrans_line(line)
        elif cur_line == 2:  #{
            cur_tl_row = common.tokenise_tagger_line(line)
        elif cur_line == 3:  #{
            cur_al_row = line.split(' ')
        #}

        cur_line = cur_line + 1
    #}
#}

for sl in sl_tl:  #{

    newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
    newtl.reverse()
    first = True
Пример #5
0
def extract_freq_lexicon(canditates):

    cur_line = 0
    lineno = 0
    sl_tl = {}

    cur_sl_row = []
    cur_tl_row = []
    cur_bt_row = []
    cur_al_row = []

    with open(canditates) as infile:
        for line in infile:
            line = line.strip()
            lineno += 1
            if lineno % 5000 == 0:
                sys.stderr.write('.')
                if lineno % 100000 == 0:
                    sys.stderr.write(str(lineno) + '\n')

                sys.stderr.flush()

            try:
                if line[0] == '-':
                    # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
                    #
                    # sl_tl[sl_word][tl_word] = tl_freq
                    i = 0
                    for slword in cur_sl_row:
                        if len(cur_bt_row[i]['tls']) > 1:
                            for al in cur_al_row:
                                if al == '':
                                    continue
                                al_sl = int(al.split('-')[1])
                                al_tl = int(al.split('-')[0])
                                if al_sl != i:
                                    continue

                                if al_tl < len(cur_tl_row):
                                    tlword = cur_tl_row[al_tl]
                                else:
                                    tlword = cur_tl_row[-1]
                                    traceback.print_stack()
                                    print("alignment out",
                                          "of",
                                          "range",
                                          al_tl,
                                          "not in",
                                          "len(",
                                          cur_tl_row,
                                          ")",
                                          file=sys.stderr)
                                    exit(1)
                                slword = slword
                                if slword not in sl_tl:
                                    sl_tl[slword] = {}

                                if tlword not in sl_tl[slword]:
                                    sl_tl[slword][tlword] = 0

                                sl_tl[slword][
                                    tlword] = sl_tl[slword][tlword] + 1
                                # print '+' , slword , tlword , sl_tl[slword][tlword], lineno

                        i = i + 1

                    cur_line = 0
                    continue

                line = line.split('\t')[1]

                if cur_line == 0:
                    cur_sl_row = common.tokenise_tagger_line(line)
                elif cur_line == 1:
                    cur_bt_row = common.tokenise_biltrans_line(line)
                elif cur_line == 2:
                    cur_tl_row = common.tokenise_tagger_line(line)
                elif cur_line == 3:
                    cur_al_row = line.split(' ')

                cur_line = cur_line + 1
            except Exception:
                # print("Error in line", lineno, ":", e, file=sys.stderr)
                traceback.print_exc()
                exit(1)

    for sl in sl_tl:

        newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
        newtl.reverse()
        first = True
        for tl in newtl:
            if tl[0] == '*':
                print('Error: tl word unknown', tl, file=sys.stderr)
                continue

            first_tag_sl = sl.split('<')[1].split('>')[0].strip()
            first_tag_tl = tl.split('<')[1].split('>')[0].strip()
            if first_tag_sl != first_tag_tl:
                print('Error:',
                      first_tag_sl,
                      '!=',
                      first_tag_tl,
                      file=sys.stderr)
                continue

            if first:
                print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl), '@')
                first = False
            else:
                print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))
Пример #6
0
def extract_sentences(phrase_table_file, biltrans_out_file):
    lineno = 0
    total_valid = 0
    total_errors = 0

    not_ambiguous = []
    with open(phrase_table_file) as phrase_table, open(
            biltrans_out_file) as biltrans_out:
        while True:
            try:
                lineno = lineno + 1
                pt_line = phrase_table.readline().strip()
                bt_line = biltrans_out.readline().strip()

                if not bt_line.strip() and not pt_line.strip():
                    break
                elif not bt_line.strip() or not pt_line.strip():
                    continue

                row = pt_line.split('|||')
                bt = common.tokenise_biltrans_line(bt_line.strip())
                sl = common.tokenise_tagger_line(row[1].strip())
                tl = common.tokenise_tagger_line(row[0].strip())

                if not common.ambiguous(bt):
                    not_ambiguous.append(str(lineno))
                    if len(not_ambiguous) >= 10:
                        print("not ambiguous:",
                              ' '.join(not_ambiguous),
                              file=sys.stderr)
                        not_ambiguous = []

                    continue

                if len(sl) < 2 and len(tl) < 2:
                    continue

                # Check that the number of words in the lexical transfer, and in the phrasetable matches up
                if len(sl) != len(bt):
                    print("Error in line",
                          lineno,
                          ": len(sl) != len(bt)",
                          file=sys.stderr)
                    continue

                # cheking if the alignments are empty
                if not row[2].strip():
                    print("In line",
                          lineno,
                          ", alignments are empty",
                          file=sys.stderr)
                    continue

                # Resumption<n> of<pr> the<def><def> session<n>
                # Resumption<n><sg>/Reanudación<n><f><sg> of<pr>/de<pr> the<det><def><sp>/el<det><def><GD><ND> session<n><sg>/sesión<n><f><sg>
                # Reanudación<n> de<pr> el<det><def> periodo<n> de<pr> sesión<n>
                # 0-0 1-1 2-2 5-3

                print(lineno, '\t' + row[1])
                print(lineno, '\t' + bt_line)
                print(lineno, '\t' + row[0])
                print(lineno, '\t' + row[2])
                print('-' * (len(bt_line) + 5))
                total_valid += 1
            except Exception as e:
                print("Error in line", lineno, ": ", e, file=sys.stderr)
                total_errors += 1
                continue

    print('total:', lineno, file=sys.stderr)
    print('valid:',
          total_valid,
          '(' + str((total_valid / lineno) * 100) + '%)',
          file=sys.stderr)
    print('errors:',
          total_errors,
          '(' + str((total_errors / lineno) * 100) + '%)',
          file=sys.stderr)
Пример #7
0
def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules):
    MAX_NGRAMS = 2
    cur_line = 0

    sl_tl_defaults = {}
    sl_tl = {}
    ngrams = {}

    lineno = 0
    for line in open(freq_lexicon).readlines():
        lineno += 1
        if lineno % 10000 == 0:
            print(lineno, file=sys.stderr)
        if len(line) < 1:
            continue

        row = common.tokenise_tagger_line(line)
        sl = common.wrap(row[0])
        tl = common.wrap(row[1])
        if tl[1] == '*':
            tl = tl[:-3] + '$'
        if line.count('@') > 0:
            sl_tl_defaults[sl] = tl
        else:
            sl_tl[sl] = tl

    cur_sl_row = []
    cur_tl_row = []
    cur_bt_row = []
    cur_al_row = []
    lineno = 0
    for line in open(candidates).readlines():
        lineno += 1
        line = line.strip()
        if lineno % 500 == 0:
            print(lineno, file=sys.stderr)
        if line[0] == '-':
            #		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row)
            #		print cur_sl_row
            #		print cur_bt_row
            #		print cur_tl_row
            #		print cur_al_row
            #
            # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
            #
            # sl_tl[sl_word][tl_word] = tl_freq
            i = 0
            for slword in cur_sl_row:
                if len(cur_bt_row[i]['tls']) > 1:
                    for al in cur_al_row:
                        if al == '':
                            continue
                        al_sl = int(al.split('-')[1])
                        al_tl = int(al.split('-')[0])
                        if al_sl != i:
                            continue

                        tlword = common.wrap(cur_tl_row[al_tl])
                        slword = common.wrap(slword)

                        if slword not in sl_tl_defaults:
                            print('!', file=sys.stderr)
                            continue

                        for j in range(1, MAX_NGRAMS):
                            pregram = ' '.join(
                                map(common.wrap, cur_sl_row[i - j:i + 1]))
                            postgram = ' '.join(
                                map(common.wrap, cur_sl_row[i:i + j + 1]))
                            roundgram = ' '.join(
                                map(common.wrap, cur_sl_row[i - j:i + j + 1]))

                            if slword not in ngrams:
                                ngrams[slword] = {}

                            if pregram not in ngrams[slword]:
                                ngrams[slword][pregram] = {}

                            if postgram not in ngrams[slword]:
                                ngrams[slword][postgram] = {}

                            if roundgram not in ngrams[slword]:
                                ngrams[slword][roundgram] = {}

                            if tlword not in ngrams[slword][pregram]:
                                ngrams[slword][pregram][tlword] = 0

                            if tlword not in ngrams[slword][postgram]:
                                ngrams[slword][postgram][tlword] = 0

                            if tlword not in ngrams[slword][roundgram]:
                                ngrams[slword][roundgram][tlword] = 0

                            ngrams[slword][pregram][
                                tlword] = ngrams[slword][pregram][tlword] + 1
                            ngrams[slword][postgram][
                                tlword] = ngrams[slword][postgram][tlword] + 1
                            ngrams[slword][roundgram][
                                tlword] = ngrams[slword][roundgram][tlword] + 1

                i = i + 1

            cur_line = 0
            # print line
            continue

        line = line.split('\t')[1]

        if cur_line == 0:
            cur_sl_row = common.tokenise_tagger_line(line)
        elif cur_line == 1:
            cur_bt_row = common.tokenise_biltrans_line(line)
        elif cur_line == 2:
            cur_tl_row = common.tokenise_tagger_line(line)
        elif cur_line == 3:
            cur_al_row = line.split(' ')

        cur_line = cur_line + 1

    for sl in ngrams:
        for ngram in ngrams[sl]:
            total = 0
            max_freq = -1
            current_tl = ''
            newtl = sorted(ngrams[sl][ngram],
                           key=lambda x: ngrams[sl][ngram][x])
            newtl.reverse()
            newtl = newtl[:max_rules]
            for tl in newtl:
                if ngrams[sl][ngram][tl] > max_freq:
                    max_freq = ngrams[sl][ngram][tl]
                    current_tl = tl

                total = total + ngrams[sl][ngram][tl]

            # > If for each of the rules we include
            # > the amount of time the translation is seen with that pattern over the
            # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8
            # > etc.  (>0.6 would be the same as 2/3 of the time the alternative
            # > translation is seen with that ngram, and 1/3 of the time the default
            # > translation is). I think this would be easier to explain than the magic
            # > number I came up with.
            #
            # I see this as a way to define how "crispy" the decisions are. I think it
            # would be better to express this as a ratio: the ratio of the times the
            # alternative translation is seen to the number of times the defaullt
            # translation is seen with that n-gram.
            #
            # It would be "2" in this case: the alternative is seen twice as often as
            # the default.

            for tl in newtl:
                crispiness = 0.0
                default = sl_tl_defaults[sl]
                alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
                def_crisp = 1.0
                if default in ngrams[sl][ngram]:
                    def_crisp = float(ngrams[sl][ngram][default] /
                                      float(total))

                weight = float(ngrams[sl][ngram][tl]) / float(total)
                crispiness = alt_crisp / def_crisp

                # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]

                if crispiness < crisphold:
                    print(
                        '-', crispiness, weight, total, max_freq,
                        ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram +
                        '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]))
                else:

                    print(
                        '+', crispiness, weight, total, max_freq,
                        ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram +
                        '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))