Exemplo n.º 1
0
def read_frequencies(fname):
    with open(fname) as fin:
        sl_tl = {}
        sl_tl_defaults = {}
        indexes = {}
        trad_counter = defaultdict(lambda: 0)
        for line_ in fin.readlines():
            line = line_.strip()
            if not line:
                continue
            row = common.tokenize_tagger_line(line)
            sl = row[0]
            tl = row[1]
            fr = float(line.split(' ')[0])
            indexes[(sl, tl)] = trad_counter[sl]
            trad_counter[sl] += 1
            if '@' in line:
                sl_tl_defaults[sl] = tl
                if fr == 0.0:
                    print(
                        '!!! Prolly something went wrong here, the default has freq of 0.0',
                        file=sys.stderr)
                else:
                    print('    %s => %s = %.10f' % (sl, tl, fr),
                          file=sys.stderr)
            else:
                sl_tl[sl] = tl
        return sl_tl, sl_tl_defaults, indexes
Exemplo n.º 2
0
def extract_alig_lrx(lex_freq):
    with open(lex_freq) as d:
        print('<rules>')

        for line in d:
            sys.stdout.flush()
            if line[-2] == '@':
                row = common.tokenize_tagger_line(line)

                fq = line.split(' ')[0]
                sl = row[0]
                tl = row[1]

                if line.count('>') < 2:
                    continue

                print(sl, tl, file=sys.stderr)
                sl_lem = sl.split('<')[0]
                tl_lem = tl.split('<')[0]
                sl_lem = sl_lem.replace('-', '\\-').replace('~', ' ').replace(
                    '&', '&amp;')
                tl_lem = tl_lem.replace('-', '\\-').replace('~', ' ').replace(
                    '&', '&amp;')

                sl_tag = sl.replace('><', '.').split('<')[1].strip('>')
                tl_tag = tl.replace('><', '.').split('<')[1].strip('>')

                cmb = ''
                cma = ''

                if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']:
                    cmb = '<!--'
                    cma = '-->'
                else:
                    cma = ''
                    cmb = ''

                rule = cmb + '<rule comment="' + fq + '">'
                # rule = rule + '<match lemma="' + sl_lem + '" tags="' + sl_tag + '"><select lemma="' + tl_lem + '" tags="' + tl_tag + '"/>'
                rule = rule + '<match lemma="' + sl_lem + '"><select lemma="' + tl_lem + '"/>'
                rule = rule + '</match>'
                rule = rule + '</rule>' + cma

                print(rule)

        print('</rules>')
sl_tl_defaults = {}
sl_tl = defaultdict(list)

features = {}  # features[(slword, ['a', 'list'], tlword)] = 3

indexes = {}
trad_counter = defaultdict(lambda: 0)

# First read in the frequency defaults

for line in open(sys.argv[1]):
    line = line.strip()
    if len(line) < 1:
        continue

    row = common.tokenize_tagger_line(line)
    sl = common.wrap(row[0])
    tl = common.wrap(row[1])
    if tl[1] == '*':
        tl = tl[:-3] + '$'

    indexes[(sl, tl)] = trad_counter[sl]
    trad_counter[sl] += 1
    sl_tl[sl].append(tl)

    if line.count('@') > 0:
        sl_tl_defaults[sl] = tl


class Counter(BCC.BiltransCounter):
    tokenizer = 'biltrans'
while reading: #{	
	try:
		lineno = lineno + 1;
		pt_line = phrase_table.readline().strip();	
		bt_line = biltrans_out.readline().strip();

		if not bt_line.strip() and not pt_line.strip(): #{
			reading = False;
			break
		#}
		elif not bt_line.strip() or not pt_line.strip():
			continue;

		row = pt_line.split(' ||| ');
		bt = common.tokenize_biltrans_line(bt_line);
		sl = common.tokenize_tagger_line(row[1]);
		tl = common.tokenize_tagger_line(row[0]);

		
		if not ambiguous(bt): #{
			print ("line", lineno, "not ambiguous", file=sys.stderr);
			continue;
		#}
		if len(sl) < 2 and len(tl) < 2: #{
			continue;
		#}


		# Check that the number of words in the lexical transfer, and in the phrasetable matches up
		if len(sl) != len(bt): #{
			print ("len(sl) != len(bt)", file=sys.stderr);
	dm_file = open(sys.argv[2]); # File with tagger output
	reading = True;
	lineno = 0;
	while reading: #{
		lineno += 1
		if (lineno % 1000 == 0):
			print ("at line no: ", lineno, file=sys.stderr);
		am_line = am_file.readline();
		dm_line = dm_file.readline();
		if am_line == '': #{
			reading = False;
			continue;
		#}
		try:
			am_row = common.tokenize_biltrans_line(am_line);
			dm_row = set(common.tokenize_tagger_line(dm_line));
		except:
			continue;

		cur_sl_row = [x['sl'] for x in am_row];

		for i in range(0, len(am_row)): #{
			sl = am_row[i]['sl'];
			tls = am_row[i]['tls'];
			if len(tls) < 2: #{
				continue;
			#}


			for tl in found_tls(tls, dm_row): #{
				for j in range(1, MAX_NGRAMS): #{
def wrap (x):
	return '^' + x + '$'

sl_tl_defaults = {}; 
sl_tl = {};

indexes = {};
trad_counter = {}; 
rindex = {};

with open(sys.argv[1]) as d:
	for line in d: #{
		if len(line) < 1: #{
			continue;
		#}
		row = common.tokenize_tagger_line(line);
		sl = wrap(row[0].strip());
		tl = wrap(row[1].strip());
		if tl[1] == '*':
			tl = tl[:-3] + '$'

		if sl not in sl_tl: #{
			sl_tl[sl] = [];
		#}
		if sl not in trad_counter: #{
			trad_counter[sl] = 0;
		#}
		if line.count('@') > 0: #{
			sl_tl_defaults[sl] = tl;
		#}
		sl_tl[sl].append(tl);
def ngrams_to_rules(ngrams, crisphold):
    permitted_tags = ['n', 'vblex', 'adj', 'n.*', 'vblex.*', 'adj.*']

    print('<rules>')
    lineno = 1
    ruleno = 0
    for line in open(ngrams).readlines():
        #	print('\n')
        #	print(line)
        if len(line) < 2:
            continue

        line = line.strip()
        #line = line.strip()

        # + 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
        row = line.split('\t')

        if len(row) == 3:
            row.insert(0, '')

    #	tipus = row[0].split(' ')[0]
        weight = row[0].split(' ')[1]
        sl = row[1].strip()[1:-1]
        tl = row[3][1:-1]
        tl_lema = tl.split('<')[0].lower()
        tl_tags = '<'.join(tl.split('<')[1:]).replace('><',
                                                      '.').replace('>', '')

        freq = row[4]
        pattern = common.tokenize_tagger_line(row[2])

        if row[2].count('<guio>') > 0 or row[2].count(
                '<sent>') > 0 or row[2].count('<cm>') > 0:
            print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr)
            continue

        inpattern = False
        for w in pattern:
            if w.count(sl) > 0:
                inpattern = True

        if inpattern == False:
            print('SL_NOT_IN_PATTERN', line, sl, tl, file=sys.stderr)
            continue

        if tl_tags.count('adj') > 0 and sl.count('adj') < 1:
            print("TAG_MISMATCH", line, file=sys.stderr)
            continue

        if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1:
            print("TAG_MISMATCH", line, file=sys.stderr)
            continue

        if tl_tags.split('.')[0] not in permitted_tags:
            print("TAG_NOT_PERMITTED", tl_tags, '||', line, file=sys.stderr)
            continue

        if float(weight) <= float(crisphold):
            print("UNDER_THRESHOLD",
                  weight,
                  "<",
                  crisphold,
                  "||",
                  line,
                  file=sys.stderr)
            continue

        if any([x.startswith("*") for x in pattern]):
            print("UNKNOWN_WORD_IN_PATTERN", pattern, file=sys.stderr)
            continue

        sel = False
        ruleno = ruleno + 1
        lineno = lineno + 1

        print('  <rule c="' + str(ruleno) + ' ' + str(lineno) + ': ' + freq +
              '" weight="' + weight + '">')
        for word in pattern:
            sl_lema = word.split('<')[0].lower()
            if (sl_lema[0] == '*'):
                continue

            if word.count('><') > 0:
                sl_tags = '<'.join(word.split('<')[1:]).replace('><',
                                                                '.').replace(
                                                                    '>', '')
            else:
                sl_tags = '<'.join(word.split('<')[1:]).strip('<>')

            # ======================================================================= #

            sl_lema = sl_lema.replace('~', ' ')
            tl_lema = tl_lema.replace('~', ' ')
            sl_lema = sl_lema.replace('-', '\-')
            tl_lema = tl_lema.replace('-', '\-')
            sl_lema = sl_lema.replace('(', '\(')
            tl_lema = tl_lema.replace('(', '\(')
            sl_lema = sl_lema.replace(')', '\)')
            tl_lema = tl_lema.replace(')', '\)')

            if word.lower().count(sl) > 0:
                lineno = lineno + 1
                if sl_lema == '':
                    print('    <match tags="' + sl_tags + '"><select lemma="' +
                          tl_lema + '" tags="' + tl_tags + '"/></match>')
                else:
                    print('    <match lemma="' + sl_lema + '" tags="' +
                          sl_tags + '"><select lemma="' + tl_lema +
                          '" tags="' + tl_tags + '"/></match>')

                sel = True
            else:
                lineno = lineno + 1
                if sl_lema == '':
                    print('    <match tags="' + sl_tags + '"/>')
                else:
                    print('    <match lemma="' + sl_lema + '" tags="' +
                          sl_tags + '"/>')

        if sel == False:

            print('  </rule> <!-- Warning: No select operation ', line, '-->')
        else:
            print('  </rule>')

        lineno = lineno + 1
    print('</rules>')
					#}
				#}	
				i = i + 1;
			#}

			cur_line = 0;
		except:
			print >>sys.stderr, "error in line", lineno;
		#print line;	
		continue;
	#}	
	
	line = line.split('\t')[1];

	if cur_line == 0: #{
		cur_sl_row = common.tokenize_tagger_line(line);
	elif cur_line == 1: #{
		cur_bt_row = common.tokenize_biltrans_line(line);
	elif cur_line == 2: #{
		cur_tl_row = common.tokenize_tagger_line(line);
	elif cur_line == 3:  #{
		cur_al_row = line.split(' ');
	#}

	cur_line = cur_line + 1;
#}

for sl in sl_tl: #{

	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
	newtl.reverse()
#}

MAX_NGRAMS = 3;

crisphold = float(sys.argv[3]);
cur_line = 0;

sl_tl_defaults = {}; 
sl_tl = {};
ngrams = {};

for line in file(sys.argv[1]).readlines(): #{
	if len(line) < 1: #{
		continue;
	#}
	row = common.tokenize_tagger_line(line.decode('utf-8'));
	sl = wrap(row[0]);
	tl = wrap(row[1]);
	if tl[1] == '*':
		tl = tl[:-3] + '$'
	if line.count('@') > 0: #{
		sl_tl_defaults[sl] = tl;
	else: #{
		sl_tl[sl] = tl;
	#}
#}

cur_sl_row = [];
cur_tl_row = [];
cur_bt_row = [];
cur_al_row = [];
    #+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
    row = line.split('\t')

    if len(row) == 3:
        row.insert(0, '')

#	tipus = row[0].split(' ')[0];
    weight = row[0].split(' ')[1]
    sl = row[1].strip()[1:-1]
    tl = row[3][1:-1]
    tl_lema = tl.split('<')[0].lower()
    tl_tags = '<'.join(tl.split('<')[1:]).replace('><', '.').replace('>', '')

    freq = row[4]
    pattern = common.tokenize_tagger_line(row[2])

    if row[2].count('<guio>') > 0 or row[2].count(
            '<sent>') > 0 or row[2].count('<cm>') > 0:  #{
        print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr)
        continue
    #}

    inpattern = False
    for w in pattern:  #{
        if w.count(sl) > 0:  #{
            inpattern = True
        #}
    #}
    if inpattern == False:  #{
        print('SL_NOT_IN_PATTERN', line, sl, tl, file=sys.stderr)
# File with ambiguous biltrans output
dm_file = open(sys.argv[2])
# File with biltrans output
reading = True

while reading:  #{
    am_line = am_file.readline()
    dm_line = dm_file.readline()

    if am_line == '' and dm_line == '':  #{
        reading = False
        continue
    #}
    try:
        am_row = common.tokenize_biltrans_line(am_line)
        dm_row = common.tokenize_tagger_line(dm_line)
    except:
        continue

    limit = len(am_row)
    for i in range(0, limit):  #{
        if len(am_row[i]['tls']) > 1:  #{
            sl = am_row[i]['sl']
            if sl not in sl_tl:  #{
                sl_tl[sl] = {}
            #}
            bts = am_row[i]['tls']
            valid_trads = set()
            for bt in bts:  #{
                valid_trads.add(bt)
            #}
while reading:  # {
    lineno = lineno + 1
    pt_line = phrase_table.readline().strip()
    bt_line = biltrans_out.readline().strip()

    if bt_line == "" and pt_line == "":  # {
        reading = False
        # }

    if not ambiguous(bt_line):  # {
        # 		print(lineno, ' not ambiguous.', file=sys.stderr);
        continue
        # }

    row = pt_line.split("|||")
    print(common.tokenize_tagger_line(row[0]))
    bt = bt_line.split()
    sl = row[1].strip()
    tl = row[0].strip()
    aliniaments = row[2].strip()

    bt_row = bt_line.split(" ")
    sl_row = sl.split(" ")
    tl_row = tl.split(" ")

    if len(sl_row) < 2 and len(tl_row) < 2:  # {
        continue
        # }

        # Check that the number of words in the lexical transfer, and in the phrasetable matches up
    if len(sl_row) != len(bt_row):  # {
	line = line.strip();
	#line = line.decode('utf-8').strip();
	print(line, file=sys.stderr)
	#+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
	row = line.split('\t');

	tipus = row[0].split(' ')[0];
	weight = row[0].replace('  ', ' ').split(' ')[1];
	sl = row[1].strip()[1:-1];
	tl = row[3][1:-1];
	tl_lema = tl.split('<')[0].lower();
	tl_tags = ''.join(tl.split('<')[1:]).replace('>', '.').rstrip('.')
	freq = 1
#	freq = float(row[4]);

	pattern = common.tokenize_tagger_line(row[2]);

	if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0: #{
		print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr);
		continue;
	#}

	if tipus == '-' or tipus == '~': #{
		print('DEFAULT_READING', line, file=sys.stderr);
		continue;
	#}

	# Hacks
#	if len(pattern) == 0: #{
#		print('ZERO_PATTERN' , line, file=sys.stderr);
#		continue;
Exemplo n.º 14
0
	pt_line = phrase_table.readline().strip();
	bt_line = biltrans_out.readline().strip();

	if bt_line == '' and pt_line == '': #{
		reading = False;
	#}

	if not ambiguous(bt_line): #{
#		print(lineno, ' not ambiguous.', file=sys.stderr);
		continue;
	#}



	row = pt_line.split('|||');
	print (common.tokenize_tagger_line(row[0]));
	bt = bt_line.split();
	sl = row[1].strip();
	tl = row[0].strip();
	aliniaments = row[2].strip();

	bt_row = bt_line.split(' ');
	sl_row = sl.split(' ');
	tl_row = tl.split(' ');

	if len(sl_row) < 2 and len(tl_row) < 2: #{
		continue;
	#}

	# Check that the number of words in the lexical transfer, and in the phrasetable matches up
	if len(sl_row) != len(bt_row): #{
Exemplo n.º 15
0
    # File with tagger output
    reading = True
    lineno = 0
    while reading:  #{
        lineno += 1
        if (lineno % 1000 == 0):
            print("at line no: ", lineno, file=sys.stderr)
        am_line = am_file.readline()
        dm_line = dm_file.readline()
        if am_line == '':  #{
            reading = False
            continue
        #}
        try:
            am_row = common.tokenize_biltrans_line(am_line)
            dm_row = set(common.tokenize_tagger_line(dm_line))
        except:
            continue

        cur_sl_row = [x['sl'] for x in am_row]

        for i in range(0, len(am_row)):  #{
            sl = am_row[i]['sl']
            tls = am_row[i]['tls']
            if len(tls) < 2:  #{
                continue
            #}

            for tl in found_tls(tls, dm_row):  #{
                for j in range(1, MAX_NGRAMS):  #{
                    pregram = ' '.join(cur_sl_row[i - j:i + 1])
def ngrams_to_rules(ngrams):
    # FREQMIN = 8.0

    MINMATCH = 2

    permitted_tags = ['n', 'vblex', 'adj']

    print('<rules>')
    lineno = 1
    ruleno = 0

    with open(ngrams) as infile:
        for line in infile:
            #	print '\n'
            #	print line
            if len(line) < 2:
                continue

            line = line.strip()
            # line = line.decode('utf-8').strip()
            print(line, file=sys.stderr)
            # + 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
            row = line.split('\t')

            tipus = row[0].split(' ')[0]
            weight = row[0].replace('  ', ' ').split(' ')[1]
            sl = row[1].strip()[1:-1]
            tl = row[3][1:-1]
            tl_lema = tl.split('<')[0].lower()
            tl_tags = ''.join(tl.split('<')[1:]).replace(
                '>', '.').rstrip('.')
            freq = 1
        #	freq = float(row[4])

            pattern = common.tokenize_tagger_line(row[2])

            if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0:
                print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr)
                continue

            if tipus == '-' or tipus == '~':
                print('DEFAULT_READING', line, file=sys.stderr)
                continue

            # Hacks
        #	if len(pattern) == 0:
        #		print('ZERO_PATTERN' , line, file=sys.stderr);
        #		continue

            if len(pattern) < MINMATCH and len(pattern) > 0:
                print('BELOW_MINMATCH', line, file=sys.stderr)
                continue

            inpattern = False
            for w in pattern:
                if w.lower().count(sl) > 0:
                    inpattern = True

            if len(pattern) > 0 and not inpattern:
                print('SL_NOT_IN_PATTERN', line, file=sys.stderr)
                continue

            if tl_tags.count('adj') > 0 and sl.count('adj') < 1:
                print("TAG_MISMATCH", line, file=sys.stderr)
                continue

            if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1:
                print("TAG_MISMATCH", line, file=sys.stderr)
                continue

            if tl_tags.split('.')[0] not in permitted_tags:
                print("TAG_NOT_PERMITTED", tl_tags,
                      '||', line, file=sys.stderr)
                continue

            sel = False
            ruleno = ruleno + 1
            lineno = lineno + 1

            commentb = ''
            commente = ''
        #	if freq < FREQMIN:
        #		commentb = '<!--'
        #		commente = '-->'

            print(commentb + '  <rule c="' + str(ruleno) + ' ' +
                  str(lineno) + ': ' + str(freq) + '" weight="' + weight + '">')
            for word in pattern:
                sl_lema = word.split('<')[0].lower()
                if word.count('><') > 0:
                    sl_tags = '<'.join(word.split('<')[1:]).replace(
                        '><', '.').replace('>', '')
                else:
                    sl_tags = '<'.join(word.split('<')[1:]).strip('<>')

                # ======================================================================= #

                sl_lema = sl_lema.replace('~', ' ')
                tl_lema = tl_lema.replace('~', ' ')
        #		sl_lema = sl_lema.replace('-', '\-')
        #		tl_lema = tl_lema.replace('-', '\-')
        #		sl_lema = sl_lema.replace('(', '\(')
        #		tl_lema = tl_lema.replace('(', '\(')
        #		sl_lema = sl_lema.replace(')', '\)')
        #		tl_lema = tl_lema.replace(')', '\)')
        #
                if word.lower().count(sl) > 0:
                    lineno = lineno + 1
                    if sl_lema == '':
                        print('    <match tags="' + sl_tags + '"><select lemma="' +
                              tl_lema + '" tags="' + tl_tags + '"/></match>')
                    else:
                        print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
                              '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')

                    sel = True
                else:
                    lineno = lineno + 1
                    if sl_lema == '':
                        print('    <match tags="' + sl_tags + '"/>')
                    else:
                        print('    <match lemma="' + sl_lema +
                              '" tags="' + sl_tags + '"/>')

            if sel == False and len(pattern) == 0:
                sl_lema = sl.split('<')[0]
                if sl.count('><') > 0:
                    sl_tags = '<'.join(sl.split('<')[1:]).replace(
                        '><', '.').replace('>', '')
                else:
                    sl_tags = '<'.join(sl.split('<')[1:]).strip('<>')

                if sl_lema == '':
                    print('    <match tags="' + sl_tags + '"><select lemma="' +
                          tl_lema + '" tags="' + tl_tags + '"/></match>')
                else:
                    print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
                          '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')
                print('  </rule>' + commente)
            elif sel == False:
                print('  </rule>'+commente +
                      '<!-- Warning: No select operation ', line, '-->')
            else:
                print('  </rule>' + commente)

            lineno = lineno + 1

    print('</rules>')