示例#1
0
 def next_dm_line(self):
     self.dm_linenum += 1
     self.dm_line = self.dm_file.readline()
     if not self.dm_line:
         self.dm_id, self.dm_row = None, []
         self.reading = False
         return
     ls = self.dm_line.split('\t')
     if self.line_ids:
         self.dm_id = int(self.dm_line.split('.[][')[1].split()[0])
     if self.tokenizer == 'regex':
         self.dm_row = self.lu_sep.split(ls[1].strip()[1:-1])
     elif self.tokenizer == 'biltrans':
         self.dm_row = common.tokenize_biltrans_line(self.dm_line)
		continue;
	#}
	current_am_line_id = int(am_line.split("\t")[0]);

#	# to skip lines in the frac corpus if we have a sub-corpus
	if current_dm_line_id != current_am_line_id: #{
		print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr);
#		while current_dm_line_id != current_am_line_id: #{
#			dm_line = dm_file.readline();
#			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);       
#			print('skipping %d ...' % (current_dm_line_id), file=sys.stderr);
#		#}
	#}
	while current_dm_line_id == current_am_line_id: #{

		am_row = common.tokenize_biltrans_line(am_line);
		dm_row = common.tokenize_biltrans_line(dm_line);

		if len(am_row) != len(dm_row): #{
			amc = len(am_row);
			dmc = len(dm_row);
			print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
			print('am(',amc,'):\t' + am_line, file=sys.stderr);
			print('dm(',dmc,'):\t' + dm_line, file=sys.stderr);
			print('...skipping', file=sys.stderr);
			dm_line = dm_file.readline();
			if dm_line == '': #{
				reading = False;
				break;
			#}
			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
while reading: #{	
	try:
		lineno = lineno + 1;
		pt_line = phrase_table.readline().strip();	
		bt_line = biltrans_out.readline().strip();

		if not bt_line.strip() and not pt_line.strip(): #{
			reading = False;
			break
		#}
		elif not bt_line.strip() or not pt_line.strip():
			continue;

		row = pt_line.split(' ||| ');
		bt = common.tokenize_biltrans_line(bt_line);
		sl = common.tokenize_tagger_line(row[1]);
		tl = common.tokenize_tagger_line(row[0]);

		
		if not ambiguous(bt): #{
			print ("line", lineno, "not ambiguous", file=sys.stderr);
			continue;
		#}
		if len(sl) < 2 and len(tl) < 2: #{
			continue;
		#}


		# Check that the number of words in the lexical transfer, and in the phrasetable matches up
		if len(sl) != len(bt): #{
				i = i + 1;
			#}

			cur_line = 0;
		except:
			print >>sys.stderr, "error in line", lineno;
		#print line;	
		continue;
	#}	
	
	line = line.split('\t')[1];

	if cur_line == 0: #{
		cur_sl_row = common.tokenize_tagger_line(line);
	elif cur_line == 1: #{
		cur_bt_row = common.tokenize_biltrans_line(line);
	elif cur_line == 2: #{
		cur_tl_row = common.tokenize_tagger_line(line);
	elif cur_line == 3:  #{
		cur_al_row = line.split(' ');
	#}

	cur_line = cur_line + 1;
#}

for sl in sl_tl: #{

	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
	newtl.reverse()
	first = True;
	for tl in newtl: #{
    current_am_line_id = int(am_line.split("\t")[0])

    #	# to skip lines in the frac corpus if we have a sub-corpus
    if current_dm_line_id != current_am_line_id:  #{
        print('line_id_mismatch: %d != %d' %
              (current_am_line_id, current_dm_line_id),
              file=sys.stderr)
#		while current_dm_line_id != current_am_line_id: #{
#			dm_line = dm_file.readline();
#			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
#			print('skipping %d ...' % (current_dm_line_id), file=sys.stderr);
#		#}
#}
    while current_dm_line_id == current_am_line_id:  #{

        am_row = common.tokenize_biltrans_line(am_line)
        dm_row = common.tokenize_biltrans_line(dm_line)

        if len(am_row) != len(dm_row):  #{
            amc = len(am_row)
            dmc = len(dm_row)
            print('Mismatch in number of LUs between analysis and training',
                  file=sys.stderr)
            print('am(', amc, '):\t' + am_line, file=sys.stderr)
            print('dm(', dmc, '):\t' + dm_line, file=sys.stderr)
            print('...skipping', file=sys.stderr)
            dm_line = dm_file.readline()
            if dm_line == '':  #{
                reading = False
                break
            #}