Exemplo n.º 1
0
def main():
	if len(sys.argv) != 3:
		print('USAGE: python3 evalsplit.py \\')
		print('               <in-file: reference splitter>')
		print('               <in-file: hypothesis splitter>')
		return

	fname_ref = sys.argv[1]
	fname_hyp = sys.argv[2]

	sp_ref = 0
	sp_hyp = 0
	sp_match = 0
	nosp_ref = 0
	nosp_hyp = 0
	nosp_match = 0

	sps_ref = {(int(x[0]), int(x[1])) for x in dlmread(fname_ref)}
	sps_hyp = {(int(x[0]), int(x[1])) for x in dlmread(fname_hyp)}
	match = 0
	for sp in sps_hyp:
		if sp in sps_ref:
			match += 1
	
	len_r = len(sps_ref)
	len_h = len(sps_hyp)
	
	print('SPLIT-R: %f [%d/%d]' % (match/len_r, match, len_r))
	print('SPLIT-P: %f [%d/%d]' % (match/len_h, match, len_h))
	print('SPLIT-F: %f [2*%d/(%d+%d)]' % (2*match/(len_r+len_h), match, len_r, len_h))
Exemplo n.º 2
0
def main():
    if len(sys.argv) != 3:
        print('USAGE: python3 evalsplit.py \\')
        print('               <in-file: reference splitter>')
        print('               <in-file: hypothesis splitter>')
        return

    fname_ref = sys.argv[1]
    fname_hyp = sys.argv[2]

    sp_ref = 0
    sp_hyp = 0
    sp_match = 0
    nosp_ref = 0
    nosp_hyp = 0
    nosp_match = 0

    sps_ref = {(int(x[0]), int(x[1])) for x in dlmread(fname_ref)}
    sps_hyp = {(int(x[0]), int(x[1])) for x in dlmread(fname_hyp)}
    match = 0
    for sp in sps_hyp:
        if sp in sps_ref:
            match += 1

    len_r = len(sps_ref)
    len_h = len(sps_hyp)

    print('SPLIT-R: %f [%d/%d]' % (match / len_r, match, len_r))
    print('SPLIT-P: %f [%d/%d]' % (match / len_h, match, len_h))
    print('SPLIT-F: %f [2*%d/(%d+%d)]' %
          (2 * match / (len_r + len_h), match, len_r, len_h))
Exemplo n.º 3
0
def load(fnames):
    print('--------')
    print('loading data ...')
    corpus_in = [x for x in dlmread(fnames['IN'], ' ')]

    tab_pid_pmt = {}
    with \
     codecs.open(fnames['PID'], 'r', 'utf-8') as fp_pid, \
     codecs.open(fnames['PMT'], 'r', 'utf-8') as fp_pmt:
        for lid, lmt in zip(fp_pid, fp_pmt):
            key = tuple(int(x) for x in lid.strip().split(' '))
            val = lmt.strip().split(' ')
            tab_pid_pmt[key] = val

    tab_sp = defaultdict(lambda: [])
    with codecs.open(fnames['SP'], 'r', 'utf-8') as fp:
        for l in fp:
            lineno, wordno = tuple(int(x) for x in l.strip().split(' '))
            tab_sp[lineno].append(wordno)

    print('  # of input sentence     : %d' % len(corpus_in))
    print('  # of part id-mt table   : %d' % len(tab_pid_pmt))
    print('  # of splitting table    : %d' %
          sum(len(x) for x in tab_sp.values()))
    return corpus_in, tab_pid_pmt, tab_sp
Exemplo n.º 4
0
def main():

	if len(sys.argv) != 4:
		print('USAGE: python3 predicttosplit.py \\')
		print('               <in-file: LIBLINEAR prediction> \\')
		print('               <in-file: input corpus> \\')
		print('               <out-file: splitter table>')
		return

	fname_predict = sys.argv[1]
	fname_in = sys.argv[2]
	fname_splitter = sys.argv[3]

	corpus_in = [x for x in dlmread(fname_in, ' ')]

	with \
		codecs.open(fname_predict, 'r', 'utf-8') as fp_pred, \
		codecs.open(fname_splitter, 'w', 'utf-8') as fp_sp:
		line = 0
		pos = 0
		for i, l in enumerate(fp_pred):
			while not corpus_in[line] or pos == len(corpus_in[line])-1:
				line += 1
				pos = 0
			status = int(l.strip())
			if status == 1:
				fp_sp.write('%d %d\n' % (line, pos))
			pos += 1
Exemplo n.º 5
0
def main():
	if len(sys.argv) != 5:
		print('USAGE: python3 gdptosplit.py')
		print('                 <[1] in-file: GreedyDP model>')
		print('                 <[2] in-file: input corpus with POS>')
		print('                 <[3] out-file: splitter table>')
		print('                 <[4] float: mean of #words>')
		return
	
	fname_model = sys.argv[1]
	fname_in = sys.argv[2]
	fname_sp = sys.argv[3]
	mu = float(sys.argv[4])
	
	try:
		model = loadmodel(fname_model, mu)
	except Exception as ex:
		sys.stderr.write('ERROR: mu is too small %s' % ex)
		return 1
	
	print(model)

	corpus_in = [[tuple(x.split('_')) for x in inp] for inp in dlmread(fname_in, ' ')]
	
	with open(fname_sp, 'w', encoding='utf-8') as fp:
		for i, inp in enumerate(corpus_in):
			for j in range(len(inp)-1):
				if (inp[j][1], inp[j+1][1]) in model:
					fp.write('%d %d\n' % (i, j))
Exemplo n.º 6
0
def main():

    if len(sys.argv) != 4:
        print('USAGE: python3 predicttosplit.py \\')
        print('               <in-file: LIBLINEAR prediction> \\')
        print('               <in-file: input corpus> \\')
        print('               <out-file: splitter table>')
        return

    fname_predict = sys.argv[1]
    fname_in = sys.argv[2]
    fname_splitter = sys.argv[3]

    corpus_in = [x for x in dlmread(fname_in, ' ')]

    with \
     codecs.open(fname_predict, 'r', 'utf-8') as fp_pred, \
     codecs.open(fname_splitter, 'w', 'utf-8') as fp_sp:
        line = 0
        pos = 0
        for i, l in enumerate(fp_pred):
            while not corpus_in[line] or pos == len(corpus_in[line]) - 1:
                line += 1
                pos = 0
            status = int(l.strip())
            if status == 1:
                fp_sp.write('%d %d\n' % (line, pos))
            pos += 1
Exemplo n.º 7
0
def main():
    if len(sys.argv) != 3:
        print('USAGE: python3 evalliblin.py \\')
        print('               <in-file: LIBLINEAR input file>')
        print('               <in-file: LIBLINEAR prediction result>')
        return

    fname_ref = sys.argv[1]
    fname_hyp = sys.argv[2]

    sp_ref = 0
    sp_hyp = 0
    sp_match = 0
    nosp_ref = 0
    nosp_hyp = 0
    nosp_match = 0

    for ls_ref, ls_hyp in zip(dlmread(fname_ref), dlmread(fname_hyp)):
        ref = int(ls_ref[0])
        hyp = int(ls_hyp[0])

        if ref == 1:
            sp_ref += 1
        else:
            nosp_ref += 1

        if hyp == 1:
            sp_hyp += 1
        else:
            nosp_hyp += 1

        if ref == hyp:
            if ref == 1:
                sp_match += 1
            else:
                nosp_match += 1

    print('LIBLIN-R: %f [%d/%d]' %
          (0 if sp_ref == 0 else sp_match / sp_ref, sp_match, sp_ref))
    print('LIBLIN-P: %f [%d/%d]' %
          (0 if sp_hyp == 0 else sp_match / sp_hyp, sp_match, sp_hyp))
    print('LIBLIN-F: %f [2*%d/(%d+%d)]' %
          (0 if sp_ref + sp_hyp == 0 else 2 * sp_match /
           (sp_ref + sp_hyp), sp_match, sp_ref, sp_hyp))
Exemplo n.º 8
0
def load(fnames):
    print('--------')
    print('loading data ...')
    corpus_in = [x for x in dlmread(fnames['IN'], ' ')]
    corpus_ref = [x for x in dlmread(fnames['REF'], ' ')]

    tab_pid_pmt = {}
    with \
     codecs.open(fnames['PID'], 'r', 'utf-8') as fp_pid, \
     codecs.open(fnames['PMT'], 'r', 'utf-8') as fp_pmt:
        for lid, lmt in zip(fp_pid, fp_pmt):
            key = tuple(int(x) for x in lid.strip().split(' '))
            val = lmt.strip().split(' ')
            tab_pid_pmt[key] = val

    print('  # of input sentence     : %d' % len(corpus_in))
    print('  # of reterence sentence : %d' % len(corpus_ref))
    print('  # of part id-mt table   : %d' % len(tab_pid_pmt))
    return corpus_in, corpus_ref, tab_pid_pmt
Exemplo n.º 9
0
def load(fnames):
	print('--------')
	print('loading data ...')
	corpus_in = [x for x in dlmread(fnames['IN'], ' ')]
	corpus_ref = [x for x in dlmread(fnames['REF'], ' ')]
	
	tab_pid_pmt = {}
	with \
		codecs.open(fnames['PID'], 'r', 'utf-8') as fp_pid, \
		codecs.open(fnames['PMT'], 'r', 'utf-8') as fp_pmt:
		for lid, lmt in zip(fp_pid, fp_pmt):
			key = tuple(int(x) for x in lid.strip().split(' '))
			val = lmt.strip().split(' ')
			tab_pid_pmt[key] = val

	print('  # of input sentence     : %d' % len(corpus_in))
	print('  # of reterence sentence : %d' % len(corpus_ref))
	print('  # of part id-mt table   : %d' % len(tab_pid_pmt))
	return corpus_in, corpus_ref, tab_pid_pmt
Exemplo n.º 10
0
def main():
	if len(sys.argv) != 3:
		print('USAGE: python3 evalliblin.py \\')
		print('               <in-file: LIBLINEAR input file>')
		print('               <in-file: LIBLINEAR prediction result>')
		return

	fname_ref = sys.argv[1]
	fname_hyp = sys.argv[2]

	sp_ref = 0
	sp_hyp = 0
	sp_match = 0
	nosp_ref = 0
	nosp_hyp = 0
	nosp_match = 0

	for ls_ref, ls_hyp in zip(dlmread(fname_ref), dlmread(fname_hyp)):
		ref = int(ls_ref[0])
		hyp = int(ls_hyp[0])

		if ref == 1:
			sp_ref += 1
		else:
			nosp_ref += 1

		if hyp == 1:
			sp_hyp += 1
		else:
			nosp_hyp += 1

		if ref == hyp:
			if ref == 1:
				sp_match += 1
			else:
				nosp_match += 1

	print('LIBLIN-R: %f [%d/%d]' % (0 if sp_ref==0 else sp_match/sp_ref, sp_match, sp_ref))
	print('LIBLIN-P: %f [%d/%d]' % (0 if sp_hyp==0 else sp_match/sp_hyp, sp_match, sp_hyp))
	print('LIBLIN-F: %f [2*%d/(%d+%d)]' % (0 if sp_ref+sp_hyp==0 else 2*sp_match/(sp_ref+sp_hyp), sp_match, sp_ref, sp_hyp))
Exemplo n.º 11
0
def load(fnames):
	print('--------')
	print('loading data ...')
	corpus_in = [x for x in dlmread(fnames['IN'], ' ')]
	
	tab_pid_pmt = {}
	with \
		codecs.open(fnames['PID'], 'r', 'utf-8') as fp_pid, \
		codecs.open(fnames['PMT'], 'r', 'utf-8') as fp_pmt:
		for lid, lmt in zip(fp_pid, fp_pmt):
			key = tuple(int(x) for x in lid.strip().split(' '))
			val = lmt.strip().split(' ')
			tab_pid_pmt[key] = val

	tab_sp = defaultdict(lambda: [])
	with codecs.open(fnames['SP'], 'r', 'utf-8') as fp:
		for l in fp:
			lineno, wordno = tuple(int(x) for x in l.strip().split(' '))
			tab_sp[lineno].append(wordno)
	
	print('  # of input sentence     : %d' % len(corpus_in))
	print('  # of part id-mt table   : %d' % len(tab_pid_pmt))
	print('  # of splitting table    : %d' % sum(len(x) for x in tab_sp.values()))
	return corpus_in, tab_pid_pmt, tab_sp
Exemplo n.º 12
0
def main():
    if len(sys.argv) != 6:
        print('USAGE: python3 makeliblin_greedy.py \\')
        print('               <str: mode ("dev" or "test")>')
        print('               <in-file: input sentence with POS> \\')
        print('               <in-file: splitter table> \\')
        print(
            '               <(dev)out-file, (test)in-file: feature ID table> \\'
        )
        print('               <out-file: LIBLINEAR input data>')
        return

    mode = sys.argv[1]
    fname_pos = sys.argv[2]
    fname_splitter = sys.argv[3]
    fname_fid = sys.argv[4]
    fname_liblin = sys.argv[5]

    if mode not in ['dev', 'test']:
        sys.stderr.write('ERROR: unknown mode.\n')
        return

    # load word and pos
    corpus_in_pos = [x for x in dlmread(fname_pos, ' ')]
    for i in range(len(corpus_in_pos)):
        corpus_in_pos[i] = [w.split('_') for w in corpus_in_pos[i]]

    # load splitter
    tab_sp = defaultdict(lambda: [])
    with open(fname_splitter, 'r', encoding='utf-8') as fp:
        for l in fp:
            lineno, wordno = tuple(int(x) for x in l.strip().split(' '))
            tab_sp[lineno].append(wordno)

    # load or new feature id table
    fid = defaultdict(lambda: len(fid) + 1)
    if mode == 'test':
        with open(fname_fid, 'r', encoding='utf-8') as fp:
            for l in fp:
                ls = l.split()
                k = ls[0]
                v = int(ls[1])
                fid[k] = v

    # make/save training data
    n = 0
    with open(fname_liblin, 'w', encoding='utf-8') as fp:
        for i in range(len(corpus_in_pos)):
            data = [['<s>', '<s>']] * 2 + corpus_in_pos[i] + [['</s>', '</s>']
                                                              ] * 2
            for j in range(len(data) - 5):  # ignore end of sentence
                jj = j + 2
                features = []

                # unigram words
                #				addfeature(features, fid, 'WORD[-2]=%s' % data[jj-2][0], mode)
                addfeature(features, fid, 'WORD[-1]=%s' % data[jj - 1][0],
                           mode)
                addfeature(features, fid, 'WORD[0]=%s' % data[jj + 0][0], mode)
                addfeature(features, fid, 'WORD[+1]=%s' % data[jj + 1][0],
                           mode)
                addfeature(features, fid, 'WORD[+2]=%s' % data[jj + 2][0],
                           mode)
                # unigram POSes
                #				addfeature(features, fid, 'POS[-2]=%s' % data[jj-2][1], mode)
                addfeature(features, fid, 'POS[-1]=%s' % data[jj - 1][1], mode)
                addfeature(features, fid, 'POS[0]=%s' % data[jj + 0][1], mode)
                addfeature(features, fid, 'POS[+1]=%s' % data[jj + 1][1], mode)
                addfeature(features, fid, 'POS[+2]=%s' % data[jj + 2][1], mode)
                # bigram words
                #				addfeature(features, fid, 'WORD[-2:-1]=%s_%s' % (data[jj-2][0], data[jj-1][0]), mode)
                addfeature(
                    features, fid,
                    'WORD[-1:0]=%s_%s' % (data[jj - 1][0], data[jj + 0][0]),
                    mode)
                addfeature(
                    features, fid,
                    'WORD[0:+1]=%s_%s' % (data[jj + 0][0], data[jj + 1][0]),
                    mode)
                addfeature(
                    features, fid,
                    'WORD[+1:+2]=%s_%s' % (data[jj + 1][0], data[jj + 2][0]),
                    mode)
                # bigram POSes
                #				addfeature(features, fid, 'POS[-2:-1]=%s_%s' % (data[jj-2][1], data[jj-1][1]), mode)
                addfeature(
                    features, fid,
                    'POS[-1:0]=%s_%s' % (data[jj - 1][1], data[jj + 0][1]),
                    mode)
                addfeature(
                    features, fid,
                    'POS[0:+1]=%s_%s' % (data[jj + 0][1], data[jj + 1][1]),
                    mode)
                addfeature(
                    features, fid,
                    'POS[+1:+2]=%s_%s' % (data[jj + 1][1], data[jj + 2][1]),
                    mode)
                # trigram words
                #				addfeature(features, fid, 'WORD[-2:0]=%s_%s_%s' % (data[jj-2][0], data[jj-1][0], data[jj+0][0]), mode)
                addfeature(
                    features, fid, 'WORD[-1:+1]=%s_%s_%s' %
                    (data[jj - 1][0], data[jj + 0][0], data[jj + 1][0]), mode)
                addfeature(
                    features, fid, 'WORD[0:+2]=%s_%s_%s' %
                    (data[jj + 0][0], data[jj + 1][0], data[jj + 2][0]), mode)
                # trigram POSes
                #				addfeature(features, fid, 'POS[-2:0]=%s_%s_%s' % (data[jj-2][1], data[jj-1][1], data[jj+0][1]), mode)
                addfeature(
                    features, fid, 'POS[-1:+1]=%s_%s_%s' %
                    (data[jj - 1][1], data[jj + 0][1], data[jj + 1][1]), mode)
                addfeature(
                    features, fid, 'POS[0:+2]=%s_%s_%s' %
                    (data[jj + 0][1], data[jj + 1][1], data[jj + 2][1]), mode)

                line = '1 ' if j in tab_sp[i] else '2 '

                line += ' '.join('%d:1' % f for f in sorted(features))
                fp.write(line + '\n')
                n += 1

    # save feature id table
    if mode == 'dev':
        with open(fname_fid, 'w', encoding='utf-8') as fp:
            for k, v in fid.items():
                fp.write('%s\t%d\n' % (k, v))