Exemplo n.º 1
0
#TODO:Check lines are equal in SLFs and TLFs.

#Command line params
min_fms = float(args.min_fms)
max_len = int(args.max_len) 

#Training file pointers
file1 = open(args.SLF)
file2 = open(args.TLF)


src_sentences, tgt_sentences = [], []

while True:
	line = preprocess(file1.readline())
	line1 = preprocess(file2.readline())
	if not line or not line1:
		break
	if len(line.split()) > max_len:
		continue
	src_sentences.append(line)
	tgt_sentences.append(line1)

#Close files
file1.close()
file2.close()

#Testing file pointers
file3 = open(args.SLFT)
file4 = open(args.TLFT)
Exemplo n.º 2
0
min_len = int(args.min_len)
max_len = int(args.max_len) 

all_files = os.listdir(path)
files_map = {}
test_sentences = 0
fmses = []

for file1 in all_files:
	match = re.match(r'[a-z]{2}\.[a-z]{2}\-[a-z]{2}\.(test|train)', file1)
	if match:
		print(file1)
		src_sentences = []
		f1 = open(path+'/'+file1)
		while True:
			line = preprocess(f1.readline())
			if not line:
				break
			if line == '':
				continue
			src_sentences.append(line)

		sys.setrecursionlimit(10000)

		for i in range(len(src_sentences)):
			for j in range(i+1, len(src_sentences)):
				s, s1 = src_sentences[i], src_sentences[j]
				fms = FMS(s, s1).calculate_using_wanger_fischer()
				fmses.append(fms)
		break
Exemplo n.º 3
0
from lib.utilities import preprocess, assertion, is_subsegment

parser = argparse.ArgumentParser(description='Generates set D.')
parser.add_argument('S', help='Second Sentence')
parser.add_argument('T', help='First Sentence Translation')
parser.add_argument('S1', help='Second Sentence')
parser.add_argument('LP', help='Language Pair')

parser.add_argument('-d', help='Specify the lanuguage-pair installation directory')
parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2')
parser.add_argument('--max-len', help='Maximum length of sub-string allowed.')
args = parser.parse_args()

#Applying some preprocessing on input data.
s_sentence = preprocess(args.S)
t_sentence = preprocess(args.T)
s1_sentence = preprocess(args.S1)

lp = args.LP
lps = lp.split('-')

#Testing Input data
assertion(s_sentence != "", "S should not be blank.\nSee -h for help")
assertion(s1_sentence != "", "S1 should not be blank.\nSee -h for help")
assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'")

#Read optional params
lp_dir = args.d
min_fms = float(args.min_fms)
min_len = int(args.min_len)
Exemplo n.º 4
0
parser.add_argument("LP", help="Language Pair for TM (for example en-eo)")

parser.add_argument("-v", help="Verbose Mode", action="store_true")
parser.add_argument("-t", help="Show patching traces", action="store_true")
parser.add_argument("-c", help="Specify the sqlite3 db to be used for caching", default="")
parser.add_argument("-d", help="Specify the language-pair installation directory")
parser.add_argument("--cam", help="Only those patches which cover all the mismatches", action="store_true")
parser.add_argument("--go", help="To patch only grounded mismatches", action="store_true")
parser.add_argument("--bo", help="Prints the best possible transalation only", action="store_true")
parser.add_argument("--min-fms", help="Minimum value of fuzzy match score of S and S1.", default="0.8")
parser.add_argument("--min-len", help="Minimum length of sub-segment allowed.", default="2")
parser.add_argument("--max-len", help="Maximum length of sub-segment allowed.", default="5")
args = parser.parse_args()

# Applying some preprocessing on input data.
s_sentence = preprocess(args.S)
tmxfile = preprocess(args.TM)
lp = args.LP
lps = lp.split("-")


# Testing Input data
assertion(s_sentence != "", "S should not be blank. See -h for help")
assertion(os.path.isfile(tmxfile), "TM does not exist")
assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'")

# Read optional params
cache = args.c
lp_dir = args.d
verbose = args.v
show_traces = args.t