Exemplo n.º 1
0
def main(argv):

	parameters = Parameters(argv)
	hostname = parameters.getHostname()
	port = parameters.getPort()
	dbname = parameters.getDBName()
	language_1, language_2 = parameters.getLanguage()
	collection = parameters.getCollection()
	filexml_1 = parameters.getInputFile_1()
	filexml_2 = parameters.getInputFile_2()
	type_corpus = parameters.getType()

	print 'Using parameters of configuration: '
	print '- Host : ',hostname
	print '- Port : ',port
	print '- Coll : ',collection
	print '- DBase: ',dbname
	print '- XML1 : ',filexml_1
	print '- XML2 : ',filexml_2

	database = Mongo(hostname, dbname, collection)	

	dic_content_1 = OrderedDict()
	parserxml_1 = XML(filexml_1, language_1)
	dic_content_1 = parserxml_1.getContent()
	size_1 = len(dic_content_1)
	del parserxml_1

	dic_content_2 = OrderedDict()
	parserxml_2 = XML(filexml_2, language_2)
	dic_content_2 = parserxml_2.getContent()
	size_2 = len(dic_content_2)
	del parserxml_2
	
	counter = 1
	if size_1 == size_2:
		#As both files come from WebAligner, they must have the same number of documents
		for id_order in dic_content_1:
			id_file_1 = dic_content_1[id_order]['id_file']
			language_1 = dic_content_1[id_order]['language']
			content_1 = dic_content_1[id_order]['content']
			
			id_file_2 = dic_content_2[id_order]['id_file']
			language_2 = dic_content_2[id_order]['language']
			content_2 = dic_content_2[id_order]['content']

			if database.exists(language_1, id_file_1):
				if not database.exists(language_2, id_file_2):
					database.insertInExisting(language_1, id_file_1, language_2, id_file_2, content_2)
			else:
				if database.exists(language_2, id_file_2):
					database.insertInExisting(language_2, id_file_2, language_1, id_file_1, content_1)
				else:
					database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, counter)
			counter += 1
	else:
		#Files have different number of documents, so they are not aligned
		print '\nError: Files not aligned. Please align them with WebAligner.'
Exemplo n.º 2
0
def main(argv):
	parameters = Parameters(argv)
	hostname = parameters.getHostname()
	port = parameters.getPort()
	dbname = parameters.getDBName()
	language_1, language_2 = parameters.getLanguage()
	collection = parameters.getCollection()
	fileinput_1 = parameters.getInputFile_1()
	fileinput_2 = parameters.getInputFile_2()
	type_corpus = parameters.getType()

	print 'Using parameters of configuration: '
	print '- Host : ',hostname
	print '- Port : ',port
	print '- Coll : ',collection
	print '- DBase: ',dbname
	print '- File1: ',fileinput_1
	print '- File2: ',fileinput_2

	database = Mongo(hostname, dbname, collection)	

	id_file_1 = (fileinput_1.split('/'))[-1]
	id_file_2 = (fileinput_2.split('/'))[-1]

	try:
		file_1 = codecs.open(fileinput_1, 'r', 'utf-8')
	except IOError:
		print 'ERROR: System cannot open the '+fileinput_1+' file'
		sys.exit(2)
	try:
		file_2 = codecs.open(fileinput_2, 'r', 'utf-8')
	except IOError:
		print 'ERROR: System cannot open the '+fileinput_2+' file'
		sys.exit(2)
	
	#Sentences indexed by the number of the line : number_line = _id (sentence)
	line_number = 1
	lines_2 = file_2.readlines()

	for counter, content_1 in enumerate(file_1):
		content_2 = lines_2[counter]

		if not database.exists(language_1, id_file_1) and not database.exists(language_2, id_file_2):
			database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, line_number)
		else:
			if database.existsSentence(language_1, id_file_1, line_number):
				if not database.existsSentence(language_2, id_file_2, line_number):
					database.insertInExistingSentence(language_1, id_file_1, language_2, id_file_2, content_2, line_number)
			else:
				if database.existsSentence(language_2, id_file_2, line_number):
					database.insertInExistingSentence(language_2, id_file_2, language_1, id_file_1, content_1, line_number)
				else:
					database.insertNewSentence(language_1, id_file_1, content_1, language_2, id_file_2, content_2, line_number)
		if (line_number % 1000 == 0):
			print 'Indexing line: ',line_number
		line_number += 1
Exemplo n.º 3
0
def main(argv):
	parameters = Parameters(argv)
	hostname = parameters.getHostname()
	port = parameters.getPort()
	dbname = parameters.getDBName()
	language_1, language_2 = parameters.getLanguage()
	collection = parameters.getCollection()
	input_folder = parameters.getInputFolder()
	type_corpus = parameters.getType()

	print 'Using parameters of configuration: '
	print '- Host : ',hostname
	print '- Port : ',port
	print '- Coll : ',collection
	print '- DBase: ',dbname
	print '- Input: ',input_folder

	database = Mongo(hostname, dbname, collection)	

	try:
		root, dirs, files = os.walk(input_folder+''+language_1+'/').next()[:3]
	except IOError:
		print 'ERROR: It was not possible to open the '+input_folder+'en/ folder'
		sys.exit(2)
		
	for corpus_file in files:
		#if (corpus_file ~ "/~/$"):
		if not '.txt~' in corpus_file:
			print 'Working on file: '+corpus_file
			id_file_1 = language_1+'_'+corpus_file[0:-4]
			id_file_2 = language_2+'_'+corpus_file[0:-4]

			try:
				file_1 = codecs.open(input_folder+''+language_1+'/'+corpus_file, 'r', 'utf-8')
			except IOError:
				print 'ERROR: System cannot open the '+root+''+corpus_file+' file'
				sys.exit(2)
			try:
				file_2 = codecs.open(input_folder+''+language_2+'/'+corpus_file, 'r', 'utf-8')
			except IOError:
				print 'ERROR: System cannot open the '+root+'../'+language_2+'/'+corpus_file+' file'
				sys.exit(2)
	
			#Sentences indexed by the number of the line : number_line = _id (sentence)
			line_number = 1
			lines_2 = file_2.readlines()
			content_1 = ''
			content_2 = ''
			for counter, line in enumerate(file_1):
				if re.match('(^<)', line):
					if content_1 != '' and content_2 != '':
						if not database.exists(language_1, id_file_1) and not database.exists(language_2, id_file_2):
							database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, line_number)
						else:
							if database.existsSentence(language_1, id_file_1, line_number):
								if not database.existsSentence(language_2, id_file_2, line_number):
									database.insertInExistingSentence(language_1, id_file_1, language_2, id_file_2, content_2, line_number)
							else:
								if database.existsSentence(language_2, id_file_2, line_number):
									database.insertInExistingSentence(language_2, id_file_2, language_1, id_file_1, content_1, line_number)
								else:
									database.insertNewSentence(language_1, id_file_1, content_1, language_2, id_file_2, content_2, line_number)
						line_number += 1
						content_1 = ''
						content_2 = ''
					if (line_number % 100 == 0):
						print 'Indexing line: ',line_number
				else:
					content_1 += line
					content_2 += lines_2[counter]
	
		file_1.close()
		file_2.close()
Exemplo n.º 4
0
def main(argv):
	parameters = Parameters(argv)
	hostname = parameters.getHostname()
	port = parameters.getPort()
	dbname = parameters.getDBName()
	language_1, language_2 = parameters.getLanguage()
	collection = parameters.getCollection()
	input_folder = parameters.getInputFolder()
	type_corpus = parameters.getType()

	print 'Using parameters of configuration: '
	print '- Host : ',hostname
	print '- Port : ',port
	print '- Coll : ',collection
	print '- DBase: ',dbname
	print '- Input: ',input_folder

	database = Mongo(hostname, dbname, collection)	
	
	try:
		root, dirs, files = os.walk(input_folder).next()[:3]
	except IOError:
		print 'ERROR: It was not possible to open the '+input_folder+' folder'
		sys.exit(2)
		
	name_folder = (input_folder.split('/'))[-2]
	dic_files = {}
	for corpus_file in files:
		print 'Working on file: '+corpus_file
		if not re.match('~$', corpus_file):
			id_file = corpus_file[0:-7]
			language = corpus_file[-6:-4]
			if not dic_files.has_key(id_file):
				dic_files[id_file] = {'language_1': language}
			else:
				dic_files[id_file]['language_2'] = language

	counter = 1
	for filename in dic_files:
		language_1 = dic_files[filename]['language_1']
		language_2 = dic_files[filename]['language_2']
		id_file_1 = name_folder+'_'+filename+'_'+language_1
		id_file_2 = name_folder+'_'+filename+'_'+language_2

		try:
			file_1 = codecs.open(input_folder+''+filename+'_'+language_1+'.snt', 'r', 'utf-8')
		except IOError:
			print 'ERROR: System cannot open the '+input_folder+''+filename+'_'+language_1+'.snt file'
			sys.exit(2)
		try:
			file_2 = codecs.open(input_folder+''+filename+'_'+language_2+'.snt', 'r', 'utf-8')
		except IOError:
			print 'ERROR: System cannot open the '+input_folder+''+filename+'_'+language_2+'.snt file'
			sys.exit(2)
		
		content_1 = ''
		for line in file_1:
			#if line.strip():
			content_1 += line

		content_2 = ''
		for line in file_2:
			#if line.strip():
			content_2 += line

		if database.exists(language_1, id_file_1):
			if not database.exists(language_2, id_file_2):
				database.insertInExisting(language_1, id_file_1, language_2, id_file_2, content_2)
		else:
			if database.exists(language_2, id_file_2):
				database.insertInExisting(language_2, id_file_2, language_1, id_file_1, content_1)
			else:
				database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, counter)
		counter += 1