def main(argv): parameters = Parameters(argv) hostname = parameters.getHostname() port = parameters.getPort() dbname = parameters.getDBName() language_1, language_2 = parameters.getLanguage() collection = parameters.getCollection() filexml_1 = parameters.getInputFile_1() filexml_2 = parameters.getInputFile_2() type_corpus = parameters.getType() print 'Using parameters of configuration: ' print '- Host : ',hostname print '- Port : ',port print '- Coll : ',collection print '- DBase: ',dbname print '- XML1 : ',filexml_1 print '- XML2 : ',filexml_2 database = Mongo(hostname, dbname, collection) dic_content_1 = OrderedDict() parserxml_1 = XML(filexml_1, language_1) dic_content_1 = parserxml_1.getContent() size_1 = len(dic_content_1) del parserxml_1 dic_content_2 = OrderedDict() parserxml_2 = XML(filexml_2, language_2) dic_content_2 = parserxml_2.getContent() size_2 = len(dic_content_2) del parserxml_2 counter = 1 if size_1 == size_2: #As both files come from WebAligner, they must have the same number of documents for id_order in dic_content_1: id_file_1 = dic_content_1[id_order]['id_file'] language_1 = dic_content_1[id_order]['language'] content_1 = dic_content_1[id_order]['content'] id_file_2 = dic_content_2[id_order]['id_file'] language_2 = dic_content_2[id_order]['language'] content_2 = dic_content_2[id_order]['content'] if database.exists(language_1, id_file_1): if not database.exists(language_2, id_file_2): database.insertInExisting(language_1, id_file_1, language_2, id_file_2, content_2) else: if database.exists(language_2, id_file_2): database.insertInExisting(language_2, id_file_2, language_1, id_file_1, content_1) else: database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, counter) counter += 1 else: #Files have different number of documents, so they are not aligned print '\nError: Files not aligned. Please align them with WebAligner.'
def main(argv): parameters = Parameters(argv) hostname = parameters.getHostname() port = parameters.getPort() dbname = parameters.getDBName() language_1, language_2 = parameters.getLanguage() collection = parameters.getCollection() fileinput_1 = parameters.getInputFile_1() fileinput_2 = parameters.getInputFile_2() type_corpus = parameters.getType() print 'Using parameters of configuration: ' print '- Host : ',hostname print '- Port : ',port print '- Coll : ',collection print '- DBase: ',dbname print '- File1: ',fileinput_1 print '- File2: ',fileinput_2 database = Mongo(hostname, dbname, collection) id_file_1 = (fileinput_1.split('/'))[-1] id_file_2 = (fileinput_2.split('/'))[-1] try: file_1 = codecs.open(fileinput_1, 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the '+fileinput_1+' file' sys.exit(2) try: file_2 = codecs.open(fileinput_2, 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the '+fileinput_2+' file' sys.exit(2) #Sentences indexed by the number of the line : number_line = _id (sentence) line_number = 1 lines_2 = file_2.readlines() for counter, content_1 in enumerate(file_1): content_2 = lines_2[counter] if not database.exists(language_1, id_file_1) and not database.exists(language_2, id_file_2): database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, line_number) else: if database.existsSentence(language_1, id_file_1, line_number): if not database.existsSentence(language_2, id_file_2, line_number): database.insertInExistingSentence(language_1, id_file_1, language_2, id_file_2, content_2, line_number) else: if database.existsSentence(language_2, id_file_2, line_number): database.insertInExistingSentence(language_2, id_file_2, language_1, id_file_1, content_1, line_number) else: database.insertNewSentence(language_1, id_file_1, content_1, language_2, id_file_2, content_2, line_number) if (line_number % 1000 == 0): print 'Indexing line: ',line_number line_number += 1
def main(argv): parameters = Parameters(argv) hostname = parameters.getHostname() port = parameters.getPort() dbname = parameters.getDBName() language_1, language_2 = parameters.getLanguage() collection = parameters.getCollection() input_folder = parameters.getInputFolder() type_corpus = parameters.getType() print 'Using parameters of configuration: ' print '- Host : ',hostname print '- Port : ',port print '- Coll : ',collection print '- DBase: ',dbname print '- Input: ',input_folder database = Mongo(hostname, dbname, collection) try: root, dirs, files = os.walk(input_folder+''+language_1+'/').next()[:3] except IOError: print 'ERROR: It was not possible to open the '+input_folder+'en/ folder' sys.exit(2) for corpus_file in files: #if (corpus_file ~ "/~/$"): if not '.txt~' in corpus_file: print 'Working on file: '+corpus_file id_file_1 = language_1+'_'+corpus_file[0:-4] id_file_2 = language_2+'_'+corpus_file[0:-4] try: file_1 = codecs.open(input_folder+''+language_1+'/'+corpus_file, 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the '+root+''+corpus_file+' file' sys.exit(2) try: file_2 = codecs.open(input_folder+''+language_2+'/'+corpus_file, 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the '+root+'../'+language_2+'/'+corpus_file+' file' sys.exit(2) #Sentences indexed by the number of the line : number_line = _id (sentence) line_number = 1 lines_2 = file_2.readlines() content_1 = '' content_2 = '' for counter, line in enumerate(file_1): if re.match('(^<)', line): if content_1 != '' and content_2 != '': if not database.exists(language_1, id_file_1) and not database.exists(language_2, id_file_2): database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, line_number) else: if database.existsSentence(language_1, id_file_1, line_number): if not database.existsSentence(language_2, id_file_2, line_number): database.insertInExistingSentence(language_1, id_file_1, language_2, id_file_2, content_2, line_number) else: if database.existsSentence(language_2, id_file_2, line_number): database.insertInExistingSentence(language_2, id_file_2, language_1, id_file_1, content_1, line_number) else: database.insertNewSentence(language_1, id_file_1, content_1, language_2, id_file_2, content_2, line_number) line_number += 1 content_1 = '' content_2 = '' if (line_number % 100 == 0): print 'Indexing line: ',line_number else: content_1 += line content_2 += lines_2[counter] file_1.close() file_2.close()
def main(argv): parameters = Parameters(argv) hostname = parameters.getHostname() port = parameters.getPort() dbname = parameters.getDBName() language_1, language_2 = parameters.getLanguage() collection = parameters.getCollection() input_folder = parameters.getInputFolder() type_corpus = parameters.getType() print 'Using parameters of configuration: ' print '- Host : ',hostname print '- Port : ',port print '- Coll : ',collection print '- DBase: ',dbname print '- Input: ',input_folder database = Mongo(hostname, dbname, collection) try: root, dirs, files = os.walk(input_folder).next()[:3] except IOError: print 'ERROR: It was not possible to open the '+input_folder+' folder' sys.exit(2) name_folder = (input_folder.split('/'))[-2] dic_files = {} for corpus_file in files: print 'Working on file: '+corpus_file if not re.match('~$', corpus_file): id_file = corpus_file[0:-7] language = corpus_file[-6:-4] if not dic_files.has_key(id_file): dic_files[id_file] = {'language_1': language} else: dic_files[id_file]['language_2'] = language counter = 1 for filename in dic_files: language_1 = dic_files[filename]['language_1'] language_2 = dic_files[filename]['language_2'] id_file_1 = name_folder+'_'+filename+'_'+language_1 id_file_2 = name_folder+'_'+filename+'_'+language_2 try: file_1 = codecs.open(input_folder+''+filename+'_'+language_1+'.snt', 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the '+input_folder+''+filename+'_'+language_1+'.snt file' sys.exit(2) try: file_2 = codecs.open(input_folder+''+filename+'_'+language_2+'.snt', 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the '+input_folder+''+filename+'_'+language_2+'.snt file' sys.exit(2) content_1 = '' for line in file_1: #if line.strip(): content_1 += line content_2 = '' for line in file_2: #if line.strip(): content_2 += line if database.exists(language_1, id_file_1): if not database.exists(language_2, id_file_2): database.insertInExisting(language_1, id_file_1, language_2, id_file_2, content_2) else: if database.exists(language_2, id_file_2): database.insertInExisting(language_2, id_file_2, language_1, id_file_1, content_1) else: database.insertNewData(language_1, id_file_1, content_1, language_2, id_file_2, content_2, type_corpus, counter) counter += 1