def saveTranslateDocx(): start_time = int(round(time.time() * 1000)) log.info('uploadTranslateDocx: started at ' + str(start_time)) if (request.form.getlist('basename') is None or not isinstance(request.form.getlist('basename'), list)): res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getres( ), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status'] basename = request.form.getlist('basename')[0] current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") f = request.files['file'] filepath = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_u.docx') index = 0 while (os.path.exists(filepath)): filepath = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_' + str(index) + '_u.docx') index = index + 1 f.save(filepath) res = CustomResponse(Status.SUCCESS.value, basename + '_' + str(index) + '_u' + '.docx') translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__translate_uploaded=True) log.info('uploadTranslateDocx: ended at ' + str(getcurrenttime()) + 'total time elapsed : ' + str(getcurrenttime() - start_time)) return res.getres()
def write_document_basename(basename): log.info('write_document_basename : started for ' + basename) with app.app_context(): filepath = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_s.docx') filepath_processed = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_t' + '.docx') xml_content = docx_helper.get_document_xml(filepath) xmltree = docx_helper.get_xml_tree(xml_content) nodes = [] for node, text in docx_helper.itertext_old(xmltree): nodes.append(node) for node in nodes: node_id = node.attrib['id'] if node.text is not None and node.text.strip() is not '': text_node = TextNode.objects(node_id=node_id, basename=basename) text_node_len = get_text_node_len(text_node) if text_node is not None and not text_node_len == 0: tgt_text = get_tgt_text(text_node) node.text = tgt_text docx_helper.save_docx(filepath, xmltree, filepath_processed, None) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_PROCESSED, set__feedback_pending=True) log.info('write_document_basename : ended for ' + basename)
def fetch_translation_process(): log.info('fetch_translation_process : started at ' + str(getcurrenttime())) try: transalationProcess = TranslationProcess.objects(created_by=request.headers.get('ad-userid')).order_by( '-basename').to_json() res = CustomResponse(Status.SUCCESS.value, json.loads(transalationProcess)) except: log.info('fetch-translation-process : ERROR occured') pass log.info('fetch_translation_process : ended at ' + str(getcurrenttime())) return res.getres()
def delete_process(): log.info('delete_process: started at ' + str(getcurrenttime())) try: basename = request.form.getlist('processname')[0] log.info('delte_process : requested basename is : ' + basename) translationProcess = TranslationProcess.objects(basename=basename).delete() log.info('delete_process: ended at ' + str(getcurrenttime())) res = CustomResponse(Status.SUCCESS.value, basename) except: log.info('delte_process : ERROR while processing basename : ' + basename) res = CustomResponse(Status.FAILURE.value, basename) return res.getres()
def translate(): pool = mp.Pool(mp.cpu_count()) basename = str(int(time.time())) current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") f = request.files['file'] filepath = os.path.join( app.config['UPLOAD_FOLDER'], basename + '.pdf') translationProcess = TranslationProcess( status=STATUS_PROCESSING, name=f.filename, created_on=current_time, basename=basename) translationProcess.save() f.save(filepath) pool.apply_async(converttoimage, args=( filepath, app.config['UPLOAD_FOLDER'], basename, '_hin'), callback=capturetext) pool.close() pool.join() filtertext(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin.txt', app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt') processenglish(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt') translatewithanuvadaeng(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt', app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_tran.txt') f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_tran.txt', 'r') english_res = [] hindi_res = [] for f in f_eng: english_res.append(f) f_eng.close() f_hin = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt', 'r') for f in f_hin: hindi_res.append(f) f_hin.close() data = {'hindi': hindi_res, 'english': english_res} translations = [] for i in range(0, len(hindi_res)): translation = Translation(basename=str( basename), source=hindi_res[i], target=english_res[i]) translations.append(translation) Translation.objects.insert(translations) for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'): os.remove(f) res = CustomResponse(Status.SUCCESS.value, data) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_PROCESSED) return res.getres()
def write_document(): consumer = get_consumer(TOPIC_TO_PROCESS) if consumer is None: raise Exception('Kafka consumer not available, aborting process') try: for msg in consumer: basename = str(msg.value) log.info('write_document : started for ' + basename) with app.app_context(): filepath = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_s.docx') filepath_processed = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_t' + '.docx') xml_content = docx_helper.get_document_xml(filepath) xmltree = docx_helper.get_xml_tree(xml_content) nodes = [] for node, text in docx_helper.itertext_old(xmltree): nodes.append(node) for node in nodes: node_id = node.attrib['id'] if node.text is not None and node.text.strip() is not '': text_node = TextNode.objects(node_id=node_id, basename=basename) log.info('write_document : text_node object is == ' + str(json.loads(text_node.to_json()))) text_node_len = get_text_node_len(text_node) log.info( 'write_document : text_node object len is == ' + str(text_node_len)) if text_node is not None and not text_node_len == 0: tgt_text = get_tgt_text(text_node) node.text = tgt_text docx_helper.save_docx(filepath, xmltree, filepath_processed, None) translationProcess = TranslationProcess.objects( basename=basename) translationProcess.update(set__status=STATUS_PROCESSED) log.info('write_document : ended for ' + basename) except Exception as e: log.error('write_document : ERROR OCCURRED : NMT SERVER ERROR ' + str(e)) write_document()
def translateFile(): pool = mp.Pool(mp.cpu_count()) basename = str(int(time.time())) current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") f = request.files['file'] filepath = os.path.join( app.config['UPLOAD_FOLDER'], basename + '.pdf') translationProcess = TranslationProcess( status=STATUS_PROCESSING, name=f.filename, created_on=current_time, basename=basename) translationProcess.save() f.save(filepath) pool.apply_async(converttoimage, args=( filepath, app.config['UPLOAD_FOLDER'], basename, ''), callback=capturealtotext) pool.close() pool.join() res = CustomResponse(Status.SUCCESS.value, '') translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_PROCESSED) return res.getres()
def download_docx(): log.info('download-docx: started') filename = request.args.get('filename') if filename == '': return CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, 'filename missing').getres() try: filename_without_docx = filename.split('.docx')[0] n_filename = filename_without_docx.split('_') try: log.info('download-docx: finding process from basename : ' + str(n_filename[0])) translationProcess = TranslationProcess.objects( basename=n_filename[0]) if translationProcess is not None: data = translationProcess[0]['name'] if len(n_filename) > 1: data = data.split('.docx')[0] + '_translated.docx' log.info( 'download-docx: process found for basename with name = ' + str(data)) result = flask.send_file(os.path.join('upload/', filename), as_attachment=True, attachment_filename=data) result.headers["x-suggested-filename"] = data except Exception as e: log.info( 'download-docx: error in finding process for basename : ' + str(n_filename)) result = flask.send_file(os.path.join('upload/', filename), as_attachment=True, attachment_filename="default.docx") result.headers["x-suggested-filename"] = filename return result except Exception as e: return CustomResponse(Status.DATA_NOT_FOUND.value, 'file not found').getres()
def get_pending_nodes(): no_of_nodes = 0 node_received = 0 try: translationProcess = TranslationProcess.objects( status=STATUS_PROCESSING) for tp in translationProcess: doc_nodes = DocumentNodes.objects(basename=tp['basename']) try: no_of_nodes = no_of_nodes + doc_nodes[0]['nodes_sent'] node_received = node_received + doc_nodes[0]['nodes_received'] except Exception as e: log.info( 'get_pending_nodes : Exception occured while counting nodes for basename = ' + tp['basename'] + ' with error ' + str(e)) pass log.info('get_pending_nodes : nodes details == total_nodes : ' + str(no_of_nodes) + ', node_completed : ' + str(node_received)) return no_of_nodes - node_received except Exception as e: log.info('get_pending_nodes : Exception occured : error is = ' + str(e)) return 0
def translate_docx_v2(): start_time = int(round(time.time() * 1000)) log.info('translate_docx_v2: started at ' + str(start_time)) basename = str(int(time.time())) current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") iso_date = datetime.now().isoformat() f = request.files['file'] filepath = os.path.join(app.config['UPLOAD_FOLDER'], basename + '.docx') sourceLang = request.form.getlist('sourceLang')[0] targetLang = request.form.getlist('targetLang')[0] sourceLang_code = request.form.getlist('sourceLangCode')[0] targetLang_code = request.form.getlist('targetLangCode')[0] model_meta_data = request.form.getlist('model')[0] log.info('model meta data' + model_meta_data) model_obj = json.loads(model_meta_data) model_id = int(model_obj['model_id']) url_end_point = 'translation_en' if 'url_end_point' in model_obj: url_end_point = model_obj['url_end_point'] targetLang_code = model_obj['target_language_code'] sourceLang_code = model_obj['source_language_code'] log.info('translate_docx_v2: started at ' + str(start_time)) translationProcess = TranslationProcess( created_by=request.headers.get('ad-userid'), status=STATUS_PROCESSING, name=f.filename, created_on=current_time, basename=basename, sourceLang=sourceLang, targetLang=targetLang) translationProcess.save() f.save(filepath) filename_to_processed = f.filename filepath_processed = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_t' + '.docx') filepath_processed_src_with_ids = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_s' + '.docx') log.info("translate_docx_v2 : file name " + filename_to_processed) xmltree = None try: xml_content = docx_helper.get_document_xml(filepath) xmltree = docx_helper.get_xml_tree(xml_content) except Exception as e: log.info( 'translate_docx_v2 : Error while extracting docx, trying to convert it to docx from doc' ) try: docx_helper.convert_DOC_to_DOCX(filepath) xml_content = docx_helper.get_document_xml(filepath) xmltree = docx_helper.get_xml_tree(xml_content) log.info('translate_docx_v2 : doc to docx conversion successful') except Exception as e: log.error( 'translate_docx_v2 : Error while extracting docx files. error is = ' + str(e)) log.error( 'translate_docx_v2 : Error while extracting docx files. uploaded file is corrupt' ) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_FAILED) res = CustomResponse(Status.CORRUPT_FILE.value, 'uploaded file is corrupt') log.info('translate_docx_v2: ended at ' + str(getcurrenttime()) + 'total time elapsed : ' + str(getcurrenttime() - start_time)) return res.getres(), 500 nodes = [] texts = [] if xmltree is None: res = CustomResponse(Status.CORRUPT_FILE.value, 'uploaded file is corrupt') log.info('translate_docx_v2: ended at ' + str(getcurrenttime()) + 'total time elapsed : ' + str(getcurrenttime() - start_time)) return res.getres(), 500 try: docx_helper.add_identification_tag(xmltree, basename) docx_helper.pre_process_text(xmltree) except Exception as e: log.error( 'translate_docx_v2 : error occureed for pre-processing document. Error is ' + str(e)) log.info('translate_docx_v2 : not pre-processing document') docx_helper.warp_original_with_identification_tags( filepath, xmltree, filepath_processed_src_with_ids) word_count = 0 for node, text in docx_helper.itertext_old(xmltree): nodes.append(node) texts.append(text) if text is not None: word_count = word_count + len(text.split(' ')) doc_report = {} doc_report['word_count'] = word_count doc_report['sentence_count'] = len(texts) doc_report['source_lang'] = sourceLang doc_report['target_lang'] = targetLang doc_report['user_id'] = request.headers.get('ad-userid') userhighcourt_obj = Userhighcourt.objects( user_id=request.headers.get('ad-userid')) if userhighcourt_obj and len(userhighcourt_obj) > 0: userhighcourt_dict = json.loads(userhighcourt_obj.to_json()) if 'high_court_code' in userhighcourt_dict[0]: high_court_obj = Highcourt.objects( high_court_code=userhighcourt_dict[0]['high_court_code']) if high_court_obj and len(high_court_obj) > 0: highcourt_dict = json.loads(high_court_obj.to_json()) if 'high_court_name' in highcourt_dict[0]: doc_report['high_court_name'] = highcourt_dict[0][ 'high_court_name'] doc_report['high_court_code'] = userhighcourt_dict[0][ 'high_court_code'] try: profile = requests.get(PROFILE_REQ_URL + request.headers.get('ad-userid')).content profile = json.loads(profile) doc_report['username'] = profile['username'] except Exception as e: log.error( 'translate_docx_v2 : error occurred for profile fetching, error is = ' + str(e)) doc_report['document_id'] = basename doc_report['created_on'] = current_time doc_report['created_on_iso'] = iso_date log.info('sending data to elasticsearch ==' + str(doc_report)) try: create_dashboard_report(doc_report, ELASTIC_INDEX) except Exception as e: log.error( 'translate_docx_v2 : error occurred for report saving, error is = ' + str(e)) log.info('translate_docx_v2 : number of nodes = ' + str(len(nodes)) + ' and text are : ' + str(len(texts))) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__eta=(TEXT_PROCESSING_TIME) * (len(texts) + get_pending_nodes()) / 25) total_nodes = get_total_number_of_nodes_with_text(nodes) try: doc_nodes = DocumentNodes(basename=basename, created_date=current_time, total_nodes=total_nodes, nodes_sent=0, nodes_received=0, is_complete=False) doc_nodes.save() send_nodes(nodes, basename, model_id, url_end_point, targetLang_code, sourceLang_code, texts) res = CustomResponse(Status.SUCCESS.value, 'file has been queued') translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_PROCESSING) log.info('translate_docx_v2: ended at ' + str(getcurrenttime()) + 'total time elapsed : ' + str(getcurrenttime() - start_time)) return res.getres() except Exception as e: log.error( 'translate_docx_v2 : error occurred file not processing, Error is = ' + str(e)) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_FAILED) res = CustomResponse(Status.FAILURE.value, 'something went wrong') log.info('translate_docx_v2: ended at ' + str(getcurrenttime()) + 'total time elapsed : ' + str(getcurrenttime() - start_time)) return res.getres(), 500
def translateDocx(): start_time = int(round(time.time() * 1000)) log.info('translateDocx: started at ' + str(start_time)) basename = str(int(time.time())) current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") f = request.files['file'] filepath = os.path.join(app.config['UPLOAD_FOLDER'], basename + '.docx') sourceLang = request.form.getlist('sourceLang')[0] model_meta_data = request.form.getlist('model')[0] log.info('model meta data' + model_meta_data) model_obj = json.loads(model_meta_data) url_end_point = 'translation_en' model_id = int(model_obj['model_id']) if 'url_end_point' in model_obj: url_end_point = model_obj['url_end_point'] targetLang = request.form.getlist('targetLang')[0] translationProcess = TranslationProcess( created_by=request.headers.get('ad-userid'), status=STATUS_PROCESSING, name=f.filename, created_on=current_time, basename=basename, sourceLang=sourceLang, targetLang=targetLang) translationProcess.save() f.save(filepath) filename_to_processed = f.filename filepath_processed = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_t' + '.docx') filepath_processed_src_with_ids = os.path.join(app.config['UPLOAD_FOLDER'], basename + '_s' + '.docx') log.info("translate-doxc : " + filename_to_processed) xml_content = docx_helper.get_document_xml(filepath) xmltree = docx_helper.get_xml_tree(xml_content) nodes = [] texts = [] docx_helper.add_identification_tag(xmltree, basename) docx_helper.warp_original_with_identification_tags( filepath, xmltree, filepath_processed_src_with_ids) docx_helper.pre_process_text(xmltree) for node, text in docx_helper.itertext(xmltree): nodes.append(node) texts.append(text) log.info('translateDocx: number of nodes ' + str(len(nodes)) + ' and text are : ' + str(len(texts))) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__eta=(TEXT_PROCESSING_TIME) * (len(texts) + get_pending_nodes()) / 25) """ method which don't use tokenization """ # docx_helper.modify_text(nodes) nodes_first_page = [] # nodes_first_page = modify_first_page.get_first_page_nodes(nodes) # first_page_node_len = modify_first_page.get_size(nodes_first_page) # node_after_first_page = modify_first_page.get_nodes_after_f_page(nodes, first_page_node_len) # # modify_first_page.modify_text_on_first_page_using_model(nodes_first_page, model_id, url_end_point) docx_helper.modify_text_with_tokenization(nodes, None, model_id, url_end_point) # xml_footer_list = translate_footer.translate_footer(filepath, model_id, url_end_point) docx_helper.save_docx(filepath, xmltree, filepath_processed, None) res = CustomResponse(Status.SUCCESS.value, basename + '_t' + '.docx') translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_PROCESSED) log.info('translateDocx: ended at ' + str(getcurrenttime()) + 'total time elapsed : ' + str(getcurrenttime() - start_time)) return res.getres()
def translateFile(): pool = mp.Pool(mp.cpu_count()) basename = str(int(time.time())) current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") f = request.files['file'] filepath = os.path.join( app.config['UPLOAD_FOLDER'], basename + '.pdf') translationProcess = TranslationProcess( status=STATUS_PROCESSING, name=f.filename, created_on=current_time, basename=basename) translationProcess.save() f.save(filepath) pool.apply_async(converttoimage, args=( filepath, app.config['UPLOAD_FOLDER'], basename, '_hin'), callback=capturetext) pool.close() pool.join() filtertext(app.config['UPLOAD_FOLDER'] + '/'+basename+'_hin.txt', app.config['UPLOAD_FOLDER'] + '/'+basename+'_hin_filtered.txt') processenglish(app.config['UPLOAD_FOLDER'] + '/'+basename+'_hin_filtered.txt') translatewithanuvadaeng(app.config['UPLOAD_FOLDER'] + '/'+basename+'_hin_filtered.txt', app.config['UPLOAD_FOLDER'] + '/'+basename+'_eng_tran.txt') f_eng = open(app.config['UPLOAD_FOLDER']+'/' + basename + '_eng_tran.txt', 'r') english_res = [] hindi_res = [] for f in f_eng: english_res.append(f) f_eng.close() f_hin = open(app.config['UPLOAD_FOLDER']+'/' + basename + '_hin_filtered.txt', 'r') index = 0 previousY = 0 previousX = 0 previousH = 0 previousP = '' text_y = {} text_x = 0 for f in f_hin: hindi_res.append(f) print(f) point = fetchwordhocrfromsentence(f, basename) english = english_res[index] words = english.split(' ') wordIndex = 0 for word in words: try: if point['values'] is not None and point['values'][wordIndex] is not None and point['values'][wordIndex]['height'] is not None: previousY = point['values'][wordIndex]['left'] previousX = point['values'][wordIndex]['top'] previousH = point['values'][wordIndex]['height'] try: if text_y[point['values'][wordIndex]['imagepath']] is None: text_y[point['values'][wordIndex]['imagepath']] = 200 except Exception as e: text_y[point['values'][wordIndex]['imagepath']] = 200 (text_x, vertical) = puttext(point['values'][wordIndex]['height'],200,text_y[point['values'][wordIndex]['imagepath']],english,point['values'][wordIndex]['imagepath']) text_y[point['values'][wordIndex]['imagepath']] = vertical # else: # (text_x, text_y) = puttext(point['values'][wordIndex]['height'],point['values'][wordIndex]['left'],point['values'][wordIndex]['top'],english,point['values'][wordIndex]['imagepath']) previousP = point['values'][wordIndex]['imagepath'] break except Exception as e: previousY = previousY + 200 # puttext(previousH,previousY,previousX,word,previousP) wordIndex = wordIndex + 1 # puttext(point['values'][wordIndex]['left'],point['values'][wordIndex]['top'],word,point['values'][wordIndex]['imagepath']) index = index + 1 f_hin.close() data = {'hindi': hindi_res, 'english': english_res} translations = [] for i in range(0, len(hindi_res)): translation = Translation(basename=str( basename), source=hindi_res[i], target=english_res[i]) translations.append(translation) Translation.objects.insert(translations) # for f in glob.glob(app.config['UPLOAD_FOLDER']+'/'+basename+'*'): # os.remove(f) res = CustomResponse(Status.SUCCESS.value, data) translationProcess = TranslationProcess.objects(basename=basename) translationProcess.update(set__status=STATUS_PROCESSED) return res.getres()