예제 #1
0
파일: service.py 프로젝트: AgHarsh/anuvaad
 def tokenisation_core(self, paragraph_data, text_locale):
     try:
         tokenised_text = []
         if text_locale == 'en':
             for paragraph in paragraph_data:
                 tokenised_sentence_data = AnuvaadEngTokenizer().tokenize(
                     paragraph)
                 tokenised_text.extend(tokenised_sentence_data)
         elif text_locale == 'hi':
             for paragraph in paragraph_data:
                 tokenised_sentence_data = AnuvaadHinTokenizer().tokenize(
                     paragraph)
                 tokenised_text.extend(tokenised_sentence_data)
         elif text_locale == 'kn':
             for paragraph in paragraph_data:
                 tokenised_sentence_data = AnuvaadKanTokenizer().tokenize(
                     paragraph)
                 tokenised_text.extend(tokenised_sentence_data)
         return tokenised_text
     except:
         log_exception(
             "tokenisation_core : Error occured during tokenising the paragraphs",
             self.input_json_data, None)
         raise ServiceError(
             400,
             "Tokenisation failed. Something went wrong during tokenisation."
         )
예제 #2
0
 def tokenisation_core(self, paragraph_data, text_locale):
     tokenised_text = []
     for paragraph in paragraph_data:
         if paragraph is not None:
             try:
                 paragraph = self.remove_extra_spaces(paragraph)
                 if text_locale == 'en':
                     tokenised_sentence_data = AnuvaadEngTokenizer(
                     ).tokenize(paragraph)
                     tokenised_text.extend(tokenised_sentence_data)
                 elif text_locale == 'hi' or text_locale == 'mr':
                     tokenised_sentence_data = AnuvaadHindiTokenizer(
                     ).tokenize(paragraph)
                     tokenised_text.extend(tokenised_sentence_data)
                 elif text_locale == 'kn':
                     tokenised_sentence_data = AnuvaadKannadaTokenizer(
                     ).tokenize(paragraph)
                     tokenised_text.extend(tokenised_sentence_data)
                 elif text_locale == 'ta':
                     tokenised_sentence_data = AnuvaadTamilTokenizer(
                     ).tokenize(paragraph)
                     tokenised_text.extend(tokenised_sentence_data)
                 elif text_locale == 'ml':
                     tokenised_sentence_data = AnuvaadMalayalamTokenizer(
                     ).tokenize(paragraph)
                     tokenised_text.extend(tokenised_sentence_data)
             except:
                 log_exception(
                     "Received error in this text :  %s" % (paragraph),
                     self.input_json_data, None)
                 raise ServiceError(
                     400,
                     "Tokenisation failed. Something went wrong during tokenisation."
                 )
     return tokenised_text
예제 #3
0
파일: service.py 프로젝트: AgHarsh/anuvaad
 def tokenisation_response(self, input_file_data, in_locale, index):
     try:
         output_filepath, output_filename = file_ops.output_path(
             index, self.DOWNLOAD_FOLDER, '.txt')
         tokenised_data = self.tokenisation_core(input_file_data, in_locale)
         self.writing_tokenised_sentence_in_file(tokenised_data,
                                                 output_filepath)
         return output_filename
     except:
         log_exception(
             "tokenisation_response : Error occured during output file creation",
             None, None)
         raise ServiceError(
             400,
             "Tokenisation failed. Something went wrong during output file creation."
         )
예제 #4
0
파일: service.py 프로젝트: eagle-sb/anuvaad
 def pdf2html(self, input_pdf_file, jobid):
     try:
         output_htmlfiles_path, output_pngfiles_path = pdf_ops.pdf_to_html(
             self.DOWNLOAD_folder, input_pdf_file)
         log_info(
             "pdf2html",
             "successfully received output filepath for HTML and PNG files",
             jobid)
         return output_htmlfiles_path, output_pngfiles_path
     except:
         log_exception("pdf2html",
                       "Error occured during pdf to html conversion", jobid,
                       None)
         raise ServiceError(
             400,
             "pdf2html failed. Something went wrong during conversion.")
예제 #5
0
파일: service.py 프로젝트: AgHarsh/anuvaad
 def adding_tokenised_text_blockmerger(self, input_json_data_pagewise,
                                       in_locale, page_id):
     try:
         blocks = input_json_data_pagewise['text_blocks']
         for block_id, item in enumerate(blocks):
             text_data = item['text']
             tokenised_text = self.tokenisation_core([text_data], in_locale)
             item['tokenized_sentences'] = [
                 self.making_object_for_tokenised_text(
                     text, in_locale, i, block_id, page_id)
                 for i, text in enumerate(tokenised_text)
             ]
         return input_json_data_pagewise
     except:
         log_error(
             "Keys in block merger response changed or tokenisation went wrong.",
             self.input_json_data, None)
         raise ServiceError(
             400,
             "Tokenisation failed. Keys in block merger response changed or tokenisation went wrong."
         )