Пример #1
0
class Convert:

    MODEL_PATH = os.path.abspath(os.path.join(os.getcwd(), MODEL_PATH))

    def __init__(self, filename):

        self.filename = filename
        self.translator = Translator()
        self.translator.load_model(self.MODEL_PATH)
        self.image_out_path = os.path.join(os.getcwd(), "./data/images/")
        self.outfilename = os.path.join(os.path.dirname(self.filename),
                                        "output" + str(uuid.uuid1()) + ".txt")
        self.english = list()
        self.hindi = list()

    def extract_text(self):

        PDF_file = self.filename
        out_folder_name = os.path.basename(self.filename)

        if not os.path.exists(self.image_out_path):
            os.mkdir(self.image_out_path)

        if not os.path.exists(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name))):
            os.mkdir(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name)))

        index = 0
        maxPages = pdf2image._page_count(PDF_file)
        for page in range(0, maxPages, 10):
            pages = pdf2image.convert_from_path(PDF_file,
                                                dpi=200,
                                                first_page=page,
                                                last_page=min(
                                                    page + 10 - 1, maxPages))
            for tpage in pages:
                tpage.save(
                    os.path.abspath(
                        os.path.join(self.image_out_path, out_folder_name,
                                     str(index) + ".jpg")), 'JPEG')
                index = index + 1

        print("Successfully saved images for each page for {}".format(
            self.image_out_path))

        english_text = list()

        for filename in sorted(os.listdir(
                os.path.join(self.image_out_path, out_folder_name)),
                               key=lambda x: int(os.path.splitext(x)[0])):
            if filename.endswith("jpg"):
                text = str(((pytesseract.image_to_string(
                    Image.open(
                        os.path.join(self.image_out_path, out_folder_name,
                                     filename))))))
                text = text.replace('-\n', '')
                english_text.append(text)

        corpus = " ".join(english_text)
        corpus = re.sub(r'\n+', '\n', corpus).strip()
        corpus = TextBlob(corpus)
        for sentence in corpus.sentences:
            self.english.append(sentence.string.replace("\n", " "))
        print("English Text Extracted is : {}".format(self.english))
        shutil.rmtree(self.image_out_path)

    def convert_text(self):

        for sourceText in self.english:
            output = self.translator.translate(sourceText, verbose=False)[0]
            self.hindi.append(output)

        print("Hindi Converted Text : {}".format(self.hindi))

    def create_pdf(self):

        f = open(self.outfilename, 'w')
        for item in self.hindi:
            f.write("%s\n" % item)
        f.close()

        return self.outfilename
import pickle as pkl
import torch
from language import Language
from translator import Translator


app = flask.Flask(__name__)
app.config['DEBUG'] = True

english_language = pkl.load(
    open('./data/languages/english_language', 'rb'))

norwegian_language = pkl.load(
    open('./data/languages/norwegian_language', 'rb'))

file_load_name = '../data/model/translator_model_transformer.py'
translator = Translator(english_language, norwegian_language, cuda='cpu')
translator.load_model(file_load_name)


@app.route('/translate', methods=['GET'])
def translate():

    english = request.args['english']
    norwegian = translator.translate(english)

    return "<h1>%s</h1><h1>%s</h1>" % (english, norwegian)


app.run()