Exemplo n.º 1
0
 def test_ec(self):
     logging.info("test_")
     line = '我们现今所使用的大部分舒学符号'  # ,你们用的什么婊点符号
     logging.info('input sentence is: %s', line)
     corrected_sent, correct_ranges = correct(line)
     logging.info('corrected_sent: %s', corrected_sent)
     logging.info('correct_ranges: %s', correct_ranges)
Exemplo n.º 2
0
def eval_bcmi_data(data_path, verbose=False):
    sentence_size = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            error_sentence, right_sentence = get_bcmi_corpus(line)
            if not error_sentence:
                continue
            pred_sentence, pred_detail = corrector.correct(error_sentence)
            if verbose:
                print('input sentence:', error_sentence)
                print('pred sentence:', pred_sentence)
                print('right sentence:', right_sentence)
            sentence_size += 1
            if right_sentence == pred_sentence:
                right_count += 1
                right_result[error_sentence] = [right_sentence, pred_sentence]
            else:
                wrong_result[error_sentence] = [right_sentence, pred_sentence]
    if verbose:
        print('right count:', right_count, ';sentence size:', sentence_size)
    return right_count / sentence_size, right_result, wrong_result
Exemplo n.º 3
0
def run(image_path):
    image_path_ = image_path.rsplit('.', 1)[0]
    preprocess_image_path = image_path_ + '_bw.jpg'
    audio_path = image_path_ + '.wav'

    image = cv2.imread(image_path)

    image_bw = segmenter.preprocess(image)

    cv2.imwrite(preprocess_image_path, image_bw)

    lines = segmenter.segment_lines(image_bw)

    #classified_text = "රජගහා විහාරෆ හඤූතර තරඔක් කදූභැටියරී වර පසින් ඇහ් අ"
    classified_text = ""

    for i, line in enumerate(lines):
        character_images = segmenter.segment_line(line, i)
        for character_image in character_images:
            classified_text += classifier.classify(character_image)
            #classified_text += "2"

    #classified_text = "රජගහා විහාරෆ හඤූතර තරඔක් කදූභැටියරී වර පසින් ඇහ්"

    # remove extra spaces

    #classified_text = ""
    classified_text = classified_text.strip()
    classified_text = " ".join(classified_text.split())

    #join modifiers
    classified_text = classifier.join_modifiers(classified_text)

    print classified_text

    corrected_words = corrector.correct(classified_text)

    corrected_text = ""

    for words in corrected_words:
        corrected_text += words[0].encode("utf-8") + " "

    print corrected_text

    synthesized_data = synthesizer.synthesize(corrected_text)

    audio_outfile = wave.open(audio_path, 'wb')

    audio_outfile.setparams(synthesized_data[0][0])

    for i in range(0, len(synthesized_data), 1):
        audio_outfile.writeframes(synthesized_data[i][1])

    # classified_text = "රජගහා විහාරෆ හඤූතර තරඔක් කදූභැටියරී වර පසින් ඇහ් අ"
    # corrected_words = [[1111111,2,"old1"],[3233333,4,"old2"]]

    return classified_text, corrected_words, image_path_, audio_path
Exemplo n.º 4
0
def ocr(warp_id):
    '''
    Input -
    {
        [crop: {
            [left: int]
            [right: int]
            [top: int]
            [bottom: int]
        }]
    }

    Output -
        content
        translated
    '''

    img_path = get_warp_image_path(warp_id) 
    img = cv2.imread(img_path, flags=cv2.IMREAD_GRAYSCALE)
    if img is None:
        fl.abort(404)

    json = request.get_json()
    try:
        if "crop" in json:
            crop_obj = json["crop"]
            left = none_or_int(crop_obj.get("left"))
            right = none_or_int(crop_obj.get("right"))
            top = none_or_int(crop_obj.get("top"))
            bottom = none_or_int(crop_obj.get("bottom"))
            img = img[top:bottom, left:right]
    except (KeyError, ValueError):
        fl.abort(400)

    with NamedTemporaryFile(suffix='.ppm') as f:
        cv2.imwrite(f.name, img)
        proc = Popen([os.path.join(CURRENT_DIRECTORY, "nhocr"),
                      f.name,
                      "-o",
                      "-",
                      "-block"],
                     stdout=PIPE)
        buf = u"".join([line.decode("utf-8")
                        for line in proc.stdout])
        proc.wait()
    buf = u"\n".join(buf.splitlines())
    content = correct(buf)
    print content.encode("utf-8")
    annotated, translated = ja_ko_translator(content)

    return fl.jsonify(
        content=annotated,
        translated=translated
    )
Exemplo n.º 5
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = corrector.correct(error_sentence)
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence:', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # if right_r == pred_r:
            #     right_count += 1
            #     right_result[error_sentence] = [right_r, pred_r]
            # else:
            #     wrong_result[error_sentence] = [right_r, pred_r]
            if verbose:
                print('right: {} => {} , index: {}'.format(
                    right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Exemplo n.º 6
0
        ("xsel --version", 'You need to install xsel'),
        ("xdotool --help", 'You need to install xdotool'),
        ('xte -help > /dev/null', 'You need to install xautomation'),
        ("xvkbd -no-sync -no-repeat -xsendevent -text '\C' > /dev/null",
         'You need to install xvkbd'),
    )
    try:
        for command, error_text in commands:
            ret_code = call(command, shell=True)
            if ret_code != 0:
                raise OSError(error_text)
    except OSError as e:
        print('\n' + '!' * 30)
        print("Test failed:\n" + str(e))
        print('!' * 30 + '\n')
        exit(1)


if __name__ == '__main__':
    print('Keyboard layout corrector. \r\nTesting system environment...')
    check_env()

    print('Test example:')
    test_str = 'ghbdtn'
    corrected_str = correct(test_str)

    print('"%s" -> "%s"' % (test_str, corrected_str))

    if corrected_str == 'привет':
        print('Test passed.')
Exemplo n.º 7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from subprocess import call
from corrector import correct

__author__ = 'Odarchenko N.D.'


if __name__ == '__main__':
    print('Keyboard layout corrector. \r\nTesting system environment...')
    commands = (
        ("xsel --clear", 'You need to install xsel'),
        ('setxkbmap > /dev/null', 'You need to install setxkbmap'),
        ("xvkbd -xsendevent -text '\C'>/dev/null", 'You need to install xvkbd'),
    )
    try:
        for command, error_text in commands:
            ret_code = call(command, shell=True)
            if ret_code != 0:
                raise OSError(error_text)
    except OSError as e:
        print('\n' + '!' * 30)
        print("Test failed:\n" + str(e))
        print('!' * 30 + '\n')
        exit(1)
    print('Test example:')
    test = 'ghbdtn'
    print('"%s" -> "%s"' % (test, correct(test)))
    print('Test passed.')
Exemplo n.º 8
0
# -*- coding: utf-8 -*-

import numpy as np
# import segmenter
# import classifier
import corrector
import codecs
# import synthesizer

file = open('corrector/input.txt', 'r')
input_text = file.read()
# input_text = raw_text[3:]
print input_text

corrected_text = corrector.correct(input_text)

print '%s' % ''.join([' , '.join('%s' % ' '.join(e) for e in corrected_text)])

# corrected_text = corrector.correct(classified_text)
#print "corrected text : " + corrected_text

# synthesized_voice = synthesizer.synthesize(corrected_text)
Exemplo n.º 9
0
        obj = json.loads(resp_payload)
        resultData = obj["resultData"]

        annotated = content
        hurigana = obj.get("hurigana", [])
        for item in hurigana:
            from_ = item["z"]
            to = item["h"]
            annotated = annotated.replace(from_, from_ + "(" + to + ")")
        return (annotated, resultData)

    return trans


if __name__ == '__main__':
    trans = build_trans()

    src = sys.stdin.read().decode("utf-8")
    print "[Original]"
    print src.encode("UTF-8")
    src = correct(src)
    orig, result = trans(src)
    if result is None:
        sys.exit(1)
    result = result.encode("utf-8")

    print "[Translated]"
    sys.stdout.write(orig)
    sys.stdout.write("\n")
    sys.stdout.write(result)
Exemplo n.º 10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from subprocess import call
from corrector import correct

__author__ = 'Odarchenko N.D.'

if __name__ == '__main__':
    print('Keyboard layout corrector. \r\nTesting system environment...')
    commands = (
        ("xsel --clear", 'You need to install xsel'),
        ('setxkbmap > /dev/null', 'You need to install setxkbmap'),
        ("xvkbd -xsendevent -text '\C'>/dev/null",
         'You need to install xvkbd'),
    )
    try:
        for command, error_text in commands:
            ret_code = call(command, shell=True)
            if ret_code != 0:
                raise OSError(error_text)
    except OSError as e:
        print('\n' + '!' * 30)
        print("Test failed:\n" + str(e))
        print('!' * 30 + '\n')
        exit(1)
    print('Test example:')
    test = 'ghbdtn'
    print('"%s" -> "%s"' % (test, correct(test)))
    print('Test passed.')
Exemplo n.º 11
0
    def correct_content(self, content, language):
        # TODO to be moved to LT processes class
        # Segments and sends the content to LT according to the
        # public api rate limits
        # http://wiki.languagetool.org/public-http-api

        if os.path.isfile(self.outpath):
            msg = 'title exists in cache: %s'%self.title
            print(self.outpath)
            print(msg)
            logging.info(msg)
            with open(self.outpath) as f:
                responses = json.load(f)
            return responses
        else:
            responses = {'title': self.title, 'results': []}
            if self.online:
                per_req_size_limit = 6e3 # KB
                sentences = content.split('. ')
                requests = []
                test_chunks = []
                chunk = []
                for sentence in sentences:
                    chunk.append(sentence)
                    total_chunk = '. '.join(chunk)
                    if sys.getsizeof(total_chunk) > per_req_size_limit:
                        requests.append(total_chunk)
                        test_chunks.append((chunk[0], chunk[-1]))
                        chunk = []
                if chunk:
                    # add last chunk
                    requests.append('. '.join(chunk))
                    test_chunks.append((chunk[0], chunk[-1]))

                # send requests to api
                # TODO smarter rate limit control needed
                total_requests = len(requests)
                for i, request in enumerate(requests):
                    try:
                        response = api.check(request,
                                         api_url=self.languagetool,
                                         lang=language)
                    # TODO check language, if confidence lower than 0.90 resend
                    except Exception as e:
                        msg = "%s language error. Trying to detect the language."\
                              ""%language
                        logging.warning(msg)
                        response = api.check(test_chunks[i][1],
                                         api_url=self.languagetool,
                                         lang=language)
                        language_bottom = response['language']['detectedLanguage']['code']
                        response = api.check(test_chunks[i][0],
                                         api_url=self.languagetool,
                                         lang=language_bottom)
                        language_top = response['language']['detectedLanguage']['code']
                        if language != language_top:
                            language = language_top
                        else:
                            language = language_bottom
                        msg = "%s detected as new language"%language
                        logging.info(msg)
                        response = api.check(request,
                                         api_url=self.languagetool,
                                         lang=language)
                    message = '%i/%i response sent'%(i+1, total_requests)
                    print(message)
                    logging.info(message)
                    if i+1 != total_requests:
                        # wait at all except the last LT api call
                        time.sleep(4)
                    responses['results'].append({'content': request,
                                                   'response': response})
            else:
                chunks = corrector.get_chunks(content)
                corrector.correct(chunks, responses)

            with open(self.outpath, 'w') as out:
                json.dump(responses, out, indent = 2)
            return responses
Exemplo n.º 12
0
    def correcttest(self):
        if not login.current_user.is_authenticated:
            return redirect(url_for('.login_view'))

        found = tests.find()

        if request.method == 'POST':
            title = request.form.get('title')
            tfound = tests.find_one({"TITLE": title})
            # if request does not contain the file part
            if 'file' not in request.files:
                flash('No file was sent', category='danger')
                return redirect(request.url)
            file = request.files['file']
            # if user does not select file, browser will
            # submit an empty part without filename
            if file.filename == '':
                flash('No file was selected', category='danger')
                return redirect(request.url)
            # if file was selected but of the wrong type
            if file and not self.allowed_file(file.filename):
                flash('Please select a .pdf file', category='danger')
                return redirect(request.url)
            # if file was selected & is correct type
            if file and self.allowed_file(file.filename):
                as_jpeg = PDF2jpg.convert(file)
                # fetches the answer key corresponding to the test
                key = self.getAnswerKey(tfound)
                print(key)
                # corrects the test image using the answer key
                # returns (location, score, correct, AMOUNT)
                loc, corr, am, sc, flag = corrector.correct(as_jpeg, key)
                curr_time = time.localtime()
                ctime = time.strftime('%a, %d %b %Y %H:%M:%S GMT', curr_time)
                corrected = {
                    "TEST": title,
                    "SCORE": sc,
                    "CORRECT": corr,
                    "AMOUNT": am,
                    "FLAG": flag,
                    "CREATED": ctime
                }
                # insert into the db
                result = results.insert_one(corrected)
                # obtain the MongoDB ObjectID in string form
                id = str(result.inserted_id) + '.png'
                # move and give a unique name to the test image for storage
                # destination = path to file
                destination = shutil.move(loc, 'results/' + id)
                # update the document with test file location
                results.update({'_id': result.inserted_id},
                               {"$set": {
                                   "HREF": id
                               }},
                               upsert=False)
                print('NEW_FILE_SAVED={}'.format(destination))
                flash("File was corrected. Visit 'Test Results' to see scores",
                      category='success')
                return render_template('sb-admin/pages/uploadtest.html',
                                       tests=found,
                                       admin_view=self)

        self.header = "Correct Test"
        return render_template('sb-admin/pages/uploadtest.html',
                               tests=found,
                               admin_view=self)
 def test_correct(self):
     response = {'title':'test'}
     correct(self.test_chunks, response)
     self.assertNotEqual(len(response['results']), 0)