Пример #1
0
def get_text_from_image(image_data, app_id, app_key, app_secret, api_version=0, timeout=3):
    """
    Get image text use baidu ocr

    :param image_data:
    :param app_id:
    :param app_key:
    :param app_secret:
    :param api_version:
    :param timeout:
    :return:
    """
    client = AipOcr(appId=app_id, apiKey=app_key, secretKey=app_secret)
    client.setConnectionTimeoutInMillis(timeout * 1000)

    options = {}
    options["language_type"] = "CHN_ENG"
    templateSign = "Nsdax2424asaAS791823112"

    #if api_version == 1:
    #    result = client.basicAccurate(image_data, options)
    #else:
    #    result = client.basicGeneral(image_data, options)

    result = client.custom(image_data, templateSign)#client.custom




    if "error_code" in result:
        print("baidu api error: ", result["error_msg"])
        return ""
    return [words["words"] for words in result["words_result"]]
Пример #2
0
class BaiduOcr(object):
    def __init__(self):
        APP_ID = ocrConfig.config['APP_ID']
        API_KEY = ocrConfig.config['API_KEY']
        SECRET_KEY = ocrConfig.config['SECRET_KEY']
        self.client = AipOcr(APP_ID, API_KEY, SECRET_KEY)

    def get_file_content(self, filePath):
        with open(filePath, 'rb') as fp:
            return fp.read()
        # image = get_file_content('example.jpg')

    def getData(self, url):
        # 获取图片信息
        image = requests.get(url)
        with open('123.png', "wb") as code:
            code.write(image.content)
        image = self.get_file_content('123.png')
        # result = self.client.basicAccurate(image.content)
        result = self.client.custom(image, 'f89f26bdb9a26b18447a025a0e9911a3')
        # result = self.client.webImageUrl(url)
        print(result)
Пример #3
0
        image = get_file_content(fatherUrl + '/img/' +
                                 str(pdfNames[pdfIndex])[:-4] + '/' +
                                 imgFileName[txtIndex])

        print('正在读取【' + imgFileName[txtIndex] + '】' + '   进度为:' +
              str(txtIndex) + '/' + str(imgFileNamesLen))
        """ 如果有可选参数 """
        options = {}
        options["detect_direction"] = "true"
        options["probability"] = "true"
        templateSign = "f3b35f4c3d36db6b89c9608ea288d8b6"
        classifierId = int(1)
        """ 带参数调用通用文字识别, 图片参数为本地图片 """

        try:
            result = client.custom(image, templateSign)
        except Exception as e:
            print('请检查当前网络,问题如下:')
            print(e)
            continue
        # else:
        # print(result)

        try:
            if str(result['data']['isStructured']) == 'False':
                ocrFalseImg.append(imgFileName[txtIndex])
                print('------------------')
                print('【' + str(imgFileName[txtIndex]) + '】' +
                      '   结构化不匹配,跳过【文件写入】循环')
                print('------------------')
                continue
Пример #4
0
class OcrSta:
    def __init__(self, need_dict):
        self.appid = "17190994"
        self.api_key = "QG8ZlQll0h3Ue1sSau33oAXi"
        self.secret_key = "DQSYSiVelTsbX2SF9iY5qkRN8krpMsOM"
        self.model = need_dict.get("model", 0)
        self.templateList = ["", "8bfde40b92dbc0327b9856eceb132881", "d13744efe98c81097165696ecb306ba0", "01a1dc76d0b37cd5540365026ea302f1"] #深渊,副本,团本
        self.table_kw = [set([]), set(["damage", "num", "name"]), set(["name", "num"]), set(["name", "num"])]
        self.picture_path = ["moren/", "shenyuan/", "fuben/", "tuanben/"]
        self.client = AipOcr(self.appid, self.api_key, self.secret_key)
        self.pic_set = self.get_all_picture()
        self.options = {"templateSign": self.templateList[self.model]}
        self.my_info_dict = {}

    def get_all_picture(self):
        path = "./" + self.picture_path[self.model]
        files = os.listdir(path)
        pic_set = set()
        for file in files:
            if ".jpg" in file:
                pic_set.add(path + file)
        return pic_set

    def get_file_content(self, filepath):
        with open(filepath, 'rb') as fp:
            return fp.read()

    def OcrPic(self):
        pic_num = 0
        for pic in self.pic_set:
            pic_num += 1
            print pic_num
            image = self.get_file_content(pic)
            a = self.client.custom(image, self.options)
            print a
            ans_data = a["data"]
            ret = ans_data["ret"]
            name = ""
            #print a
            for item in ret:
                #遍历这张表里的每一行
                info_list = item["word_name"].strip().split('#')
                if self.model == 1: #以下处理深渊
                    if "name" == info_list[2]:
                        name = item["word"].replace(u'\u526f\u65cf\u957f',"").replace(u"\u65cf\u5458","").replace(u"\u65cf\u957f","").replace(u'\u957f\u8001',"").replace(u'\u65b0\u4eba',"").replace(u'\u7cbe\u82f1',"").replace(u'\u8c6a\u6770',"")
                        self.my_info_dict[name] = {"damage":0, "num":0}
                    if "damage" == info_list[2]:
                        self.my_info_dict[name]["damage"] = item["word"]
                    if "num" == info_list[2]:
                        self.my_info_dict[name]["num"] = item["word"]
                elif self.model == 2 or self.model == 3: #以下处理副本和团本
                    if "name" == info_list[2]:
                        name = item["word"].replace(u'\u526f\u65cf\u957f',"").replace(u"\u65cf\u5458","").replace(u"\u65cf\u957f","").replace(u'\u957f\u8001',"").replace(u'\u65b0\u4eba',"").replace(u'\u7cbe\u82f1',"").replace(u'\u8c6a\u6770',"")
                        if not name:
                            name = " "
                        self.my_info_dict[name] = {"num":0}
                    if "num" == info_list[2]:
                        filter_word = ""
                        for char in item["word"]:
                            if char.isdigit():
                                filter_word += char
                        self.my_info_dict[name]["num"] = filter_word

             
    def PrintToScreen(self):
        for key in self.my_info_dict:
            out_str = key + '\t' 
            for label in self.my_info_dict[key]:
                out_str += label + '\t' + str(self.my_info_dict[key][label]) + '\t'
            print out_str

    def PrintToFile(self):
        today = datetime.date.today().strftime("%Y%m%d")
        out_file = self.picture_path[self.model].strip('/') + '_' + str(today)
        f = open(out_file, 'w')
        if self.model == 1: #以下处理深渊
            out_str = "shenyuan" + '\t' + today + '\n'
            f.write(out_str)
            out_str = "名字" + '\t' + "排名" + '\t' + "总伤害" + '\t' + "次数" + '\n'
            f.write(out_str)
            #2019.11.18修改,现在增加表头并且按照数值排序输出
            out_list = []
            for key in self.my_info_dict:
                temp_list = [key, self.my_info_dict[key]['damage'], self.my_info_dict[key]['num']]
                out_list.append(temp_list)
            out_list = sorted(out_list, key = lambda x:int(x[1]), reverse = True)
            index = 1
            for item in out_list:
                out_str = item[0] +  '\t' + str(index) + '\t' + str(item[1]) + '\t' + str(item[2]) + '\n'
                index += 1
                f.write(out_str.encode('gbk', 'ignore'))
        if self.model == 2 or self.model == 3: #以下处理副本和团本
            out_str = "副本名字" + '\t' + today + '\n'
            f.write(out_str)
            out_str = "名字" + '\t' + "成绩" + '\n'
            f.write(out_str)
            #2019.11.18修改,现在增加表头并且按照数值排序输出
            out_list = []
            for key in self.my_info_dict:
                temp_list = [key, self.my_info_dict[key]['num']]
                out_list.append(temp_list)
            out_list = sorted(out_list, key = lambda x:int(x[1]), reverse = True)
            for item in out_list:
                out_str = item[0] + '\t' + str(item[1]) + '\n'
                f.write(out_str.encode('gbk', 'ignore'))
        f.close()
Пример #5
0
    access_token = get_access_token()
    if access_token is None:
        return None
    print access_token
    params = {'access_token': access_token}
    headers = {'Content-type': 'application/x-www-form-urlencoded'}
    rq = {'image': img, 'templateSign': '75728a7d5fbaa201049f6198f651305f2019'}
    res = requests.request(
        'POST',
        'https://aip.baidubce.com/rest/2.0/solution/v1/iocr/recognise',
        params=params,
        headers=headers,
        data=rq)
    if res:
        print res.json()
    return None


image = get_file_content('test_aip.jpg')
data0 = client.basicAccurate(image)
print_result(data0['words_result'])
# get_img(image)
from jy_word.File import File
my_file = File()
data1 = client.custom(image, template_id)
# print data1
my_file.write('aip_test.json', data1)

if __name__ == "__main__":
    pass
Пример #6
0
now = datetime.now().strftime("%Y%m%d%H%M%S")
excelPath = r'gene' + now + '.xlsx'
excelWriter = pd.ExcelWriter(excelPath, engine='openpyxl')
imgs = file_name(file_dir)
if len(imgs) == 0:
    print('not exist images')
else:
    for img in imgs:
        img_path = file_dir + img
        image = get_file_content(img_path)
        img = img.replace('.png', '').replace('.jpg', '')
        total = {}
        t1, t2, t3, t4, t5, t6, t7, key = [], [], [], [], [], [], [], []
        """ 调用自定义模板文字识别 """
        client.custom(image)
        """ 如果有可选参数 """
        options = {}
        options["templateSign"] = "3f8c7bd213fbe2e82d4f3881f450fdb1"  #3突变模板
        # options["classifierId"] = 31232   指定分类器
        """ 带参数调用自定义模板文字识别 """
        result = client.custom(image, options)['data']['ret']
        for r in result:
            word_name = r['word_name'].split("#")[-1]
            word = r['word']
            if word_name not in key:
                key.append(word_name)
            if word_name == '基因':
                t1.append(r['word'] if len(r['word']) > 0 else u'/')
            if word_name == '转录本编号':
                t2.append(r['word'] if len(r['word']) > 0 else u'/')
Пример #7
0
    def handler_ocr(self):
        """
        处理程序:OCR 文本识别
        :return:
        """
        if self.projects['count'] == 0:
            self.cmd.append(self.helper_color('<br>无合同图片,不能进行识别!', 'warning'))
            return

        # 连接百度云
        self.cmd.append('\n正在连接到百度云 OCR 接口...')
        QApplication.processEvents()

        app_id = "11407546"
        api_key = "lY6vQLFc1zMBCotBThZWgEPO"
        secret_key = "FhWvLVonrxbtKa4bKFgVLAFnsMyRKbTU"
        template_id1 = "2a9be4f38c0806b9f5dfd3c3616b7560"
        # template_id2 = "f61002a4a009e77a0cd53775b64a771b"

        client = AipOcr(app_id, api_key, secret_key)
        client.setConnectionTimeoutInMillis(20000)
        client.setSocketTimeoutInMillis(30000)

        # 文本识别
        for project in self.projects['items']:
            # 中途取消扫描
            if self.stop_work:
                return

            index = 0
            self.cmd.append('<br>正在识别 ' + self.helper_color(project['unit'], 'info') + ' 的合同图片...')
            QApplication.processEvents()

            for data in project['data']:
                # 中途取消扫描
                if self.stop_work:
                    return

                index += 1
                data['result'] = {'name': '', 'date': ''}
                data['modified'] = False
                try:
                    with open(data['origin'], 'rb') as fp:
                        image = fp.read()
                        data['ocr_text'] = client.custom(image, template_id1)
                        QApplication.processEvents()

                        if data['ocr_text']['error_code'] != 0:
                            self.cmd.append(str(index) + '/' + str(project['count']) + self.helper_color('  识别失败: ', 'warning')
                                            + self.helper_color(os.path.abspath(data['origin']), 'disabled'))
                            error_code = data['ocr_text']['error_code']
                            error_msg = data['ocr_text']['error_msg']
                            self.cmd.append('---- 错误代码:' + str(error_code))
                            self.cmd.append('---- 错误描述:' + str(error_msg))
                        else:
                            data['result'] = {
                                'name': data['ocr_text']['data']['ret'][1]['word'],
                                'date': data['ocr_text']['data']['ret'][0]['word']
                            }

                            self.cmd.append(str(index) + '/' + str(project['count']) + self.helper_color('  识别成功:', 'successful')
                                + self.helper_color(os.path.abspath(data['origin']), 'disabled'))
                            self.cmd.append('---- 合同名称:' + self.helper_color(data['result']['name'], 'importance'))
                            self.cmd.append('---- 合同日期:' + data['result']['date'])
                except:
                    self.cmd.append(str(index) + '/' + str(project['count']) + self.helper_color('  识别出错: ', 'danger')
                        + os.path.abspath(data['origin']))

                QApplication.processEvents()

        self.cmd.append(self.helper_color('<br>OCR 识别程序已完成!', 'successful'))
        self.handler_disable_all_btn(True)