예제 #1
0
def main():
    args = get_args()

    tika = TikaApp(args.jar or os.environ.get("TIKA_APP_JAR", None))

    parameters = {
        "path": args.file,
        "payload": args.payload,
        "objectInput": sys.stdin if args.stdin else None
    }

    try:
        if args.detect:
            print(tika.detect_content_type(**parameters))

        if args.text:
            print(tika.extract_only_content(**parameters))

        if args.language:
            print(tika.detect_language(**parameters))

        if args.all:
            parameters["pretty_print"] = True
            print(tika.extract_all_content(**parameters))

        if args.metadata:
            parameters["pretty_print"] = True
            print(tika.extract_only_metadata(**parameters))

    except IOError:
        pass
예제 #2
0
# -*- coding: utf-8 -*- 
# @Time : 2020/12/11 10:25 
# @Author : 王西亚 
# @File : c_doc.py

from tikapp import TikaApp

tika_client = TikaApp(file_jar="/usr/local/Cellar/tika/1.24.1_1/libexec/tika-app-1.24.1.jar")
metadata = tika_client.extract_only_metadata("/Users/wangxiya/Downloads/000101020062805119-00.pdf")
print(type(metadata))
print(metadata)

# from tika import parser

# parsed = parser.from_file('/path/to/file')
# print(parsed["metadata"])
# print(parsed["content"])

# parsed = parser.from_file('/Users/wangxiya/Downloads/000101020062805119-00.pdf', 'http://localhost:9998/tika')
# metadata = parsed["metadata"]
# print(type(metadata))
# print(metadata)
예제 #3
0
from tikapp import TikaApp

tika_client = TikaApp(
    file_jar="/Users/yma2/Documents/_garage/python/cxm/tika/tika-app-1.20.jar")

analyzeFile = "/Users/yma2/Downloads/Azure_Developer_Guide_eBook_ja-JP.pdf"
print(tika_client.detect_content_type(analyzeFile))
print(tika_client.detect_language(analyzeFile))
print(tika_client.extract_only_content(analyzeFile))
print(tika_client.extract_only_metadata(analyzeFile))
    def process(self) -> str:
        """
        在这里提取文档数据的元数据, 将元数据文件存储在self.file_content.work_root_dir下, 固定名称为self.FileName_MetaData, 注意返回的串中有元数据的格式
        注意: 如果出现内存泄漏现象, 则使用新建进程提取元数据, 放置到文件中, 在本进程中解析元数据!!!
        :return:
        """
        default_result = super().process()
        out_metadata_file_fullname = CFile.join_file(
            self.file_content.work_root_dir, self.FileName_MetaData)
        in_file_fullname = self.file_info.file_name_with_full_path

        if not settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Enable, True):
            return default_result

        tika_dependence_mode = settings.application.xpath_one(
            self.Path_Setting_Dependence_Tika_Mode, self.Name_Server)
        if CUtils.equal_ignore_case(tika_dependence_mode, self.Name_Server):
            tika_server_url = settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Server_Url, None)
            tika_server_connect_timeout = settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Server_Timeout, 30)
            if CUtils.equal_ignore_case(tika_server_url, ''):
                return default_result

            try:
                parsed = TikaServer.from_file(
                    in_file_fullname,
                    tika_server_url,
                    requestOptions={'timeout': tika_server_connect_timeout})
                meta_data_dict = parsed["metadata"]
                json_obj = CJson()
                json_obj.load_obj(meta_data_dict)
                json_obj.to_file(out_metadata_file_fullname)
                return CResult.merge_result_info(
                    CResult.merge_result(
                        self.Success,
                        '文档[{0}]的元数据提取成功'.format(in_file_fullname)),
                    self.Name_Format, self.MetaDataFormat_Json)
            except Exception as error:
                return CResult.merge_result(
                    self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format(
                        in_file_fullname, error.__str__()))
        else:
            tika_application = settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Client_App, None)
            if CUtils.equal_ignore_case(tika_application, ''):
                return default_result

            if not CFile.file_or_path_exist(tika_application):
                return CResult.merge_result(
                    self.Failure,
                    '文档[{0}]的元数据无法提取, 详细原因为: [依赖中间件{1}文件不存在, 请修正后重试!]'.format(
                        in_file_fullname, tika_application))

            try:
                tika_client = TikaApplication(file_jar=tika_application)
                meta_data_dict = tika_client.extract_only_metadata(
                    in_file_fullname)
                json_obj = CJson()
                json_obj.load_obj(meta_data_dict)
                json_obj.to_file(out_metadata_file_fullname)
                return CResult.merge_result_info(
                    CResult.merge_result(
                        self.Success,
                        '文档[{0}]的元数据提取成功'.format(in_file_fullname)),
                    self.Name_Format, self.MetaDataFormat_Json)
            except Exception as error:
                return CResult.merge_result(
                    self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format(
                        in_file_fullname, error.__str__()))

        # result = raster_mdreader.get_metadata_2_file(out_metadata_file_fullname)
        # result = CProcessUtils.processing_method(raster_mdreader.get_metadata_2_file, out_metadata_file_fullname)
        # 进程调用模式
        # p_one = Process(target=raster_mdreader.get_metadata_2_file, args=(out_metadata_file_fullname,))
        # p_one.start()
        # p_one.join()
        return CResult.merge_result_info(result, self.Name_Format,
                                         self.MetaDataFormat_Json)