def tika(conf, attachments): """This method updates the attachments results with the Tika reports. Args: attachments (list): all attachments of email conf (dict): conf of this post processor Returns: This method updates the attachments list given """ if conf["enabled"]: from tikapp import TikaApp tika = TikaApp(file_jar=conf["path_jar"], memory_allocation=conf["memory_allocation"]) for a in attachments: if not a.get("is_filtered", False): if a["Content-Type"] in conf["whitelist_cont_types"]: payload = a["payload"] if a["content_transfer_encoding"] != "base64": payload = payload.encode("base64") # tika-app only gets payload in base64 a["tika"] = tika.extract_all_content(payload=payload, convert_to_obj=True)
def getTextFile(nameOfPDF, jarPath): ''' getTextFile("nameOfPDF", path) will take the PDF and output it as a textfile. Path will be taken and used to specify the .jar file needed for tika. Note: Needs the tika-app-1.22.jar and tika-app-python files and folders in the same working directory ''' # get the Tika Object from current directory tika_client = TikaApp(file_jar=join(jarPath, "tika-app-1.22.jar")) # read the pdf with open(nameOfPDF) as fin: # get rid of the .pdf to change to .txt for later if nameOfPDF[-4:] == ".pdf": foutName = nameOfPDF[:-4] else: # if its not .pdf, then just keep the filename foutName = nameOfPDF content = tika_client.extract_all_content(objectInput=fin) # write the pdf to a text file & add .txt to it with open(foutName + ".txt", "w", encoding='utf-8', errors='replace') as fout: fout.write(content) return foutName + ".txt"
def run(message): tika_client = TikaApp(file_jar="./tika-app/tika-app-1.21.jar") tika_result = tika_client.extract_only_content(message["path"]) if (tika_result != None): processing_dir = "./data/processing/" identifier = str(uuid.uuid4()) workfile = processing_dir + identifier with open(workfile, 'wb') as f: f.write(tika_result.encode('UTF-8')) new_message = { "identifier": identifier, "parent": message["identifier"], "path": workfile, "filename": "pdf.txt", "filetype": "unknown", "history": [], "metadata": {}, "original_file": False } sendEvent(new_message) extract_images_pymupdf(message) sendEvent(message)
class pdf_parser: def __init__(self, tika_jar_path): self.tika_client = TikaApp(file_jar=tika_jar_path) def parse(self, doc_path, file=False): if file: encoded = base64.b64encode(doc_path) content = json.loads( self.tika_client.extract_all_content(payload=encoded)) else: content = json.loads( self.tika_client.extract_all_content(path=doc_path)) content_string = re.sub(r'\n(?![\n])', r'', content[0]['X-TIKA:content']) content_string = re.sub(r'(\n)(\n+)', r'\1', content_string) date_string = content[0].get('Last-Modified') \ or content[0].get('Last-Save-Date') \ or content[0].get('Creation-Date') \ or datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') if date_string: date_string = str(dateutil.parser.parse(date_string).date()) df = pd.DataFrame(columns=['date', 'content']) df.loc[0] = [date_string, content_string] return df
def __init__(self, index, es_host="localhost:9200", file_jar='/Users/laofeng/es_home/tika-app-1.23.jar', index_content=False, create_index=True, force_renew_index=False, schema_file=None): self.tika_app = TikaApp(file_jar) self.es = Elasticsearch(es_host) self.index = index self.index_content = index_content #查询索引是否存在 index_exist = self.es.indices.exists(index) #if not index_exist and not create_index: if not index_exist and create_index and schema_file: with open(schema_file, 'r', encoding='utf-8') as f: schema = json.load(f) self.es.indices.create(index, schema) #如果已经存在,且强制renew,先删除后,再建立 if index_exist and force_renew_index: self.es.indices.delete(index) with open(schema_file, 'r', encoding='utf-8') as f: schema = json.load(f) self.es.indices.create(index, schema)
def __init__(self, jar=None, memory_allocation=None, valid_content_types=set()): # Init Tika self._tika_client = TikaApp(file_jar=jar, memory_allocation=memory_allocation) self._jar = jar self._memory_allocation = memory_allocation self._valid_content_types = valid_content_types
class TikaReader(object): def __init__(self, path): self.tika_client = TikaApp(file_jar=path) def detect_type(self, doc): return self.tika_client.detect_content_type(doc) def detect_language(self, doc): return self.tika_client.detect_language(doc) def content(self, doc): return self.tika_client.extract_all_content(doc)
def handle_backup_file(self,context, savepkt): DebugMessage(context, 100, "handle_backup_file called with " + str(savepkt) + "\n"); DebugMessage(context, 100, "fname: " + savepkt.fname + " Type: " + str(savepkt.type) + "\n"); if ( savepkt.type == bFileType['FT_REG'] ): DebugMessage(context, 100, "regulaer file, do something now...\n"); # configure your Elasticsearch server here: es = Elasticsearch([{'host': '192.168.17.2', 'port': 9200}]) # configure your TikaApp jar file here: try: tika_client = TikaApp(file_jar="/usr/local/bin/tika-app-1.20.jar") except Exception as ex: JobMessage(context, bJobMessageType['M_ERROR'], 'Error indexing %s. Tika error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK']; # tika_client has several parser options # Next one is for metadata only: #result_payload=tika_client.extract_only_metadata(savepkt.fname) # This one includes file contents as text: try: result_payload=tika_client.extract_all_content(savepkt.fname) except Exception as ex: JobMessage(context, bJobMessageType['M_ERROR'], 'Error extracting contents from %s. Tika error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK']; # result_payload is a list of json-strings. Nested structes like # tar-files or emails with attachments or inline documents are # returned as distinct json string. # The first string [0] contains information for the main file # TODO: care about nested structures, for now we only takte the first/main file try: data = json.loads(result_payload)[0] except Exception as ex: JobMessage(context, bJobMessageType['M_ERROR'], 'Error reading json fields delivered by Tika examining file %s. Json error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK']; # Tika eventually adds "Unkonwn Tags (id)", with id as increasing number, which # could lead to exceed the keyword limit in elasticsearch indices, we # remove those tags for data_keyword in data.keys(): if data_keyword.startswith ("Unknown tag ("): del data[data_keyword] # Tika adds some emptylines at the beginning of content, we strip it here if 'X-TIKA:content' in data: data['X-TIKA:content'] = data['X-TIKA:content'].strip() data['bareos_jobId'] = self.jobId data['bareos_fdname'] = self.fdname data['bareos_joblevel'] = unichr(self.level) data['bareos_directory'] = os.path.dirname(savepkt.fname) try: esRes = es.index (index="bareos-test", doc_type='_doc', body=data) except Exception as ex: JobMessage(context, bJobMessageType['M_ERROR'], 'Error indexing %s. Elastic error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK'];
def run(message): # Text tika_client = TikaApp(file_jar="./tika-app/tika-app-1.21.jar") tika_result = tika_client.extract_only_content(message["path"]) if (tika_result != None): processing_dir = "./data/processing/" identifier = str(uuid.uuid4()) workfile = processing_dir + identifier with open(workfile, 'wb') as f: f.write(tika_result.encode('UTF-8')) new_message = { "identifier": identifier, "parent": message["identifier"], "path": workfile, "filename" : "doc.txt", "filetype": "unknown", "history": [], "metadata": {}, "original_file": False } sendEvent(new_message) # Images with ZipFile(message["path"], 'r') as zipObj: processng_dir = "./data/processing" tmp_identifier = os.path.join(processng_dir, str(uuid.uuid4())) zipObj.extractall(tmp_identifier) for root, dirs, files in os.walk(tmp_identifier): for filename in files: if ".png" in filename or ".jpeg" in filename or ".jpg" in filename: new_identifier = str(uuid.uuid4()) processing_dest = os.path.join(processng_dir, new_identifier) new_message = { "identifier": new_identifier, "parent": message["identifier"], "path": processing_dest, "filename" : filename, "filetype": "unknown", "history": [], "metadata": {}, "original_file": False } shutil.move(os.path.join(root, filename), processing_dest) sendEvent(new_message) shutil.rmtree(tmp_identifier) sendEvent(message)
def tika(conf, attachments): """This method updates the attachments results with the Tika reports. Args: attachments (list): all attachments of email conf (dict): conf of this post processor Returns: This method updates the attachments list given """ if conf["enabled"]: from tikapp import TikaApp tika = TikaApp(file_jar=conf["path_jar"], memory_allocation=conf["memory_allocation"]) wtlist = conf.get("whitelist_content_types", []) if not wtlist: log.warning( "Apache Tika analysis setted, without whitelist content types") return for a in attachments: if not a.get("is_filtered", False): if a["Content-Type"] in wtlist: payload = a["payload"] if a["content_transfer_encoding"] != "base64": try: payload = payload.encode("base64") except UnicodeError: # content_transfer_encoding': u'x-uuencode' # it's not binary with strange encoding continue # tika-app only gets payload in base64 try: results = tika.extract_all_content(payload=payload, convert_to_obj=True) if results: a["tika"] = results except JSONDecodeError: log.warning( "JSONDecodeError for {!r} in Tika analysis".format( a["md5"]))
def main(): args = get_args() tika = TikaApp(args.jar or os.environ.get("TIKA_APP_JAR", None)) parameters = { "path": args.file, "payload": args.payload, "objectInput": sys.stdin if args.stdin else None } try: if args.detect: print(tika.detect_content_type(**parameters)) if args.text: print(tika.extract_only_content(**parameters)) if args.language: print(tika.detect_language(**parameters)) if args.all: parameters["pretty_print"] = True print(tika.extract_all_content(**parameters)) if args.metadata: parameters["pretty_print"] = True print(tika.extract_only_metadata(**parameters)) except IOError: pass
def main(): args = get_args() command_line = dict() if args.jar: command_line = {"TIKA_APP_JAR": args.jar} defaults = {"TIKA_APP_JAR": "/opt/tika/tika-app-1.15.jar"} options = ChainMap(command_line, os.environ, defaults) tika = TikaApp(options['TIKA_APP_JAR']) try: if args.file: f = args.file if args.detect: print(tika.detect_content_type(path=f)) if args.text: print(tika.extract_only_content(path=f)) if args.language: print(tika.detect_language(path=f)) if args.all: print(tika.extract_all_content(path=f, pretty_print=True)) elif args.payload: p = args.payload if args.detect: print(tika.detect_content_type(payload=p)) if args.text: print(tika.extract_only_content(payload=p)) if args.language: print(tika.detect_language(payload=p)) if args.all: print(tika.extract_all_content(payload=p, pretty_print=True)) except IOError: pass
class TikaAnalysis(object): def __init__(self, jar=None, memory_allocation=None, valid_content_types=set()): # Init Tika self._tika_client = TikaApp(file_jar=jar, memory_allocation=memory_allocation) self._jar = jar self._memory_allocation = memory_allocation self._valid_content_types = valid_content_types @property def jar(self): return self._jar @jar.setter def jar(self, value): self._jar = value @property def memory_allocation(self): return self._memory_allocation @memory_allocation.setter def memory_allocation(self, value): self._memory_allocation = value @property def valid_content_types(self): return self._valid_content_types @valid_content_types.setter def valid_content_types(self, value): if not isinstance(value, set): raise InvalidContentTypes("Content types must be a set") self._valid_content_types = value def add_meta_data(self, attachment): """If content_type in valid_content_types this method extracts meta data and update attachments input results. """ if not isinstance(attachment, dict): raise InvalidAttachment("Attachment result is not a dict") # The Apache Tika output of archive contains the contents and metadata # of all archived files. if attachment['Content-Type'] in self.valid_content_types: attachment['tika'] = self._tika_client.extract_all_content( payload=attachment['payload'], convert_to_obj=True)
class TikaReader: # Iniciador de la clase. def __init__(self, file_process): # Cliente Tika que utiliza que carga el fichero jar cliente. self.tika_client = TikaApp(file_jar="tika-app-1.20.jar") self.file_process = file_process # Detector del tipo de contenido MIME. def detect_document_type(self): return self.tika_client.detect_content_type(self.file_process) # Detector de lenguaje utilizado en el documento. def detect_language(self): return self.tika_client.detect_language(self.file_process) # Extractor del contenido completo del documento. def extract_complete_info(self, value=False): return self.tika_client.extract_all_content(self.file_process, convert_to_obj=value) # Extractor de solo el contenido del documento. def extract_content_info(self): return self.tika_client.extract_only_content(self.file_process)
class ProcessJSONTika(object): def __init__(self, path): self.tika_client = TikaApp(file_jar=path) def jsonprocessor(self, doc): return self.tika_client.extract_all_content(doc, convert_to_obj=True)[0] def author(self, doc): return self.jsonprocessor(doc).get('Author', None) def creationdate(self, doc): return self.jsonprocessor(doc).get('Creation-Date', None) def lastmodified(self, doc): return self.jsonprocessor(doc).get('Last-Modified', None) def all_content(self, doc): return self.jsonprocessor(doc)['X-TIKA:content'] def top_10_words(self, doc): content = self.all_content(doc) words = word_tokenize(content) # stopwords stopWords = set(stopwords.words('english')) clean_words = [ word for word in words if word.isalpha() and word not in stopWords ] words_dic = {} for i in clean_words: if i in words_dic.keys(): words_dic[i] += 1 else: words_dic[i] = 1 return sorted(words_dic.items(), key=operator.itemgetter(1), reverse=True)[:10]
def __init__(self, tika_jar_path): self.tika_client = TikaApp(file_jar=tika_jar_path)
from tikapp import TikaApp tika_client = TikaApp( file_jar="/Users/yma2/Documents/_garage/python/cxm/tika/tika-app-1.20.jar") analyzeFile = "/Users/yma2/Downloads/Azure_Developer_Guide_eBook_ja-JP.pdf" print(tika_client.detect_content_type(analyzeFile)) print(tika_client.detect_language(analyzeFile)) print(tika_client.extract_only_content(analyzeFile)) print(tika_client.extract_only_metadata(analyzeFile))
def process(self) -> str: """ 在这里提取文档数据的元数据, 将元数据文件存储在self.file_content.work_root_dir下, 固定名称为self.FileName_MetaData, 注意返回的串中有元数据的格式 注意: 如果出现内存泄漏现象, 则使用新建进程提取元数据, 放置到文件中, 在本进程中解析元数据!!! :return: """ default_result = super().process() out_metadata_file_fullname = CFile.join_file( self.file_content.work_root_dir, self.FileName_MetaData) in_file_fullname = self.file_info.file_name_with_full_path if not settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Enable, True): return default_result tika_dependence_mode = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Mode, self.Name_Server) if CUtils.equal_ignore_case(tika_dependence_mode, self.Name_Server): tika_server_url = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Server_Url, None) tika_server_connect_timeout = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Server_Timeout, 30) if CUtils.equal_ignore_case(tika_server_url, ''): return default_result try: parsed = TikaServer.from_file( in_file_fullname, tika_server_url, requestOptions={'timeout': tika_server_connect_timeout}) meta_data_dict = parsed["metadata"] json_obj = CJson() json_obj.load_obj(meta_data_dict) json_obj.to_file(out_metadata_file_fullname) return CResult.merge_result_info( CResult.merge_result( self.Success, '文档[{0}]的元数据提取成功'.format(in_file_fullname)), self.Name_Format, self.MetaDataFormat_Json) except Exception as error: return CResult.merge_result( self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format( in_file_fullname, error.__str__())) else: tika_application = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Client_App, None) if CUtils.equal_ignore_case(tika_application, ''): return default_result if not CFile.file_or_path_exist(tika_application): return CResult.merge_result( self.Failure, '文档[{0}]的元数据无法提取, 详细原因为: [依赖中间件{1}文件不存在, 请修正后重试!]'.format( in_file_fullname, tika_application)) try: tika_client = TikaApplication(file_jar=tika_application) meta_data_dict = tika_client.extract_only_metadata( in_file_fullname) json_obj = CJson() json_obj.load_obj(meta_data_dict) json_obj.to_file(out_metadata_file_fullname) return CResult.merge_result_info( CResult.merge_result( self.Success, '文档[{0}]的元数据提取成功'.format(in_file_fullname)), self.Name_Format, self.MetaDataFormat_Json) except Exception as error: return CResult.merge_result( self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format( in_file_fullname, error.__str__())) # result = raster_mdreader.get_metadata_2_file(out_metadata_file_fullname) # result = CProcessUtils.processing_method(raster_mdreader.get_metadata_2_file, out_metadata_file_fullname) # 进程调用模式 # p_one = Process(target=raster_mdreader.get_metadata_2_file, args=(out_metadata_file_fullname,)) # p_one.start() # p_one.join() return CResult.merge_result_info(result, self.Name_Format, self.MetaDataFormat_Json)
def analyse_pdf_archive(pdf_csv, keyword_csv, tika_file_jar, outfile_name): # PDF for each day saved as welt_mmdd. File names listed in csv file 'welt_pdf'. Create list of PDF names. with open(pdf_csv, 'r', encoding = 'utf-8-sig') as f: reader = csv.reader(f) pdf_names = list(reader) pdf_names = list(itertools.chain(*pdf_names)) # acts as main data frame to contain individual data frames pdf_list = [] for name in pdf_names: pdf_list.append(name + '.pdf') # Create list of keywords from 'keyword_stems.csv' with open(keyword_csv, 'r', encoding = 'utf-8-sig') as f: reader = csv.reader(f) keywords = list(reader) keywords = list(itertools.chain(*keywords)) tika_client = TikaApp(file_jar=tika_file_jar) a = 1 #set counter to 1 keyword_counter = [] for pdf in pdf_list: rawText = tika_client.extract_only_content(pdf) print("pdf {0} extracted".format(a)) rawList = rawText.split( ) rawList_nopunct = [word.translate(str.maketrans('', '', string.punctuation)) for word in rawList] counts = Counter(rawList_nopunct) list_words = counts.most_common() keyword_hits_list = [] for x in range(0, len(list_words)): temp = list(list_words[x]) # convert from tuple to list temp[1] = str(temp[1]) # change number (at index 1) into string n_temp = [(unicodedata.normalize('NFKD', word).encode('ASCII', 'ignore')).lower().decode() for word in temp] #normalised umlauts in data #check word (at index 0) against list of keywords, add new column = 1 if match, = 0 otherwise. hits = 0 for i in range(0, len(keywords)): if keywords[i] in n_temp[0]: hits = hits + 1 if hits != 0: n_temp.append(1) else: n_temp.append(0) keyword_hits = int(n_temp[1])*int(n_temp[2]) keyword_hits_list.append(keyword_hits) keyword_counts = sum(keyword_hits_list) keyword_counter.append(keyword_counts) print("day {0} complete".format(a)) if list_words != []: a = a + 1 else: break df = pd.DataFrame({"id": pdf_names, "keywords": keyword_counter}) df.to_csv(outfile_name, index=False)
def tika_extract_only_content(memory=None): tika_client = TikaApp(file_jar=TIKA_APP_JAR, memory_allocation=memory) output = tika_client.extract_only_content(path=test_zip) return output
def __init__(self, file_process): # Cliente Tika que utiliza que carga el fichero jar cliente. self.tika_client = TikaApp(file_jar="tika-app-1.20.jar") self.file_process = file_process
def tika_content_type(): tika_client = TikaApp(file_jar=TIKA_APP_JAR) output = tika_client.detect_content_type(path=test_zip) return output
class TikaProcessing(AbstractProcessing): """ This class processes the output mail attachments to add Apache Tika analysis. Args: jar (string): path of Apache Tika App jar valid_content_types (list or set): list of contents types to analyze memory_allocation (string): memory to give to Apache Tika App """ def __init__(self, **kwargs): super(TikaProcessing, self).__init__(**kwargs) # Init Tika self._tika_client = TikaApp(file_jar=self.jar, memory_allocation=self.memory_allocation) def __getattr__(self, name): try: return self._kwargs[name] except KeyError: # Default values if name in ("memory_allocation"): return None else: msg = "'{0}' object has no attribute '{1}'" raise AttributeError(msg.format(type(self).__name__, name)) def __setattr__(self, name, value): super(TikaProcessing, self).__setattr__(name, value) if name == "valid_content_types": if not isinstance(value, set) and not isinstance(value, list): raise InvalidContentTypes("Content types must be set or list") self._kwargs[name] = value def _check_arguments(self): """This method checks if all mandatory arguments are given. """ if 'jar' not in self._kwargs: msg = "Argument '{0}' not in object '{1}'" raise MissingArgument(msg.format('jar', type(self).__name__)) if 'valid_content_types' not in self._kwargs: msg = "Argument '{0}' not in object '{1}'" raise MissingArgument( msg.format('valid_content_types', type(self).__name__)) def process(self, attachment): """This method updates the attachment result with the Tika output. Args: attachment (dict): dict with a raw attachment mail Returns: This method updates the attachment dict given """ super(TikaProcessing, self).process(attachment) if attachment['Content-Type'] in self.valid_content_types: attachment['tika'] = self._tika_client.extract_all_content( payload=attachment['payload'], convert_to_obj=True)
def convert_Tika(self,fname): tika_client = TikaApp(file_jar=os.getcwd()+'/tika-app-1.20.jar') return tika_client.extract_only_content(fname)
def __init__(self, **kwargs): super(TikaProcessing, self).__init__(**kwargs) # Init Tika self._tika_client = TikaApp(file_jar=self.jar, memory_allocation=self.memory_allocation)
# -*- coding: utf-8 -*- # @Time : 2020/12/11 10:25 # @Author : 王西亚 # @File : c_doc.py from tikapp import TikaApp tika_client = TikaApp(file_jar="/usr/local/Cellar/tika/1.24.1_1/libexec/tika-app-1.24.1.jar") metadata = tika_client.extract_only_metadata("/Users/wangxiya/Downloads/000101020062805119-00.pdf") print(type(metadata)) print(metadata) # from tika import parser # parsed = parser.from_file('/path/to/file') # print(parsed["metadata"]) # print(parsed["content"]) # parsed = parser.from_file('/Users/wangxiya/Downloads/000101020062805119-00.pdf', 'http://localhost:9998/tika') # metadata = parsed["metadata"] # print(type(metadata)) # print(metadata)
def extract_text(filename): jar_path = os.path.abspath(os.path.join("lib", "tika-app-1.28.jar")) tika_client = TikaApp(file_jar=jar_path) parsed = tika_client.extract_only_content(filename) return parsed
def tika_detect_language(): tika_client = TikaApp(file_jar=TIKA_APP_JAR) output = tika_client.detect_language(path=test_zip) return output
class FileBeats: # # 建立文件索引 #只有如下扩展名的文件才会被索引文件内容 export_content_exts = ('.md', '.html', '.htm', '.txt', '.ppt', '.pptx', '.key', '.pdf', ".pages", ".doc", ".docx", '.py', '.java') def __init__(self, index, es_host="localhost:9200", file_jar='/Users/laofeng/es_home/tika-app-1.23.jar', index_content=False, create_index=True, force_renew_index=False, schema_file=None): self.tika_app = TikaApp(file_jar) self.es = Elasticsearch(es_host) self.index = index self.index_content = index_content #查询索引是否存在 index_exist = self.es.indices.exists(index) #if not index_exist and not create_index: if not index_exist and create_index and schema_file: with open(schema_file, 'r', encoding='utf-8') as f: schema = json.load(f) self.es.indices.create(index, schema) #如果已经存在,且强制renew,先删除后,再建立 if index_exist and force_renew_index: self.es.indices.delete(index) with open(schema_file, 'r', encoding='utf-8') as f: schema = json.load(f) self.es.indices.create(index, schema) # 格式化时间,参数是秒和时间格式 @staticmethod def second2date(second, style="%Y-%m-%d %H:%M:%S"): time_array = time.localtime(second) date_str = time.strftime(style, time_array) return date_str def export_file_tags(self, abs_path): tags = {"path": abs_path} (basename, ext) = os.path.splitext(abs_path) tags['ext'] = ext.lstrip('.') # 去掉了后缀的点 tags['name'] = os.path.basename(abs_path) size = os.path.getsize(abs_path) tags['size'] = size # 过滤调太大或者太小的文件 # 文件太大,二进制文件类型不导出content if self.index_content and ext.lower() in FileBeats.export_content_exts: try: r = self.tika_app.extract_only_content( path=abs_path, payload="base64_payload") if r: tags['content'] = r except Exception as e: traceback.print_exc() return tags # 索引文档 def index_doc(self, tags, _type='_doc'): # index 相当于表名, body被索引的文本(分词) tags['timestamp'] = datetime.now() # 使用文件全路径做为id res = self.es.index(index=self.index, doc_type=_type, body=tags, id=tags['path']) def index_docs(self, docs): for doc in docs: self.index_doc(doc) # 处理一个文件,先导出tags,然后索引文档 def process_file(self, f): tags = self.export_file_tags(f) self.index_doc(tags) def beats_more(self, folders, asynchronous=True): print("开始索引文件", FileBeats.second2date(time.time())) for folder in folders: self.start_beats(folder) print("索引文件结束", FileBeats.second2date(time.time())) def start_beats(self, source_dir='/Volumes/portable/sync/', asynchronous=True): print("开始索引文件", source_dir, FileBeats.second2date(time.time())) # 遍历文件 greenlets = list() #index_tasks = list() for folder, dirs, files in os.walk(source_dir, topdown=False): # 过滤掉一些文件夹 if '@' in folder or '.svn' in folder or folder.endswith( '.app') or "迅雷" in folder: print('忽略目录', folder) continue for f in files: if f.startswith("."): continue abs_path = os.path.join(folder, f) try: # process_file(abs_path) if asynchronous: greenlets.append( gevent.spawn(self.export_file_tags, abs_path)) else: tags = self.export_file_tags(abs_path) self.index_doc(tags) # 任务达到500个,执行一次 if len(greenlets) >= 5: gevent.joinall(greenlets) self.index_docs([g.value for g in greenlets]) # 使用并发es会出现一个read timeout或者是socket的错误 # index_tasks.append(gevent.spawn(index_docs,[g.value for g in greenlets])) greenlets.clear() # if len(index_tasks) > 50: # gevent.joinall(index_tasks) # index_tasks.clear() except Exception as e: traceback.print_exc() # 清理不足5个文件的情况 if asynchronous: gevent.joinall(greenlets) self.index_docs([g.value for g in greenlets]) # index_tasks.append(gevent.spawn(index_docs, [g.value for g in greenlets])) # gevent.joinall(index_tasks) print("索引文件结束", source_dir, FileBeats.second2date(time.time()))
from tikapp import TikaApp from image_mod import convert_image_to_string import os tika_client = TikaApp( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'src', 'tika-app-1.22.jar')) def receive_text_from_file(path: str, ext=None): text = tika_client.extract_only_content(path) if text == "" and ext == 'pdf': return convert_image_to_string(path, ext=ext) return text
def handle_backup_file(self, context, savepkt): DebugMessage(context, 100, "handle_backup_file called with " + str(savepkt) + "\n") DebugMessage( context, 100, "fname: " + savepkt.fname + " Type: " + str(savepkt.type) + "\n") if (savepkt.type == bFileType['FT_REG']): DebugMessage(context, 100, "regulaer file, do something now...\n") # configure your Elasticsearch server here: es = Elasticsearch([{'host': '192.168.17.2', 'port': 9200}]) # configure your TikaApp jar file here: try: tika_client = TikaApp( file_jar="/usr/local/bin/tika-app-1.20.jar") except Exception as ex: JobMessage( context, bJobMessageType['M_ERROR'], 'Error indexing %s. Tika error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK'] # tika_client has several parser options # Next one is for metadata only: #result_payload=tika_client.extract_only_metadata(savepkt.fname) # This one includes file contents as text: try: result_payload = tika_client.extract_all_content(savepkt.fname) except Exception as ex: JobMessage( context, bJobMessageType['M_ERROR'], 'Error extracting contents from %s. Tika error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK'] # result_payload is a list of json-strings. Nested structes like # tar-files or emails with attachments or inline documents are # returned as distinct json string. # The first string [0] contains information for the main file # TODO: care about nested structures, for now we only takte the first/main file try: data = json.loads(result_payload)[0] except Exception as ex: JobMessage( context, bJobMessageType['M_ERROR'], 'Error reading json fields delivered by Tika examining file %s. Json error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK'] # Tika eventually adds "Unkonwn Tags (id)", with id as increasing number, which # could lead to exceed the keyword limit in elasticsearch indices, we # remove those tags for data_keyword in data.keys(): if data_keyword.startswith("Unknown tag ("): del data[data_keyword] # Tika adds some emptylines at the beginning of content, we strip it here if 'X-TIKA:content' in data: data['X-TIKA:content'] = data['X-TIKA:content'].strip() data['bareos_jobId'] = self.jobId data['bareos_fdname'] = self.fdname data['bareos_joblevel'] = unichr(self.level) data['bareos_directory'] = os.path.dirname(savepkt.fname) try: esRes = es.index(index="bareos-test", doc_type='_doc', body=data) except Exception as ex: JobMessage( context, bJobMessageType['M_ERROR'], 'Error indexing %s. Elastic error: %s' % (savepkt.fname, str(ex))) return bRCs['bRC_OK']