def process(fullpath, config, rcontext, columns=None): parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input,content,metadata,context) content = content.toString() processed = [ metadata.get("Creation-Date"), metadata.get("Last-Modified"), metadata.get("Last-Save-Date"), metadata.get("Revision-Number"), metadata.get("Author"), metadata.get("Last-Author"), metadata.get("Template"), metadata.get("Word-Count"), metadata.get("title"), metadata.get("subject"), metadata.get("Company"), metadata.get("Keywords"), metadata.get("Page-Count"), metadata.get("Character Count"), content ] extract.tika_extract(fullpath, context, metadata, config, rcontext) return processed
def process(fullpath, config, rcontext, columns=None): results = [] meta = [] parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input, content, metadata, context) content = content.toString() for n in metadata.names(): meta.append(metadata.get(n)) val = 0 parse = [0, 3, 4, 7, 9] for x in meta: if val in parse: results.append(x) val += 1 results.append(content) extract.tika_extract(fullpath, context, metadata, config, rcontext) return results
def process(fullpath, config, rcontext, columns=None): parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input, content, metadata, context) content = content.toString() processed = [ metadata.get("Creation-Date"), metadata.get("Last-Modified"), metadata.get("Last-Save-Date"), metadata.get("Revision-Number"), metadata.get("Author"), metadata.get("Last-Author"), metadata.get("Template"), metadata.get("Word-Count"), metadata.get("title"), metadata.get("subject"), metadata.get("Company"), metadata.get("Keywords"), metadata.get("Page-Count"), metadata.get("Character Count"), content ] extract.tika_extract(fullpath, context, metadata, config, rcontext) return processed
def process(fullpath, config, rcontext, columns=None): results = [] meta = [] parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input,content,metadata,context) content = content.toString() for n in metadata.names(): meta.append(metadata.get(n)) val = 0 parse = [0,3,4,7,9] for x in meta: if val in parse: results.append(x) val += 1 results.append(content) extract.tika_extract(fullpath, context, metadata, config, rcontext); return results