def edit(resnum): if not hasrclextract: return 'Sorry, needs recoll version 1.19 or later' query = get_query() qs = query_to_recoll_string(query) rclq = recoll_initsearch(query) if resnum > rclq.rowcount - 1: return 'Bad result index %d' % resnum rclq.scroll(resnum) doc = rclq.fetchone() bottle.response.content_type = doc.mimetype bottle.response.headers['Content-Disposition'] = \ 'attachment; filename=%s' % doc.filename # If ipath is null, we can just return the file pathismine = False if doc.ipath == '': path = doc.url.replace('file://','') else: xt = rclextract.Extractor(doc) path = xt.idoctofile(doc.ipath, doc.mimetype) pathismine = True print >> sys.stderr, "Sending %s with mimetype %s" % (path, doc.mimetype) f = open(path, 'r') if pathismine: os.unlink(path) return f
def edit(resnum): query = get_query() qs = query_to_recoll_string(query) rclq = recoll_initsearch(query) if resnum > rclq.rowcount - 1: return 'Bad result index %d' % resnum rclq.scroll(resnum) doc = rclq.fetchone() bottle.response.content_type = doc.mimetype pathismine = False xt = rclextract.Extractor(doc) path = xt.idoctofile(doc.ipath, doc.mimetype) pathismine = True if (not doc.ipath) and "filename" in doc.keys(): filename = doc.filename else: filename = os.path.basename(path) bottle.response.headers['Content-Disposition'] = \ 'attachment; filename="%s"' % filename bottle.response.headers['Content-Length'] = os.stat(path).st_size f = open(path, 'rb') if pathismine: os.unlink(path) return f
def edit(resnum): if not hasrclextract: return 'Sorry, needs recoll version 1.19 or later' query = get_query() qs = query_to_recoll_string(query) rclq = recoll_initsearch(query) if resnum > rclq.rowcount - 1: return 'Bad result index %d' % resnum rclq.scroll(resnum) doc = rclq.fetchone() bottle.response.content_type = doc.mimetype pathismine = False if doc.ipath == '': # If ipath is null, we can just return the file path = doc.url.replace('file://', '') else: # Else this is a subdocument, extract to temporary file xt = rclextract.Extractor(doc) path = xt.idoctofile(doc.ipath, doc.mimetype) pathismine = True bottle.response.headers['Content-Disposition'] = \ 'attachment; filename="%s"' % os.path.basename(path).encode('utf-8') path = path.encode('utf-8') bottle.response.headers['Content-Length'] = os.stat(path).st_size f = open(path, 'r') if pathismine: os.unlink(path) return f
async def recoll_packet_text(config, resnum, query, searchtype, dir, sort, ascending, page): if not hasrclextract: return 'Sorry, needs recoll version 1.19 or later' query = wrap_query(query, searchtype) q = recoll_initsearch(config, query, dir, sort, ascending) if resnum > q.rowcount - 1: return 'Bad result index %d' % resnum q.scroll(resnum) doc = q.fetchone() xt = rclextract.Extractor(doc) tdoc = xt.textextract(doc.ipath) return tdoc.text, render_packet_name(doc.filename)
def preview(resnum): query = get_query() qs = query_to_recoll_string(query) rclq = recoll_initsearch(query) if resnum > rclq.rowcount - 1: return 'Bad result index %d' % resnum rclq.scroll(resnum) doc = rclq.fetchone() xt = rclextract.Extractor(doc) tdoc = xt.textextract(doc.ipath) if tdoc.mimetype == 'text/html': bottle.response.content_type = 'text/html; charset=utf-8' else: bottle.response.content_type = 'text/plain; charset=utf-8' return tdoc.text
def extractofile(doc, outfilename=""): extractor = rclextract.Extractor(doc) outfilename = extractor.idoctofile(doc.ipath, doc.mimetype, ofilename=outfilename) return outfilename
def textextract(self, index): doc = self.searchResults[index] extractor = rclextract.Extractor(doc) newdoc = extractor.textextract(doc.ipath) print(newdoc.text)
def utf8string(s): if ISP3: return s else: return s.encode('utf8') db = recoll.connect() query = db.query() # This normally has only one result, a well-known html file nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0) print("Result count: %d %d" % (nres, query.rowcount)) doc = query.fetchone() xtrac = rclextract.Extractor(doc) doc = xtrac.textextract(doc.ipath) print("Text length: %d" % len(doc.text)) refdigest = 'bfbb63f7a245c31767585b45014dbd07' # This normally has 2 results, one of which is a pdf attachment. nres = query.execute("population_size_cultural_transmission", stemming=0) for doc in query: if doc.mimetype == 'application/pdf': xtrac = rclextract.Extractor(doc) filename = xtrac.idoctofile(doc.ipath, doc.mimetype) f = open(filename, 'rb') data = f.read() f.close() m = hashlib.md5()
def extract(doc): extractor = rclextract.Extractor(doc) newdoc = extractor.textextract(doc.ipath) return newdoc