def text_lines(location, demarkup=False): """ Return a text lines iterator from file at `location`. Return an empty iterator if no text content is extractible. Text extraction is based on detected file type. if `demarkup` is True, attempt to detect if a file contains HTML/XML-like markup and cleanup this markup. Note: For testing or building from strings, location can be a is a list of unicode line strings. """ # TODO: add support for "wide" UTF-16-like strings where each char is # followed by a zero as is often found in some Windows binaries. Do this for # binaries only. This is in direct conflict with "strings" extraction as # currently implemented if not location: return iter([]) if not isinstance(location, basestring): # not a path: wrap an iterator on location which should be a sequence # of lines return iter(location) T = typecode.get_type(location) if not T.contains_text: return iter([]) # Should we read this as some markup, pdf office doc, text or binary? if T.is_pdf: return unicode_text_lines_from_pdf(location) # lightweight markup stripping support if demarkup and markup.is_markup(location): try: return markup.demarkup(location) except: # try again later with as plain text pass # TODO: handle Office-like documents, RTF, etc # if T.is_doc: # return unicode_text_lines_from_doc(location) if T.is_text: return unicode_text_lines(location) # DO NOT introspect media, archives and compressed files # if not T.contains_text: # return iter([]) if T.is_binary: # fall back to binary return unicode_text_lines_from_binary(location) else: # if neither text, text-like nor binary: treat as binary # this should never happen # fall back to binary return unicode_text_lines_from_binary(location)
def numbered_text_lines( location, demarkup=False, plain_text=False, start_line=1, ): """ Yield tuples of (line number, text line) from the file at `location`. Return an empty iterator if no text content is extractible. Text extraction is based on detected file type. Long lines are broken down in chunks, therefore two items can have the same line number. line numbers start at ``start_line`` which is 1-based by default. If `demarkup` is True, attempt to detect if a file contains HTML/XML-like markup and cleanup this markup. If `plain_text` is True treat the file as a plain text file and do not attempt to detect its type and extract it's content with special procedures. This is used mostly when loading license texts and rules. Note: For testing or building from strings, location can be a is a list of unicode line strings. """ if not location: return iter([]) if not isinstance(location, str): # not a path: wrap an iterator on location which should be a sequence # of lines if TRACE: logger_debug('numbered_text_lines:', 'location is not a file') return enumerate(iter(location), start_line) if plain_text: if TRACE: logger_debug('numbered_text_lines:', 'plain_text') return enumerate(unicode_text_lines(location), start_line) T = typecode.get_type(location) if TRACE: logger_debug('numbered_text_lines: T.filetype_file:', T.filetype_file) logger_debug('numbered_text_lines: T.is_text_with_long_lines:', T.is_text_with_long_lines) logger_debug('numbered_text_lines: T.is_binary:', T.is_binary) # TODO: we should have a command line to force digging inside binaries if not T.contains_text: return iter([]) # Should we read this as some markup, pdf office doc, text or binary? if T.is_pdf and T.is_pdf_with_text: if TRACE: logger_debug('numbered_text_lines:', 'is_pdf') return enumerate(unicode_text_lines_from_pdf(location), start_line) if T.filetype_file.startswith('Spline Font Database'): if TRACE: logger_debug('numbered_text_lines:', 'Spline Font Database') return enumerate( (as_unicode(l) for l in sfdb.get_text_lines(location)), start_line, ) # lightweight markup stripping support if demarkup and markup.is_markup(location): try: lines = list(enumerate(markup.demarkup(location), start_line)) if TRACE: logger_debug('numbered_text_lines:', 'demarkup') return lines except: # try again later with as plain text pass if T.is_js_map: try: lines = list(enumerate(js_map_sources_lines(location), start_line)) if TRACE: logger_debug('numbered_text_lines:', 'js_map') return lines except: # try again later with as plain text otherwise pass if T.is_text: numbered_lines = enumerate(unicode_text_lines(location), start_line) # text with very long lines such minified JS, JS map files or large JSON if (not location.endswith('package.json') and (T.is_text_with_long_lines or T.is_compact_js or T.filetype_file == 'data' or 'locale' in location)): numbered_lines = break_numbered_unicode_text_lines(numbered_lines) if TRACE: logger_debug('numbered_text_lines:', 'break_numbered_unicode_text_lines') return numbered_lines # TODO: handle Office-like documents, RTF, etc # if T.is_doc: # return unicode_text_lines_from_doc(location) # TODO: add support for "wide" UTF-16-like strings where each char is # followed by a zero as is often found in some Windows binaries. Do this for # binaries only. This is may conflicting with "strings" extraction as # currently implemented if T.is_binary: # fall back to binary if TRACE: logger_debug('numbered_text_lines:', 'is_binary') return enumerate(unicode_text_lines_from_binary(location), start_line) return iter([])
def test_jsp_demarkup(self): test_file = self.get_test_loc(u'markup/java.jsp') result = list(markup.demarkup(test_file)) expected = [ u' version="1.0" encoding="ISO-8859-1"?>\n', u' <%@page session="false" contentType="text/html; charset=ISO-8859-1" %>\n', u' <%@page import="clime.messadmin.model.IServerInfo" %>\n', u' <%@taglib prefix="core" uri="messadmin-core" %>\n', u' <%@taglib prefix="format" uri="messadmin-fmt" %>\n', u' HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"--%>\n', u' HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"--%>\n', u' html \n', u' PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n', u' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n', u' html \n', u' PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n', u' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"--%>\n', u' html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n', u' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"--%>\n', u'\n', u' xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">\n', u' IServerInfo serverInfos = (IServerInfo) request.getAttribute("serverInfos");\n', u' String webFilesRoot = (String) request.getAttribute("WebFilesRoot"); %>\n', u' value="${pageContext.request.servletPath}" var="submitUrl" scope="page"/--%> can use value="${pageContext.request.servletPath}" because this JSP is include()\'ed --%>\n', u' or use directly ${pageContext.request.requestURI} --%>\n', u" String submitUrl = request.getContextPath() + request.getServletPath(); /* Can use +request.getServletPath() because this JSP is include()'ed */ %>\n", u' \n', u' http-equiv="content-type" content="text/html; charset=iso-8859-1 "/> \n', u'\t http-equiv="pragma" content="no-cache "/> HTTP 1.0 -->\n', u'\t http-equiv="cache-control" content="no-cache,must-revalidate "/> HTTP 1.1 -->\n', u'\t http-equiv="expires" content="0 "/> 0 is an invalid value and should be treated as \'now\' -->\n', u'\t http-equiv="content-language" content="en "/> fr-FR --%>\n', u'\t name="author" content="Cedrik LIME "/> \n', u'\t name="copyright" content="copyright 2005-2006 Cedrik LIME "/> \n', u'\t name="robots" content="noindex,nofollow,noarchive "/> \n', u'\t Server System Informations \n', u'\t rel="stylesheet" type="text/css" =" MessAdmin.css "/> \n', u'\t type="text/css">\n', u'\t \n', u'\t type="text/javascript" src=" js/getElementsBySelector.js"> \n', u'\t type="text/javascript" src=" js/behavior.js"> \n', u'\t type="text/javascript" src=" js/MessAdmin.js"> \n', u'\t type="text/javascript">// ', u'\t\tfunction reloadPage() {\n', u'\t\t\twindow.location.reload();\n', u'\t\t}\n', u'\t//]]>\n', u'\t \n', u' \n', u' \n', u'\n', u' ', u' border="0" cellspacing="0" cellpadding="0" width="100%">\n', u' \n', u' align="right" class="topheading" width="44"> alt="Indus Logo" border="0" height="39" width="44" src=" /MessAdmin/images/logo.gif"> class="topheading">Indus Application Management Console \n', u' \n', u' \n', u' \n', u' \n', u' border="0" cellspacing="0" cellpadding="0">\n', u' \n', u' class="backtab"> class="tabs" ="http://*****:*****@ value=\' serverInfos.getSystemProperties().get("java.home") %> \'/> \n', u'\t\t \n', u'\t \n', u'\t \n', u'\t\t Platform \n', u'\t\t \n', u'\t\t\t value=\' serverInfos.getSystemProperties().get("os.name") %> \'/> / value=\' serverInfos.getSystemProperties().get("os.arch") %> \'/> \n', u'\t\t\t value=\' serverInfos.getSystemProperties().get("os.version") %> \'/> \n', u'\t\t \n', u'\t \n', u' \n', u' \n', u'\n', u' style="text-align: center;"> type="button" onclick="window.location.reload()">Refresh \n', u'\n', u' class="error"> value=\' request.getAttribute("error") %> \'/> \n', u' class="message"> value=\' request.getAttribute("message") %> \'/> \n', u'\n', u' id="extraServerAttributes">\n', u' items=" serverInfos.getServerSpecificData() %>" var="serverSpecificData" varStatus="status">\n', u' java.util.Map.Entry serverSpecificData = (java.util.Map.Entry) pageContext.getAttribute("serverSpecificData"); %>\n', u'\t \n', u'\t\t <legend > serverSpecificData.getKey() %> </legend> \n', u'\t\t serverSpecificData.getValue() %>\n', u'\t \n', u' \n', u' \n', u'\n', u' <jsp:include page="footer.jsp "/> \n', u'\n', u' \n', u' ' ] assert expected == result