def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ warnings.warn( "scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.", ScrapyDeprecationWarning, stacklevel=2 ) if isinstance(regex, str): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group('extract')] # named group except Exception: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, str): return [replace_entities(s, keep=['lt', 'amp']) for s in strings] else: return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
def test_regular(self): # regular conversions self.assertEqual(replace_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(replace_entities(b'As low as £100!'), u'As low as \xa3100!') self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
def test_returns_unicode(self): # make sure it always return uncode assert isinstance(replace_entities(b'no entities'), six.text_type) assert isinstance(replace_entities(b'Price: £100!'), six.text_type) assert isinstance(replace_entities(u'no entities'), six.text_type) assert isinstance(replace_entities(u'Price: £100!'), six.text_type)
def test_illegal_entities(self): self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(replace_entities('x≤y'), u'x\u2264y') self.assertEqual(replace_entities('xy'), u'xy') self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy')
def test_illegal_entities(self): self.assertEqual( replace_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual( replace_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(replace_entities('x≤y'), u'x\u2264y')
def test_missing_semicolon(self): for entity, result in ( ( '<<!', '<<!', ), ( '<!', '<!', ), ( 'A ', 'A ', ), ( 'A!', 'A!', ), ( 'Ah', 'Ah', ), ( 'A!', 'A!', ), ( 'Ax', 'Ax', ), ( '³!', '\u00B3!', ), ( 'Á!', '\u00C1!', ), ( '☃!', '\u2603!', ), ( '™', '\u2122', ), ( '™', '\u2122', ), ): self.assertEqual(replace_entities(entity, encoding='cp1252'), result) self.assertEqual( replace_entities('x%sy' % entity, encoding='cp1252'), 'x%sy' % result)
def test_keep_entities(self): # keep some entities self.assertEqual( replace_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), '<b>Low < High & Medium \xa3 six</b>') self.assertEqual( replace_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), '<b>Low < High & Medium \xa3 six</b>')
def test_missing_semicolon(self): for entity, result in ( ( "<<!", "<<!", ), ( "<!", "<!", ), ( "A ", "A ", ), ( "A!", "A!", ), ( "Ah", "Ah", ), ( "A!", "A!", ), ( "Ax", "Ax", ), ( "³!", "\u00B3!", ), ( "Á!", "\u00C1!", ), ( "☃!", "\u2603!", ), ( "™", "\u2122", ), ( "™", "\u2122", ), ): self.assertEqual(replace_entities(entity, encoding="cp1252"), result) self.assertEqual( replace_entities(f"x{entity}y", encoding="cp1252"), f"x{result}y" )
def test_keep_entities(self): # keep some entities self.assertEqual( replace_entities(b"<b>Low < High & Medium £ six</b>", keep=["lt", "amp"]), "<b>Low < High & Medium \xa3 six</b>", ) self.assertEqual( replace_entities("<b>Low < High & Medium £ six</b>", keep=["lt", "amp"]), "<b>Low < High & Medium \xa3 six</b>", )
def test_illegal_entities(self): self.assertEqual( replace_entities("a < b &illegal; c � six", remove_illegal=False), "a < b &illegal; c � six", ) self.assertEqual( replace_entities("a < b &illegal; c � six", remove_illegal=True), "a < b c six", ) self.assertEqual(replace_entities("x≤y"), "x\u2264y") self.assertEqual(replace_entities("xy"), "xy") self.assertEqual(replace_entities("xy", remove_illegal=False), "xy")
def clean_url(url): clean_url = '' try: clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding)))) except ValueError: pass return clean_url
def process(self, data, url_object): """Process HTML data. Replaces entities and removes tags (except comments) before processing with TextProcessor. """ logging.info("Process HTML %s" % url_object.url) try: encoding, data = get_codec_and_string(data) # Remove style tags to avoid false positives from inline styles data = remove_tags_with_content(data, which_ones=('style',)) except UnicodeDecodeError as ude: logging.error('UnicodeDecodeError in handle_error_method: {}'.format(ude)) logging.error('Error happened for file: {}'.format(url_object.url)) return False # Convert HTML entities to their unicode representation entity_replaced_html = replace_entities(data) # Collapse whitespace (including newlines), since extra whitespace is # not significant in HTML (except inside comment tags) collapsed_html = _whitespace_re.sub(' ', entity_replaced_html) # Replace tags with <> character to make sure text processor # doesn't match across tag boundaries. replace_tags_text = _html_tag_re.sub('<>', collapsed_html) return self.text_processor.process(replace_tags_text, url_object)
def remove_garbage(val): val = replace_escape_chars(val) val = replace_entities(val) val = re.sub(r'\.', '. ', val) val = re.sub(r'\s+,\s{2,}', ', ', val) val = re.sub(r'\s{2,}', ' ', val) return val.strip()
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'), token='____SECTION____'): """Normalize web content. Parameters ---------- keep : tuple HTML tags to keep. token : str or None Token to use for replacing kep HTML tags. Do not replace if `None`. """ try: x = strip_html5_whitespace(x) x = remove_comments(x) x = remove_tags(x, keep=keep) if token: x = replace_tags(x, token=token) x = replace_entities(x) x = replace_escape_chars(x) except (TypeError, AttributeError): pass for part in _rx_web_sectionize.split(x): if part: yield part
def clearText(inputTextFile, outputTextFile, outputErr): input = open(inputTextFile, 'rb') sc = chardet.detect(input.read()) input.close() #print(inputTextFile,sc) # if sc["encoding"] != None: # outputErr.write(path+"\n") if sc["encoding"] != None and sc["confidence"] > 0.5: input = open(inputTextFile, 'r', encoding=sc["encoding"]) output = open(outputTextFile, 'w', encoding="utf-8") try: text = input.read() except Exception as e: print(e) outputErr = open("C:\\ErrorFiles.log", 'a', encoding="utf-8") outputErr.write(inputTextFile + " | " + outputTextFile + "\n") outputErr.close() return text = replace_entities(text) PUNCTUATION = '' #u'…»«—№’' RU_ALPHABET = u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя' final_new_text = re.sub( "[^{}]+".format(printable + PUNCTUATION + RU_ALPHABET), "", text) final_new_text = re.sub(r'\s+', ' ', final_new_text) for c in list(PUNCTUATION): final_new_text.replace(c, ' ' + c + ' ') final_new_text.replace(u'\xa0', ' ') output.write(final_new_text) output.close() input.close() else: outputErr.write(inputTextFile + "\n")
def text(region): """Converts HTML to text. There is no attempt at formatting other than removing excessive whitespace, For example: >>> t = lambda s: text(htmlregion(s)) >>> t(u'<h1>test</h1>') u'test' Leading and trailing whitespace are removed >>> t(u'<h1> test</h1> ') u'test' Comments are removed >>> t(u'test <!-- this is a comment --> me') u'test me' Text between script tags is ignored >>> t(u"scripts are<script>n't</script> ignored") u'scripts are ignored' HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>") u'The text is here' """ text = replace_entities(region.text_content, encoding=region.htmlpage.encoding) return _WS.sub(u' ', text).strip()
def queryPreprocessing(query, args): # regular expressions p_tag_comment = re.compile(r'(<.*?>|<!--.*-->)') p_alpha_digit = re.compile(r'\b([a-z]+)-([0-9]+)\b', re.I) p_digit_alpha = re.compile(r'\b([0-9]+)-([a-z]+)\b', re.I) p_dot_acronym = re.compile(r'\b([a-z]+\.)+[a-z]+(\.|\b)', re.I) p_date = re.compile( r"""\b ([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}) |([0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4}) |(((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[.]? |January|February|March|April|May|June|July|August |September|October|November|December) \ [0-9]{1,2}(st|nd|rd|th)?,\ [0-9]{2,4}) \b""", re.VERBOSE | re.I) p_docno = re.compile(r'(?:<DOCNO>\s*)(.+)(?:\s*</DOCNO>)', re.I) p_num1 = re.compile(r',([0-9]{3})') p_num2 = re.compile(r'\b(\d+)[.]0+\b') p_file_extension = re.compile( r'([^\\\/:*?\"<>|\s]+)\.(aif|cda|mid|midi|mp3|mpa|ogg|wav|wma|wpl|7z|arj|deb|pkg|rar|rpm|tar\.gz|z|zip|bin|dmg|iso|toast|vcd|csv|dat|db|dbf|log|mdb|sav|sql|tar|xml|apk|bat|bin|cgi|pl|com|exe|gadget|jar|py|wsf|fnt|fon|otf|ttf|ai|bmp|gif|ico|jpeg|jpg|png|ps|psd|svg|tif|tiff|asp|aspx|cer|cfm|css|htm|html|js|jsp|part|php|rss|xhtml|key|odp|pps|ppt|pptx|class|cpp|cs|h|java|sh|swift|vb|ods|xlr|xls|xlsx|bak|cab|cfg|cpl|cur|dll|dmp|drv|icns|ico|ini|lnk|msi|sys|tmp|3g2|3gp|avi|flv|h264|m4v|mkv|mov|mp4|mpg|mpeg|rm|swf|vob|wmv|doc|docx|odt|pdf|rtf|tex|txt|wks|wps|wpd)', re.I) p_prefix = re.compile( r'\b(a|an|ante|anti|auto|circum|co|com|con|contra|contro|de|dis|en|em|ex|extra|fore|hetero|h**o|homeo|hyper|il|im|in|ir|inter|intra|intro|macro|micro|mid|mis|mono|non|omni|over|post|pre|pro|re|semi|sub|super|sym|syn|trans|tri|un|under|uni)-([a-z])+\b', re.I) p_hyphen = re.compile(r'\b(\w+-)+\w+\b') # create a porter stemmer stemmer = PorterStemmer() # convert all character references (e.g. >, >, &x3e;) to unicode query = replace_entities(query) query = html.unescape(query) # some queries have '/', need to be handled specifically query = query.replace('/', ' / ') # convert to lower case query = query.lower() # expand file extension query = p_file_extension.sub(r'\g<1>\g<2> \g<2>', query) # ph.D. -> phd query = p_dot_acronym.sub(dotAcronym, query) # convert date to mm/dd/yyyy format or remove it if invalid query = p_date.sub(dateReplace, query) # digit format query = p_num1.sub(r'\g<1>', query) # remove ',' in 1,000 query = p_num2.sub(r'\g<1>', query) # remove '.00' in 1.00 # expand digit-alpha format query = p_digit_alpha.sub(digitAlpha, query) # expand alpha-digit format query = p_alpha_digit.sub(alphaDigit, query) # expand stem with hyphen prefix query = p_prefix.sub(prefixReplace, query) # expand hyphenated word query = p_hyphen.sub(hyphenReplace, query) # tokenize query query = nltk.word_tokenize(query) # apply Porter Stemmer if args.index_type == 'stem': query = [stemmer.stem(word) for word in query] # remove term not in idx_table (value will be 0 for all retrieval) query = [x for x in query if x in idx_table] return query
def make_table(): items = [] for obj in obj_location_list: items.append(Item(obj.total_time)) # Populate the table table = ItemTable(items) table_html = str(table.__html__().replace("<table>", '<table class="table">')) # print(table_html) table_html = replace_entities(table_html) # counter1 = count(1) # table_html = re.sub('data-target="#demo', lambda m: m.group() + str(next(counter1)), table_html) # table_html = table_html.replace("</td></tr>", '</td></tr> <tr> <td colspan="6" class="hiddenRow"style="padding:0!important;"><div class="accordian-body collapse" id="demo"> <ul class="list-group"> [cmmt] </ul> </div></td></tr>') # counter2 = count(1) # table_html = re.sub('id="demo', lambda m: m.group() + str(next(counter2)), table_html) # for key, value in theme_dict.items(): # for sub_theme in value: # table_html = table_html.replace('[cmmt]', get_cmmts(sub_theme.theme, theme_dict),1) # g.theme_dict = result_list return table_html
def replace_all_entities(string): """ replace all XML entities, even poorly encoded """ # hack because BGG encodes 'Ü' as '&#195;&#156;' (d'oh!) # note that this may corrupt text that's actually encoded correctly! return replace_entities( replace_utf_entities( string.replace("&", "&").replace("&", "&").replace("&", "&")))
def clean_text(text): """Clean text from tags, replace entities and normalize whitespaces""" text = remove_tags(text) text = replace_entities(text) # Normalize whitespace text = re.sub(r'(\s)+', '\\1', text) # Strip whitespace return text.strip()
def test_missing_semicolon(self): for entity, result in ( ('<<!', '<<!',), ('<!', '<!',), ('A ', 'A ',), ('A!', 'A!',), ('Ah', 'Ah',), ('A!', 'A!',), ('Ax', 'Ax',), ('³!', u'\u00B3!',), ('Á!', u'\u00C1!',), ('☃!', u'\u2603!',), ('™', u'\u2122',), ('™', u'\u2122',), ): self.assertEqual(replace_entities(entity, encoding='cp1252'), result) self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
def re(self, regex): if isinstance(regex, types.StringTypes): regex = re.compile(regex, re.UNICODE) text = self.extract() try: lst = [regex.search(text).group('extract')] except: lst = regex.findall(text) return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(lst)]
def sanitize(iterable): # TODO change name and add other options iterable = (x.strip() for x in iterable) iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable) iterable = (x.encode('ascii', errors='ignore').decode('ascii') for x in iterable) iterable = (replace_entities(x) for x in iterable) iterable = (remove_tags(x) for x in iterable) return iterable
def sanitize(iterable): # TODO change name and add other options iterable = (x.strip() for x in iterable) iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable) iterable = (unidecode(x) for x in iterable) iterable = (replace_entities(x) for x in iterable) iterable = (remove_tags(x) for x in iterable) return iterable
def route(): """Get all GET requests and dump them to logs.""" # Unescape HTML and write to log. with open("cap.log", "a") as f: f.write(replace_entities(str(request.url)) + "\n") # If we think a keylogger (param: 'c') is trying to get us information... with open("key.log", "a") as f: if "c" in request.args: keys = loads(replace_entities(request.args.get('c'))) # From the JSON list get only keys pressed. If impossible, stop. try: keys = "".join(keys) f.write(keys + '\n') except Exception: pass return message
def clean_text(text): # Helper function to clean text data and remove non-printable chars. text = text.strip() text = re.sub(r'[\n\t\r\s]+', ' ', text) text = text.encode('ascii', errors='ignore').decode('ascii') text = replace_entities(text) text = remove_tags(text) return text
def extract_raw_text(html): text = replace_entities(html) text = re_clean_blanks.sub(u' ', text) text = re_clean_comments.sub(u' ', text) text = re_clean_javascript.sub(u' ', text) text = re_clean_style.sub(u' ', text) text = re_clean_balises.sub(u' ', text) text = re_clean_blanks.sub(u' ', text).strip() text = re_clean_multiCR.sub(u'\n', text) return text
def clean_file(file_name, fields): res = [] with jsonlines.open(file_name) as rdr: for line in rdr: for f in fields: if not line[f]: continue line[f] = replace_entities(line[f].replace("\n", "").strip()) res.append(copy.deepcopy(line)) return res
def clean_text(text): """Clean text from tags, replace entities and normalize whitespaces""" if not isinstance(text, six.string_types): return text text = remove_tags(text) text = replace_entities(text) # Normalize whitespace text = re.sub(r'(\s)+', '\\1', text) # Strip whitespace return text.strip()
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ imgurl = extract_image_url(txt) return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
def normalizeTool(cls, html): ''' :param html: 需要去掉html代码的str字符串 :return: 去掉html代码的字符串 ''' removeHtml = w3.replace_escape_chars(w3.replace_entities( w3.remove_tags(html)), replace_by=" ") # removeHtml = w3.replace_escape_chars(w3.replace_entities(w3.remove_tags(html))) removeEscapeChars = " ".join(removeHtml.split()) return removeEscapeChars
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ imgurl = extract_image_url(txt) return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
def extract_regex(regex, text, encoding="utf-8"): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group("extract")] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, six.text_type): return [replace_entities(s, keep=["lt", "amp"]) for s in strings] else: return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, basestring): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group('extract')] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) #flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。 if isinstance(text, unicode): return [replace_entities(s, keep=['lt', 'amp']) for s in strings] else: return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
def parse_item(self, response): links = dict() link_titles = set() url = response.url.split('#')[0].lower() url_head = url.split('/pages/')[0] + '/pages/' title = response.xpath('//meta[@name="DC.title"]/@content').extract_first() if title and title.endswith('- NHS Choices'): title = title.rstrip(' NHS Choices').rstrip(' -') subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ') subjects = [s.lower() for s in subjects if s] if not subjects: subjects = [title.lower()] description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first()) raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first() page_content = clean_text(replace_entities(remove_tags(raw_page_content))) for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'): label = a.xpath('text()').extract_first() href = a.xpath('@href').extract_first() if href and label: href = self.base_url + href.lstrip('/') href = href.lower() label = clean_text(label) if '/conditions/' in href and url_head not in href: link_titles.add(label) if href in links: links[href]['count'] += 1 else: links[href] = { 'count': 1, 'label': label } if url_head in href and href != url: print("********************", href) yield scrapy.Request(href, self.parse_item) article = NhsItem() article['url'] = url article['title'] = title article['subjects'] = subjects article['description'] = description article['page_content'] = str(page_content) article['links'] = links article['link_titles'] = list(link_titles) yield article
def _has_ajaxcrawlable_meta(text): """ >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>') True >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>") True >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>') False >>> _has_ajaxcrawlable_meta('<html></html>') False """ # Stripping scripts and comments is slow (about 20x slower than # just checking if a string is in text); this is a quick fail-fast # path that should work for most pages. if 'fragment' not in text: return False if 'content' not in text: return False text = html.remove_tags_with_content(text, ('script', 'noscript')) text = html.replace_entities(text) text = html.remove_comments(text) return _ajax_crawlable_re.search(text) is not None
import urllib import urlparse from urlparse import urljoin from w3lib.html import replace_entities def clean_link(link_text): return link_text.strip("\t\r\n '\"") # 返回第一个url地址 list_first_item = lambda x:x[0] if x else None # 将url地址组装返回,并移除空格标点 entites clean_url = lambda base_url, u, response_encoding: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) # 获取请求参数 def get_query(url, key): bits = list(urlparse.urlparse(url)) query = urlparse.parse_qs(bits[4]) return query[key][0] # 设置请求参数 def set_query(url, **args): bits = list(urlparse.urlparse(url)) query = urlparse.parse_qs(bits[4])
def _cleanup(value): return " ".join(replace_entities(replace_tags(value)).strip().split())
def remove_entities(text, encoding): return replace_entities(text, keep=_ENTITIES_TO_KEEP, encoding=encoding)
""" if type(arg) is types.ListType: return list(set(arg)) elif type(arg) is types.TupleType: return tuple(set(arg)) return arg def clean_link(link_text): """ Remove leading and trailing whitespace and punctuation """ return link_text.strip("\t\r\n '\"") clean_url = lambda base_url, u, response_encoding: urljoin( base_url, replace_entities(text=clean_link(u), encoding=response_encoding) ) # # clean_url = lambda base_url, u, response_encoding: urljoin(base_url, # replace_entities( # text=clean_link(u.decode(response_encoding, 'ignore')), # encoding=response_encoding) # ) """ remove leading and trailing whitespace and punctuation and entities from the given text. then join the base_url and the link that extract """
#!/usr/bin/python # -*- coding: utf-8 -*- # __author__ 'Hao LI' import types from w3lib.html import replace_entities from urlparse import urlparse, urljoin NULL = [None, 'null'] prefix = "www.bbc" new_prefix = "http://www.bbc.com" def clean_link(link_text): """ Remove leading and trailing whitespace and punctuation """ return link_text.strip("\t\r\n '\"") clean_url = lambda base_url,u,response_encoding: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) """ remove leading and trailing whitespace and punctuation and entities from the given text. then join the base_url and the link that extract """
def test_encoding(self): self.assertEqual(replace_entities(b'x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y')
def test_browser_hack(self): # check browser hack for numeric character references in the 80-9F range self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y') self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y')
def test_keep_entities(self): # keep some entities self.assertEqual(replace_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') self.assertEqual(replace_entities(u'<b>Low < High & Medium £ six</b>', keep=[u'lt', u'amp']), u'<b>Low < High & Medium \xa3 six</b>')