def processDataPrice(values): result = replace_escape_chars(values, which_ones='€', replace_by=u'.') result = replace_escape_chars(result, which_ones='EUR', replace_by=u'') result = replace_escape_chars(result, which_ones=' ', replace_by=u'') result = replace_escape_chars(result, which_ones='.', replace_by=u'') result = replace_escape_chars(result, which_ones=',', replace_by=u'.') return result
def process_item(self, item, spider): # clean date raw_date = item['date'].split() item['date'] = dateparser.parse(raw_date[0]) # user item['user'] = replace_escape_chars(item['user']) # set item['deck_set'] = re.sub('[()]', '', ' '.join(raw_date[1:])) # deck rating item['rating'] = int(re.findall('\d+', item['rating'])[0]) # deck count item['count'] = list(map(int, item['count'])) # replace card names by unique ids for i, card in enumerate(item['cards']): card_name_raw = replace_escape_chars(card) card_name_clean = ' '.join(card_name_raw.split()) item['cards'][i] = self.get_card_id(card_name_clean) # multiply ids by card counts id_list = [] for i, j in zip(item['cards'], item['count']): id_list.extend([i] * j) # replace by list of ids item['cards'] = id_list # deck id item['deck_id'] = int(item['deck_id']) # craft cost item['craft_cost'] = int(item['craft_cost']) # deck format if item['deck_format']: item['deck_format'] = 'W' # wild else: item['deck_format'] = 'S' # standard # check if 30 cards in deck if len(item['cards']) != 30: raise DropItem("Incomplete deck") # remove count del item['count'] return item
def step2_get_detail_html_save_csv(url) : # 상세 페이지의 html 데이터를 받아온다. response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # print(soup) # 데이터를 추출한다. rows = soup.select('tr[class^="calendar__row calendar_row calendar__row--grey"]') for row in rows : date = row.select('td[class="calendar__cell calendar__date date"]')[0].text time = row.select('td[class="calendar__cell calendar__time time"]')[0].text currency = html.replace_escape_chars(row.select('td[class="calendar__cell calendar__currency currency "]')[0].text) impact = row.select('td[class^="calendar__cell calendar__impact impact calendar__impact calendar__impact--"] div span')[0].get('class')[0] index_name = row.select('.calendar__event-title')[0].text actual = row.select('td[class="calendar__cell calendar__actual actual"]')[0].text forecast = row.select('td[class="calendar__cell calendar__forecast forecast"]')[0].text previous = row.select('td[class="calendar__cell calendar__previous previous"]')[0].text # .text를 하면 태그 사이의 문자열만 가져올 수 있다. data_list = [date, time, currency, impact, index_name, actual, forecast, previous] if os.path.exists('index_data_total.csv') == False: # 헤더를 저장한다. with open('index_data_total.csv', 'w', newline='') as fp: writer = csv.writer(fp) writer.writerow(['date', 'time', 'currency', 'impact', 'index_name', 'actual', 'forecast', 'previous']) with open('index_data_total.csv', 'a', newline='') as fp2: writer2 = csv.writer(fp2) writer2.writerow(data_list)
def parse_news(self, response): """ Extract the data from the news page and if the page is not in cache, this HTML request is counted, so the ip should be updated if necessary. The update ip needs to stay here unless you don't want HTTPCACHE """ loader = NewsLoader(item=NewsItem(), response=response) loader.add_xpath('title', '//span[@class="lede-text-only__highlight"]/text()') loader.add_xpath( 'title', '//span[@class="lede-large-content__highlight"]/text()') loader.add_xpath('title', '//article//h1/text()') authors = response.xpath('//div[@class="author"]/text()').extract() for author in authors: author = strip_html5_whitespace(author) author = replace_escape_chars(author) if len(author) != 0: loader.add_value('author', author) timestamp = response.xpath( '//time[@class="article-timestamp"]/@datetime').extract()[0] timestamp = du.normalize_timestamp(timestamp, hasTimezone=True) loader.add_value('date', timestamp.split(' ')[0]) loader.add_value('time', timestamp.split(' ')[1]) loader.add_xpath('content', '//div[@class="body-copy fence-body"]') loader.add_xpath('tags', '//meta[@name="keywords"]/@content') return loader.load_item()
def parse_item(self, response): DI = DoubanItem() DI['scheme'] = response.url sel = Selector(response, type='html') context = sel.css( '#content > div > div.article > div.topic-content.clearfix > div.topic-doc' ) # 发布人 DI['user'] = context.css("h3 > span.from > a::text").extract()[0] # 时间 DI['created_at'] = context.css( ' h3 > span.color-green::text').extract()[0] # 标题,可能两种匹配,优先选择正文中的 title = context.css( 'table > tbody > tr:nth-child(2) > td.tablecc').extract() if not title: title = sel.css('#content > h1').extract()[0] else: title = title[0] DI['title'] = remove_tags(title).strip() # 正文文本 DI['text'] = replace_escape_chars('。'.join( sel.css('#link-report > div >p ::text').extract()), which_ones=('\n', '\t', '\r', ' ')) # 点赞数,不存在就是0 lc = sel.css('#sep > div.sns-bar-fav > span > a::text').extract() if not lc: lc = 0 else: lc = lc[0][:-1] DI['like_count'] = lc return DI
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'), token='____SECTION____'): """Normalize web content. Parameters ---------- keep : tuple HTML tags to keep. token : str or None Token to use for replacing kep HTML tags. Do not replace if `None`. """ try: x = strip_html5_whitespace(x) x = remove_comments(x) x = remove_tags(x, keep=keep) if token: x = replace_tags(x, token=token) x = replace_entities(x) x = replace_escape_chars(x) except (TypeError, AttributeError): pass for part in _rx_web_sectionize.split(x): if part: yield part
def parseFunc(self, response, N): js = json.loads(response.text, encoding='utf-8') cardN = len(js['cards']) for i in range(N, cardN): cgN = len(js['cards'][i]['card_group']) for j in range(cgN): it = js['cards'][i]['card_group'][j] #logging.warning('i:%d,j:%d' % (i, j)) if 'mblog' in it.keys(): mblog = it['mblog'] keys = mblog.keys() item = WeibozItem() item['mblogid'] = mblog['mblogid'] item['created_at'] = mblog['created_at'] item['comments_count'] = self.etl(mblog, 'comments_count', keys) item['like_count'] = self.etl(mblog, 'like_count', keys) item['reposts_count'] = self.etl(mblog, 'reposts_count', keys) item['text'] = replace_escape_chars( remove_tags(mblog['text']), which_ones=('\n', '\t', '\r', ' ')) item['scheme'] = it['scheme'] item['user'] = { 'name': mblog['user']['screen_name'], 'fansNum': mblog['user']['fansNum'], 'statuses_count': mblog['user']['statuses_count'] } yield item
def remove_garbage(val): val = replace_escape_chars(val) val = replace_entities(val) val = re.sub(r'\.', '. ', val) val = re.sub(r'\s+,\s{2,}', ', ', val) val = re.sub(r'\s{2,}', ' ', val) return val.strip()
def process_item(self, item, spider): """ Process content based on its type. """ content_type = item.get('content_type', 'UNKNOWN') log = structlog.get_logger().bind( event = 'PROCESS_ITEM', content_type = content_type, source_url = item['source_url']) if content_type == 'HTML': plain_content = html.replace_escape_chars( html.remove_tags( html.remove_tags_with_content( item['content'], which_ones = ('script',) ) ), which_ones = ('\n','\t','\r',' '), replace_by = '') item['content'] = plain_content log.info(message = 'HTML content extracted') # @TODO elif content_type in ['PDF','MS_WORD', 'LIBREOFFICE', 'POWERPOINT', 'CSV', 'XLSX', 'XLS']: log.info( event = 'QUEUE_CONTENT', message = 'Pushing content for deferred processing') elif content_type in [None, 'UNKNOWN']: log.warn(error = 'UNRECOGNIZED_CONTENT_TYPE') return item
def parseResponse(self, response, N): js = json.loads(response.text, encoding='utf-8') cardNum = len(js['cards']) for i in range(N, cardNum): cardGroupNum = len(js['cards'][i]['card_group']) for j in range(cardGroupNum): it = js['cards'][i]['card_group'][j] if 'mblog' in it.keys(): mblog = it['mblog'] keys = mblog.keys() item = PostItem() item['mblogid'] = mblog['id'] item['created_at'] = mblog['created_at'] item['comments_count'] = self.etl(mblog, 'comments_count', keys) item['like_count'] = self.etl(mblog, 'attitudes_count', keys) item['reposts_count'] = self.etl(mblog, 'reposts_count', keys) item['text'] = replace_escape_chars( remove_tags(mblog['text']), which_ones=('\n', '\t', '\r', ' ')) item['scheme'] = it['scheme'] item['user_name'] = mblog['user']['screen_name'] item['user_followers'] = mblog['user']['followers_count'] item['user_statuses'] = mblog['user']['statuses_count'] item['user_gender'] = mblog['user']['gender'] item['airline'] = self.airline_name yield item
def parse(self, response): articles = response.css('article') for article in articles: title = article.css('.entry-title > a::text').extract_first() date_published = article.css( '.entry-date.published::text').extract_first() link = article.xpath( './/*[@class="more-link"]/@href').extract_first() if 'category-video' in article.attrib['class']: content_dirty = article.css('.entry-content p').extract_first() content = remove_tags(content_dirty).replace('|', ' | ').replace( '\n', ' ') else: content_dirty = article.xpath( './/*[@class="entry-content"]').extract_first() content = replace_tags(replace_escape_chars(content_dirty), ' ').replace('(more…)', '').replace( 'Read More', '').strip() yield { 'Title': title, 'Published On': date_published, 'Content': content, 'Link': link, } next_page = response.xpath( './/a[contains(@class, "next")]/@href').extract_first() yield Request(next_page)
def parse_radiozet(self, response): url = response.url date = response.css('article .info-header__date--published__date::text' ).extract_first() #date = date.split(' ') #date = date[0] date = date.replace('.', '-') time = response.css('article .info-header__date--published__time::text' ).extract_first() title = response.css( "article header .full__title.full__article__title::text").extract( ) title = ' '.join(title) title = replace_escape_chars(title) lead = response.css(".full__article__lead ::text").extract() lead = ' '.join(lead) lead = remove_tags(lead) lead = re.sub('\s+', ' ', lead) lead = re.sub(' \n', '', lead) exclude_selectors = ( 'not(ancestor::*[contains(@class, "advert")])' ' and not(ancestor::*[contains(@class, "embed__article")])' ' and not(ancestor::*[contains(@class, "SandboxRoot")])' ' and not(ancestor::*[contains(@class, "twitter-tweet")])' ' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])' ' and not(descendant::*[starts-with(text(), "ZOBACZ TAKŻE:")])') #text = response.css('div.full__article__body p:not([class^="embed__article"])').extract() selector = '//div[contains(@class, "full__article__body")]//p[%s]' % exclude_selectors text = response.xpath(selector) text = text.extract() # W R usunac akapity ze zdjeciami oraz wpisami z twittera - https://t.co/ lub pic.twitter.com/ source = text[-1] text.pop(-1) text.pop(0) text = ' || '.join(text) text = remove_tags(text) source = remove_tags(source) # Joining lead with text text = ' || '.join([lead, text]) tags = response.css( 'div.full__article__tags__list a::attr("title")').extract() yield { 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'source': source, 'tags': ', '.join(tags) }
def safe_html(response): html_raw = response.xpath( '//div[contains(@class, "panel panel-default anenities")]' ).extract_first('').strip() html = replace_escape_chars(safehtml(htmlregion(html_raw))).replace( '<p></p>', '').strip() return html
def parse_tvn24bis(self, response): url = response.url art_id = url.split(',')[-1].split('.')[0] date = response.css( 'article.detail header time::attr("datetime")').extract_first() date = date.split(' ') time = date[1][0:4] date = date[0] title = response.css("article.detail header h1 ::text").extract_first() title = replace_escape_chars(title).strip() lead = response.css("div.content p.lead ::text").extract_first() lead = replace_escape_chars(lead).strip() text = response.xpath( '//div[@class="content"]/p[not(contains(@clas, "rules") or contains(@clas, "footer"))]/text()' ).extract() text = ' || '.join(text) text = remove_tags(text) text = replace_escape_chars(text) text = clear_text(text) autor = response.css( "div.content div.footer ::text").extract()[1].split('/') if len(autor) > 1: source = autor[1] source = source.strip().replace('Źródło: ', '') autor = autor[0].strip().replace('Autor: ', '') else: source = '' autor = autor[0].strip().replace('Autor: ', '') yield { 'id': art_id, 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'autor': autor, 'source': source }
def strip_all(self, value): # 去除空格和标签,换行符 try: value = remove_tags(value) value = replace_escape_chars(value) finally: value = value.strip() return value
class ProductLoader(scrapy.loader.ItemLoader): default_output_processor = MapCompose(remove_tags) product_in = MapCompose(remove_tags, lambda s: replace_escape_chars(s, which_ones=('\n', '\t', '\r', '\xa0', ' '))) product_out = TakeFirst() price_in = MapCompose(remove_tags, lambda s: replace_escape_chars(s, which_ones=('\n', '\t', '\r', '\xa0', ' ')), extract_number) price_out = TakeFirst() retailer_in = Compose() retailer_out = TakeFirst() date_in = Compose() date_out = TakeFirst()
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(replace_escape_chars(b"no ec"), str) assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str) assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str) assert isinstance( replace_escape_chars( b"no ec", which_ones=( "\n", "\t", ), ), str, ) assert isinstance(replace_escape_chars("no ec"), str) assert isinstance(replace_escape_chars("no ec", replace_by="str"), str) assert isinstance( replace_escape_chars( "no ec", which_ones=( "\n", "\t", ), ), str, )
def parse_interia(self, response): '''Parser for Interia''' url = response.url art_id = url.split('nId,')[1] date = response.css('.article-date ::attr("content")').extract_first() date = date.split('T') time = date[1] date = date[0] title = response.css("h1.article-title::text").extract() title = ' '.join(title) title = replace_escape_chars(title) lead = response.css(".article-body .article-lead::text").extract() lead = ' '.join(lead) lead = remove_tags(lead) # art_path = '//div[@class = "article-container"]/' \ # 'div[not(*/@class = "embed")]/' \ # 'p[not(/aside[@class = "embed embed-photo embed-center"])]' # text = response.xpath(art_path) exclude_selectors = ( 'not(self::*[contains(@class, "advert")])' ' and not(self::*[starts-with(text(), "ZOBACZ RÓWNIEŻ:")])' ' and not(self::*[starts-with(text(), "SPRAWDŹ:")])' ' and not(descendant-or-self::*[contains(@class, "sub")])' ' and not(descendant-or-self::*[contains(@class, "embed")])' ' and not(ancestor-or-self::*[contains(@class, "embed")])' ' and not(descendant-or-self::*[contains(@class, "aside")])' ' and not(ancestor-or-self::*[contains(@class, "aside")])' ' and not(descendant-or-self::*[contains(@class, "aside")])' ' and not(descendant-or-self::u)' ' and (self::p[not(contains(@dir, "ltr"))])') selector = '//div[@class = "article-container"]/' \ 'div[not(*/@class = "embed")]/' \ '*[%s]' % exclude_selectors text = response.xpath(selector) text = text.extract() text = ' || '.join(text) text = remove_tags(text) text = clear_text(text) source = response.css( ".article-footer .article-source ::attr('content')").extract() yield { 'id': art_id, 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'source': ', '.join(source) }
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(replace_escape_chars(b'no ec'), six.text_type) assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type) assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type) assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type) assert isinstance(replace_escape_chars(u'no ec'), six.text_type) assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type) assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type)
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
def normalizeTool(cls, html): ''' :param html: 需要去掉html代码的str字符串 :return: 去掉html代码的字符串 ''' removeHtml = w3.replace_escape_chars(w3.replace_entities( w3.remove_tags(html)), replace_by=" ") # removeHtml = w3.replace_escape_chars(w3.replace_entities(w3.remove_tags(html))) removeEscapeChars = " ".join(removeHtml.split()) return removeEscapeChars
def parse_tvn24(self, response): url = response.url art_id = url.split(',')[-1].split('.')[0] date = response.css( 'div.articleDateContainer time::attr("datetime")').extract_first() date = date.split(' ') time = date[1] date = date[0] title = response.css("div.mainContainer h1 ::text").extract_first() title = replace_escape_chars(title).strip() lead = response.css("article h2 span.black ::text").extract_first() lead = replace_escape_chars(lead).strip() text = response.xpath( '//*[not(contains(self, "em") or contains(self, "figure") or contains(self, "aside") or contains(@class, "innerArticleModule.onRight.cols.externalContent.innerText") or contains(@class, "lead"))]/div[@class="textArticleDefault"]//article/p[not(contains(self, "em") or contains(self, "figure") or contains(self, "aside") or contains(@class, "innerArticleModule.onRight.cols.externalContent.innerText") or contains(@class, "innerText") or contains(@class, "lead") or contains(@class, "textHolder") or contains(self, "div") or contains(text(), "czytaj"))]/text()' ).extract() text = ' || '.join(text) text = remove_tags(text) text = replace_escape_chars(text) # Joining lead with text text = ' || '.join([lead, text]) autor = response.css("div.articleAuthors ::text").extract() source = autor[2].strip().replace('Źródło: ', '') autor = autor[0].strip().replace('Autor: ', '') yield { 'id': art_id, 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'autor': autor, 'source': source }
def parse_rmf(self, response): url = response.url art_id = url.split('nId,')[1] date = response.css('.article-date ::attr("content")').extract_first() date = date.split('T') time = date[1] date = date[0] title = response.css(".article-header .article-title::text").extract() title = ' '.join(title) title = replace_escape_chars(title) lead = response.css(".article-body .article-lead::text").extract() art_path = '//div[@class = "article-container"]/div[@class = "article-body"]/div[@class = "articleContent"][not(*/@class = "embed")]/p[not(contains(descendant-or-self, "u") or contains(descendant-or-self, "sub") or contains(descendant-or-self, "b") or contains(ancestor-or-self, "aside") or contains(descendant-or-self, "aside") or contains(ancestor-or-self, "twitter-widget") or contains(@class, "Tweet-text"))]' text = response.xpath(art_path) text = text.extract() twitter = response.css( ".article-container .article-body .articleContent .embed-blockquote" ).extract() # Usuniecie wpisow twitterowych # TO DO - usunac lepiej twitty if len(twitter) > 0: for a in range(0, len(twitter)): for t in range(0, len(text)): if text[t] == twitter[a]: text[t] = "" text = ' || '.join(text) text = remove_tags(text) # Joining lead with text lead = ' '.join(lead) text = ' || '.join([lead, text]) text = re.sub('\s+', ' ', text) autor = response.css(".article-author-name::text").extract() source = response.css( ".article-footer .article-source ::attr('content')").extract() tags = response.css(".elementTagsList a::text").extract() yield { 'id': art_id, 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'autor': ', '.join(autor), 'source': ', '.join(source), 'tags': ', '.join(tags) }
def parse_car_rating(self, response): loader = ItemLoader(item = ZigwheelsItem()) rating = {} for i, li in enumerate(response.css('ul.p-list li')): if i > 2: break rating[replace_escape_chars(li.css('div.m-wl::text').get().lower().split(' ')[0])] = replace_escape_chars(li.css('div.m-wr::text').get()) for li in response.css('div.rv-ls ul li'): rating[replace_escape_chars(li.css('div.m-wl::text').get().lower().split(' ')[0])] = replace_escape_chars(li.css('div.m-wr::text').get()) loader.add_value('brand', response.xpath('//div[@class="zw-cmn-containerColor"]/div/ol/li[3]/a/span/text()').get()) loader.add_value('name', response.xpath('//div[@class="zw-cmn-containerColor"]/div/ol/li[4]/a/span/text()').get()) loader.add_value('mileage', rating['mileage']) loader.add_value('performance', rating['performance']) loader.add_value('maintenance', rating['maintenance']) loader.add_value('comfort', rating['comfort']) loader.add_value('safety', rating['safety']) loader.add_value('features', rating['features']) yield loader.load_item()
def test_with_escape_chars(self): # text with escape chars self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape') self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=b'\xc2\xa3'), u'escape\xa3chars\xa3')
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def clean_content(self, text): """ Return a string of text cleaned up by tags, entities, escape chars, quotes and spaces """ temp = remove_tags_with_content(text, which_ones=('style', 'script', 'figcaption')) temp = remove_tags(temp) temp = remove_entities(temp) temp = replace_escape_chars(temp) temp = unquote_markup(temp) temp = " ".join(temp.split()) return temp
def processText(value): """process to get text, clean specifix character Arguments: value {string} -- input value Returns: string -- out put value """ if value: value = replace_escape_chars(value) value = remove_tags(value) return value else: return ''
def processHtml(value): """process to get text, clean specifix character Arguments: value {string} -- input value Returns: string -- out put value """ if value: value = replace_escape_chars(value) value = value.replace('#ft5_slash#', '/') return value else: return ''
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([clean_url(url) for url in links_text]) return [Link(url, "") for url in urlstext]
def parse_dziennik(self, response): '''Parser for dziennik.pl''' url = response.url art_id = url.split('artykuly/')[1] art_id = art_id.split(',')[0] date = response.css('span.ap-date time::text').extract_first() date = date.split(', ') time = date[1] date = date[0] date = date.replace('.', '-') title = response.css( ".articlepage .single-article-title::text").extract() title = ' '.join(title) title = replace_escape_chars(title) lead = response.css("article h2::text").extract() lead = ' '.join(lead) lead = remove_tags(lead) text = response.css('div#dziennik_intext.articleBody p').extract() # W R usunac akapity ze zdjeciami oraz wpisami z twittera - https://t.co/ lub pic.twitter.com/ text = ' || '.join(text) text = remove_tags(text) # Joining lead with text text = ' || '.join([lead, text]) source = response.css("div.ps-line strong::text").extract() tags = response.css("div.ps-line.tags a::text").extract() yield { 'id': art_id, 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'source': ', '.join(source), 'tags': ', '.join(tags) }
def clean_text(text): return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
def test_replace_escape_chars(self): # make sure it always return unicode assert isinstance(replace_escape_chars('no ec'), unicode) assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode) assert isinstance(replace_escape_chars('no ec', which_ones=('\n', '\t',)), unicode) # text without escape chars self.assertEqual(replace_escape_chars(u'no ec'), u'no ec') self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec') # text with escape chars self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape') self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')
def test_replace_escape_chars(self): # make sure it always return unicode assert isinstance(replace_escape_chars("no ec"), unicode) assert isinstance(replace_escape_chars("no ec", replace_by="str"), unicode) assert isinstance(replace_escape_chars("no ec", which_ones=("\n", "\t")), unicode) # text without escape chars self.assertEqual(replace_escape_chars(u"no ec"), u"no ec") self.assertEqual(replace_escape_chars(u"no ec", which_ones=("\n",)), u"no ec") # text with escape chars self.assertEqual(replace_escape_chars(u"escape\n\n"), u"escape") self.assertEqual(replace_escape_chars(u"escape\n", which_ones=("\t",)), u"escape\n") self.assertEqual(replace_escape_chars(u"escape\tchars\n", which_ones=("\t")), "escapechars\n") self.assertEqual(replace_escape_chars(u"escape\tchars\n", replace_by=" "), "escape chars ") self.assertEqual(replace_escape_chars(u"escape\tchars\n", replace_by=u"\xa3"), u"escape\xa3chars\xa3") self.assertEqual(replace_escape_chars(u"escape\tchars\n", replace_by="\xc2\xa3"), u"escape\xa3chars\xa3")
def test_without_escape_chars(self): # text without escape chars self.assertEqual(replace_escape_chars(u'no ec'), u'no ec') self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')