示例#1
0
def processDataPrice(values):
    result = replace_escape_chars(values, which_ones='€', replace_by=u'.')
    result = replace_escape_chars(result, which_ones='EUR', replace_by=u'')
    result = replace_escape_chars(result, which_ones=' ', replace_by=u'')
    result = replace_escape_chars(result, which_ones='.', replace_by=u'')
    result = replace_escape_chars(result, which_ones=',', replace_by=u'.')
    return result
示例#2
0
    def process_item(self, item, spider):

        # clean date
        raw_date = item['date'].split()
        item['date'] = dateparser.parse(raw_date[0])

        # user
        item['user'] = replace_escape_chars(item['user'])

        # set
        item['deck_set'] = re.sub('[()]', '', ' '.join(raw_date[1:]))

        # deck rating
        item['rating'] = int(re.findall('\d+', item['rating'])[0])

        # deck count
        item['count'] = list(map(int, item['count']))

        # replace card names by unique ids
        for i, card in enumerate(item['cards']):
            card_name_raw = replace_escape_chars(card)
            card_name_clean = ' '.join(card_name_raw.split())
            item['cards'][i] = self.get_card_id(card_name_clean)

        # multiply ids by card counts
        id_list = []
        for i, j in zip(item['cards'], item['count']):
            id_list.extend([i] * j)

        # replace by list of ids
        item['cards'] = id_list

        # deck id
        item['deck_id'] = int(item['deck_id'])

        # craft cost
        item['craft_cost'] = int(item['craft_cost'])

        # deck format
        if item['deck_format']:
            item['deck_format'] = 'W'  # wild
        else:
            item['deck_format'] = 'S'  # standard

        # check if 30 cards in deck
        if len(item['cards']) != 30:
            raise DropItem("Incomplete deck")

        # remove count
        del item['count']

        return item
示例#3
0
def step2_get_detail_html_save_csv(url) :
    # 상세 페이지의 html 데이터를 받아온다.
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # print(soup)
    # 데이터를 추출한다.
    rows = soup.select('tr[class^="calendar__row calendar_row calendar__row--grey"]')
    for row in rows :

        date = row.select('td[class="calendar__cell calendar__date date"]')[0].text
        time = row.select('td[class="calendar__cell calendar__time time"]')[0].text
        currency = html.replace_escape_chars(row.select('td[class="calendar__cell calendar__currency currency "]')[0].text)
        impact = row.select('td[class^="calendar__cell calendar__impact impact calendar__impact calendar__impact--"] div span')[0].get('class')[0]
        index_name = row.select('.calendar__event-title')[0].text
        actual = row.select('td[class="calendar__cell calendar__actual actual"]')[0].text
        forecast = row.select('td[class="calendar__cell calendar__forecast forecast"]')[0].text
        previous = row.select('td[class="calendar__cell calendar__previous previous"]')[0].text

        # .text를 하면 태그 사이의 문자열만 가져올 수 있다.

        data_list = [date, time, currency, impact, index_name, actual, forecast, previous]

        if os.path.exists('index_data_total.csv') == False:
            # 헤더를 저장한다.
            with open('index_data_total.csv', 'w', newline='') as fp:
                writer = csv.writer(fp)
                writer.writerow(['date', 'time', 'currency', 'impact', 'index_name', 'actual', 'forecast', 'previous'])
        with open('index_data_total.csv', 'a', newline='') as fp2:
            writer2 = csv.writer(fp2)
            writer2.writerow(data_list)
示例#4
0
    def parse_news(self, response):
        """
        Extract the data from the news page and if the page is not in cache,
        this HTML request is counted, so the ip should be updated if necessary.
        The update ip needs to stay here unless you don't want HTTPCACHE
        """

        loader = NewsLoader(item=NewsItem(), response=response)
        loader.add_xpath('title',
                         '//span[@class="lede-text-only__highlight"]/text()')
        loader.add_xpath(
            'title', '//span[@class="lede-large-content__highlight"]/text()')
        loader.add_xpath('title', '//article//h1/text()')
        authors = response.xpath('//div[@class="author"]/text()').extract()
        for author in authors:
            author = strip_html5_whitespace(author)
            author = replace_escape_chars(author)
            if len(author) != 0:
                loader.add_value('author', author)
        timestamp = response.xpath(
            '//time[@class="article-timestamp"]/@datetime').extract()[0]
        timestamp = du.normalize_timestamp(timestamp, hasTimezone=True)
        loader.add_value('date', timestamp.split(' ')[0])
        loader.add_value('time', timestamp.split(' ')[1])
        loader.add_xpath('content', '//div[@class="body-copy fence-body"]')
        loader.add_xpath('tags', '//meta[@name="keywords"]/@content')
        return loader.load_item()
示例#5
0
    def parse_item(self, response):
        DI = DoubanItem()
        DI['scheme'] = response.url
        sel = Selector(response, type='html')
        context = sel.css(
            '#content > div > div.article > div.topic-content.clearfix > div.topic-doc'
        )
        # 发布人
        DI['user'] = context.css("h3 > span.from > a::text").extract()[0]
        # 时间
        DI['created_at'] = context.css(
            ' h3 > span.color-green::text').extract()[0]
        # 标题,可能两种匹配,优先选择正文中的
        title = context.css(
            'table > tbody > tr:nth-child(2) > td.tablecc').extract()
        if not title:
            title = sel.css('#content > h1').extract()[0]
        else:
            title = title[0]
        DI['title'] = remove_tags(title).strip()
        # 正文文本
        DI['text'] = replace_escape_chars('。'.join(
            sel.css('#link-report > div >p ::text').extract()),
                                          which_ones=('\n', '\t', '\r', ' '))

        # 点赞数,不存在就是0
        lc = sel.css('#sep > div.sns-bar-fav > span > a::text').extract()
        if not lc:
            lc = 0
        else:
            lc = lc[0][:-1]
        DI['like_count'] = lc
        return DI
示例#6
0
文件: utils.py 项目: sztal/taukit
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'),
                          token='____SECTION____'):
    """Normalize web content.

    Parameters
    ----------
    keep : tuple
        HTML tags to keep.
    token : str or None
        Token to use for replacing kep HTML tags.
        Do not replace if `None`.
    """
    try:
        x = strip_html5_whitespace(x)
        x = remove_comments(x)
        x = remove_tags(x, keep=keep)
        if token:
            x = replace_tags(x, token=token)
        x = replace_entities(x)
        x = replace_escape_chars(x)
    except (TypeError, AttributeError):
        pass
    for part in _rx_web_sectionize.split(x):
        if part:
            yield part
示例#7
0
 def parseFunc(self, response, N):
     js = json.loads(response.text, encoding='utf-8')
     cardN = len(js['cards'])
     for i in range(N, cardN):
         cgN = len(js['cards'][i]['card_group'])
         for j in range(cgN):
             it = js['cards'][i]['card_group'][j]
             #logging.warning('i:%d,j:%d' % (i, j))
             if 'mblog' in it.keys():
                 mblog = it['mblog']
                 keys = mblog.keys()
                 item = WeibozItem()
                 item['mblogid'] = mblog['mblogid']
                 item['created_at'] = mblog['created_at']
                 item['comments_count'] = self.etl(mblog, 'comments_count',
                                                   keys)
                 item['like_count'] = self.etl(mblog, 'like_count', keys)
                 item['reposts_count'] = self.etl(mblog, 'reposts_count',
                                                  keys)
                 item['text'] = replace_escape_chars(
                     remove_tags(mblog['text']),
                     which_ones=('\n', '\t', '\r', ' '))
                 item['scheme'] = it['scheme']
                 item['user'] = {
                     'name': mblog['user']['screen_name'],
                     'fansNum': mblog['user']['fansNum'],
                     'statuses_count': mblog['user']['statuses_count']
                 }
                 yield item
def remove_garbage(val):
    val = replace_escape_chars(val)
    val = replace_entities(val)
    val = re.sub(r'\.', '. ', val)
    val = re.sub(r'\s+,\s{2,}', ', ', val)
    val = re.sub(r'\s{2,}', ' ', val)
    return val.strip()
示例#9
0
    def process_item(self, item, spider):
        """
        Process content based on its type.
        """
        content_type = item.get('content_type', 'UNKNOWN')
        log = structlog.get_logger().bind(
            event = 'PROCESS_ITEM',
            content_type = content_type,
            source_url = item['source_url'])

        if content_type == 'HTML':
            plain_content = html.replace_escape_chars(
                html.remove_tags(
                    html.remove_tags_with_content(
                        item['content'],
                        which_ones = ('script',)
                    )
                ),
                which_ones = ('\n','\t','\r','   '),
                replace_by = '')
            item['content'] = plain_content
            log.info(message = 'HTML content extracted')
        # @TODO
        elif content_type in ['PDF','MS_WORD', 'LIBREOFFICE', 'POWERPOINT', 'CSV', 'XLSX', 'XLS']:
            log.info(
                event = 'QUEUE_CONTENT',
                message = 'Pushing content for deferred processing')
        elif content_type in [None, 'UNKNOWN']:
            log.warn(error = 'UNRECOGNIZED_CONTENT_TYPE')

        return item
示例#10
0
 def parseResponse(self, response, N):
     js = json.loads(response.text, encoding='utf-8')
     cardNum = len(js['cards'])
     for i in range(N, cardNum):
         cardGroupNum = len(js['cards'][i]['card_group'])
         for j in range(cardGroupNum):
             it = js['cards'][i]['card_group'][j]
             if 'mblog' in it.keys():
                 mblog = it['mblog']
                 keys = mblog.keys()
                 item = PostItem()
                 item['mblogid'] = mblog['id']
                 item['created_at'] = mblog['created_at']
                 item['comments_count'] = self.etl(mblog, 'comments_count',
                                                   keys)
                 item['like_count'] = self.etl(mblog, 'attitudes_count',
                                               keys)
                 item['reposts_count'] = self.etl(mblog, 'reposts_count',
                                                  keys)
                 item['text'] = replace_escape_chars(
                     remove_tags(mblog['text']),
                     which_ones=('\n', '\t', '\r', ' '))
                 item['scheme'] = it['scheme']
                 item['user_name'] = mblog['user']['screen_name']
                 item['user_followers'] = mblog['user']['followers_count']
                 item['user_statuses'] = mblog['user']['statuses_count']
                 item['user_gender'] = mblog['user']['gender']
                 item['airline'] = self.airline_name
                 yield item
示例#11
0
    def parse(self, response):
        articles = response.css('article')

        for article in articles:
            title = article.css('.entry-title > a::text').extract_first()
            date_published = article.css(
                '.entry-date.published::text').extract_first()
            link = article.xpath(
                './/*[@class="more-link"]/@href').extract_first()

            if 'category-video' in article.attrib['class']:
                content_dirty = article.css('.entry-content p').extract_first()
                content = remove_tags(content_dirty).replace('|',
                                                             ' | ').replace(
                                                                 '\n', ' ')
            else:
                content_dirty = article.xpath(
                    './/*[@class="entry-content"]').extract_first()
                content = replace_tags(replace_escape_chars(content_dirty),
                                       '  ').replace('(more…)', '').replace(
                                           'Read More', '').strip()

            yield {
                'Title': title,
                'Published On': date_published,
                'Content': content,
                'Link': link,
            }

        next_page = response.xpath(
            './/a[contains(@class, "next")]/@href').extract_first()
        yield Request(next_page)
示例#12
0
    def parse_radiozet(self, response):
        url = response.url

        date = response.css('article .info-header__date--published__date::text'
                            ).extract_first()
        #date = date.split(' ')
        #date = date[0]
        date = date.replace('.', '-')
        time = response.css('article .info-header__date--published__time::text'
                            ).extract_first()

        title = response.css(
            "article header .full__title.full__article__title::text").extract(
            )
        title = ' '.join(title)
        title = replace_escape_chars(title)

        lead = response.css(".full__article__lead ::text").extract()
        lead = ' '.join(lead)
        lead = remove_tags(lead)
        lead = re.sub('\s+', ' ', lead)
        lead = re.sub(' \n', '', lead)

        exclude_selectors = (
            'not(ancestor::*[contains(@class, "advert")])'
            ' and not(ancestor::*[contains(@class, "embed__article")])'
            ' and not(ancestor::*[contains(@class, "SandboxRoot")])'
            ' and not(ancestor::*[contains(@class, "twitter-tweet")])'
            ' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])'
            ' and not(descendant::*[starts-with(text(), "ZOBACZ TAKŻE:")])')

        #text = response.css('div.full__article__body p:not([class^="embed__article"])').extract()
        selector = '//div[contains(@class, "full__article__body")]//p[%s]' % exclude_selectors
        text = response.xpath(selector)
        text = text.extract()

        # W R usunac akapity ze zdjeciami oraz wpisami z twittera - https://t.co/ lub pic.twitter.com/
        source = text[-1]
        text.pop(-1)
        text.pop(0)
        text = ' || '.join(text)
        text = remove_tags(text)
        source = remove_tags(source)

        # Joining lead with text
        text = ' || '.join([lead, text])

        tags = response.css(
            'div.full__article__tags__list a::attr("title")').extract()
        yield {
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'source': source,
            'tags': ', '.join(tags)
        }
示例#13
0
    def safe_html(response):

        html_raw = response.xpath(
            '//div[contains(@class, "panel panel-default anenities")]'
        ).extract_first('').strip()
        html = replace_escape_chars(safehtml(htmlregion(html_raw))).replace(
            '<p></p>', '').strip()
        return html
示例#14
0
    def parse_tvn24bis(self, response):
        url = response.url
        art_id = url.split(',')[-1].split('.')[0]

        date = response.css(
            'article.detail header time::attr("datetime")').extract_first()
        date = date.split(' ')
        time = date[1][0:4]
        date = date[0]

        title = response.css("article.detail header h1 ::text").extract_first()
        title = replace_escape_chars(title).strip()

        lead = response.css("div.content p.lead ::text").extract_first()
        lead = replace_escape_chars(lead).strip()

        text = response.xpath(
            '//div[@class="content"]/p[not(contains(@clas, "rules") or contains(@clas, "footer"))]/text()'
        ).extract()

        text = ' || '.join(text)
        text = remove_tags(text)
        text = replace_escape_chars(text)
        text = clear_text(text)

        autor = response.css(
            "div.content div.footer ::text").extract()[1].split('/')
        if len(autor) > 1:
            source = autor[1]
            source = source.strip().replace('Źródło: ', '')
            autor = autor[0].strip().replace('Autor: ', '')
        else:
            source = ''
            autor = autor[0].strip().replace('Autor: ', '')

        yield {
            'id': art_id,
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'autor': autor,
            'source': source
        }
示例#15
0
 def strip_all(self, value):
     # 去除空格和标签,换行符
     try:
         value = remove_tags(value)
         value = replace_escape_chars(value)
     finally:
         value = value.strip()
     return value
示例#16
0
class ProductLoader(scrapy.loader.ItemLoader):
    default_output_processor = MapCompose(remove_tags)

    product_in = MapCompose(remove_tags,
                            lambda s: replace_escape_chars(s, which_ones=('\n', '\t', '\r', '\xa0', ' ')))
    product_out = TakeFirst()

    price_in = MapCompose(remove_tags,
                          lambda s: replace_escape_chars(s, which_ones=('\n', '\t', '\r', '\xa0', ' ')),
                          extract_number)
    price_out = TakeFirst()

    retailer_in = Compose()
    retailer_out = TakeFirst()

    date_in = Compose()
    date_out = TakeFirst()
示例#17
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(replace_escape_chars(b"no ec"), str)
     assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str)
     assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str)
     assert isinstance(
         replace_escape_chars(
             b"no ec",
             which_ones=(
                 "\n",
                 "\t",
             ),
         ),
         str,
     )
     assert isinstance(replace_escape_chars("no ec"), str)
     assert isinstance(replace_escape_chars("no ec", replace_by="str"), str)
     assert isinstance(
         replace_escape_chars(
             "no ec",
             which_ones=(
                 "\n",
                 "\t",
             ),
         ),
         str,
     )
示例#18
0
    def parse_interia(self, response):
        '''Parser for Interia'''
        url = response.url
        art_id = url.split('nId,')[1]

        date = response.css('.article-date ::attr("content")').extract_first()
        date = date.split('T')
        time = date[1]
        date = date[0]

        title = response.css("h1.article-title::text").extract()
        title = ' '.join(title)
        title = replace_escape_chars(title)

        lead = response.css(".article-body .article-lead::text").extract()
        lead = ' '.join(lead)
        lead = remove_tags(lead)

        # art_path = '//div[@class = "article-container"]/' \
        #            'div[not(*/@class = "embed")]/' \
        #            'p[not(/aside[@class = "embed embed-photo embed-center"])]'
        # text = response.xpath(art_path)

        exclude_selectors = (
            'not(self::*[contains(@class, "advert")])'
            ' and not(self::*[starts-with(text(), "ZOBACZ RÓWNIEŻ:")])'
            ' and not(self::*[starts-with(text(), "SPRAWDŹ:")])'
            ' and not(descendant-or-self::*[contains(@class, "sub")])'
            ' and not(descendant-or-self::*[contains(@class, "embed")])'
            ' and not(ancestor-or-self::*[contains(@class, "embed")])'
            ' and not(descendant-or-self::*[contains(@class, "aside")])'
            ' and not(ancestor-or-self::*[contains(@class, "aside")])'
            ' and not(descendant-or-self::*[contains(@class, "aside")])'
            ' and not(descendant-or-self::u)'
            ' and (self::p[not(contains(@dir, "ltr"))])')
        selector = '//div[@class = "article-container"]/' \
                   'div[not(*/@class = "embed")]/' \
                    '*[%s]' % exclude_selectors
        text = response.xpath(selector)

        text = text.extract()
        text = ' || '.join(text)
        text = remove_tags(text)
        text = clear_text(text)

        source = response.css(
            ".article-footer .article-source ::attr('content')").extract()
        yield {
            'id': art_id,
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'source': ', '.join(source)
        }
示例#19
0
文件: regex.py 项目: bihicheng/scrapy
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
示例#20
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(replace_escape_chars(b'no ec'), six.text_type)
     assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type)
     assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type)
     assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type)
     assert isinstance(replace_escape_chars(u'no ec'), six.text_type)
     assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type)
     assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type)
示例#21
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(replace_escape_chars(b'no ec'), six.text_type)
     assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type)
     assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type)
     assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type)
     assert isinstance(replace_escape_chars(u'no ec'), six.text_type)
     assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type)
     assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type)
示例#22
0
文件: regex.py 项目: 0326/scrapy
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
示例#23
0
 def normalizeTool(cls, html):
     '''
     :param html: 需要去掉html代码的str字符串
     :return: 去掉html代码的字符串
     '''
     removeHtml = w3.replace_escape_chars(w3.replace_entities(
         w3.remove_tags(html)),
                                          replace_by=" ")
     # removeHtml = w3.replace_escape_chars(w3.replace_entities(w3.remove_tags(html)))
     removeEscapeChars = " ".join(removeHtml.split())
     return removeEscapeChars
示例#24
0
    def parse_tvn24(self, response):
        url = response.url
        art_id = url.split(',')[-1].split('.')[0]

        date = response.css(
            'div.articleDateContainer time::attr("datetime")').extract_first()
        date = date.split(' ')
        time = date[1]
        date = date[0]

        title = response.css("div.mainContainer h1 ::text").extract_first()
        title = replace_escape_chars(title).strip()

        lead = response.css("article h2 span.black ::text").extract_first()
        lead = replace_escape_chars(lead).strip()

        text = response.xpath(
            '//*[not(contains(self, "em") or contains(self, "figure") or contains(self, "aside") or contains(@class, "innerArticleModule.onRight.cols.externalContent.innerText") or contains(@class, "lead"))]/div[@class="textArticleDefault"]//article/p[not(contains(self, "em") or contains(self, "figure") or contains(self, "aside") or contains(@class, "innerArticleModule.onRight.cols.externalContent.innerText") or contains(@class, "innerText") or contains(@class, "lead") or contains(@class, "textHolder") or contains(self, "div") or contains(text(), "czytaj"))]/text()'
        ).extract()

        text = ' || '.join(text)
        text = remove_tags(text)
        text = replace_escape_chars(text)

        # Joining lead with text
        text = ' || '.join([lead, text])
        autor = response.css("div.articleAuthors ::text").extract()
        source = autor[2].strip().replace('Źródło: ', '')
        autor = autor[0].strip().replace('Autor: ', '')

        yield {
            'id': art_id,
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'autor': autor,
            'source': source
        }
示例#25
0
    def parse_rmf(self, response):
        url = response.url
        art_id = url.split('nId,')[1]

        date = response.css('.article-date ::attr("content")').extract_first()
        date = date.split('T')
        time = date[1]
        date = date[0]

        title = response.css(".article-header .article-title::text").extract()
        title = ' '.join(title)
        title = replace_escape_chars(title)
        lead = response.css(".article-body .article-lead::text").extract()

        art_path = '//div[@class = "article-container"]/div[@class = "article-body"]/div[@class = "articleContent"][not(*/@class = "embed")]/p[not(contains(descendant-or-self, "u") or contains(descendant-or-self, "sub") or contains(descendant-or-self, "b") or contains(ancestor-or-self, "aside")  or contains(descendant-or-self, "aside") or contains(ancestor-or-self, "twitter-widget") or contains(@class, "Tweet-text"))]'
        text = response.xpath(art_path)
        text = text.extract()

        twitter = response.css(
            ".article-container .article-body .articleContent .embed-blockquote"
        ).extract()

        # Usuniecie wpisow twitterowych
        # TO DO - usunac lepiej twitty
        if len(twitter) > 0:
            for a in range(0, len(twitter)):
                for t in range(0, len(text)):
                    if text[t] == twitter[a]:
                        text[t] = ""

        text = ' || '.join(text)
        text = remove_tags(text)

        # Joining lead with text
        lead = ' '.join(lead)
        text = ' || '.join([lead, text])
        text = re.sub('\s+', ' ', text)
        autor = response.css(".article-author-name::text").extract()
        source = response.css(
            ".article-footer .article-source ::attr('content')").extract()
        tags = response.css(".elementTagsList a::text").extract()
        yield {
            'id': art_id,
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'autor': ', '.join(autor),
            'source': ', '.join(source),
            'tags': ', '.join(tags)
        }
示例#26
0
    def parse_car_rating(self, response):
        loader = ItemLoader(item = ZigwheelsItem())

        rating = {}

        for i, li in enumerate(response.css('ul.p-list li')):
            if i > 2: break
            rating[replace_escape_chars(li.css('div.m-wl::text').get().lower().split(' ')[0])] = replace_escape_chars(li.css('div.m-wr::text').get())
        for li in response.css('div.rv-ls ul li'):
            rating[replace_escape_chars(li.css('div.m-wl::text').get().lower().split(' ')[0])] = replace_escape_chars(li.css('div.m-wr::text').get())

        loader.add_value('brand', response.xpath('//div[@class="zw-cmn-containerColor"]/div/ol/li[3]/a/span/text()').get())
        loader.add_value('name', response.xpath('//div[@class="zw-cmn-containerColor"]/div/ol/li[4]/a/span/text()').get())
        loader.add_value('mileage', rating['mileage'])
        loader.add_value('performance', rating['performance'])
        loader.add_value('maintenance', rating['maintenance'])
        loader.add_value('comfort', rating['comfort'])
        loader.add_value('safety', rating['safety'])
        loader.add_value('features', rating['features'])

        yield loader.load_item()
示例#27
0
 def test_with_escape_chars(self):
     # text with escape chars
     self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
     self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=b'\xc2\xa3'), u'escape\xa3chars\xa3')
示例#28
0
 def test_with_escape_chars(self):
     # text with escape chars
     self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
     self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
     self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=b'\xc2\xa3'), u'escape\xa3chars\xa3')
示例#29
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
示例#30
0
    def clean_content(self, text):
        """
        Return a string of text cleaned up by tags, entities,
        escape chars, quotes and spaces
        """

        temp = remove_tags_with_content(text,
                                        which_ones=('style', 'script',
                                                    'figcaption'))
        temp = remove_tags(temp)
        temp = remove_entities(temp)
        temp = replace_escape_chars(temp)
        temp = unquote_markup(temp)
        temp = " ".join(temp.split())
        return temp
示例#31
0
def processText(value):
    """process to get text, clean specifix character

    Arguments:
        value {string} -- input value

    Returns:
        string -- out put value
    """
    if value:
        value = replace_escape_chars(value)
        value = remove_tags(value)
        return value
    else:
        return ''
示例#32
0
def processHtml(value):
    """process to get text, clean specifix character
    
    Arguments:
        value {string} -- input value
    
    Returns:
        string -- out put value
    """
    if value:
        value = replace_escape_chars(value)
        value = value.replace('#ft5_slash#', '/')
        return value
    else:
        return ''
示例#33
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url) for url in links_text])

        return [Link(url, "") for url in urlstext]
示例#34
0
    def parse_dziennik(self, response):
        '''Parser for dziennik.pl'''
        url = response.url
        art_id = url.split('artykuly/')[1]
        art_id = art_id.split(',')[0]

        date = response.css('span.ap-date time::text').extract_first()
        date = date.split(', ')
        time = date[1]
        date = date[0]
        date = date.replace('.', '-')

        title = response.css(
            ".articlepage .single-article-title::text").extract()
        title = ' '.join(title)
        title = replace_escape_chars(title)

        lead = response.css("article h2::text").extract()
        lead = ' '.join(lead)
        lead = remove_tags(lead)

        text = response.css('div#dziennik_intext.articleBody p').extract()

        # W R usunac akapity ze zdjeciami oraz wpisami z twittera - https://t.co/ lub pic.twitter.com/
        text = ' || '.join(text)
        text = remove_tags(text)

        # Joining lead with text
        text = ' || '.join([lead, text])
        source = response.css("div.ps-line strong::text").extract()
        tags = response.css("div.ps-line.tags a::text").extract()
        yield {
            'id': art_id,
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'source': ', '.join(source),
            'tags': ', '.join(tags)
        }
示例#35
0
 def clean_text(text):
     return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
示例#36
0
    def test_replace_escape_chars(self):
        # make sure it always return unicode
        assert isinstance(replace_escape_chars('no ec'), unicode)
        assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode)
        assert isinstance(replace_escape_chars('no ec', which_ones=('\n', '\t',)), unicode)

        # text without escape chars
        self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
        self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')

        # text with escape chars
        self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
        self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')
示例#37
0
文件: test_html.py 项目: kmike/w3lib
    def test_replace_escape_chars(self):
        # make sure it always return unicode
        assert isinstance(replace_escape_chars("no ec"), unicode)
        assert isinstance(replace_escape_chars("no ec", replace_by="str"), unicode)
        assert isinstance(replace_escape_chars("no ec", which_ones=("\n", "\t")), unicode)

        # text without escape chars
        self.assertEqual(replace_escape_chars(u"no ec"), u"no ec")
        self.assertEqual(replace_escape_chars(u"no ec", which_ones=("\n",)), u"no ec")

        # text with escape chars
        self.assertEqual(replace_escape_chars(u"escape\n\n"), u"escape")
        self.assertEqual(replace_escape_chars(u"escape\n", which_ones=("\t",)), u"escape\n")
        self.assertEqual(replace_escape_chars(u"escape\tchars\n", which_ones=("\t")), "escapechars\n")
        self.assertEqual(replace_escape_chars(u"escape\tchars\n", replace_by=" "), "escape chars ")
        self.assertEqual(replace_escape_chars(u"escape\tchars\n", replace_by=u"\xa3"), u"escape\xa3chars\xa3")
        self.assertEqual(replace_escape_chars(u"escape\tchars\n", replace_by="\xc2\xa3"), u"escape\xa3chars\xa3")
示例#38
0
 def test_without_escape_chars(self):
     # text without escape chars
     self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
     self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')