示例#1
0
    def process_item(self, item, spider):

        i = item['summary'][0]
        i = remove_tags(i)
        i = replace_escape_chars(i)
        item['summary'][0] = i

        i = item['job_title'][0]
        i = remove_tags(i)
        i = replace_escape_chars(i)
        item['job_title'][0] = i
        
        return item
示例#2
0
    def process_item(self, item, spider):
        i = item['summary'][0]
        i = remove_tags(i)
        i = replace_escape_chars(i)
        item['summary'][0] = i

        i = item['job_title'][0]
        i = remove_tags(i)
        i = replace_escape_chars(i)
        item['job_title'][0] = i
        print item

        return item
示例#3
0
    def parse_news(self, response):
        date = response.css(
            "div.content-time-published.margin .time-modified.margin::text"
        ).extract_first()
        title = response.css(
            "span#id-blasting-tv-masthead-video-title::text").extract_first()
        subtitle = response.css("h2.title-h2::text").extract_first()

        try:
            article = remove_tags_with_content(response.css(
                "div.article-body.p402_premium.template-a").extract_first(),
                                               which_ones=('div', 'script'))
        except:
            article = remove_tags_with_content(
                response.css("div#article-body-p1").extract_first(),
                which_ones=('div', 'a', 'script'))

        article = remove_tags(article)
        article = replace_escape_chars(article, which_ones=('\n'))
        article = re.sub(r'http\S+', '', article).strip()
        yield {
            'article': article,
            'subtitle': subtitle,
            'title': title,
            'date': date,
            'link': response.url,
            'website': 'blasting'
        }
示例#4
0
    def process_item(self, item, spider):
        body_only = Selector(text=item['body']).css('body').get()
        script_removed = remove_tags_with_content(body_only, which_ones=('style', 'script'))
        tags_replaced = replace_tags(script_removed, ' ')
        item['body'] = replace_escape_chars(tags_replaced, ' ')

        logging.info(f'Item cleaned up: {item["title"]}')
        return item
示例#5
0
文件: regex.py 项目: serkanh/scrapy
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = self.base_url if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
示例#6
0
 def process_item(self, item, spider):
   if item['title'] and item['author'] and item['date'] and item['text'] and item['link']:
     if not item['link'] in self.urls_seen:
       item['text'] = remove_tags(remove_tags_with_content(replace_escape_chars(filter(lambda x: x in string.printable, item['text'][25:])), which_ones=('div', 'img', 'script')))
       item['title'] = filter(lambda x: x in string.printable, item['title'])
       self.urls_seen.add(item['link'])
       return item
     else:
       raise DropItem('Duplicate item %s' % item)
   else:
     raise DropItem('Missing fields %s' % item)
示例#7
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
示例#8
0
def replace_escape(value):
    return replace_escape_chars(value, replace_by=u' ')
示例#9
0
    def load_sectionitem(self, page1_selector, page2_selector, term, is_open,
                         clss, section_index, term_index, course_index):

        print("******* Begin loading section {} *******".format(section_index))

        section_loader = ItemLoader(item=SectionItem(),
                                    selector=page2_selector)

        section_loader.add_xpath('sid', '//*[@id="SSR_CLS_DTL_WRK_CLASS_NBR"]')
        section_loader.add_xpath('days', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('mon', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('tue', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('wed', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('thu', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('fri', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('start', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('ending', '//*[@id="MTG_SCHED$0"]')
        section_loader.add_xpath('professor', '//*[@id="MTG_INSTR$0"]')
        section_loader.add_xpath('room', '//*[@id="MTG_LOC$0"]')
        section_loader.add_xpath('cap', '//*[@id="SSR_CLS_DTL_WRK_ENRL_CAP"]')
        section_loader.add_xpath(
            'enrolled', '//*[@id="SSR_CLS_DTL_WRK_ENRL_TOT"]'
        )  #can have individual and combined capacities
        section_loader.add_xpath('wcap', '//*[@id="SSR_CLS_DTL_WRK_WAIT_CAP"]')
        section_loader.add_xpath('wenrolled',
                                 '//*[@id="SSR_CLS_DTL_WRK_WAIT_TOT"]')

        section_loader.add_value('term', term)

        section_loader.selector = page1_selector
        section_loader.add_value('open', is_open)

        if (page1_selector.css("[id^='DERIVED_CLSRCH_DESCR200$" +
                               str(course_index) + "']").extract_first() !=
                None):
            words = replace_escape_chars(
                remove_tags(
                    page1_selector.css("[id^='DERIVED_CLSRCH_DESCR200$" +
                                       str(course_index) +
                                       "']").extract_first())).split()

            title = ''

            for word in words[2:]:
                title = title + word + ' '

            number = words[1]

            dept = Department.objects.get(code=words[0])

            input_str = replace_escape_chars(
                remove_tags(
                    page2_selector.css(
                        "[id='PSXLATITEM_XLATLONGNAME']").extract_first()))
            session = ''
            session_dict = {
                'University': 'un',
                'University Eligible/CPE': 'uc',
                'University Non-standard Dates': 'ud',
                'CPE (Continuing Education)': 'ce',
                'CPE Non-standard Dates': 'cu',
                'CPE Summer Session 1': 'c1',
                'CPE Summer Session 2': 'c2',
                'CPE Summer Session 3': 'c3',
            }

            if ('*' in input_str):
                session = session_dict[input_str[1:]]
            else:
                session = session_dict[input_str]

            section_loader.add_value(
                'clss',
                Course.objects.filter(title=title, session=session,
                                      dept=dept).get(number=number))
        section_loader.add_xpath(
            'component', '//*[@id="DERIVED_CLSRCH_SSR_CLASSNAME_LONG$' +
            str(section_index) + '"]')

        return section_loader.load_item()
示例#10
0
 def default_proc(input):
     input = remove_tags(input)
     input = replace_escape_chars(input)
     return input
示例#11
0
def replace_escape(value):
    return replace_escape_chars(value, replace_by=u' ')
示例#12
0
    def test_replace_escape_chars(self):
        # make sure it always return unicode
        assert isinstance(replace_escape_chars('no ec'), unicode)
        assert isinstance(replace_escape_chars('no ec', replace_by='str'),
                          unicode)
        assert isinstance(
            replace_escape_chars('no ec', which_ones=(
                '\n',
                '\t',
            )), unicode)

        # text without escape chars
        self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
        self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n', )),
                         u'no ec')

        # text with escape chars
        self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
        self.assertEqual(
            replace_escape_chars(u'escape\n', which_ones=('\t', )),
            u'escape\n')
        self.assertEqual(
            replace_escape_chars(u'escape\tchars\n', which_ones=('\t')),
            'escapechars\n')
        self.assertEqual(
            replace_escape_chars(u'escape\tchars\n', replace_by=' '),
            'escape chars ')
        self.assertEqual(
            replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'),
            u'escape\xa3chars\xa3')
        self.assertEqual(
            replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'),
            u'escape\xa3chars\xa3')
示例#13
0
    def test_replace_escape_chars(self):
        # make sure it always return unicode
        assert isinstance(replace_escape_chars('no ec'), unicode)
        assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode)
        assert isinstance(replace_escape_chars('no ec', which_ones=('\n','\t',)), unicode)

        # text without escape chars
        self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
        self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')

        # text with escape chars
        self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
        self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')