Exemplo n.º 1
0
    def parse(self, response):
        for cell in response.xpath('//td'):
            try:
                l = extract_a(cell)
                next(l)
                url, t = next(l)
            except StopIteration:
                continue
            if not t or '=' not in url:
                continue

            a = get_article(url, t)
            if a is None:
                continue

            desc = ''.join(cell.xpath(desc_xp).extract()).strip()
            if desc:
                a['description'] = desc

            article_json(a)
            yield a

        if get_type(response.url) == 'mono':
            pn = pagen
        else:
            pn = '(//div[@class="paginationControl"])[1]'

        for url, t in extract_a(response.xpath(pn)):
            try:
                yield response.follow(url, meta={'page': int(t)})
            except ValueError:
                pass
Exemplo n.º 2
0
    def parse(self, response):
        for url, t in extract_a(response.xpath('//td[@class="num"]')):
            yield Request(response.urljoin(url), callback=self.thread_parse)

        for url, t in extract_a(response.xpath('(//div[@class="pg"])[1]')):
            try:
                yield Request(response.urljoin(url), meta={'page': int(t)})
            except ValueError:
                pass
Exemplo n.º 3
0
 def parse(self, response):
     for section in response.xpath('//div[@class="d-sect"]')[2:-1]:
         sname = extract_t(section.xpath('p'))
         for url, t in extract_a(section):
             g = get_article(url)
             g['category'] = sname
             yield response.follow(url, meta={'genre': g}, callback=m_parse)
Exemplo n.º 4
0
    def parse(self, response):
        if get_type(response.url) == 'mono':
            mora = '(//td[@class="makerlist-box-t2" or @class="initial"])'
            xp = {
                'main': '//td[@class="w50"]',
                'name': './/a[@class="bold"]',
                'description': './/div[@class="maker-text"]',
            }

            subt = {
                'main': subt_main,
                'name': 'td/a',
                'description': '(td)[2]',
            }
            yield from makers(response, subt)
        else:
            mora = '(//ul[starts-with(@class,"d-mod")])[position()>1]'
            xp = {
                'main': '//div[@class="d-unit"]',
                'name': './/span[@class="d-ttllarge"]',
                'description': './/p',
            }

        g = response.meta.get('genre')
        yield from makers(response, xp, g)
        if g:
            return

        for url, t in extract_a(response.xpath(mora)):
            yield response.follow(url)
Exemplo n.º 5
0
    def parse(self, response):
        v_type = get_type(response.url)

        for actress in response.css('div.act-box').xpath('.//li'):
            url, t = next(extract_a(actress))
            if v_type == 'digital':
                url = url[:-13]

            a = get_article(url, t)
            if a is None:
                continue

            name, alias = alias_re.match(t).groups()
            if alias is not None:
                a['name'] = name
                a['alias'] = alias

            extra = actress.xpath('.//span/text()').extract()
            if extra:
                a['kana'], alias_kana = alias_re.match(extra[0]).groups()
                if alias_kana is not None:
                    a['alias_kana'] = alias_kana

                try:
                    a['count'] = int(extra[1].split(':')[1])
                except (IndexError, ValueError):
                    pass

            a['image'] = actress.xpath('.//img/@src').extract_first()

            article_json(a)
            yield a

        for url, t in extract_a(response.xpath(pagen)):
            try:
                page = int(t)
                if page == 1:
                    continue
                yield response.follow(url, meta={'page': page})
            except ValueError:
                continue

        for url, t in extract_a(response.xpath(aiueo)):
            yield response.follow(url)
Exemplo n.º 6
0
    def parse(self, response):
        v_type = get_type(response.url)

        desc = response.css('div.mg-b20.lh4')
        if v_type == 'mono':
            desc = desc.xpath('p')

        item = {
            'type': v_type,
            'url': response.url.split('?')[0],
            'title': extract_t(response.xpath('//h1')),
            'cover': response.xpath(cover_xp).extract_first(),
            'description': extract_t(desc),
        }

        urls = {}

        for row in response.xpath('//td[@class="nw"]/..'):
            info = extract_t(row.xpath('td'))[:-1]
            try:
                info, parser = info_box[info]
            except KeyError:
                continue

            if parser == 'PERFORMER':
                item.update(get_performers(row.xpath('td'), urls))
            elif parser is None:
                item[info] = extract_t(row.xpath('td[2]'))
            else:
                try:
                    item[info] = parser(get_articles(row.xpath('td'), urls))
                except StopIteration:
                    pass

        sample = response.xpath('//a[starts-with(@id,"sample-image")]/img')
        if sample:
            item['samples'] = len(sample)
            item['sample_link'] = sample.xpath('@src').extract_first()

        m_l = response.xpath('//script[contains(.,"#mutual-link")]/text()')
        if m_l:
            m_l = response.urljoin(mutual_l.format(*m_l.re(r":\s*'(.*)',")))
            item['mutual'] = set(i[0] for i in extract_a(get_aux(m_l)))

        a_p = response.xpath('//script[contains(.,"#a_performer")]/text()')
        if a_p:
            a_p = response.urljoin(a_p.re_first(r"url: '(.*)',"))
            item.update(get_performers(get_aux(a_p), urls))

        for url, a in urls.items():
            a['type'] = v_type
            yield response.follow(url, meta={'article': a}, callback=a_parse)

        item['JSON_FILENAME'] = JSON_FILENAME

        yield item
Exemplo n.º 7
0
def get_articles(links, urls=None, only_id=True):
    for url, t in extract_a(links):
        a = get_article(url, t, _type=False)
        if a is None:
            continue

        if urls is not None and url not in urls:
            urls[url] = a

        if only_id:
            yield a['id']
        else:
            yield a['article'], a['id']
Exemplo n.º 8
0
def get_articles(links, urls=None, only_id=True):
    for url, t in extract_a(links):
        if url.startswith('javascript:'):
            continue

        a = get_article(url, t)
        if a is None:
            continue

        if urls is not None and url not in urls:
            urls[url] = a

        if only_id:
            yield a['id']
        else:
            yield a['article'], a['id']
Exemplo n.º 9
0
def makers(response, xp, genre=None):
    for mk in response.xpath(xp.pop('main')):
        url = next(extract_a(mk))[0]

        m = get_article(url)
        if m is None:
            continue

        if genre is not None:
            m['genre'] = set((genre['id'], ))
            yield m
            continue

        m.update({k: extract_t(mk.xpath(v)) for k, v in xp.items()})

        img = mk.xpath('.//img/@src').extract_first()
        if img:
            m['image'] = img

        article_json(m)
        yield m
Exemplo n.º 10
0
    def parse(self, response):
        if get_type(response.url) == 'mono':
            xp = '//div[@class="sect01"]'
            s_xp = 'table/@summary'
        else:
            xp = '//div[@class="d-area area-list"]'
            s_xp = 'div[@class="d-capt"]/text()'

        for section in response.xpath(xp)[1:]:
            sname = section.xpath(s_xp).extract_first()

            for url, t in extract_a(section):
                if url.startswith('#'):
                    continue

                item = get_article(url, t)
                if item is None:
                    continue

                item['category'] = sname

                article_json(item)
                yield item
Exemplo n.º 11
0
    def parse(self, response):
        p_type, pid = get_pid(response.url)

        if not pid:
            print(response.url)
            return

        desc = response.xpath('//div[@class="title2"]/following-sibling::p')
        if p_type == 'PPV':
            desc = response.xpath('//ul[@class="review"]/li[1]')

        item = {
            'pid': pid,
            'type': p_type,
            'url': response.url,
            'title': extract_t(response.xpath('//h2')),
            'description': extract_t(desc),
        }

        vid = extract_t(response.xpath('//div[@class="top-title"]'))
        if vid:
            item['vid'] = vid.split(': ')[1]

        for src in response.xpath(cover_xp).extract():
            if 'imgs.aventertainments' in src:
                item['cover'] = src
                break

        urls = {}

        for li in response.xpath(main_xp):
            info = extract_t(li.xpath('span') or li)
            try:
                info, parser = info_box[info[:-1]]
            except KeyError:
                continue

            if parser is None:
                item[info] = extract_t(li, p='text()[2]')
            else:
                try:
                    i = parser(get_articles(li, urls))
                except StopIteration:
                    i = None
                item[info] = i

        for details in response.xpath('//div[@id="detailbox"]'):
            info = extract_t(details.xpath('span'))
            try:
                info, parser = info_box[info[:-1]]
            except KeyError:
                continue

            if parser is None:
                pass
            else:
                try:
                    item[info] += parser(get_articles(details, urls))
                except StopIteration:
                    pass

        try:
            item['keyword'] = sorted(set(item.pop('keyword')))
        except KeyError:
            pass

        sample = response.xpath('//div[@class="TabbedPanels"]//img')
        if sample:
            item['sample_link'] = sample.xpath('@src').extract_first()

        th = response.css('ul.thumbs')
        if th:
            item['gallery'] = tuple(extract_t(ul, 'li/a/@href') for ul in th)

        mutual = response.xpath('//div[@id="mini-tabs"]')
        if mutual:
            item['mutual'] = sorted(i[0] for i in extract_a(mutual))

        for url, a in urls.items():
            a['type'] = p_type
            yield response.follow(url, meta={'article': a}, callback=a_parse)

        item['JSON_FILENAME'] = JSON_FILENAME

        yield item
Exemplo n.º 12
0
def studios(links):
    for url, t in extract_a(links):
        studio = get_article(url, t)
        article_json(studio)
        yield studio