Python default示例，scrapers.items.default Python示例

示例#1

0

显示文件

文件： MA_DOI.py 项目： mcialini/tsne

    def parse_items(self, response):
        sel = Selector(response)

        getpdfs = '//a[(contains(@href, "enforcementactions.pdf") or contains(@href, "admin")) and contains(@class, "titlelink")]'

        links = sel.xpath(getpdfs)

        for link in links:
            item = default(FILEITEM)
            group = default(FILEGROUP)

            url = URL + link.xpath('@href').extract()[0]
            name = link.xpath('text()').extract()

            y = re.compile('\d\d\d\d')
            year = y.findall(name[0])[0].encode('ascii', 'ignore').strip()

            name = re.search(': (.*)', name[0].encode('ascii', 'ignore'))
            name = name.group(1)

            item['source'] = url
            item['name'] = name
            item['state'] = state
            item['year'] = year
            group['items'].append(item)

            logging.info('SCRAPED > ' + str(len(group['items'])))
            yield group

示例#2

0

显示文件

文件： CT_DOI.py 项目： mcialini/tsne

    def dostuff(self, response):
        from scrapy.shell import inspect_response
        inspect_response(response)
        sel = Selector(response)
        rowSelector = '//table[@id="GridView1"]//tr'
        rows = sel.xpath(rowSelector)

        logging.info('---------CALLED PARSE*********')
        for r in range(1, len(rows)-2):
            # The last row is the footer
            item = default(FILEITEM)
            group = default(FILEGROUP)
            cols = rows[r].xpath('td')
            name = cols[0].xpath('text()').extract()[0].encode('ascii', 'ignore').upper()
            disp = cols[1].xpath('text()').extract()[0].encode('ascii', 'ignore')
            date = cols[2].xpath('text()').extract()[0].encode('ascii', 'ignore')
            y = re.compile('\d\d\d\d')
            year = y.findall(date)[0].encode('ascii', 'ignore')
            type = cols[3].xpath('text()').extract()[0].encode('ascii', 'ignore')
            link = cols[4].xpath('a')
            url = link.xpath('@href').extract()[0]
            title = 'Disposition'

            item['source'] = url
            item['name'] = name
            item['state'] = state
            item['year'] = year

            group['items'].append(item)
            logging.info(item['name'])
            item = default(FILEITEM)

示例#3

0

显示文件

文件： CT_DOI.py 项目： mcialini/tsne

    def parse(self, response):

        sel = Selector(response)

        """
        parse should grab all the data on a given page, then create a 
        formrequest for the next page with a callback of parse
        """
        rowSelector = '//table[@id="GridView1"]//tr'
        rows = sel.xpath(rowSelector)

        logging.info('********CALLED PARSE*********')
        for r in range(1, len(rows)-2):
            # The last row is the footer
            item = default(FILEITEM)
            group = default(FILEGROUP)
            cols = rows[r].xpath('td')
            name = cols[0].xpath('text()').extract()[0].encode('ascii', 'ignore').upper()
            disp = cols[1].xpath('text()').extract()[0].encode('ascii', 'ignore')
            date = cols[2].xpath('text()').extract()[0].encode('ascii', 'ignore')
            y = re.compile('\d\d\d\d')
            year = y.findall(date)[0].encode('ascii', 'ignore')
            type = cols[3].xpath('text()').extract()[0].encode('ascii', 'ignore')
            link = cols[4].xpath('a')
            url = link.xpath('@href').extract()[0]
            title = 'Disposition'

            item['source'] = url
            item['name'] = name
            item['state'] = state
            item['year'] = year

            group['items'].append(item)
            logging.info(item['name'])
            item = default(FILEITEM)


        viewStateSelector = '//input[@name="__VIEWSTATE"]/@value'
        viewState = sel.xpath(viewStateSelector).extract()[0].encode('ascii','ignore')
        eventValidationSel = '//input[@name="__EVENTVALIDATION"]/@value'
        eventValidation = sel.xpath(eventValidationSel).extract()[0]
        # print viewState
        resp =  FormRequest.from_response(
                response,
                formname="form1",
                formdata={
                    '__EVENTTARGET': 'GridView1',
                    '__EVENTARGUMENT': 'Page$Next',
                    '__EVENTVALIDATION': eventValidation,
                    '__VIEWSTATE': viewState.replace('%2','/'),
                    'ctl00$jsCheck' : str(0)
                },
                callback=self.dostuff,
                method='POST',
                dont_filter=True
            )

        logging.info(resp.headers)
        logging.info(resp.body)

示例#4

0

显示文件

文件： CT_DOI_MED.py 项目： mcialini/tsne

    def parse(self, response):

        sel = Selector(response)

        year = ''

        rowSelector = '//div//table//tbody//tr'
        rows = sel.xpath(rowSelector)
        for r in range(len(rows)):
            # 1. Check if the current row is a header row with the year on it
            # 2. If not, grab the link from the row and the text

            header = rows[r].xpath('th/text()')
            if header:
                year = header[0].extract()
                y = re.compile('\d\d\d\d')
                year = y.findall(year)
                if not year:
                    year = ['2014']
                year = year[0].encode('ascii', 'ignore').strip()
            else:

                links = rows[r].xpath('td//a[(contains(@href,"lib/cid"))]')
                group = default(FILEGROUP)

                for l in links:
                    text = l.xpath('text()').extract()
                    name = ''
                    if text:
                        item = default(FILEITEM)
                        url = l.xpath('@href').extract()[0]
                        url = urlparse.urljoin(
                            response.url, url.strip()).encode('ascii', 'ignore')

                        name = text[0].encode('ascii', 'ignore')
                        name = re.sub('[\\\?\*\"\.><\|\r\n]', '', name)
                        name = re.sub('[:\/]', '-', name)

                        item['source'] = url
                        item['name'] = name
                        item['state'] = state
                        item['year'] = year

                        group['items'].append(item)

                logging.info('SCRAPED > ' + str(len(group['items'])))
                yield group

示例#5

0

显示文件

文件： VT_DFR.py 项目： mcialini/tsne

    def parse_items(self, response):

        sel = Selector(response)

        # Find the name of the order
        titleSelector = '//h1[@id="page-title"]/text()'
        title = sel.xpath(titleSelector).extract()[0].encode('ascii', 'ignore')

        # Find the date of the order
        dateSelector = '//span[@class="date-display-single"]/text()'
        date = sel.xpath(dateSelector).extract()[0]
        y = re.compile('\d\d\d\d')
        year = y.findall(date)[0].encode('ascii', 'ignore')

        # Find the file we want to download
        fileSelector = '//span[@class="file"]//a'
        files = sel.xpath(fileSelector)

        item = default(FILEITEM)
        group = default(FILEGROUP)

        for i in range(len(files)):
            url = files[i].xpath('@href').extract()[0]
            # reconstruct the absolute path to the file
            url = urlparse.urljoin(
                response.url, url.strip()).encode('ascii', 'ignore')

            name = files[i].xpath('text()').extract()[0].replace('.pdf', '')
            name = (re.sub('[\\\?\*\"\.><\|\r\n]', '', title).replace('/', ' ') + ' - ' + re.sub(
                '[\\\?\*\"\.><\|\r\n]', '', name)).replace(', ', ',')

            item['source'] = url
            item['name'] = name
            item['state'] = state
            item['year'] = year

            group['items'].append(item)
            item = default(FILEITEM)

        logging.info('SCRAPED > ' + str(len(group['items'])))
        yield group

示例#6

0

显示文件

文件： RI_DOI.py 项目： mcialini/tsne

    def parse(self, response):

        sel = Selector(response)

        rowSelector = '//table[@class="datatable"]//tr'
        rows = sel.xpath(rowSelector)


        for r in rows:
            col = r.xpath('td')
            if col:
                item = default(FILEITEM)
                group = default(FILEGROUP)

                name = col[0].xpath('text()').extract()[0]
                url = col[1].xpath('a//@href').extract()[0]
                year = col[2].xpath('text()').extract()[0]

                name = re.sub(' +', ' ', name).strip().encode(
                    'ascii', 'ignore')
                name = re.sub('[\\\?\*\"\.><\|\r\n]', '', name)
                name = re.sub('[\/:]', '-', name)
                name = re.sub('[;]', ',', name)

                url = urlparse.urljoin(response.url, url.strip())
                y = re.compile('\d\d\d\d')
                year = y.findall(year)[0].encode('ascii', 'ignore')

                if year == '1007':
                    year = '2007'

                item['source'] = url.encode('ascii', 'ignore')
                item['name'] = name
                item['state'] = state
                item['year'] = year
                group['items'].append(item)

                logging.info('SCRAPED > ' + str(len(group['items'])))
                yield group

示例#7

0

显示文件

文件： NH_DOI.py 项目： mcialini/tsne

    def parse_items(self, response):

        sel = Selector(response)

        # Find the date of the order
        dateSelector = '//td[@class="title"]/text()'
        date = " ".join(sel.xpath(dateSelector).extract())
        y = re.compile("\d\d\d\d")
        year = y.findall(date)[0].encode("ascii", "ignore").strip()

        # Find the file we want to download
        rowSelector = '//table[@id="blue_table"]//tr'
        rows = sel.xpath(rowSelector)

        # 1. Single link with company name as a/text
        # 2. Line of text with company name, and one or more links to documentsd
        # 3. Multiple links, top one having the company name
        for r in rows:
            group = default(FILEGROUP)

            col = r.xpath("td")
            # the column we're interested in is not empty, so let's scrape
            # it
            if col:

                titleSelector1 = "div//p/text()"
                titleSelector2 = "div//text()"
                titleSelector3 = "div//span/text()"
                titleSelector4 = "p//span/text()"
                titleSelector5 = "p/text()"
                titleSelector6 = "span/text()"
                titleSelector7 = "text()"

                titles1 = col[0].xpath(titleSelector1)
                titles2 = col[0].xpath(titleSelector2)
                titles3 = col[0].xpath(titleSelector3)
                titles4 = col[0].xpath(titleSelector4)
                titles5 = col[0].xpath(titleSelector5)
                titles6 = col[0].xpath(titleSelector6)
                titles7 = col[0].xpath(titleSelector7)

                compname = ""

                if titles1:
                    compname = re.sub(" +", " ", titles1[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector1 + '\n')
                elif titles2:
                    compname = re.sub(" +", " ", titles2[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector2 + '\n')
                elif titles3:
                    compname = re.sub(" +", " ", titles3[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector3 + '\n')
                elif titles4:
                    compname = re.sub(" +", " ", titles4[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector4 + '\n')
                elif titles5:
                    compname = re.sub(" +", " ", titles5[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector5 + '\n')
                elif titles6:
                    compname = re.sub(" +", " ", titles6[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector6 + '\n')
                elif titles7:
                    compname = re.sub(" +", " ", titles7[0].extract()).strip().encode("ascii", "ignore")
                    compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    ##f.write(titleSelector7 + '\n')

                # If no title was found, then just use the first link as the
                # company name
                if compname == "":
                    titleCatch1 = "a/text()"
                    titleCatch2 = "div//a/text()"
                    title1 = col[0].xpath(titleCatch1)
                    title2 = col[0].xpath(titleCatch2)
                    if title1:
                        compname = re.sub(" +", " ", title1[0].extract()).strip().encode("ascii", "ignore")
                        compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                    elif title2:
                        compname = re.sub(" +", " ", title2[0].extract()).strip().encode("ascii", "ignore")
                        compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname)
                # f.write(compname + '\n')

                name = ""
                url = ""

                links1 = col[0].xpath("a")
                links2 = col[0].xpath("div//a")
                links3 = col[0].xpath("p//a")

                if links1:
                    for l in links1:
                        item = default(FILEITEM)
                        name = l.xpath("text()").extract()[0]
                        name = re.sub(" +", " ", name).strip().encode("ascii", "ignore")
                        name = re.sub('[\\\?\*"\.><\|\r\n]', "", name)
                        item["name"] = compname + " " + name

                        url = l.xpath("@href").extract()[0]
                        url = urlparse.urljoin(response.url, url.strip()).encode("ascii", "ignore")

                        item["source"] = url
                        item["name"] = name
                        item["state"] = state
                        item["year"] = year
                        group["items"].append(item)

                elif links2:
                    for l in links2:
                        item = default(FILEITEM)
                        name = l.xpath("text()").extract()[0]
                        name = re.sub(" +", " ", name).strip().encode("ascii", "ignore")
                        name = re.sub('[\\\?\*"\.><\|\r\n]', "", name)
                        item["name"] = compname + " " + name

                        url = l.xpath("@href").extract()[0]
                        url = urlparse.urljoin(response.url, url.strip()).encode("ascii", "ignore")

                        item["source"] = url
                        item["name"] = name
                        item["state"] = state
                        item["year"] = year
                        group["items"].append(item)

                elif links3:
                    for l in links3:
                        item = default(FILEITEM)
                        name = l.xpath("text()").extract()[0]
                        name = re.sub(" +", " ", name).strip().encode("ascii", "ignore")
                        name = re.sub('[\\\?\*"\.><\|\r\n]', "", name)
                        item["name"] = re.sub('[\\\?\*"\.><\|\r\n]', "", name)
                        item["name"] = compname + " " + name

                        url = l.xpath("@href").extract()[0]
                        url = urlparse.urljoin(response.url, url.strip()).encode("ascii", "ignore")

                        item["source"] = url
                        item["name"] = name
                        item["state"] = state
                        item["year"] = year
                        group["items"].append(item)

                logging.info("SCRAPED > " + str(len(group["items"])))
                yield group

示例#8

0

显示文件

文件： ME_DOI.py 项目： mcialini/tsne

    def parse(self, response):

        sel = Selector(response)

        titleSelector = '//head//title'
        linkSelector = '//a[contains(@href,"\d\d\d\d")]'

        linkSelector = '//body//td[@id="awt-middle-col"]//table//tr'

        title = sel.xpath(titleSelector)
        links = sel.xpath(linkSelector)

        curyear = 0

        for i in range(1, len(links)):
            url = ''

            year = links[i].xpath('td//strong/text()').extract()

            if (year):
                curyear = year[0].encode('ascii', 'ignore')

            year = curyear

            # There are three different ways the rows of the table are structured.
            # 1. within <td>
                        # 2. within <td><div>
                        # 3. within <td><p>
                        #
            # Search with each filter and scrape the files and titles that are
            # found
            comp1 = links[i].xpath('td/text()').extract()
            if (comp1):
                # clean up the titles
                comp1 = re.sub(' +', ' ', comp1[0]).rstrip().encode(
                    'ascii', 'ignore')
                comp1 = re.sub('[\\\?\*\"\.><\|\r\n]', '', comp1)
                comp1 = re.sub('[:\/]', '-', comp1)
                comp1 = comp1.replace(' - ', '').replace(' -', '')

            comp2 = links[i].xpath('td//div/text()').extract()
            if (comp2):
                # clean up the titles
                comp2 = re.sub(' +', ' ', comp2[0]).rstrip().encode(
                    'ascii', 'ignore')
                comp2 = re.sub('[\\\?\*\"\.><\|\r\n]', '', comp2)
                comp2 = re.sub('[:\/]', '-', comp2)
                comp2 = comp2.replace(' - ', '').replace(' -', '')

            comp3 = links[i].xpath('td//p/text()').extract()
            if (comp3):
                # clean up the titles
                comp3 = re.sub(' +', ' ', comp3[0]).rstrip().encode(
                    'ascii', 'ignore')
                comp3 = re.sub('[\\\?\*\"\.><\|\r\n]', '', comp3)
                comp3 = re.sub('[:\/]', '-', comp3)
                comp3 = comp3.replace(' - ', '').replace(' -', '')

            comp = ''
            if (comp1):
                comp = comp1
            elif (comp2):
                comp = comp2
            elif (comp3):
                comp = comp3

            group = default(FILEGROUP)
            docs = links[i].xpath('td//div/a')
            for l in docs:
                if (l):
                    item = default(FILEITEM)
                    lastpart = l.xpath('text()').extract()[
                        0].rstrip().encode('ascii', 'ignore')
                    if (len(docs) == 1) or (len(docs) > 1 and 'PDF' not in lastpart):
                        url = l.xpath('@href').extract()[0]
                        url = urlparse.urljoin(response.url, url.strip())

                        name = comp + ' - ' + lastpart

                        logging.info(lastpart + ': ' + name)
                        item['source'] = url.encode('ascii', 'ignore').replace(' ', '%20')
                        item['name'] = name
                        item['state'] = state
                        item['year'] = year
                        group['items'].append(item)

            logging.info('SCRAPED > ' + str(len(group['items'])))
            yield group