def parse_items(self, response): sel = Selector(response) getpdfs = '//a[(contains(@href, "enforcementactions.pdf") or contains(@href, "admin")) and contains(@class, "titlelink")]' links = sel.xpath(getpdfs) for link in links: item = default(FILEITEM) group = default(FILEGROUP) url = URL + link.xpath('@href').extract()[0] name = link.xpath('text()').extract() y = re.compile('\d\d\d\d') year = y.findall(name[0])[0].encode('ascii', 'ignore').strip() name = re.search(': (.*)', name[0].encode('ascii', 'ignore')) name = name.group(1) item['source'] = url item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info('SCRAPED > ' + str(len(group['items']))) yield group
def dostuff(self, response): from scrapy.shell import inspect_response inspect_response(response) sel = Selector(response) rowSelector = '//table[@id="GridView1"]//tr' rows = sel.xpath(rowSelector) logging.info('---------CALLED PARSE*********') for r in range(1, len(rows)-2): # The last row is the footer item = default(FILEITEM) group = default(FILEGROUP) cols = rows[r].xpath('td') name = cols[0].xpath('text()').extract()[0].encode('ascii', 'ignore').upper() disp = cols[1].xpath('text()').extract()[0].encode('ascii', 'ignore') date = cols[2].xpath('text()').extract()[0].encode('ascii', 'ignore') y = re.compile('\d\d\d\d') year = y.findall(date)[0].encode('ascii', 'ignore') type = cols[3].xpath('text()').extract()[0].encode('ascii', 'ignore') link = cols[4].xpath('a') url = link.xpath('@href').extract()[0] title = 'Disposition' item['source'] = url item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info(item['name']) item = default(FILEITEM)
def parse(self, response): sel = Selector(response) """ parse should grab all the data on a given page, then create a formrequest for the next page with a callback of parse """ rowSelector = '//table[@id="GridView1"]//tr' rows = sel.xpath(rowSelector) logging.info('********CALLED PARSE*********') for r in range(1, len(rows)-2): # The last row is the footer item = default(FILEITEM) group = default(FILEGROUP) cols = rows[r].xpath('td') name = cols[0].xpath('text()').extract()[0].encode('ascii', 'ignore').upper() disp = cols[1].xpath('text()').extract()[0].encode('ascii', 'ignore') date = cols[2].xpath('text()').extract()[0].encode('ascii', 'ignore') y = re.compile('\d\d\d\d') year = y.findall(date)[0].encode('ascii', 'ignore') type = cols[3].xpath('text()').extract()[0].encode('ascii', 'ignore') link = cols[4].xpath('a') url = link.xpath('@href').extract()[0] title = 'Disposition' item['source'] = url item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info(item['name']) item = default(FILEITEM) viewStateSelector = '//input[@name="__VIEWSTATE"]/@value' viewState = sel.xpath(viewStateSelector).extract()[0].encode('ascii','ignore') eventValidationSel = '//input[@name="__EVENTVALIDATION"]/@value' eventValidation = sel.xpath(eventValidationSel).extract()[0] # print viewState resp = FormRequest.from_response( response, formname="form1", formdata={ '__EVENTTARGET': 'GridView1', '__EVENTARGUMENT': 'Page$Next', '__EVENTVALIDATION': eventValidation, '__VIEWSTATE': viewState.replace('%2','/'), 'ctl00$jsCheck' : str(0) }, callback=self.dostuff, method='POST', dont_filter=True ) logging.info(resp.headers) logging.info(resp.body)
def parse(self, response): sel = Selector(response) year = '' rowSelector = '//div//table//tbody//tr' rows = sel.xpath(rowSelector) for r in range(len(rows)): # 1. Check if the current row is a header row with the year on it # 2. If not, grab the link from the row and the text header = rows[r].xpath('th/text()') if header: year = header[0].extract() y = re.compile('\d\d\d\d') year = y.findall(year) if not year: year = ['2014'] year = year[0].encode('ascii', 'ignore').strip() else: links = rows[r].xpath('td//a[(contains(@href,"lib/cid"))]') group = default(FILEGROUP) for l in links: text = l.xpath('text()').extract() name = '' if text: item = default(FILEITEM) url = l.xpath('@href').extract()[0] url = urlparse.urljoin( response.url, url.strip()).encode('ascii', 'ignore') name = text[0].encode('ascii', 'ignore') name = re.sub('[\\\?\*\"\.><\|\r\n]', '', name) name = re.sub('[:\/]', '-', name) item['source'] = url item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info('SCRAPED > ' + str(len(group['items']))) yield group
def parse_items(self, response): sel = Selector(response) # Find the name of the order titleSelector = '//h1[@id="page-title"]/text()' title = sel.xpath(titleSelector).extract()[0].encode('ascii', 'ignore') # Find the date of the order dateSelector = '//span[@class="date-display-single"]/text()' date = sel.xpath(dateSelector).extract()[0] y = re.compile('\d\d\d\d') year = y.findall(date)[0].encode('ascii', 'ignore') # Find the file we want to download fileSelector = '//span[@class="file"]//a' files = sel.xpath(fileSelector) item = default(FILEITEM) group = default(FILEGROUP) for i in range(len(files)): url = files[i].xpath('@href').extract()[0] # reconstruct the absolute path to the file url = urlparse.urljoin( response.url, url.strip()).encode('ascii', 'ignore') name = files[i].xpath('text()').extract()[0].replace('.pdf', '') name = (re.sub('[\\\?\*\"\.><\|\r\n]', '', title).replace('/', ' ') + ' - ' + re.sub( '[\\\?\*\"\.><\|\r\n]', '', name)).replace(', ', ',') item['source'] = url item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) item = default(FILEITEM) logging.info('SCRAPED > ' + str(len(group['items']))) yield group
def parse(self, response): sel = Selector(response) rowSelector = '//table[@class="datatable"]//tr' rows = sel.xpath(rowSelector) for r in rows: col = r.xpath('td') if col: item = default(FILEITEM) group = default(FILEGROUP) name = col[0].xpath('text()').extract()[0] url = col[1].xpath('a//@href').extract()[0] year = col[2].xpath('text()').extract()[0] name = re.sub(' +', ' ', name).strip().encode( 'ascii', 'ignore') name = re.sub('[\\\?\*\"\.><\|\r\n]', '', name) name = re.sub('[\/:]', '-', name) name = re.sub('[;]', ',', name) url = urlparse.urljoin(response.url, url.strip()) y = re.compile('\d\d\d\d') year = y.findall(year)[0].encode('ascii', 'ignore') if year == '1007': year = '2007' item['source'] = url.encode('ascii', 'ignore') item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info('SCRAPED > ' + str(len(group['items']))) yield group
def parse_items(self, response): sel = Selector(response) # Find the date of the order dateSelector = '//td[@class="title"]/text()' date = " ".join(sel.xpath(dateSelector).extract()) y = re.compile("\d\d\d\d") year = y.findall(date)[0].encode("ascii", "ignore").strip() # Find the file we want to download rowSelector = '//table[@id="blue_table"]//tr' rows = sel.xpath(rowSelector) # 1. Single link with company name as a/text # 2. Line of text with company name, and one or more links to documentsd # 3. Multiple links, top one having the company name for r in rows: group = default(FILEGROUP) col = r.xpath("td") # the column we're interested in is not empty, so let's scrape # it if col: titleSelector1 = "div//p/text()" titleSelector2 = "div//text()" titleSelector3 = "div//span/text()" titleSelector4 = "p//span/text()" titleSelector5 = "p/text()" titleSelector6 = "span/text()" titleSelector7 = "text()" titles1 = col[0].xpath(titleSelector1) titles2 = col[0].xpath(titleSelector2) titles3 = col[0].xpath(titleSelector3) titles4 = col[0].xpath(titleSelector4) titles5 = col[0].xpath(titleSelector5) titles6 = col[0].xpath(titleSelector6) titles7 = col[0].xpath(titleSelector7) compname = "" if titles1: compname = re.sub(" +", " ", titles1[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector1 + '\n') elif titles2: compname = re.sub(" +", " ", titles2[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector2 + '\n') elif titles3: compname = re.sub(" +", " ", titles3[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector3 + '\n') elif titles4: compname = re.sub(" +", " ", titles4[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector4 + '\n') elif titles5: compname = re.sub(" +", " ", titles5[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector5 + '\n') elif titles6: compname = re.sub(" +", " ", titles6[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector6 + '\n') elif titles7: compname = re.sub(" +", " ", titles7[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) ##f.write(titleSelector7 + '\n') # If no title was found, then just use the first link as the # company name if compname == "": titleCatch1 = "a/text()" titleCatch2 = "div//a/text()" title1 = col[0].xpath(titleCatch1) title2 = col[0].xpath(titleCatch2) if title1: compname = re.sub(" +", " ", title1[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) elif title2: compname = re.sub(" +", " ", title2[0].extract()).strip().encode("ascii", "ignore") compname = re.sub('[\\\?\*"\.><\|\r\n]', "", compname) # f.write(compname + '\n') name = "" url = "" links1 = col[0].xpath("a") links2 = col[0].xpath("div//a") links3 = col[0].xpath("p//a") if links1: for l in links1: item = default(FILEITEM) name = l.xpath("text()").extract()[0] name = re.sub(" +", " ", name).strip().encode("ascii", "ignore") name = re.sub('[\\\?\*"\.><\|\r\n]', "", name) item["name"] = compname + " " + name url = l.xpath("@href").extract()[0] url = urlparse.urljoin(response.url, url.strip()).encode("ascii", "ignore") item["source"] = url item["name"] = name item["state"] = state item["year"] = year group["items"].append(item) elif links2: for l in links2: item = default(FILEITEM) name = l.xpath("text()").extract()[0] name = re.sub(" +", " ", name).strip().encode("ascii", "ignore") name = re.sub('[\\\?\*"\.><\|\r\n]', "", name) item["name"] = compname + " " + name url = l.xpath("@href").extract()[0] url = urlparse.urljoin(response.url, url.strip()).encode("ascii", "ignore") item["source"] = url item["name"] = name item["state"] = state item["year"] = year group["items"].append(item) elif links3: for l in links3: item = default(FILEITEM) name = l.xpath("text()").extract()[0] name = re.sub(" +", " ", name).strip().encode("ascii", "ignore") name = re.sub('[\\\?\*"\.><\|\r\n]', "", name) item["name"] = re.sub('[\\\?\*"\.><\|\r\n]', "", name) item["name"] = compname + " " + name url = l.xpath("@href").extract()[0] url = urlparse.urljoin(response.url, url.strip()).encode("ascii", "ignore") item["source"] = url item["name"] = name item["state"] = state item["year"] = year group["items"].append(item) logging.info("SCRAPED > " + str(len(group["items"]))) yield group
def parse(self, response): sel = Selector(response) titleSelector = '//head//title' linkSelector = '//a[contains(@href,"\d\d\d\d")]' linkSelector = '//body//td[@id="awt-middle-col"]//table//tr' title = sel.xpath(titleSelector) links = sel.xpath(linkSelector) curyear = 0 for i in range(1, len(links)): url = '' year = links[i].xpath('td//strong/text()').extract() if (year): curyear = year[0].encode('ascii', 'ignore') year = curyear # There are three different ways the rows of the table are structured. # 1. within <td> # 2. within <td><div> # 3. within <td><p> # # Search with each filter and scrape the files and titles that are # found comp1 = links[i].xpath('td/text()').extract() if (comp1): # clean up the titles comp1 = re.sub(' +', ' ', comp1[0]).rstrip().encode( 'ascii', 'ignore') comp1 = re.sub('[\\\?\*\"\.><\|\r\n]', '', comp1) comp1 = re.sub('[:\/]', '-', comp1) comp1 = comp1.replace(' - ', '').replace(' -', '') comp2 = links[i].xpath('td//div/text()').extract() if (comp2): # clean up the titles comp2 = re.sub(' +', ' ', comp2[0]).rstrip().encode( 'ascii', 'ignore') comp2 = re.sub('[\\\?\*\"\.><\|\r\n]', '', comp2) comp2 = re.sub('[:\/]', '-', comp2) comp2 = comp2.replace(' - ', '').replace(' -', '') comp3 = links[i].xpath('td//p/text()').extract() if (comp3): # clean up the titles comp3 = re.sub(' +', ' ', comp3[0]).rstrip().encode( 'ascii', 'ignore') comp3 = re.sub('[\\\?\*\"\.><\|\r\n]', '', comp3) comp3 = re.sub('[:\/]', '-', comp3) comp3 = comp3.replace(' - ', '').replace(' -', '') comp = '' if (comp1): comp = comp1 elif (comp2): comp = comp2 elif (comp3): comp = comp3 group = default(FILEGROUP) docs = links[i].xpath('td//div/a') for l in docs: if (l): item = default(FILEITEM) lastpart = l.xpath('text()').extract()[ 0].rstrip().encode('ascii', 'ignore') if (len(docs) == 1) or (len(docs) > 1 and 'PDF' not in lastpart): url = l.xpath('@href').extract()[0] url = urlparse.urljoin(response.url, url.strip()) name = comp + ' - ' + lastpart logging.info(lastpart + ': ' + name) item['source'] = url.encode('ascii', 'ignore').replace(' ', '%20') item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info('SCRAPED > ' + str(len(group['items']))) yield group