def next(self, parse_each=True): """Return next (at most 10) parsed RFPs. Return list of dictionaries for each RFP. If parse_each, then parse dedicated page of each RFP extracting additional metadata. Otherwise, return only parent ID, title, and permanent URI of the RFP""" rfp_list = [] # load HTML for page with list of RFPs self.load( self.get_list_uri() ) if self.request is None: raise IOError( 'Request object not initialized. Run load() first' ) try: s = self.request.read() self.doc = pq( s ) except lxml.etree.XMLSyntaxError as e: logging.error( 'Could not parse URI: %s' % self.list_uri ) parse_list = self.parse_list() # Don't parse individual RFPs if not instructed to if not parse_each: return parse_list # Load and parse each RFP's dedicated page to grab more detailed # information about each one for l in parse_list: rfp_list.append( self.parse_details(l) ) return rfp_list
def setup(self, uri): # retrieve search list request = urllib2.Request(uri, headers = self.headers) response = urllib2.urlopen(request).read() try: self.doc = pq(response) except lxml.etree.XMLSyntaxError: logging.error('Could not parse URI: %s' % self.list_uri)
def parse_list(self): """Parse page with list of RFPs Assumes self.doc contains parsed DOM of list of RFPs page """ self.parsed_list = [] # parse the DOM looking for table rows of RFPs rows = self.doc(".BoxBody_Center.FullSearchListBody_Center table tr") # remove title row and last row from self.parsed_lists; they're garbage HTML rows.pop(0) pagination = rows.pop() logging.info( "Got %s rows from Merx" % len(rows) ) # extract RFP titles and links for i in range(0, len(rows)): link = rows.eq(i).find('td').eq(5).find('a') uri = MerxParser.domain + link.attr( "href" ) id_search = self.id_pattern.search(uri) rfp = { "title" : link.text(), "uri" : uri, "original_id" : ( id_search is not None ) and id_search.group(1) or "", 'origin' : 'merx' } self.parsed_list.append( rfp ) pagination_links = pq( pagination ).find( '.NavLinkStyleLink' ) next_page = pagination_links.eq( len(pagination_links)-1 ) # This is the last page. Mark it for future reference # XXX: test stopping at last page if next_page.text().strip() != "Next": self.page = -1 logging.debug( 'Reached last self.parsed_lists page' ) else: self.page = self.page + 1 self.pagination_uri = next_page.parent().attr('onclick')[14:-3] # more Merx's stupid magic values self.pagination_data[ 'search_profile' ] = self.doc( 'input' ).eq(0).val() return self.parsed_list
def testParseList(self): self.parser.load(self.parser.get_list_uri()) try: s = self.parser.request.read() self.parser.doc = pq(s) except lxml.etree.XMLSyntaxError as e: logging.error('Could not parse URI: %s' % self.list_uri) parsed_list = self.parser.parse_list() # 10 RFPs should have been parsed self.assertEquals(10, len(parsed_list)) for rfp in parsed_list: self.assertEquals('merx', rfp['origin']) self.assertEquals(0, rfp['uri'].find('http://www.merx.com/')) self.assertNotEquals('', rfp['title']) self.assertNotEquals('', rfp['original_id'])
def testParseList(self): self.parser.load(self.parser.get_list_uri()) try: s = self.parser.request.read() self.parser.doc = pq( s ) except lxml.etree.XMLSyntaxError as e: logging.error( 'Could not parse URI: %s' % self.list_uri ) parsed_list = self.parser.parse_list() # 10 RFPs should have been parsed self.assertEquals(10, len(parsed_list)) for rfp in parsed_list: self.assertEquals('merx', rfp['origin']) self.assertEquals(0, rfp['uri'].find('http://www.merx.com/')) self.assertNotEquals('', rfp['title']) self.assertNotEquals('', rfp['original_id'])
def parse_details(self, l): try: self.load( (l['uri'], '') ) s = self.request.read() self.doc = pq( s ) # Parse page's data and stash results in a dictionary rfp = self.parse_rfp() rfp['title'] = l['title'] rfp['original_id'] = l['original_id'] rfp['origin'] = l['origin'] rfp['uri'] = l['uri'] except lxml.etree.XMLSyntaxError as e: logging.error( 'Could not parse RFP: %s' % l.uri ) raise e return rfp
def setup(self, uri): # login request = urllib2.Request(self.domain + self.login_url, urllib.urlencode(self.login_parameters), self.headers) response = urllib2.urlopen(request) # get cookie for i in range(len(response.info().headers)): if response.info().headers[i].startswith('Set-Cookie') or response.info().headers[i].startswith('set-cookie'): self.headers['Cookie'] = response.info().headers[i][12:].replace('; path=/', '')[:-2] # retrieve search list request = urllib2.Request(uri, headers = self.headers) response = urllib2.urlopen(request).read() try: self.doc = pq(response) except lxml.etree.XMLSyntaxError: logging.error('Could not parse URI: %s' % self.list_uri)