def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) r = re.compile(r'showdocument\?id=\d+$') a = s.find('a', href=r) u = urlparse.urljoin(self.br.geturl(), a['href']) self.br.open(u) d = self.br.response().read() s = soupify(pdftohtml(d)) bid.description = get_all_text(s.html.body) bid.save()
def scrape_bids(self): self.br.open(self.org.bids_page_url) s = soupify(self.br.response().read()) r = re.compile(r'^\s*Solicitation') f = lambda x: x.name == 'td' and re.search(r, x.text) self.org.bid_set.all().delete() for td in s.findAll(f): tr = td.findParent('tr') td = tr.findAll('td') bid = Bid(org=self.org) bid.title = td[0].text bid.url = self.br.geturl() bid.description = get_all_text(td[1]) z = re.search(self.date_regex, td[-3].text) if z: m, d, y = z.groups() bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.save()
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^index\.php\?m=procurement&id=\d+$') for a in s.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') # Verify open status if td[-1].text != 'OPEN': continue z = re.search(self.date_regex, td[1].contents[2]) if z: m,d,y = z.groups() bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^/purchasing/bid-form\?bidnumb=\d+') for a in s.findAll('a', href=r): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location d = a.findNext(text=re.compile(r'Due:')) d = d.split(':')[1].strip() try: r = time.strptime(d, "%B %d, %Y") bid.due_date = datetime.date(day=r.tm_mday, month=r.tm_mon, year=r.tm_year) except: pass bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'bids\.aspx\?bidID=\d+$') v = re.compile(r'(\d{1,2})/(\d{1,2})/(\d{4})') # date regex for a in s.findAll('a', href=r): if a.get('style', False): continue tr = a.findParent('tr') td = tr.findAll('td') sp = td[-1].findAll('span') z = re.search(v, sp[-1].text) if z: m,d,y = z.groups() bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'viewbid\.aspx\?bid_id=\d+$') for a in s.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') z = re.search(self.date_regex, td[-2].text) if z: m,d,y = z.groups() bid = Bid(org=self.org) bid.title = td[1].text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^files/[^.]+\.pdf$') for a in s.findAll('a', href=r): bid = Bid(self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location x = re.compile(r'lblSubDate') p = a.findNext('span', id=x) if p: z = re.search(self.date_regex, p.text) if z: m,d,y = z.groups() bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'viewbid\?id=\d+$') for a in s.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') z = re.search(self.date_regex, td[-1].text) if z: m, d, y = z.groups() bid = Bid(org=self.org) bid.title = td[1].span.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'\.pdf$') z = re.compile(r'^RFP -') f = lambda x: x.name == 'a' and re.search(r, x.get('href', '')) and re.search(z, x.text) for a in s.findAll(f): bid = Bid(org=self.org) bid.title = a.strong.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location d = a.strong.nextSibling d = ' '.join(d.split()) d = d.split(',', 1)[1].strip() d = d.rsplit(' ', 1)[0].strip() try: r = time.strptime(d, "%b %d, %Y") bid.due_date = datetime.date(day=r.tm_mday, month=r.tm_mon, year=r.tm_year) except: pass bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) x = {'summary': 'Current Bid Postings '} t = s.find('table', attrs=x) r = re.compile(r'/Bids/') for a in t.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') z = re.search(self.date_regex, td[-2].text) if z: m, d, y = z.groups() bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) f = lambda x: x.name == 'a' and x.text == 'Invitation for Bid' or x.text == 'Summary' for a in s.findAll(f): t = a.previousSibling.previousSibling.previousSibling bid = Bid(org=self.org) bid.title = t bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location tr = a.findParent('tr') d = tr.find(text=re.compile(r'\bDue:')) if d: d = d.split(':')[1].strip() d = d.split('at')[0].strip() try: r = time.strptime(d, "%B %d, %Y") bid.due_date = datetime.date(day=r.tm_mday, month=r.tm_mon, year=r.tm_year) except: pass bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'bids\.aspx\?bidID=\d+$') v = re.compile(r'(\d{1,2})/(\d{1,2})/(\d{4})') # date regex for a in s.findAll('a', href=r): if a.get('style', False): continue tr = a.findParent('tr') td = tr.findAll('td') sp = td[-1].findAll('span') z = re.search(v, sp[-1].text) if z: m, d, y = z.groups() bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) x = {'summary': 'Bid Details'} t = s.find('table', attrs=x) c = re.compile(r'^Contact', re.IGNORECASE) f = lambda x: x.name == 'span' and 'BidListHeader' in x.attrs.get( 'class', []) and re.search(c, x.text) p = t.find(f) if p: tr = p.findNext('tr') bid.contact = tr.text.strip() e = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b', re.IGNORECASE) # email regex v = tr.find(text=e) if v: m = re.search(e, v) bid.email = m.group(0) bid.description = get_all_text(t) bid.save()
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'\bRFP[^.]+\.pdf$') for a in s.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location d = td[-1].text.split(',', 1)[1] d = d.split('at')[0].strip() try: r = time.strptime(d, "%B %d, %Y") bid.due_date = datetime.date(day=r.tm_mday, month=r.tm_mon, year=r.tm_year) except: pass bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) x = {'summary': 'Current Bid Postings '} t = s.find('table', attrs=x) r = re.compile(r'/Bids/') for a in t.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') z = re.search(self.date_regex, td[-2].text) if z: m,d,y = z.groups() bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) if z: bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^files/[^.]+\.pdf$') for a in s.findAll('a', href=r): bid = Bid(self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location x = re.compile(r'lblSubDate') p = a.findNext('span', id=x) if p: z = re.search(self.date_regex, p.text) if z: m, d, y = z.groups() bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^BidBoard\.cfm\?BidID=\d+$') for a in s.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location v = re.compile( r'(\d{1,2})[/.](\d{1,2})[/.](\d{2,4})') # date regex z = re.search(v, td[-4].text) if z: m, d, y = z.groups() bid.due_date = datetime.date(day=int(d), month=int(m), year=int(y)) bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) y = re.compile(r'^/LinkClick\.aspx\?') z = re.compile(r'^Bid:') f = lambda x: x.name == 'a' and re.search(y, x.get('href', '')) and re.search(z, x.text) for a in s.findAll(f): td = a.findParent('td') t = td.find(text='Title:') t = t.parent.nextSibling bid = Bid(org=self.org) bid.title = t bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.url = urllib.quote(bid.url, ':/') bid.location = self.org.location d = a.findNext(text=re.compile(r'^Close Date:')) d = d.parent.nextSibling.strip() try: r = time.strptime(d, "%B %d, %Y") bid.due_date = datetime.date(day=r.tm_mday, month=r.tm_mon, year=r.tm_year) except: pass bids.append(bid) return bids
def scrape_bid_description(self, bid): self.br.open(bid.url) d = self.br.response().read() s = soupify(doctohtml(d)) bid.description = get_all_text(s.html.body) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) d = self.br.response().read() s = soupify(pdftohtml(d)) bid.description = get_all_text(s.html.body) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) h = s.find('h1') t = h.findParent('table') bid.description = get_all_text(t) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) x = {'class': 'ms-formtable'} t = s.find('table', attrs=x) bid.description = get_all_text(t) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) r = re.compile(r'PurchasingBids') t = s.find('table', id=r) bid.description = get_all_text(t) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) x = {'class': 'bidViewResultsLeft'} t = s.find('td', attrs=x) bid.description = get_all_text(t) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) x = {'class': 'content'} d = s.find('div', attrs=x) bid.description = get_all_text(d) bid.save()
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) t = s.find(text=re.compile(r'^Contact:')) if t: p = t.findParent('p') bid.contact = get_all_text(p) f = lambda x: x.name == 'a' and x.text == 'Download Bid Package' a = s.find(f) u = urlparse.urljoin(self.br.geturl(), a['href']) self.br.open(u) d = self.br.response().read() s = soupify(pdftohtml(d)) bid.description = get_all_text(s.html.body) bid.save()
def scrape_bid_links(self, url): bids = [] self.br.open(url) pageno = 2 done = False while True: s = soupify(self.br.response().read()) r = re.compile(r'^/node/\d+$') for a in s.findAll('a', href=r): li = a.findParent('li') x = {'class': 'views-field-title'} title_div = li.find('div', attrs=x) y = {'class': 'views-field-field-closing-date-value'} closing_date_div = li.find('div', attrs=y) z = re.compile(r'(\d{2})/(\d{2})/(\d{4})') x = re.search(z, closing_date_div.span.text) # Stop once we start seeing bids whose closing date is later # than the current date m,d,y = [int(n) for n in x.groups()] closing_date = datetime.date(y,m,d) today = datetime.date.today() # If we've passed the closing date we're done if today > closing_date: done = True break bid = Bid(org=self.org) bid.title = title_div.span.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) if done: break try: self.br.follow_link(self.br.find_link(text='%d' % pageno)) pageno += 1 except mechanize.LinkNotFoundError: break return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) ul = s.find('ul', id='toplevel') for a in ul.findAll('a'): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'bid[^.]+\.pdf$') for a in s.findAll('a', href=r): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) r = re.compile(r'^Agent') f = lambda x: x.name == 'b' and re.search(r, x.text) b = s.find(f) if b: t = b.findParent('table') bid.contact = get_all_text(t) b = s.find('blockquote') bid.description = get_all_text(b) bid.save()
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) d = s.find('div', id=re.compile(r'^_ctl\d+_content$')) r = re.compile(r'^Modules/ShowDocument\.aspx\?documentid=\d+$') for a in d.findAll('a', href=r): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^/DocumentView\.aspx\?DID=\d+$') x = {'href': r, 'class': 'Hyperlink', 'title': True} for a in s.findAll('a', attrs=x): bid = Bid(org=self.org) bid.title = a['title'] bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r1 = re.compile(r'\.pdf$') r2 = re.compile(r'window\.open') x = {'href': r1, 'onclick': r2} for a in s.findAll('a', attrs=x): if not a.previous.strip() == 'Bid Number:': continue title = a.findPrevious(text=re.compile(r'Title:')) title = re.sub(r'Title:', '', title) bid = Bid(org=self.org) bid.title = title bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location d = a.findNext(text=re.compile(r'^Due Date/Time:')) d = d.split(':')[1].strip() d = d.split('@')[0].strip() try: r = time.strptime(d, "%B %d, %Y") bid.due_date = datetime.date(day=r.tm_mday, month=r.tm_mon, year=r.tm_year) except: pass tr = a.findParent('tr') p = tr.find(text=re.compile(r'Project Manager:')) if p: bid.contact = p.parent.text bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r1 = re.compile(r'^RFP') r2 = re.compile(r'\.pdf$') f = lambda x: x.name == 'a' and re.search(r2, x.get('href', '')) and re.search(r1, x.text) for a in s.findAll(f): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'^_ctl\d+_listDataGrid$') t = s.find('table', id=r) r = re.compile(r'^index\.aspx\?recordid=\d+') for a in t.findAll('a', href=r): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_description(self, bid): self.br.open(bid.url) s = soupify(self.br.response().read()) x = {'class': 'body-content'} y = {'class': 'promo'} n = s.find('section', attrs=y) bid.contact = n.text m = re.search(self.email_regex, n.text) if m: bid.email = m.group(0) d = s.find('div', attrs=x) bid.desc = get_all_text(d) bid.save()
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'\.pdf$') f = lambda x: x.name == 'a' and re.search(r, x.get('href', '')) and x.text == 'Read more' for a in s.findAll(f): h3 = a.findPrevious('h3') bid = Bid(org=self.org) bid.title = h3.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) d = s.find('div', id='mainContent') r = re.compile(r'\.pdf$') for a in d.findAll('a', href=r): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.url = urllib.quote(bid.url, ':/') bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'center\.egov\?path=doc&id=\d+&id2=\d+&linked=0') for a in s.findAll('a', href=r): if len(a.text) == 0: continue bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids
def scrape_bid_links(self, url): bids = [] self.br.open(url) s = soupify(self.br.response().read()) r = re.compile(r'/Currentbids/[^.]+\.pdf$') x = {'class': 'learn-more-content'} d = s.find('div', attrs=x) d.extract() for a in d.findAll('a', href=r): bid = Bid(org=self.org) bid.title = a.text bid.url = urlparse.urljoin(self.br.geturl(), a['href']) bid.location = self.org.location bids.append(bid) return bids