def get_new_loan_pages(self, wait, N): LPP = LoanPageParser() loanids = self.new_loans_set() # pull N loan pages at a time and insert into DB counter = 0 num_loanids = len(loanids) while len(loanids)>0: LC = PageCrawler(self.loan_page_url, self.loan_page_login_str, wait) # create subset of loans to crawl for loans_to_grab = [] for i in range(N): try: loans_to_grab.append(loanids.pop()) except IndexError: break LC.crawl(loans_to_grab) loans = LC.get_data() # parse loan and insert into DB for loanID in loans: html = loans[loanID] db_doc = LPP.parse_html(html) self.loans.update({'loanID':db_doc['loanID']}, {'$set': db_doc}, upsert=True, safe=True) counter += 1 print 'inserted loan %s of %s' % (counter, num_loanids)
def get_note_pages(self, note_tups, wait): PC = PageCrawler(self.note_page_url, self.login_str, wait) PC.crawl(note_tups) return PC.get_data()