def scrape_page(carpool_page_url): carpool_page = parse_to_soup(carpool_page_url) threads = carpool_page.find_all(is_normal_thread) post_list = [] today = datetime.datetime.today() for thread in threads: try: title = thread.find('span',{'class':'comiis_common'}) print('Getting thread %s ' % title) link = title.find('a',{'onclick':'atarget(this)'}) normalized_txt = regex_util.normalize(link.text) date = date_extractor.extract_date_info(normalized_txt,today) location_pair = location_extractor.extract_location_info(normalized_txt) post_url = link['href'] post_page = parse_to_soup(post_url) #getting the page phone_num = scrape_phone_num(getMainPost(post_page)) # getting the phone number # add to list if (post_url and date and location_pair): post_list.append(Post(carpool_date=date, from_location=location_pair[0], to_location=location_pair[1], phone=phone_num,url=post_url, original_title=link.text,scrape_date=today)) except Exception as e: print(e) exc_type, exc_value, exc_traceback = sys.exc_info() print "*** print_tb:" traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) print "*** print_exception:" traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) print "*** print_exc:" traceback.print_exc() return post_list
def parse_request(txt, reference_date=datetime.today()): normalized_txt = regex_util.normalize(txt) date = date_extractor.extract_date_info(normalized_txt, reference_date) location_info = location_extractor.extract_location_info(normalized_txt) if location_info: return RequestInfo(date, location_info[0], location_info[1]) return None