def _store_data(self, year, match): if not match: return # Check if there are any courses to attach the exam to try: subject = Subject.objects.get(abbreviation=match.groups()[0].upper()) courses = Course.objects.filter(subject=subject, number__istartswith=match.groups()[1]) num_courses = courses.count() except ObjectDoesNotExist: num_courses = 0 if num_courses < 1: print("--No course '{0} {1}' in database".format(match.groups()[0], match.groups()[1])) return # Find/Create the course <-> data relation(s) course_relations = [] for course in courses: temp = existing_or_new(CourseRelation, course=course) temp.save(was_scraped=True) course_relations.append(temp) exam_attrs = { "year": year, "pdf_url": "http://library.queensu.ca.proxy.queensu.ca{0}".format(match.string)} exam = existing_or_new(Exam, **exam_attrs) for course_relation in course_relations: exam.course_rels.add(course_relation) exam.save(was_scraped=True) print ("--Added exam pdf for {0} course(s): {1}".format(num_courses, ", ".join([str(course) for course in courses])))
def _store_data(self, year, match): if not match: return # Check if there are any courses to attach the exam to try: subject = Subject.objects.get(abbreviation=match.groups()[0].upper()) courses = Course.objects.filter(subject=subject, number__istartswith=match.groups()[1]) num_courses = courses.count() except ObjectDoesNotExist: num_courses = 0 if num_courses < 1: print ("--No course '{0} {1}' in database".format(match.groups()[0], match.groups()[1])) return # Find/Create the course <-> data relation(s) course_relations = [] for course in courses: temp = existing_or_new(CourseRelation, course=course) temp.save(was_scraped=True) course_relations.append(temp) exam_attrs = { "year": year, "pdf_url": "http://library.queensu.ca.proxy.queensu.ca{0}".format(match.string)} exam = existing_or_new(Exam, **exam_attrs) for course_relation in course_relations: exam.course_rels.add(course_relation) exam.save(was_scraped=True) print ("--Added exam pdf for {0} course(s): {1}".format(num_courses, ", ".join([str(course) for course in courses])))
def scrape(self): print("Starting textbook scrape") print("Getting a list of courses") r = requests.get("http://www.campusbookstore.com/Textbooks/Booklists/") b = BeautifulSoup(r.text) content = b.find("div", {"class": "thecontent"}) links = content.find_all("a") temp = [] for link in links: if "campusbookstore.com/Textbooks/Course/" in link.attrs.get("href", ""): m = re.search("^(\D+)(\d+).*$", link.string) # Only parse letters in config if m and m.group(1)[1].upper() in self.config.letters: temp.append((m.group(1), m.group(2), link.attrs["href"])) print("Parsing courses") for s, c, l in temp: # Check if there are any courses to attach the book to try: subject = Subject.objects.get(abbreviation=s) courses = Course.objects.filter(subject=subject, number__istartswith=c) num_courses = courses.count() except ObjectDoesNotExist: num_courses = 0 if num_courses < 1: print("--No course '{0} {1}' in database".format(s, c)) continue # Find/Create the course <-> textbook relation(s) course_relations = [] for course in courses: temp = existing_or_new(CourseRelation, course=course) temp.save(was_scraped=True) course_relations.append(temp) print( "--Parsing books from {0} course(s): {1}".format( num_courses, ", ".join([str(course) for course in courses]) ) ) r = requests.get(l) b = BeautifulSoup(r.text) # Looking at the page source, 49 books seems to be the limit (numbers padded the 2 digits) for i in range(0, 99, 2): book_id = "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ModeFull".format(i) book = b.find("div", {"id": book_id}) if not book: break temp = book.find("table").find("table").find_all("td")[1] textbook_attrs = {"listing_url": l + "#" + book_id} # Title title = temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookTitle".format(i)} ).string textbook_attrs["title"] = unicode(title) # Authors authors = temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookAuthor".format(i)} ).string if authors and authors[:4] == " by ": textbook_attrs["authors"] = authors[4:] # Required required = temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_StatusLabel".format(i)} ).string if required and "REQUIRED" in required.upper(): textbook_attrs["required"] = True # ISBN 13 isbn_13 = temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN13Label".format(i)} ).string if isbn_13 and "[N/A]" in isbn_13: textbook_attrs["isbn_13"] = None else: textbook_attrs["isbn_13"] = unicode(isbn_13) # ISBN 10 isbn_10 = temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN10Label".format(i)} ).string if isbn_10 and "[N/A]" in isbn_10: textbook_attrs["isbn_10"] = None else: textbook_attrs["isbn_10"] = unicode(isbn_10) # New data new_price = self.price( temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewPriceLabel".format(i)}, ).string ) new_available = self.num_available( temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewAvailabilityLabel".format( i ) }, ).string ) if new_price: textbook_attrs["new_price"] = new_price if new_available: textbook_attrs["new_available"] = new_available # Used data used_price = self.price( temp.find( "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedPriceLabel".format(i)}, ).string ) used_available = self.num_available( temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedAvailabilityLabel".format( i ) }, ).string ) if used_price: textbook_attrs["used_price"] = used_price if used_available: textbook_attrs["used_available"] = used_available # Classifieds info classified_info = temp.find( "a", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ClassifiedsLabel".format(i)} ).string if classified_info: textbook_attrs["classified_info"] = classified_info # Add the textbook if textbook_attrs["isbn_10"] or textbook_attrs["isbn_13"]: textbook = existing_or_new(Textbook, **textbook_attrs) for course_relation in course_relations: textbook.course_rels.add(course_relation) textbook.save(was_scraped=True) print("----Parsed book: " + str(textbook))
def scrape(self): print "Starting textbook scrape" print "Getting a list of courses" r = requests.get("http://www.campusbookstore.com/Textbooks/Booklists/") b = BeautifulSoup(r.text) content = b.find("div", {"class": "thecontent"}) links = content.find_all("a") temp = [] for link in links: if "campusbookstore.com/Textbooks/Course/" in link.attrs.get( "href", ""): m = re.search("^(\D+)(\d+).*$", link.string) # Only parse letters in config if m and m.group(1)[1].upper() in self.config.letters: temp.append((m.group(1), m.group(2), link.attrs["href"])) print("Parsing courses") for s, c, l in temp: # Check if there are any courses to attach the book to try: subject = Subject.objects.get(abbreviation=s) courses = Course.objects.filter(subject=subject, number__istartswith=c) num_courses = courses.count() except ObjectDoesNotExist: num_courses = 0 if num_courses < 1: print("--No course '{0} {1}' in database".format(s, c)) continue # Find/Create the course <-> textbook relation(s) course_relations = [] for course in courses: temp = existing_or_new(CourseRelation, course=course) temp.save(was_scraped=True) course_relations.append(temp) print("--Parsing books from {0} course(s): {1}".format( num_courses, ", ".join([str(course) for course in courses]))) r = requests.get(l) b = BeautifulSoup(r.text) # Looking at the page source, 49 books seems to be the limit (numbers padded the 2 digits) for i in range(0, 99, 2): book_id = "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ModeFull".format( i) book = b.find("div", {"id": book_id}) if not book: break temp = book.find("table").find("table").find_all("td")[1] textbook_attrs = {"listing_url": l + "#" + book_id} # Title title = temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookTitle" .format(i) }).string textbook_attrs["title"] = unicode(title) # Authors authors = temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookAuthor" .format(i) }).string if authors and authors[:4] == " by ": textbook_attrs["authors"] = authors[4:] # Required required = temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_StatusLabel" .format(i) }).string if required and "REQUIRED" in required.upper(): textbook_attrs["required"] = True # ISBN 13 isbn_13 = temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN13Label" .format(i) }).string if isbn_13 and "[N/A]" in isbn_13: textbook_attrs["isbn_13"] = None else: textbook_attrs["isbn_13"] = unicode(isbn_13) # ISBN 10 isbn_10 = temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN10Label" .format(i) }).string if isbn_10 and "[N/A]" in isbn_10: textbook_attrs["isbn_10"] = None else: textbook_attrs["isbn_10"] = unicode(isbn_10) # New data new_price = self.price( temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewPriceLabel" .format(i) }).string) new_available = self.num_available( temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewAvailabilityLabel" .format(i) }).string) if new_price: textbook_attrs["new_price"] = new_price if new_available: textbook_attrs["new_available"] = new_available # Used data used_price = self.price( temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedPriceLabel" .format(i) }).string) used_available = self.num_available( temp.find( "span", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedAvailabilityLabel" .format(i) }).string) if used_price: textbook_attrs["used_price"] = used_price if used_available: textbook_attrs["used_available"] = used_available # Classifieds info classified_info = temp.find( "a", { "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ClassifiedsLabel" .format(i) }).string if classified_info: textbook_attrs["classified_info"] = classified_info # Add the textbook if textbook_attrs["isbn_10"] or textbook_attrs["isbn_13"]: textbook = existing_or_new(Textbook, **textbook_attrs) for course_relation in course_relations: textbook.course_rels.add(course_relation) textbook.save(was_scraped=True) print("----Parsed book: " + str(textbook))
def existing_or_new_with_time(model, **kwargs): kwargs['last_encountered'] = datetime.datetime.now() existing = existing_or_new(model, **kwargs) return existing