def __init__(self): # Create server object with SSL option self.server = smtplib.SMTP_SSL(get_secret('MAILER_SERVER')) self.server.login(get_secret('MAILER_USER'), get_secret('MAILER_PASSWORD')) # Define to self.sender = '*****@*****.**'
def __init__(self): # Create server object with SSL option self.server = smtplib.SMTP_SSL(get_secret('MAILER_SERVER')) self.server.login(get_secret('MAILER_USER'), get_secret('MAILER_PASSWORD')) # Define to self.sender = '*****@*****.**'
def get_google_credentials(self): social_user = self.user.social_auth.filter(provider='google-oauth2').first() if social_user is None: return None access_token = social_user.extra_data["access_token"] refresh_token = social_user.extra_data.get("refresh_token") expires_at = social_user.extra_data["expires"] return GoogleCredentials(access_token, get_secret('SOCIAL_AUTH_GOOGLE_OAUTH2_KEY'), get_secret('SOCIAL_AUTH_GOOGLE_OAUTH2_SECRET'), refresh_token, expires_at, "https://accounts.google.com/o/oauth2/token", 'my-user-agent/1.0')
def __new__(cls, *args, **kwargs): """Set static variables within closure. Returns: Parser """ new_instance = object.__new__(cls) cls.CREDENTIALS = { 'USERNAME': get_secret('QUEENS_USER'), 'PASSWORD': get_secret('QUEENS_PASS') } return new_instance
def __new__(cls, *args, **kwargs): """Set static variables within closure. Returns: Parser """ new_instance = object.__new__(cls) cls.CREDENTIALS = { 'username': get_secret('GW_USER'), 'password': get_secret('GW_PASS'), 'security_question_answer': get_secret('GW_SECURITY_ANSWER') } return new_instance
def __new__(cls, *args, **kwargs): """Set static variables within closure. Returns: Parser """ new_instance = object.__new__(cls) cls.CREDENTIALS = { "username": get_secret("GW_USER"), "password": get_secret("GW_PASS"), "security_question_answer": get_secret("GW_SECURITY_ANSWER"), } return new_instance
def __new__(cls, *args, **kwargs): """Set static variables within closure. Returns: Parser """ new_instance = object.__new__(cls) cls.CREDENTIALS = { 'username': get_secret('GW_USER'), 'password': get_secret('GW_PASS'), 'security_question_answer': get_secret('GW_SECURITY_ANSWER') } return new_instance
def set_img_url_google(student, social_user, access_token): response = requests.get( "https://www.googleapis.com/userinfo/v2/me".format( social_user.uid, get_secret("GOOGLE_API_KEY")), params={"access_token": access_token}, ) student.img_url = response.json()["picture"]
def _login(self): login_url = 'https://login.mis.vanderbilt.edu' params = {'service': Parser.URL + '/j_spring_cas_security_check'} soup = self.requester.get(login_url + '/login', params=params) self.requester.post( login_url + soup.find('form', {'name': 'loginForm'})['action'], parse=False, params=params, data={ 'username': get_secret('VANDY_USER'), 'password': get_secret('VANDY_PASS'), 'lt': soup.find('input', {'name': 'lt'})['value'], '_eventId': 'submit', 'submit': 'LOGIN' }, ) self.requester.get(Parser.URL + '/Entry.action', parse=False)
def _login(self): login_url = "https://login.mis.vanderbilt.edu" params = {"service": Parser.URL + "/j_spring_cas_security_check"} soup = self.requester.get(login_url + "/login", params=params) self.requester.post( login_url + soup.find("form", {"name": "loginForm"})["action"], parse=False, params=params, data={ "username": get_secret("VANDY_USER"), "password": get_secret("VANDY_PASS"), "lt": soup.find("input", {"name": "lt"})["value"], "_eventId": "submit", "submit": "LOGIN", }, ) self.requester.get(Parser.URL + "/Entry.action", parse=False)
def __new__(cls, *args, **kwargs): """Set static variables within closure. Returns: Parser """ new_instance = object.__new__(cls, *args, **kwargs) cls.KEY = get_secret('JHU_API_KEY') return new_instance
def __new__(cls, *args, **kwargs): """Set static variables within closure. Returns: Parser """ new_instance = object.__new__(cls, *args, **kwargs) cls.KEY = get_secret('JHU_API_KEY') return new_instance
def _login(self): login_url = 'https://login.mis.vanderbilt.edu' params = { 'service': Parser.URL + '/j_spring_cas_security_check' } soup = self.requester.get(login_url + '/login', params=params) self.requester.post( login_url + soup.find('form', {'name': 'loginForm'})['action'], parse=False, params=params, data={ 'username': get_secret('VANDY_USER'), 'password': get_secret('VANDY_PASS'), 'lt': soup.find('input', {'name': 'lt'})['value'], '_eventId': 'submit', 'submit': 'LOGIN' }, ) self.requester.get(Parser.URL + '/Entry.action', parse=False)
def amazon_textbook_fields(isbn): if amazon is None: amazon = AmazonAPI(get_secret('AMAZON_ACCESS_KEY'), get_secret('AMAZON_SECRET_KEY'), get_secret('AMAZON_ASSOC_TAG')) try: product = amazon.lookup(ItemId=isbn, IdType='ISBN', SearchIndex='Books') except AsinNotFound: return if isinstance(product, list): product = product[0] return { 'detail_url': product.detail_page_url, 'image_url': product.medium_image_url, 'author': product.author, 'title': product.title, }
def amazon_textbook_fields(isbn): if amazon is None: amazon = AmazonAPI( get_secret("AMAZON_ACCESS_KEY"), get_secret("AMAZON_SECRET_KEY"), get_secret("AMAZON_ASSOC_TAG"), ) try: product = amazon.lookup(ItemId=isbn, IdType="ISBN", SearchIndex="Books") except AsinNotFound: return if isinstance(product, list): product = product[0] return { "detail_url": product.detail_page_url, "image_url": product.medium_image_url, "author": product.author, "title": product.title, }
# Copyright (C) 2017 Semester.ly Technologies, LLC # # Semester.ly is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Semester.ly is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from semesterly.settings import get_secret USER = get_secret('QUEENS_USER') PASS = get_secret('QUEENS_PASS') OUTPUT_DIR = "./data-dump" PROFILE = None MAX_RETRIES = 10 RETRY_SLEEP_SECONDS = 10 LOG_DIR = "./parsing/schools/queens/qcumber_scraper/logs" SAVE_TO_DB = False # writes to JSON if False
class Parser(BaseParser): """Hopkins course parser. Attributes: API_URL (str): Description DAY_TO_LETTER_MAP (TYPE): Description KEY (str): Description last_course (dict): Description schools (list): Description semester (TYPE): Description verbosity (TYPE): Description """ API_URL = 'https://isis.jhu.edu/api/classes/' KEY = get_secret('JHU_API_KEY') DAY_TO_LETTER_MAP = { 'm': 'M', 't': 'T', 'w': 'W', 'th': 'R', 'f': 'F', 'sa': 'S', 's': 'U' } def __init__(self, **kwargs): """Construct hopkins parser object.""" self.schools = [] self.last_course = {} super(Parser, self).__init__('jhu', **kwargs) def _get_schools(self): url = '{}/codes/schools'.format(Parser.API_URL) params = {'key': Parser.KEY} self.schools = self.requester.get(url, params=params) def _get_courses(self, school): url = '{}/{}/{}'.format(Parser.API_URL, school['Name'], self.semester) params = {'key': Parser.KEY} return self.requester.get(url, params=params) def _get_section(self, course): return self.requester.get(self._get_section_url(course)) def _get_section_url(self, course): return Parser.API_URL + '/' \ + course['OfferingName'].replace(".", "") + course['SectionName'] \ + '/' + self.semester + '?key=' + Parser.KEY def _parse_schools(self): for school in self.schools: self._parse_school(school) def _parse_school(self, school): courses = self._get_courses(school) for course in courses: section = self._get_section(course) if len(section) == 0: # FIXME - make this less hacky hacky_log_file = 'parsing/schools/jhu/logs/section_url_tracking.log' with open(hacky_log_file, 'w') as f: print(self._get_section_url(course), file=f) continue self._load_ingestor(course, section) def _compute_size_enrollment(self, course): try: section_size = int(course['MaxSeats']) except: section_size = 0 try: section_enrolment = section_size \ - int(course['SeatsAvailable'].split("/")[0]) if section_enrolment < 0: section_enrolment = 0 except: section_enrolment = 0 try: waitlist = int(course.get('Waitlisted', -1)) except ValueError: waitlist = -1 return (section_size, section_enrolment, waitlist) def _load_ingestor(self, course, section): section_details = section[0]['SectionDetails'] try: num_credits = float(course['Credits']) except: num_credits = 0 # Load core course fields self.ingestor['areas'] = filter(lambda a: a != "None", course['Areas'].split(',')) if course['IsWritingIntensive'] == "Yes": self.ingestor['areas'] += ['Writing Intensive'] if len(section_details[0]['Prerequisites']) > 0: prereqs = [] for p in section_details[0]['Prerequisites']: prereqs.append(p['Description']) self.ingestor['prerequisites'] = ' '.join(prereqs) self.ingestor['level'] = re.findall(re.compile(r".+?\..+?\.(.{1}).+"), course['OfferingName'])[0] + "00" self.ingestor['name'] = titlize(course['Title']) self.ingestor['description'] = section_details[0]['Description'] self.ingestor['code'] = course['OfferingName'].strip() self.ingestor['num_credits'] = num_credits self.ingestor['department_name'] = ' '.join( course['Department'].split()[1:]) self.ingestor['campus'] = 1 self.ingestor['exclusions'] = section_details[0].get( 'EnrollmentRestrictedTo') # Add specialty areas for computer science department if course['Department'] == 'EN Computer Science': cs_areas_re = r'\bApplications|\bAnalysis|\bSystems|\bGeneral' for match in re.findall(cs_areas_re, self.ingestor['description']): self.ingestor['areas'] += [match] created_course = self.ingestor.ingest_course() if self.last_course \ and created_course['code'] == course['OfferingName'].strip() \ and created_course['name'] != course['Title']: self.ingestor['section_name'] = course['OfferingName'].strip() self.last_course = created_course for meeting in section_details[0]['Meetings']: # Load core section fields self.ingestor['section'] = "(" + section[0]['SectionName'] + ")" self.ingestor['semester'] = self.semester.split()[0] self.ingestor['instrs'] = map(lambda i: i.strip(), course['Instructors'].split(',')) self.ingestor['size'], self.ingestor['enrollment'], self.ingestor[ 'waitlist'] = self._compute_size_enrollment(course) self.ingestor['year'] = self.semester.split()[1] created_section = self.ingestor.ingest_section(created_course) # Load offering fields. times = meeting['Times'] for time in filter(lambda t: len(t) > 0, times.split(',')): time_pieces = re.search( r'(\d\d:\d\d [AP]M) - (\d\d:\d\d [AP]M)', time) self.ingestor['time_start'] = time_12to24(time_pieces.group(1)) self.ingestor['time_end'] = time_12to24(time_pieces.group(2)) if (len(meeting['DOW'].strip()) > 0 and meeting['DOW'] != "TBA" and meeting['DOW'] != "None"): self.ingestor['days'] = map( lambda d: Parser.DAY_TO_LETTER_MAP[d.lower()], re.findall(r'([A-Z][a-z]*)+?', meeting['DOW'])) self.ingestor['location'] = { 'building': meeting['Building'], 'room': meeting['Room'] } self.ingestor.ingest_meeting(created_section) def start(self, years=None, terms=None, years_and_terms=None, departments=None, textbooks=True, verbosity=3, **kwargs): """Start parse.""" self.verbosity = verbosity # Defualt to hardcoded current year. if not years: years = ['2017', '2016'] if not terms: terms = ['Spring', 'Fall', 'Summer'] # Run parser for all semesters specified. for year in years: for term in terms: self.semester = '{} {}'.format(term, year) self._get_schools() self._parse_schools()
class Parser(QPeoplesoftParser): """Course parser for Queens University.""" BASE_URL = 'https://saself.ps.queensu.ca/psc/saself/EMPLOYEE/HRMS/c/'\ 'SA_LEARNER_SERVICES.CLASS_SEARCH.GBL' CREDENTIALS = { 'USERNAME': get_secret('QUEENS_USER'), 'PASSWORD': get_secret('QUEENS_PASS') } def __init__(self, **kwargs): """Construct parsing object.""" params = { 'Page': 'SSR_CLSRCH_ENTRY', 'Action': 'U', 'ExactKeys': 'Y', 'TargetFrameName': 'None' } self.cap = webdriver.DesiredCapabilities.PHANTOMJS self.cap["phantomjs.page.settings.resourceTimeout"] = 50000000 self.cap["phantomjs.page.settings.loadImages"] = False self.cap[ "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0' self.driver = webdriver.PhantomJS(desired_capabilities=self.cap) # NOTE: comment being saved in case this is important for local dev. # self.driver = webdriver.PhantomJS( # './node_modules/phantomjs-prebuilt/bin/phantomjs', # desired_capabilities=self.cap # ) # self.driver = webdriver.Chrome() # FOR DEBUG PURPOSES ONLY super(Parser, self).__init__('queens', Parser.BASE_URL, url_params=params, **kwargs) def seleni_run(self, execute): """Run selenium routine.""" while True: try: return execute() except: continue def login(self): """Login to Queens course listings website.""" socket.setdefaulttimeout(60) self.driver.set_page_load_timeout(30) self.driver.implicitly_wait(30) self.driver.get('https://my.queensu.ca/') self.seleni_run(lambda: self.driver.find_element_by_id('username'). send_keys(Parser.CREDENTIALS['USERNAME'])) self.seleni_run(lambda: self.driver.find_element_by_id('password'). send_keys(Parser.CREDENTIALS['PASSWORD'])) self.seleni_run(lambda: self.driver.find_element_by_class_name( 'form-button').click()) self.seleni_run( lambda: self.driver.find_element_by_link_text("SOLUS").click()) # Focus iframe iframe = self.seleni_run(lambda: self.driver.find_element_by_xpath( "//iframe[@id='ptifrmtgtframe']")) self.driver.switch_to_frame(iframe) self.seleni_run( lambda: self.driver.find_element_by_link_text("Search").click()) # transfer Selenium cookies to Requester cookies for cookie in self.driver.get_cookies(): c = {cookie['name']: cookie['value']} self.requester.session.cookies.update(c) # Close Selenium/PhantomJS process. # REF: http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution # NOTE: update selenium version after fix released # (https://github.com/hydroshare/hydroshare/commit/f7ef2a867250aac86b3fd12821cabf5524c2cb17) self.driver.close() self.driver.service.process.send_signal(signal.SIGTERM) self.driver.quit() headers = { 'Pragma': 'no-cache', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'en-US,en;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'https://saself.ps.queensu.ca/psc/saself/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL?PortalActualURL=https%3a%2f%2fsaself.ps.queensu.ca%2fpsc%2fsaself%2fEMPLOYEE%2fHRMS%2fc%2fSA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL&PortalContentURL=https%3a%2f%2fsaself.ps.queensu.ca%2fpsc%2fsaself%2fEMPLOYEE%2fHRMS%2fc%2fSA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL&PortalContentProvider=HRMS&PortalCRefLabel=Student%20Center&PortalRegistryName=EMPLOYEE&PortalServletURI=https%3a%2f%2fsaself.ps.queensu.ca%2fpsp%2fsaself%2f&PortalURI=https%3a%2f%2fsaself.ps.queensu.ca%2fpsc%2fsaself%2f&PortalHostNode=HRMS&NoCrumbs=yes&PortalKeyStruct=yes', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', } self.requester.headers = headers # NOTE: get request will update CookieJar self.requester.get(Parser.BASE_URL, params={ 'Page': 'SSR_CLSRCH_ENTRY', 'Action': 'U', 'ExactKeys': 'Y', 'TargetFrameName': 'None' }) def start(self, verbosity=3, **kwargs): """Start parse.""" self.login() super(Parser, self).start(verbosity=verbosity, **kwargs)
from rest_framework.views import APIView from analytics.models import SharedTimetable from analytics.views import save_analytics_timetable from courses.serializers import CourseSerializer from student.utils import get_student from timetable.serializers import DisplayTimetableSerializer from timetable.models import Semester, Course, Section from timetable.utils import ( update_locked_sections, courses_to_timetables, ) from helpers.mixins import ValidateSubdomainMixin, FeatureFlowView, CsrfExemptMixin from semesterly.settings import get_secret hashids = Hashids(salt=get_secret("HASHING_SALT")) logger = logging.getLogger(__name__) class TimetableView(CsrfExemptMixin, ValidateSubdomainMixin, APIView): """ This view is responsible for responding to any requests dealing with the generation of timetables and the satisfaction of constraints provided by the frontend/user. """ def post(self, request): """Generate best timetables given the user's selected courses""" school = request.subdomain params = request.data student = get_student(request) course_ids = list(params["courseSections"].keys())
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from rest_framework.views import APIView from rest_framework.response import Response from rest_framework import status from django.shortcuts import get_object_or_404 from hashids import Hashids from jhu_final_exam_scheduler import JHUFinalExamScheduler from helpers.mixins import FeatureFlowView, CsrfExemptMixin from exams.models import FinalExamShare from student.utils import get_student from semesterly.settings import get_secret hashids = Hashids(salt=get_secret('HASHING_SALT')) # TODO: use new request shape class ExamView(CsrfExemptMixin, APIView): def post(self, request): final_exam_schedule = JHUFinalExamScheduler().make_schedule(request.data) return Response(final_exam_schedule, status=status.HTTP_200_OK) class ExamLink(FeatureFlowView): feature_name = 'SHARE_EXAM' def get_feature_flow(self, request, slug): exam_id = hashids.decrypt(slug)[0]
class Parser(CourseParser): """George Washington University course parser. NOTE: GW cannot support multiple login! """ URL = 'https://banweb.gwu.edu/PRODCartridge' CREDENTIALS = { 'USERNAME': get_secret('GW_USER'), 'PASSWORD': get_secret('GW_PASS'), 'SECURITY_QUESTION_ANSWER': get_secret('GW_SECURITY_ANSWER') } YEARS_AND_TERMS = { 2017: { 'Fall': '201703', 'Spring': '201701', }, 2016: { 'Fall': '201603', } } def __init__(self, **kwargs): """Construct GW parser object. Args: **kwargs: pass-through """ super(Parser, self).__init__('gw', **kwargs) def start(self, years=None, terms=None, years_and_terms=None, departments=None, verbosity=3, **kwargs): """Start parse.""" self._login() self._direct_to_search_page() years_and_terms = filter_years_and_terms( Parser.YEARS_AND_TERMS, years_filter=years, terms_filter=terms, years_and_terms_filter=years_and_terms) for year, terms in years_and_terms.items(): self.ingestor['year'] = year for term_name in terms: term_code = Parser.YEARS_AND_TERMS[year][term_name] self.ingestor['term'] = term_name # Retrieve term search page. soup = self.requester.get(Parser.URL + '/bwckgens.p_proc_term_date', params={ 'p_calling_proc': 'P_CrseSearch', 'p_term': term_code }) # Create search param list. input_options_soup = soup.find( 'form', action='/PRODCartridge/bwskfcls.P_GetCrse').find_all( 'input') query = {} for input_option in input_options_soup: query[input_option['name']] = input_option.get('value', '') query.update({ 'begin_hh': '0', 'begin_mi': '0', 'end_hh': '0', 'end_mi': '0', 'sel_ptrm': '%', 'SUB_BTN': 'Section Search' }) # Construct list of departments. depts = {} depts_soup = soup.find('select', id='subj_id').find_all('option') for dept_soup in depts_soup: depts[dept_soup.text.strip()] = dept_soup['value'] for dept_name, dept_code in depts.iteritems(): self.ingestor['department'] = { 'name': dept_name, 'code': dept_code } query['sel_subj'] = ['dummy', dept_code] rows = self.requester.post(Parser.URL + '/bwskfcls.P_GetCrse', params=query) Parser._check_errorpage(rows) try: rows = rows.find( 'table', class_='datadisplaytable').find_all('tr')[2:] except AttributeError: print('message: no results for department', dept_name, file=sys.stderr) continue # no results for department # collect offered courses in department for row in rows: info = row.find_all('td') if info[1].find('a'): # general info self.ingestor.update({ 'ident': info[1].text, 'code': info[2].text + ' ' + info[3].text, 'href': info[1].find('a')['href'], 'dept': dept_name, 'selec': info[3].text, 'section': info[4].text, 'credits': safe_cast(info[6].text, float, default=0.), 'name': info[7].text, 'size': int(info[10].text), 'enrollment': int(info[11].text), 'waitlist': safe_cast(info[14].text, int, default=-1), 'attr': '; '.join(info[22].text.split(' and ')) if len(info) == 23 else '' # FIXME - hacky fix }) # Query course catalog to obtain description. catalog = self.requester.get( Parser.URL + '/bwckctlg.p_display_courses', params={ 'term_in': term_code, 'one_subj': dept_code, 'sel_crse_strt': self.ingestor['selec'], 'sel_crse_end': self.ingestor['selec'], 'sel_subj': '', 'sel_levl': '', 'sel_schd': '', 'sel_coll': '', 'sel_divs': '', 'sel_dept': '', 'sel_attr': '' }) if catalog: self.ingestor.update( Parser._parse_catalogentrypage(catalog)) course = self.ingestor.ingest_course() section_soup = self.requester.get( Parser.URL + '/bwckschd.p_disp_listcrse', params={ 'term_in': term_code, 'subj_in': dept_code, 'crse_in': self.ingestor['selec'], 'crn_in': self.ingestor['ident'] }) meetings_soup = Parser._extract_meetings( section_soup) """Example of a meeting entry <tr> <td class="dddefault">Class</td> <td class="dddefault">4:00 pm - 6:00 pm</td> <td class="dddefault">T</td> <td class="dddefault">See Department DEPT</td> <td class="dddefault">08/28/17 - 12/11/17</td> <td class="dddefault">Lecture</td> <td class="dddefault">Timothy A. McCaffrey (<abbr title="Primary">P</abbr>), David Leitenberg </td> </tr> """ self._parse_instructors(meetings_soup) if len(meetings_soup) > 0: self.ingestor['section_type'] = meetings_soup[ 0].find_all('td')[5].text section_model = self.ingestor.ingest_section( course) self._parse_meetings(meetings_soup, section_model) def _login(self): # Collect necessary cookies self.requester.get(Parser.URL + '/twbkwbis.P_WWWLogin', parse=False) self.requester.headers['Referer'] = '{}/twbkwbis.P_WWWLogin'.format( Parser.URL) logged_in = self.requester.post(Parser.URL + '/twbkwbis.P_ValLogin', parse=False, data={ 'sid': Parser.CREDENTIALS['USERNAME'], 'PIN': Parser.CREDENTIALS['PASSWORD'] }) if logged_in.status_code != 200: print('Unexpected error: login unsuccessful', sys.exc_info()[0], file=sys.stderr) raise Exception('GW Parser, failed login') # Deal with security question page. self.requester.post('{}/twbkwbis.P_ProcSecurityAnswer'.format( Parser.URL), parse=False, data={ 'RET_CODE': '', 'SID': Parser.CREDENTIALS['USERNAME'], 'QSTN_NUM': 1, 'answer': Parser.CREDENTIALS['SECURITY_QUESTION_ANSWER'] }) def _direct_to_search_page(self): genurl = Parser.URL + '/twbkwbis.P_GenMenu' actions = ['bmenu.P_MainMnu', 'bmenu.P_StuMainMnu', 'bmenu.P_RegMnu'] map(lambda n: self.requester.get(genurl, params={'name': n}), actions) self.requester.get(Parser.URL + '/bwskfcls.P_CrseSearch', parse=False, params={'term_in': ''}) def _parse_meetings(self, meetings_soup, section_model): for meeting_soup in meetings_soup: col = meeting_soup.find_all('td') time = re.match(r'(.*) - (.*)', col[1].text) if not time: continue self.ingestor['time_start'] = self.extractor.time_12to24( time.group(1)) self.ingestor['time_end'] = self.extractor.time_12to24( time.group(2)) self.ingestor['days'] = [col[2].text] filtered_days = filter(lambda x: x.replace(u'\xa0', u''), self.ingestor['days']) if len(filtered_days) == 0: break self.ingestor['location'] = col[3].text self.ingestor.ingest_meeting(section_model) def _parse_instructors(self, meetings): self.ingestor['instrs'] = [] for meeting in meetings: instructors = meeting.find_all('td')[6].text.split(',') # NOTE: must constrain instructor length LAW 6683 for instructor in instructors[:20]: # Remove extra internal spaces. instructor = ' '.join(instructor.split()) # Remove primary tag from instructor name. instructor = re.match(r'(.*?)(?: \(P\))?$', instructor).group(1) self.ingestor['instrs'].append(instructor) @staticmethod def _parse_catalogentrypage(soup): fields = {} meat = soup.find('body').find('table', class_='datadisplaytable') if meat is None: return {} fields.update({'descr': Parser._extract_description(meat)}) fields.update(Parser._extract_info(meat.find('td', class_='ntdefault'))) return fields @staticmethod def _extract_description(soup): try: meat = soup.find_all('tr', recursive=False)[1].find('td') descr = re.match(r'<td .*?>\n([^<]+)<[^$]*</td>', meat.prettify()) return ' '.join(descr.group(1).strip().splitlines()) except: return '' @staticmethod def _extract_info(soup): # Link field in <span> tag to text proceeding it. fields = {} for t in soup.find_all('span', class_='fieldlabeltext'): data = t.next_sibling # Skip newline tags. while data and isinstance(data, Tag) and data.name == 'br': data = data.next_sibling if not isinstance(data, NavigableString): data = data.text fields[t.text.strip()[:-1]] = data extraction = { 'Schedule Types': ('section_type', lambda s: s[0].upper()), 'Levels': ('info', lambda s: 'Levels: ' + s.strip()), 'Course Attributes': ('areas', lambda x: x.strip().split(',')) } # Filter and map over (header, content) pairs. extracted = {} for name, data in fields.items(): if extraction.get(name): extracted[extraction[name][0]] = extraction[name][1](data) return extracted @staticmethod def _extract_meetings(soup): meetings = soup.find('table', class_='datadisplaytable') if meetings: meetings = meetings.find('table', class_='datadisplaytable') if meetings: meetings = meetings.find_all('tr')[1:] if meetings: return meetings else: return [] @staticmethod def _check_errorpage(soup): error = soup.find('span', class_='errortext') if not error: return raise CourseParseError('Error on page request, message: ' + error.text)
# Copyright (C) 2017 Semester.ly Technologies, LLC # # Semester.ly is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Semester.ly is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from semesterly.settings import get_secret USER = get_secret('QUEENS_USER') PASS = get_secret('QUEENS_PASS') OUTPUT_DIR = "./data-dump" PROFILE = None MAX_RETRIES = 10 RETRY_SLEEP_SECONDS = 10 LOG_DIR = "./parsing/schools/queens/qcumber_scraper/logs" SAVE_TO_DB = False # writes to JSON if False
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import json import urllib.request, urllib.error, urllib.parse import requests from django.conf import settings from django.contrib.auth.models import User from django.core.signing import TimestampSigner, BadSignature, SignatureExpired from hashids import Hashids from student.models import Student from semesterly.settings import get_secret hashids = Hashids(salt=get_secret('HASHING_SALT')) def check_student_token(student, token): """ Validates a token: checks that it is at most 2 days old and that it matches the currently authenticated student. """ try: key = '%s:%s' % (student.id, token) TimestampSigner().unsign(key, max_age=60 * 60 * 48) # Valid for 2 days except (BadSignature, SignatureExpired): return False return True
def create_student(strategy, details, response, user, *args, **kwargs): """ Part of the Python Social Auth pipeline which creates a student upon signup. If student already exists, updates information from Facebook or Google (depending on the backend). Saves friends and other information to fill database. """ backend_name = kwargs['backend'].name if Student.objects.filter(user=user).exists(): new_student = Student.objects.get(user=user) else: new_student = Student(user=user) new_student.save() social_user = user.social_auth.filter( provider=backend_name, ).first() if backend_name == 'google-oauth2' and not user.social_auth.filter( provider='facebook').exists(): try: access_token = social_user.extra_data["access_token"] except TypeError: access_token = json.loads(social_user.extra_data)["access_token"] response = requests.get( 'https://www.googleapis.com/userinfo/v2/me'.format( social_user.uid, get_secret('GOOGLE_API_KEY')), params={'access_token': access_token} ) new_student.img_url = response.json()['picture'] new_student.save() elif backend_name == 'facebook': try: access_token = social_user.extra_data["access_token"] except TypeError: access_token = json.loads(social_user.extra_data)["access_token"] if social_user: new_student.img_url = 'https://graph.facebook.com/v9.0/' + social_user.uid + '/picture?type=normal' url = 'https://graph.facebook.com/v9.0/{0}/' \ '&access_token={1}'.format( social_user.uid, access_token, ) request = urllib.request.Request(url) new_student.fbook_uid = social_user.uid new_student.save() url = 'https://graph.facebook.com/{0}/' \ 'friends?fields=id' \ '&access_token={1}'.format( social_user.uid, access_token, ) request = urllib.request.Request(url) friends = json.loads(urllib.request.urlopen(request).read().decode('utf-8')).get('data') for friend in friends: if Student.objects.filter(fbook_uid=friend['id']).exists(): friend_student = Student.objects.get( fbook_uid=friend['id']) if not new_student.friends.filter( user=friend_student.user).exists(): new_student.friends.add(friend_student) new_student.save() friend_student.save() return kwargs
class Parser(BaseParser): """Vanderbilt course parser. Attributes: API_URL (str): Description course (TYPE): Description CREDENTIALS (TYPE): Description departments (dict): Description SCHOOL (str): Description verbosity (TYPE): Description """ API_URL = 'https://webapp.mis.vanderbilt.edu/more' CREDENTIALS = { 'USERNAME': get_secret('VANDY_USER'), 'PASSWORD': get_secret('VANDY_PASS') } def __init__(self, **kwargs): """Construct parser instance. Args: **kwargs: pass-through """ self.departments = {} self.course = { 'description': '', 'cancelled': False } super(Parser, self).__init__('vandy', **kwargs) def login(self): if self.verbosity > 2: print("Logging in...") login_url = 'https://login.mis.vanderbilt.edu' get_login_url = login_url + '/login' params = { 'service': Parser.API_URL + '/j_spring_cas_security_check' } soup = self.requester.get(get_login_url, params) post_suffix_url = soup.find('form', {'name': 'loginForm'})['action'] sec_block = soup.find('input', {'name': 'lt'})['value'] login_info = { 'username': Parser.CREDENTIALS['USERNAME'], 'password': Parser.CREDENTIALS['PASSWORD'], 'lt': sec_block, '_eventId': 'submit', 'submit': 'LOGIN' } self.requester.post(login_url + post_suffix_url, login_info, params, parse=False) self.requester.get(Parser.API_URL + '/Entry.action', parse=False) def start(self, years=None, terms=None, departments=None, textbooks=True, verbosity=3): self.verbosity = verbosity self.login() # TODO - read from site and filter based on kwargs years_and_terms = { '2016': { 'Fall': '0875' }, '2017': { 'Spring': '0880', 'Fall': '0895', 'Summer': '0885', } } years_and_terms = self.extractor.filter_term_and_year( years_and_terms, years, terms ) for year, semesters in years_and_terms.items(): if self.verbosity >= 1: print('> Parsing year ' + year) self.ingestor['year'] = year for semester_name, semester_code in semesters.items(): if self.verbosity >= 1: print('>> Parsing semester ' + semester_name) self.ingestor['semester'] = semester_name # Load environment for targeted semester self.requester.get( '{}{}'.format( Parser.API_URL, '/SelectTerm!selectTerm.action'), {'selectedTermCode': semester_code}, parse=False) self.requester.get( '{}{}'.format( Parser.API_URL, '/SelectTerm!updateSessions.action'), parse=False) # Get a list of all the department codes department_codes = self.extract_department_codes() department_codes = self.extractor.filter_departments( department_codes, departments ) # Create payload to request course list from server payload = { 'searchCriteria.classStatusCodes': [ 'O', 'W', 'C' ], '__checkbox_searchCriteria.classStatusCodes': [ 'O', 'W', 'C' ] } for department_code in department_codes: if self.verbosity >= 1: print('>>> Parsing courses in', self.departments[department_code]) # Construct payload with department code payload.update({ 'searchCriteria.subjectAreaCodes': department_code }) # GET html for department course listings html = self.requester.get( '{}{}'.format( Parser.API_URL, '/SearchClassesExecute!search.action' ), payload ) # Parse courses in department self.parse_courses_in_department(html) # return to search page for next iteration self.requester.get(Parser.API_URL + '/Entry.action', parse=False) def create_course(self): self.ingestor['school'] = 'vandy' self.ingestor['campus'] = 1 self.ingestor['code'] = self.course.get('code') self.ingestor['name'] = self.course.get('name') self.ingestor['description'] = self.course.get('description', '') self.ingestor['num_credits'] = safe_cast(self.course.get('Hours'), float, default=0.) self.ingestor['areas'] = filter( lambda a: bool(a), self.course.get('Attributes', '').split(',') ) self.ingestor['prerequisites'] = self.course.get('Requirement(s)') self.ingestor['department_name'] = self.departments.get( self.course.get('department') ) self.ingestor['level'] = '0' created_course = self.ingestor.ingest_course() return created_course @staticmethod def is_float(f): try: float(f) return True except TypeError: return False def create_section(self, created_course): if self.course.get('cancelled'): self.course['cancelled'] = False return None else: self.ingestor['section'] = self.course.get('section') self.ingestor['instructors'] = self.course.get('Instructor(s)', '') self.ingestor['size'] = int(self.course.get('Class Capacity')) self.ingestor['enrolment'] = int(self.course.get('Total Enrolled')) created_section = self.ingestor.ingest_section(created_course) return created_section def create_offerings(self, created_section): if self.course.get('days'): for day in list(self.course.get('days')): self.ingestor['day'] = day self.ingestor['time_start'] = self.course.get('time_start') self.ingestor['time_end'] = self.course.get('time_end') self.ingestor['location'] = self.course.get('Location') self.ingestor.ingest_meeting(created_section) def print_course(self): for label in self.course: try: print(label + "::" + self.course[label] + '::') except: sys.stderr.write("error: UNICODE ERROR\n") print(sys.exc_info()[0]) def update_current_course(self, label, value): try: self.course[label] = value.strip() except: print('label:', label, sys.exc_info()[0]) sys.stderr.write("UNICODE ERROR\n") def extract_department_codes(self): # Query Vandy class search website soup = self.requester.get( Parser.API_URL + '/SearchClasses!input.action', parse=True) # Retrieve all deparments from dropdown in advanced search department_entries = soup.find_all( id=re.compile("subjAreaMultiSelectOption[0-9]")) # Extract department codes from parsed department entries department_codes = [de['value'] for de in department_entries] for de in department_entries: self.departments[de['value']] = de['title'] return department_codes def parse_courses_in_department(self, html): # Check number of results isn't over max num_hits_search = re.search("totalRecords: ([0-9]*),", str(html)) num_hits = 0 if num_hits_search is not None: num_hits = int(num_hits_search.group(1)) # perform more targeted searches if needed if num_hits == 300: raise CourseParseError('vandy num_hits greater than 300') else: self.parse_set_of_courses(html) def parse_set_of_courses(self, html): prev_course_number = 0 page_count = 1 while True: # Parse page by page last_class_number = self.parse_page_of_courses(html) # Condition met when reached last page if last_class_number == prev_course_number: break page_count = page_count + 1 next_page_url = '{}{}{}'.format( Parser.API_URL, '/SearchClassesExecute!switchPage.action?pageNum=', page_count) html = self.requester.get(next_page_url) prev_course_number = last_class_number def parse_page_of_courses(self, html): # initial parse with Beautiful Soup courses = html.find_all('tr', {'class': 'classRow'}) last_class_number = 0 for course in courses: # remove cancelled classes if course.find('a', {'class': 'cancelledStatus'}): self.course['cancelled'] = True last_class_number = self.parse_course(course) return last_class_number def parse_course(self, soup): # Extract course code and term number to generate access to more info details = soup.find('td', {'class', 'classSection'})['onclick'] # Extract course number and term code search = re.search("showClassDetailPanel.fire\({classNumber : '([0-9]*)', termCode : '([0-9]*)',", details) course_number, term_code = search.group(1), search.group(2) # Base URL to retrieve detailed course info course_details_url = Parser.API_URL \ + '/GetClassSectionDetail.action' # Create payload to request course from server payload = { 'classNumber': course_number, 'termCode': term_code } try: self.parse_course_details(self.requester.get(course_details_url, payload)) # Create models created_section = self.create_section(self.create_course()) if created_section: self.create_offerings(created_section) # Clear course map for next pass self.course.clear() except ParseError: print('invalid course, parse exception') return course_number def parse_course_details(self, html): # Extract course name and abbreviation details search = re.search( "(.*):.*\n(.*)", html.find(id='classSectionDetailDialog').find('h1').text) courseName, abbr = search.group(2), search.group(1) # Extract department code, catalog ID, and section number from abbr title = re.match("(\S*)-(\S*)-(\S*)", abbr) if not title: raise ParseError() department_code = title.group(1) catalog_id = title.group(2) section_number = title.group(3) if self.verbosity > 2: print('\t-', department_code, catalog_id, section_number.strip(), '-') self.update_current_course("name", courseName) self.update_current_course("code", department_code + '-' + catalog_id) self.update_current_course("department", department_code) self.update_current_course("Catalog ID", catalog_id) self.update_current_course('section', '(' + section_number.strip() + ')') # in case no description for course self.update_current_course('description', '') # Deal with course details as subgroups seen on details page detail_headers = html.find_all('div', {'class': 'detailHeader'}) detail_panels = html.find_all('div', {'class': 'detailPanel'}) # NOTE: there should be equal detail headers and detail panels assert(len(detail_headers) == len(detail_panels)) for i in range(len(detail_headers)): # Extract header name header = detail_headers[i].text.strip() # Choose parsing strategy dependent on header if header == "Details" or header == "Availability": self.parse_labeled_table(detail_panels[i]) elif header == "Description": self.parse_description(detail_panels[i]) elif header == "Notes": self.parse_notes(detail_panels[i]) elif header == "Meeting Times": self.parse_meeting_times(detail_panels[i]) elif header == "Cross Listings": pass elif header == "Attributes": self.parse_attributes(detail_panels[i]) elif header == "Ad Hoc Meeting Times": pass def parse_attributes(self, soup): labels = [l.text.strip() for l in soup.find_all('div', {'class': 'listItem'})] self.update_current_course("Attributes", ', '.join(labels)) def parse_labeled_table(self, soup): # Gather all labeled table entries labels = soup.find_all('td', {'class' : 'label'}) for label in labels: siblings = label.find_next_siblings() # Check if label value exists if len(siblings) != 0: # Extract pure label from html key = label.text[:-1].strip() # Extract label's value(s) [deals with multiline multi-values] values = [l for l in (line.strip() for line in siblings[0].text.splitlines()) if l] # Edge cases if key == "Books": # bookURL = re.search("new YAHOO.mis.student.PopUpOpener\('(.*)',", values[0]) # values = [bookURL.group(1)] values = ["<long bn url>"] elif key == "Hours": values[0] = str(safe_cast(values[0], float, default=0.)) self.update_current_course(key, ', '.join(values)) def parse_meeting_times(self, soup): # Gather all labeled table entries labels = soup.find_all('th', {'class': 'label'}) values = [] if len(labels) > 0: values = soup.find('tr', {'class': 'courseHeader'}).find_next_siblings()[0].find_all('td') else: # Create empty times slots self.update_current_course('days', '') self.update_current_course('time_start', '') self.update_current_course('time_end', '') # NOTE: number of labels and values should be the same assert(len(labels) == len(values)) for i in range(len(labels)): label = labels[i].text.strip() value = values[i].text.strip() if len(label) > 0 and len(value) > 0: if label == "Instructor(s)": self.update_current_course(label, ', '.join(self.extract_instructors(value))) elif label == "Time": self.parse_time_range(value) elif label == "Days": self.parse_days(value) else: self.update_current_course(label, value) def parse_days(self, unformatted_days): if unformatted_days == "TBA" or unformatted_days == "": self.update_current_course("days", "") else: self.update_current_course("days", unformatted_days) def parse_time_range(self, unformatted_time_range): if unformatted_time_range == "TBA" or unformatted_time_range == "": # Create empty time slots self.update_current_course('days', '') self.update_current_course('time_start', '') self.update_current_course('time_end', '') else: search = re.match("(.*) \- (.*)", unformatted_time_range) if search is not None: self.update_current_course('time_start', self.extractor.time_12to24(search.group(1))) self.update_current_course('time_end', self.extractor.time_12to24(search.group(2))) else: print('ERROR: invalid time format', file=sys.stderr) def extract_instructors(self, string): instructors = string.splitlines() for i in range(len(instructors)): # Deal with instance of primary instructor search = re.match("(.*) \(Primary\)", instructors[i]) if search is not None: instructors[i] = search.group(1) return instructors def parse_notes(self, soup): notes = ' '.join([l for l in (p.strip() for p in soup.text.splitlines()) if l]).strip() self.update_current_course('description', self.course.get('description') + '\nNotes: ' + notes) def parse_description(self, soup): self.update_current_course('description', soup.text.strip())