def __init__(self, coursecode, connection, cycle, term, requisitiontype): self.coursecode = coursecode self.crawler = Crawler() self.connection = connection self.cycle = cycle self.term = term self.requisitiontype = requisitiontype
def __init__(self): self.connection = MySQLConnection() self.timeperiod = None self.cycle = None self.coursereader = None self.offerreader = None self.crawler = Crawler() self.offerreader = OfferReader(self.connection, self.cycle, self.timeperiod)
def testItReturnsTheCrawledUrls(self): crawler = Crawler() urlsToCrawl = ['http://google.se', 'http://aftonbladet.se'] for url in urlsToCrawl: crawler.add_to_crawl(url) result = crawler.crawl() self.assertEquals( urlsToCrawl, result, 'Not all urls that was supposed to be crawled was crawled.')
def main(): """ An example how the search engine could be used. """ seed = [ 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html' ] # Instatiate the crawler. crawler = Crawler() # Start the crawler with the seed. crawler.start_crawling(seed) # Access the pages. pages = crawler.pages # Print the content of the pages print(pages) # Print the link structure link_structure_txt = pages.get_link_structure_text() print(link_structure_txt) # Printing and creation of the index indexer = Indexer() indexer.index_pages(pages) index = indexer.index print(index) # Calculation and Printing of Page Rank pagerank = Page_Rank() pagerank.fill_matrix(crawler) pagerank.calculate_probabilities(0.05, 0.95) pagerank.calculate_page_rank(0.04) print() # Scoring example_queries = ['tokens', 'index', 'classification', 'tokens classification' ] analyzer = CosinusAnalyzer(index, pages) print(analyzer.get_length_of_pages_text()) # Cosinus Scoring print(StringUtil.header('cosine_scores.txt')) for query in example_queries: hits = analyzer.analyze(query) print(hits) print() # Cosinus Scoring combined with the page rank. print(StringUtil.header('Cosinus combined with Page Rank')) for query in example_queries: hits = analyzer.analyze(query, combine_with_page_rank=True) print(hits) print()
def test_get_children(self): page = "http://www.taste.com.au/" # Exclude depth for now depth = 3 # children = 3 crawler = Crawler(page, depth) crawler.run() # Should have depth*children urls from crawl result self.assertGreater(len(crawler.urls), 1) print crawler.urls
class CourseReader(object): """Object that reads the page of a course with a given coursecode and inserts all the relevant information in the database""" def __init__(self, coursecode, connection, cycle, term, requisitiontype): self.coursecode = coursecode self.crawler = Crawler() self.connection = connection self.cycle = cycle self.term = term self.requisitiontype = requisitiontype def scancoursepage(self): match = Course.find(courseCode_equal=self.coursecode) urlstart = 'https://uspdigital.usp.br/jupiterweb/obterDisciplina' parameters = {'sgldis': str(self.coursecode)} completeurl = appendparameters(urlstart, parameters) self.crawler.loadpage(completeurl) name = self.findname() startdate = self.findstartdate() if match: course = match[0] else: course = Course(self.coursecode, name, startdate) course.store() idealtermmatch = IdealTermCourse.find(idCycle=self.cycle.idCycle, term=self.term, idCourse=course.idCourse, requisitionType= self.requisitiontype) if not idealtermmatch: idealterm = IdealTermCourse(self.cycle.idCycle, self.term, startdate, self.requisitiontype, course) idealterm.store() else: idealterm = idealtermmatch[0] return course def findname(self): namedata = self.crawler.htmlpage.findAll('b', {}) # The name will be on the third font tag found completename = getwhatisbetweenthetags(unicode(namedata[3])) name = completename[22:] # Disciplina: XXXXXXX - name return name def findstartdate(self): startdatedict = {'class': 'txt_arial_8pt_gray'} startdatedata = self.crawler.htmlpage.findAll('span', startdatedict) # The startdate will be on the fourth tag found completedata = getwhatisbetweenthetags(unicode(startdatedata[4])) data = removewhitespaces(completedata) data = convertdate(data) return data
class CourseReader(object): """Object that reads the page of a course with a given coursecode and inserts all the relevant information in the database""" def __init__(self, coursecode, connection, cycle, term, requisitiontype): self.coursecode = coursecode self.crawler = Crawler() self.connection = connection self.cycle = cycle self.term = term self.requisitiontype = requisitiontype def scancoursepage(self): match = Course.find(courseCode_equal=self.coursecode) urlstart = 'https://uspdigital.usp.br/jupiterweb/obterDisciplina' parameters = {'sgldis': str(self.coursecode)} completeurl = appendparameters(urlstart, parameters) self.crawler.loadpage(completeurl) name = self.findname() startdate = self.findstartdate() if match: course = match[0] else: course = Course(self.coursecode, name, startdate) course.store() idealtermmatch = IdealTermCourse.find( idCycle=self.cycle.idCycle, term=self.term, idCourse=course.idCourse, requisitionType=self.requisitiontype) if not idealtermmatch: idealterm = IdealTermCourse(self.cycle.idCycle, self.term, startdate, self.requisitiontype, course) idealterm.store() else: idealterm = idealtermmatch[0] return course def findname(self): namedata = self.crawler.htmlpage.findAll('b', {}) # The name will be on the third font tag found completename = getwhatisbetweenthetags(unicode(namedata[3])) name = completename[22:] # Disciplina: XXXXXXX - name return name def findstartdate(self): startdatedict = {'class': 'txt_arial_8pt_gray'} startdatedata = self.crawler.htmlpage.findAll('span', startdatedict) # The startdate will be on the fourth tag found completedata = getwhatisbetweenthetags(unicode(startdatedata[4])) data = removewhitespaces(completedata) data = convertdate(data) return data
def start(): es = create_es_connection( config['database']['host'], config['database']['port'], config['database']['access_key'], config['database']['secret_key'], config['database']['region']) threads = [] for search in config['searches']: ed = ElasticDriver(es, search['name']) i = 1 for twitterAccount in search['twitterAccounts']: td = TwitterDriver( search['keywords'], ed, search['sensitivity'], twitterAccount['consumer_key'], twitterAccount['consumer_secret'], twitterAccount['access_token_key'], twitterAccount['access_token_secret'] ) threads.append(Crawler(f'{search["name"]}-{i}', search['keywords'], es, td)) i += 1 for thread in threads: thread.start() for thread in threads: thread.join()
def test_crawler_recurses(self): # Arrange html = """ <html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html> """ initial_url = 'http://www.initialurl.com/' mock_urllib = create_autospec(urllib2) crawler = Crawler(mock_urllib) # Act crawler.crawl([initial_url]) # Assert expected_calls = [call.urlopen(initial_url), call.urlopen('http://testurl.com/testpage.html')] mock_urllib.assert_has_calls(expected_calls)
def test_crawler_recurses(self): # Arrange html = """ <html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html> """ initial_url = 'http://www.initialurl.com/' mock_urllib = create_autospec(urllib2) crawler = Crawler(mock_urllib) # Act crawler.crawl([initial_url]) # Assert expected_calls = [ call.urlopen(initial_url), call.urlopen('http://testurl.com/testpage.html') ] mock_urllib.assert_has_calls(expected_calls)
def scrape_documents( min_count=0, url_seeds=GlobalConfiguration.DEFAULT_URL_SEEDS): doc_count = 0 s = Crawler(url_seeds) docs = s.crawl(min_count) while min_count <= 0 or doc_count < min_count: for doc in docs: temp_file = get_new_file() pickle.dump(doc, temp_file) temp_file.close() log.debug('saved image doc from %s', doc.url) doc_count += 1 if doc_count % 100 == 0: log.info('%d images and counting...', doc_count) log.info('finished indexing images.') log.info('%d documents indexed', doc_count)
class FacultyReader(object): """A reader object which will use a crawler to scan through the cycles of a faculty""" def __init__(self): self.connection = None self.timeperiod = None self.faculty = None self.crawler = Crawler() @staticmethod def initwithconnection(host, user, password, database): """Returns a new CycleReader object with its connection already configured using the host, user, password and database name provided as parameters. Additionally, it configures the charset to be used as unicode""" facultyreader = FacultyReader() facultyreader.connection = MySQLdb.connect(host=host, user=user, passwd=password, db=database, use_unicode=True, charset='utf8') return facultyreader def settimeperiod(self, idtimeperiod): "Sets the timeperiod of this cycle by providing its id" self.timeperiod = idtimeperiod def setfaculty(self, idfaculty): "Sets the cycle of this reader by searching for the cycle in the bank" self.faculty = Faculty.pickById(idfaculty) def startreading(self): """Starts scanning through the Faculty's page and iterates through each of it's cycles""" urlstart = 'https://uspdigital.usp.br/jupiterweb/jupCursoLista' parameters = {'codcg': self.faculty.idFaculty, 'tipo': 'N'} completeurl = appendparameters(urlstart, parameters) self.crawler.loadpage(completeurl)
from crawler.Crawler import Crawler if __name__ == '__main__': crawler = Crawler() crawler.crawl('http://www.prestigetime.com/')
class CycleReader(object): """A reader object which will use a crawler to scan through the page of a cycle""" def __init__(self): self.connection = MySQLConnection() self.timeperiod = None self.cycle = None self.coursereader = None self.offerreader = None self.crawler = Crawler() self.offerreader = OfferReader(self.connection, self.cycle, self.timeperiod) def settimeperiod(self, idtimeperiod): "Sets the timeperiod of this cycle by providing its id" self.timeperiod = idtimeperiod self.offerreader.timeperiod = TimePeriod.pickById(idtimeperiod) def setcycle(self, idcycle): "Sets the cycle of this reader by searching for the cycle in the bank" self.cycle = Cycle.pickById(idcycle) self.offerreader.cycle = self.cycle def startreadingcycles(self): """Starts scanning through the Cycle's page and iterates through each of it's courses. This function will not read 'Ciclo Básico' or 'Grande Área'""" urlstart = 'https://uspdigital.usp.br/jupiterweb/listarGradeCurricular' codcg = self.findidfaculty() codcurlist = self.findcodcur() codhab = str(self.cycle.cycleCode) for codcur in codcurlist: parameters = { 'codcg': str(codcg), 'codcur': str(codcur), 'codhab': codhab, 'tipo': 'N' } completeurl = appendparameters(urlstart, parameters) self.crawler.loadpage(completeurl) coursecodedata = self.crawler.htmlpage.findAll('table', {}) coursecodedata = coursecodedata[1] # The table with the courses codes = getcoursecodes(coursecodedata) if codes: break return self.startreadingoffers(codes) def findidfaculty(self): "returns the id of the faculty that has this cycle" relations = 'rel_courseCoordination_cycle r1, '\ 'rel_courseCoordination_faculty r2 ' conditions = 'WHERE r1.idCourseCoordination = r2.idCourseCoordination'\ ' AND r1.idCycle = ' + str(self.cycle.idCycle) query = 'SELECT idFaculty FROM ' + relations + conditions results = self.connection.execute(query) try: idfaculty = results[0][0] except IndexError: raise IndexError('Não conseguiu achar idFaculty,\ checar rel_courseCoordination_cycle e\ rel_courseCoordination_faculty com idCycle = ' + str(self.cycle.idCycle)) sys.exit() return idfaculty def findcodcur(self): """returns the idAcademicProgram representing the code for this object's cycle""" query = 'SELECT idAcademicProgram FROM rel_academicProgram_cycle '\ 'WHERE idCycle = ' + str(self.cycle.idCycle) codcurall = self.connection.execute(query) listcodcur = [] for codcur in codcurall: listcodcur.append(codcur[0]) return listcodcur def startreadingoffers(self, coursecodes): "Using the coursecodes list, reads all the offers from each code" requisitiontypetranslationdict = {0: 1, 1: 2, 2: 3} # In the bank: 1 - Obrigatória, 2 - Eletiva, 3 - Livre index = 0 offers = [] while index < len(coursecodes): for period in coursecodes[index]: for code in coursecodes[index][period]: if code in COURSES_TO_IGNORE: continue reader = CourseReader( code, self.connection, self.cycle, period, requisitiontypetranslationdict[index]) course = reader.scancoursepage() self.offerreader.setcourse(course) offers.extend(self.offerreader.scancourseofferpage()) index += 1 return offers
def __init__(self, connection, cycle, timeperiod): self.connection = connection self.timeperiod = timeperiod self.cycle = cycle self.crawler = Crawler() self.course = None
class OfferReader(object): """A reader object which will use a crawler to scan through the page describing the offers of a course""" def __init__(self, connection, cycle, timeperiod): self.connection = connection self.timeperiod = timeperiod self.cycle = cycle self.crawler = Crawler() self.course = None def setcourse(self, course): "Sets the course which will be scanned" if not isinstance(course, Course): raise Exception('Invalid course parameter') self.course = course def scancourseofferpage(self): "Scans the page of the course, adding new offers to the bank" urlstart = 'https://uspdigital.usp.br/jupiterweb/obterTurma' fullurl = appendparameters(urlstart, {'sgldis': self.course.courseCode}) self.crawler.loadpage(fullurl) offers = [] if re.match('[\S\s]*Horário[\S\s]*', str(self.crawler.htmlpage)): offersdata = self.getrelevantdata() offers.extend(self.createoffers(offersdata)) return offers def checkprofessor(self, professorname): """Checks if the professor with the name passed as argument is already in the bank and, if it isn't, stores a new professor. After that, returns the Professor object. If the professor name is empty, there is a special entry in the bank for this case """ professor = Professor.find(name_equal=professorname) if not professor: if not professorname: professor = Professor.find(name_equal=NO_PROFESSOR_STRING)[0] else: professor = Professor(professorname) professor.store() else: professor = professor[0] return professor def checkschedule(self, sched): """Checks if the schedule object passed as sched is already in the bank and, if it isn't, stores a new schedule""" if not sched.idSchedule: sched.store() def generateoffer(self, classn_pract, prof_sched_info, professorname): """Creates a new Offer object with the informations about the class number and practical (classn_pract), the professor and the schedules (prof_sched_info), note that the professor that will be associated with this offer has its name in professorname""" professor = self.checkprofessor(professorname) offer = Offer(self.timeperiod, self.course, classn_pract[0], classn_pract[1], professor) if not isinstance(prof_sched_info[1], list): prof_sched_info[1] = [prof_sched_info[1]] for sched in prof_sched_info[1]: self.checkschedule(sched) offer.setSchedules(prof_sched_info[1]) return offer def createoffers(self, offersdata): "Returns a list of offers to treat" offerslist = [] for offerdata in offersdata: classn_pract = getclassnunmberandpractical(offerdata[0]) prof_sched = getprofessorandschedule(offerdata[1]) if not prof_sched: continue prof_sched = organize(prof_sched) for prof_sched_info in prof_sched: for professorname in prof_sched_info[0]: offer = self.generateoffer(classn_pract, prof_sched_info, professorname) offerslist.append(offer) return offerslist def getrelevantdata(self): """Returns a list of tuple with tag objects, each one containing information of one offer related to a course. The first element of the tuple has the classnumber and practical information, the second element has the professor and schedule data""" tables = self.crawler.htmlpage.findAll('table')[1] tables = tables.findAll('table')[0] tables = tables.findAll('table')[5] tables = tables.findAll('table') for table in tables: if re.match('[\S\s]*Didáticas[\S\s]*', str(table)): tables.remove(table) # Some irrelevant data will appear with strides of size 3 in this list tables = removefromlistwithstridesize(tables, 2, 2) return group(tables, 2)
def __init__(self): db = get_session() truncate_db(db) config = ProjectConfig("A:/Development/magistrska/DependencyDiff/configs/fri.json", "A:/Development/magistrska/DependencyDiff/configs/fri.results.json") for old, new in zip(self.examples, self.examples[1::]): self.old_hash, self.old_url = old self.new_hash, self.new_url = new old_page = Page.get_or_create(db, self.project_name, self.old_hash, "https://www.fri.uni-lj.si/en/") old_page.url = self.old_url new_page = Page.get_or_create(db, self.project_name, self.new_hash, self.new_url) old_crawler = Crawler(old_page, "443") new_crawler = Crawler(new_page, "443") old_crawler.get_page(db, old_page, config, self.old_hash) new_crawler.get_page(db, new_page, config, self.new_hash) old_content = old_page.contents[0].content if len(old_page.contents) > 0 else "" new_content = new_page.contents[0].content if len(new_page.contents) > 0 else "" compare_result = Compare.compare(old_content, new_content) if compare_result: element_diff = Compare.extract_differences(compare_result) for element in element_diff: old_diff = old_crawler.screenshot(element) new_diff = new_crawler.screenshot(element) DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0]) for old_action in old_page.actions: for new_action in new_page.actions: if old_action.element == new_action.element and old_action.type == new_action.type: old_action_content = old_action.content new_action_content = new_action.content if old_content != old_action_content or new_content != new_action_content: compare_result = Compare.compare(old_action_content, new_action_content) if compare_result: element_diff = Compare.extract_differences(compare_result) if element_diff: old_crawler.visit_and_action(old_page, old_action, config) new_crawler.visit_and_action(new_page, new_action, config) for element in element_diff: old_diff = old_crawler.screenshot(element) new_diff = new_crawler.screenshot(element) DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0], new_action.id) old_page.url = "https://www.fri.uni-lj.si/en/" new_page.url = "https://www.fri.uni-lj.si/en/" db.commit()
if __name__ == '__main__': logger = Logger() logger.log(Logger.INFO, "Starting Drei") print("Starting Drei") # Initializing database with tables (if necessary) InitTables.main() # Initialize the message queue for crawler and manager manager_queue = multiprocessing.Queue() # Initialize the message queue for manager and webserver manager_webserver_queue = multiprocessing.Queue() # Start the crawler crawler = Crawler(manager_queue, platform.machine() != Const.PI_PLATFORM) crawler.start() # Start the manager manager = Manager(manager_queue, manager_webserver_queue) manager.start() # Start the webserver webserver = Webserver(manager_webserver_queue, manager_queue) webserver.start() # Wait until both processes end manager_queue.close() manager_queue.join_thread() crawler.join() manager.join()
def __init__(self): self.connection = None self.timeperiod = None self.faculty = None self.crawler = Crawler()
def __init__(self, base_page, crawl_depth, children): Crawler.__init__(self, base_page, crawl_depth, children) # super(self).__init__() # super.__init__(self, base_page, crawl_depth, children) self.scrapable_urls = []
def compare_pages(self): success = self.setup_project(self.old.hash) if not success: print(f"failed diff deploying {self.old.hash}") return success = self.setup_project(self.new.hash) if not success: print(f"failed diff deploying {self.new.hash}") return for old in self.old.pages: for new in self.new.pages: if new.url == old.url: print("diff pages: ", new.url, old.url) old_crawler = None new_crawler = None old_content = old.contents[0].content if len( old.contents) > 0 else "" new_content = new.contents[0].content if len( new.contents) > 0 else "" exists = PageDiff.exists(self.db, old.id, new.id) if not exists: compare_result = Compare.compare( old_content, new_content) if compare_result: PageDiff.get_or_create(self.db, old.id, new.id, compare_result) element_diff = Compare.extract_differences( compare_result) if element_diff: old_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.old.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.old.hash].port) new_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.new.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.new.hash].port) old_crawler.visit_page(old, self.project_config) new_crawler.visit_page(new, self.project_config) for element in element_diff: old_diff = old_crawler.screenshot(element) new_diff = new_crawler.screenshot(element) DbDiff.get_or_create( self.db, old.id, new.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0]) if old_crawler is None: old_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.old.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.old.hash].port) if new_crawler is None: new_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.new.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.new.hash].port) visited = False for old_action in self.windowed_query( old.actions, Action.id, 1000): for new_action in self.windowed_query( new.actions, Action.id, 10): if old_action.element == new_action.element and old_action.type == new_action.type: old_action_content = old_action.content new_action_content = new_action.content if old_content != old_action_content or new_content != new_action_content: exists = DbDiff.exists( self.db, old.id, new.id, new_action.id) if not exists: compare_result = Compare.compare( old_action_content, new_action_content) if compare_result: element_diff = Compare.extract_differences( compare_result) if element_diff: old_crawler.visit_and_action( old, old_action, self.project_config) new_crawler.visit_and_action( old, new_action, self.project_config) for element in element_diff: old_diff = old_crawler.screenshot( element) new_diff = new_crawler.screenshot( element) DbDiff.get_or_create( self.db, old.id, new.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0], new_action.id) return True
def testItCanCrawl(self): crawler = Crawler() crawler.add_to_crawl('http://google.se') crawler.crawl()
class CycleReader(object): """A reader object which will use a crawler to scan through the page of a cycle""" def __init__(self): self.connection = MySQLConnection() self.timeperiod = None self.cycle = None self.coursereader = None self.offerreader = None self.crawler = Crawler() self.offerreader = OfferReader(self.connection, self.cycle, self.timeperiod) def settimeperiod(self, idtimeperiod): "Sets the timeperiod of this cycle by providing its id" self.timeperiod = idtimeperiod self.offerreader.timeperiod = TimePeriod.pickById(idtimeperiod) def setcycle(self, idcycle): "Sets the cycle of this reader by searching for the cycle in the bank" self.cycle = Cycle.pickById(idcycle) self.offerreader.cycle = self.cycle def startreadingcycles(self): """Starts scanning through the Cycle's page and iterates through each of it's courses. This function will not read 'Ciclo Básico' or 'Grande Área'""" urlstart = 'https://uspdigital.usp.br/jupiterweb/listarGradeCurricular' codcg = self.findidfaculty() codcurlist = self.findcodcur() codhab = str(self.cycle.cycleCode) for codcur in codcurlist: parameters = {'codcg': str(codcg), 'codcur': str(codcur), 'codhab': codhab, 'tipo': 'N'} completeurl = appendparameters(urlstart, parameters) self.crawler.loadpage(completeurl) coursecodedata = self.crawler.htmlpage.findAll('table', {}) coursecodedata = coursecodedata[1] # The table with the courses codes = getcoursecodes(coursecodedata) if codes: break return self.startreadingoffers(codes) def findidfaculty(self): "returns the id of the faculty that has this cycle" relations = 'rel_courseCoordination_cycle r1, '\ 'rel_courseCoordination_faculty r2 ' conditions = 'WHERE r1.idCourseCoordination = r2.idCourseCoordination'\ ' AND r1.idCycle = ' + str(self.cycle.idCycle) query = 'SELECT idFaculty FROM ' + relations + conditions results = self.connection.execute(query) try: idfaculty = results[0][0] except IndexError: raise IndexError('Não conseguiu achar idFaculty,\ checar rel_courseCoordination_cycle e\ rel_courseCoordination_faculty com idCycle = ' + str(self.cycle.idCycle)) sys.exit() return idfaculty def findcodcur(self): """returns the idAcademicProgram representing the code for this object's cycle""" query = 'SELECT idAcademicProgram FROM rel_academicProgram_cycle '\ 'WHERE idCycle = ' + str(self.cycle.idCycle) codcurall = self.connection.execute(query) listcodcur = [] for codcur in codcurall: listcodcur.append(codcur[0]) return listcodcur def startreadingoffers(self, coursecodes): "Using the coursecodes list, reads all the offers from each code" requisitiontypetranslationdict = {0: 1, 1: 2, 2: 3} # In the bank: 1 - Obrigatória, 2 - Eletiva, 3 - Livre index = 0 offers = [] while index < len(coursecodes): for period in coursecodes[index]: for code in coursecodes[index][period]: if code in COURSES_TO_IGNORE: continue reader = CourseReader(code, self.connection, self.cycle, period, requisitiontypetranslationdict [index]) course = reader.scancoursepage() self.offerreader.setcourse(course) offers.extend(self.offerreader.scancourseofferpage()) index += 1 return offers