def populateCurrentSemesters(semesters, url, maxSemesters): response = utils.getHTML(url, "current semesters") print "Parsing for current semesters..." htmlComment = "<!--" fall = "fall-class-schedules.php" spring = "spring-class-schedules.php" summer = "summer-class-schedules.php" currentSemester = False for line in response: line = re.sub("(<!--.*?-->)", "", line) if htmlComment in line: continue if len(semesters) == maxSemesters: break if "<h3>" in line: if "Current Semester" in line: currentSemester = True else: currentSemester = False if fall in line: name = utils.extractInfoFromLine(line, '', '-class-schedules.php">', '</a>') endpoint = "fall" semesters.append({"name" : name, "endpoint" : endpoint, "current" : currentSemester}) elif spring in line: name = utils.extractInfoFromLine(line, '', '-class-schedules.php">', '</a>') endpoint = "spring" semesters.append({"name" : name, "endpoint" : endpoint, "current" : currentSemester}) elif summer in line: name = utils.extractInfoFromLine(line, '', '-class-schedules.php">', '</a>') endpoint = "summer" semesters.append({"name" : name, "endpoint" : endpoint, "current" : currentSemester}) print "Successfully parsed current semesters."
def populateEvents(events, url): response = utils.getHTML(url, "events") print "Parsing event data..." eventCount = 0 while True: eventItem = utils.extractCourseInfo(response, '<div class="calendarentry">', '</div>') if not eventItem: break title = utils.extractInfo(eventItem, '<h4>', '<h4>', '</h4>') date = utils.extractInfo(eventItem, 'Date:', 'Date:</b>', '<br>') location = utils.extractInfo(eventItem, 'Location:', 'Location:</b>', '</p>') for line in eventItem: if '<p><p>' in line and '</p></p>' in line: description = utils.extractInfo(eventItem, '<p><p>', '<p><p>', '</p></p>') break elif '<p><p>' in line: description = line break else: description = None if not filter(lambda event: event['description'] == description, events): events.append({"title": title}) events[-1]["id"] = str(eventCount) events[-1]["date"] = date events[-1]["location"] = location events[-1]["description"] = description eventCount += 1 print "Successfully parsed", eventCount, "events.\n"
def getAllProductsOfType(productType, logfn): url = f"https://www.southlondongallery.org/product-tag/{productType}/" logfn(url) soup = getHTML(url) tiles = soup.find_all(class_="tease-product") return [ Product(createID(), tile.a["href"], tile.a.div.img["src"]) for tile in tiles ]
def iq_callback(query): data = query.data if data == 'bal': items = get_user_balances(query) bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') bot.send_message(query.message.chat.id, getHTML('balances', items), reply_markup=get_back_keyboard("bal"), parse_mode='HTML') elif data == 'trans': items = get_user_transactions(query) bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') print(getHTML('transactions', items)) bot.send_message(query.message.chat.id, getHTML('transactions', items), reply_markup=get_back_keyboard("trans"), parse_mode='HTML') elif data == 'defi': # items = get_user_transactions(query) bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') bot.send_message(query.message.chat.id, "Choose a Defi Platform", reply_markup=get_defi_keyboard("send"), parse_mode='HTML') elif data == 'setAddress': bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') set_user_address_handler(query.message) elif data == "home": bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') exchange_command(query.message) elif data == "refreshbal": items = get_user_balances(query) bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') bot.send_message(query.message.chat.id, getHTML('balances', items), reply_markup=get_back_keyboard("bal"), parse_mode='HTML') elif data == "refreshtrans": items = get_user_transactions(query) bot.answer_callback_query(query.id) bot.send_chat_action(query.message.chat.id, 'typing') print(getHTML('transactions', items)) bot.send_message(query.message.chat.id, getHTML('transactions', items), reply_markup=get_back_keyboard("trans"), parse_mode='HTML')
def getAllProductsOfType(productType, logfn): products = [] base = "https://nottinghamcontemporary.shop" url = f"{base}/{productType}" logfn(url) soup = getHTML(url) items = soup.find_all(class_="Item--product") for item in items: products.append( Product(createID(), f"{base}{item.a['href']}", f"https:{extractImageUrlFromStyle(item.a.div.div.div['style'])}")) return products
def getPageOfProducts(productType, pageNum, logfn): time.sleep(0.5) products = [] base = "https://www.nationalgallery.co.uk" url = f"{base}/products/{productType}?orderBy=&VIEW_SIZE=100&VIEW_INDEX={pageNum}" logfn(url) soup = getHTML(url) items = soup.find_all(class_="productwrap") for item in items: products.append( Product(createID(), f"{base}{item.a['href']}", f"{base}{item.a.div.img['src']}")) return products
def getPageOfProducts(pageNum, logfn): time.sleep(0.5) products = [] base = "https://ica-bookstore.myshopify.com" url = f"{base}/collections/frontpage?page={pageNum}" logfn(url) soup = getHTML(url) items = soup.find_all(class_="grid-view-item") for item in items: products.append( Product(createID(), f"{base}{item.a['href']}", f"https:{item.div.img['src']}")) return products
def getPageOfProducts(pageNum, logfn): time.sleep(0.5) products = [] base = "https://designmuseumshop.com" url = f"{base}/collections/all?page={pageNum}" logfn(url) soup = getHTML(url) items = soup.find(id="product-loop").find_all(class_="product-index-inner") for item in items: products.append( Product(createID(), f"{base}{item.a['href']}", f"https:{item.a.img['src']}")) return products
def populateSubjects(subjects, url): response = utils.getHTML(url, "subjects") print "Parsing the subjects..." subjectCount = 0 lindex = "</b></b><center><h3>" rindex = "</h3>" for line in response: if lindex in line and rindex in line: subject = line[line.index(lindex) + len(lindex):line.rindex(rindex)] subjectAbbrev = subject[subject.find("(") + 1:subject.find(")")] subjectReadable = subject[0:subject.find("(")].rstrip(" ") subjects[subjectAbbrev] = subjectReadable subjectCount += 1 print "Successfully added ", subjectCount, " subjects.\n"
def getPageOfProducts(productType, pageNum, logfn): time.sleep(0.5) products = [] base = "https://shop.royalacademy.org.uk" url = f"{base}/{productType}?p={pageNum}" logfn(url) soup = getHTML(url) items = soup.find_all(class_="product-item-photo") for item in items: products.append( Product(createID(), item["href"], item.span.span.img["data-src"])) return products
def getPageOfProducts(productType, pageNum, logfn): time.sleep(0.5) products = [] base = "https://shop.tate.org.uk" url = f"{base}/{productType}/view-all-{productType}?start={(pageNum - 1) * 100}&sz=100" logfn(url) soup = getHTML(url) tiles = soup.find_all(class_="product-tile") for tile in tiles: products.append( Product(createID(), f"{base}{tile.div.a['href']}", tile.div.img["src"])) return products
def getPageOfProducts(productType, pageNum, logfn): time.sleep(0.5) products = [] base = "https://shop.southbankcentre.co.uk" url = f"{base}/{productType}?page={pageNum}" logfn(url) soup = getHTML(url) items = soup.find_all("a", class_="grid__image") for item in items: products.append( Product(createID(), f"{base}{item['href']}", f"https:{item.img['src']}")) return products
def getPageOfProducts(pageNum, logfn): time.sleep(0.5) products = [] base = "https://shop.manchesterartgallery.org" url = f"{base}/collections/all?page={pageNum}" logfn(url) soup = getHTML(url) items = soup.find_all(class_="product-card") for item in items: products.append( Product( createID(), f"{base}{item.a['href']}", f"https:{item.div.div.div.img['data-src']}".replace( r"{width}", "200"))) return products
def getPageOfProducts(productType, page, logfn): time.sleep(0.5) products = [] base = "https://shop.camdenartscentre.org" url = f"{base}/collections/{productType}?page={page}" logfn(url) soup = getHTML(url) tiles = soup.find_all(class_="product-card") for tile in tiles: img = tile.find(class_="grid-view-item__image") widths = img["data-widths"] # e.g. '[180, 360, 540, 720]' width = widths[1:-1].split(",")[0] # e.g. '180' img_url = f"https:{img['data-src'].replace('{width}', width)}" products.append(Product(createID(), f'{base}{tile.a["href"]}', img_url)) return products
def populateArchivedSemesters(semesters, url, semestersToAdd): if semestersToAdd <= 0: return response = utils.getHTML(url, "archived semesters") semestersRemainingToAdd = semestersToAdd classScheduleString = '<a href="http://www.losrios.edu/class_schedules_reader.php?loc=flc/' for line in response: if semestersRemainingToAdd <= 0: break if classScheduleString in line: endpoint = utils.extractInfoFromLine(line, classScheduleString, classScheduleString, "_schd/index.html") if "f" in endpoint: name = "Fall 20" + endpoint[-2:] elif "su" in endpoint: name = "Summer 20" + endpoint[-2:] elif "sp" in endpoint: name = "Spring 20" + endpoint[-2:] semesters.append({"name" : name, "endpoint" : endpoint, "current" : False}) semestersRemainingToAdd -= 1
def populateCourses(courses, url): response = utils.getHTML(url, "courses") print "Parsing the courses..." coursesCount = 0 while True: courseInfo = utils.extractCourseInfo(response, "<!--Course Title-->", "<center><hr width=60%></center>") if not courseInfo: break courseTitle = utils.extractInfo(courseInfo, "Course Title", "<b>", " ") if not courseTitle: continue coursesCount += 1 courseName = utils.extractInfo(courseInfo, "Course Title", " ", " ") units = extractUnits(courseInfo) description = utils.extractInfo(courseInfo, "Description:", "</em>", "<br />") prerequisite = utils.extractInfo(courseInfo, "Prerequisite:", "</em>", "<br />") corequisite = utils.extractInfo(courseInfo, "Corequisite:", "</em>", "<br />") hours = utils.extractInfo(courseInfo, "Hours:", "</em>", "<br />") transferableTo = utils.extractInfo(courseInfo, "Transferable to", "Course Transferable to ", "</em>") advisory = utils.extractInfo(courseInfo, "Advisory:", "</em>", "<br />") generalEducation = utils.extractInfo(courseInfo, "General Education: ", "</em>", "<br />") enrollmentLimitation = utils.extractInfo(courseInfo, "Enrollment Limitation:", "</em>", "<br />") sameAs = utils.extractInfo(courseInfo, "Same As:", "</em>", "<br />") courseFamily = utils.extractInfo(courseInfo, "Course Family:", "</em>", "<br />") courses.append({"courseTitle" : courseTitle}) courses[-1]["courseName"] = courseName courses[-1]["units"] = units courses[-1]["description"] = description courses[-1]["prerequisite"] = prerequisite courses[-1]["corequisite"] = corequisite courses[-1]["hours"] = hours courses[-1]["transferableTo"] = transferableTo courses[-1]["advisory"] = advisory courses[-1]["generalEducation"] = generalEducation courses[-1]["enrollmentLimitation"] = enrollmentLimitation courses[-1]["sameAs"] = sameAs courses[-1]["courseFamily"] = courseFamily print "Successfully added ", coursesCount, " courses.\n"
def populateSports(sports, url): response = utils.getHTML(url, "sports") print "Parsing sports data..." sportCount = 0 while True: sportItem = utils.extractCourseInfo( response, '<div class="event-info clearfix">', '<div class="event-box clearfix">') if not sportItem: break month = utils.extractInfo(sportItem, '<span class="month">', '<span class="month">', '</span>') day = utils.extractInfo(sportItem, '<span class="dd">', '<span class="dd">', '</span>') date = month + ' ' + day sport = utils.extractInfo(sportItem, '<div class="sport">', '<div class="sport">', '</div>') time = utils.extractInfo(sportItem, '<span class="status">', '<span class="status">', '</span>') neutralSite = utils.extractInfo(sportItem, '<div class="neutral-site">', '<div class="neutral-site">', '</div>') opponentInfo = utils.extractCourseInfo(sportItem, '<div class="opponent">', '</div>') opponent = parseOpponentInfo(opponentInfo) sports.append({"date": date}) sports[-1]["id"] = str(sportCount) sports[-1]["sport"] = sport sports[-1]["time"] = time sports[-1]["opponent"] = opponent sports[-1]["neutralSite"] = neutralSite sportCount += 1 print "Successfully parsed", sportCount, "sports.\n"
def populateClasses(classes, url): response = utils.getHTML(url, "class schedule") classCount = 0 print "Parsing the classes..." schedule = None days = None lecTime = None labTime = None instructor = None lecRoom = None labRoom = None classNum = None classType = 'On Campus' while True: courseInfo = utils.extractCourseInfo( response, "<!--Course Title-->", "<center><hr width=60%></center>") if not courseInfo: break courseTitle = utils.extractInfo(courseInfo, "Course Title", "<b>", " ") if not courseTitle: continue courseName = utils.extractInfo(courseInfo, "Course Title", " ", " ") for line in courseInfo: line = line.replace(" ", " ") schedule = utils.extractInfoFromLine(line, "Schedule:", "</em><b>", "</b>") or schedule if "font face=Courier" in line: days = getDays(line) if "LEC" in line: lecTime = getLecTime(line) instructor = line[line.index("LEC") + len("LEC"):line. index(" ", line.index("LEC"))].lstrip( " ").rstrip(" ") lecRoom = getRoom(instructor, line) elif "LAB" in line: labTime = getLabTime(line) instructor = line[line.index("LAB") + len("LAB"):line. index(" ", line.index("LAB"))].lstrip( " ").rstrip(" ") labRoom = getRoom(instructor, line) instructor = instructor.replace(".", ". ") classNum = utils.extractInfoFromLine(line, "Textbook", lecRoom or labRoom, "<a href") or classNum if '<!--Meetings Notes-->' in line and '<b>' in line: classTypeNote = utils.extractInfoFromLine( line, '<!--Meetings Notes-->', '<b>', '</b>') or '' if 'Interactive Television' in classTypeNote: classes[-1]["classType"] = 'Television' elif 'hybrid' in classTypeNote: classes[-1]["classType"] = 'Hybrid' elif 'online' in classTypeNote: classes[-1]["classType"] = 'Online' else: classes[-1]["classType"] = 'On Campus' if schedule and days and instructor and classNum and ( lecTime or labTime) or (lecRoom or labRoom): if len(classes) > 1 and classNum == classes[-1]["classNum"]: classes[-1]["labTime"] = classes[-1]["labTime"] or labTime classes[-1]["lecRoom"] = classes[-1]["lecRoom"] or lecRoom classes[-1]["labRoom"] = classes[-1]["labRoom"] or labRoom else: classes.append({"schedule": schedule}) classes[-1]["days"] = days classes[-1]["lecTime"] = lecTime classes[-1]["labTime"] = labTime classes[-1]["instructor"] = instructor classes[-1]["lecRoom"] = lecRoom classes[-1]["labRoom"] = labRoom classes[-1]["classNum"] = classNum classes[-1]["courseTitle"] = courseTitle classes[-1]["courseName"] = courseName classes[-1]["classType"] = classType or "On Campus" classes[-1]["id"] = str(classCount) classCount += 1 days = None lecTime = None labTime = None instructor = None lecRoom = None labRoom = None classType = None print "Successfully added", classCount, "classes.\n"
def getInstructorDetails(instructors, url): # Assemble list of professors and their respective subjects from FLC Faculty page # These are stored as a temporary results to combine with the full instructor data print "Fetching faculty information..." facultyMembersNum = 0 facultyMembers = [] response = utils.getHTML(url, None) for line in response: subjectURLStr = utils.extractInfoFromLine(line, '<li><a href="academics/', '<li><a href="academics/', '">') if subjectURLStr and "catalog" not in subjectURLStr and "bustec-courses" not in subjectURLStr: subjectURL = url + '/' + subjectURLStr subjectHTML = utils.getHTML(subjectURL, None) subject = utils.extractInfo(subjectHTML, '<meta name="description" content="', '<meta name="description" content="', '">') if "(" in subject: subject = subject[0:subject.index("(")] subject = subject.rstrip(' ') utils.clearLine() print "\rSearching", subject + "...", facultyURLStr = utils.extractInfo( subjectHTML, '-faculty">', '<li><a href="academics/' + subjectURLStr, '-faculty">') if facultyURLStr: facultyURL = subjectURL + '/' + facultyURLStr + '-faculty' facultyHTML = utils.getHTML(facultyURL, None) facultyNamesHTML = utils.extractCourseInfo( facultyHTML, '<div class="calendarcontent">', "</div>") for line2 in facultyNamesHTML: if '<a href="mailto:' not in line2: continue if '<br />' in line2: facultyName = utils.extractInfoFromLine( line2, '', '', '<br />') else: facultyName = utils.extractInfoFromLine( line2, '', '', '</span>') if "(" in facultyName: facultyName = facultyName[0:facultyName. index("(")].rstrip(' ') if facultyName.find("Dean") != -1 and facultyName.find( "Dean") != 0: facultyName = facultyName[0:facultyName.index("Dean")] if '">Email' in line2: facultyEmail = utils.extractInfoFromLine( line2, '', '<a href="mailto:', '">Email') else: facultyEmail = utils.extractInfoFromLine( line2, '', '<a href="mailto:', '">') if ' ' in facultyEmail: facultyEmail = facultyEmail[0:facultyEmail.find(' ')] if '(' in line2: facultyPhone = line2[line2.rindex('(' ):line2.rindex('(') + 14].rstrip('<') else: facultyPhone = None if not filter(lambda person: person['name'] == facultyName, facultyMembers): facultyMembers.append({"name": facultyName}) facultyMembers[-1]["email"] = facultyEmail facultyMembers[-1]["phone"] = facultyPhone facultyMembersNum += 1 utils.clearLine() # Populate professors email and phone numbers for professor in instructors: for faculty in facultyMembers: lastName = professor["name"][3:] if lastName in faculty["name"] and professor["name"][0] == faculty[ "name"][0]: professor["name"] = faculty["name"] professor["email"] = faculty["email"] professor["phone"] = faculty["phone"] # Convert class hours to easy-to-read format for professor in instructors: professor["classHours"] = utils.convertTime(professor["classHours"]) print "\rSuccessfully added details to ", facultyMembersNum, " faculty members.\n"
def main(): getNewFBPosts(utils.getHTML(FBFeeds["cu"]))