def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.select('tr') for i in range(0, len(elems)): data = elems[i] img_src = data.select('img')[0]['src'] data = data.getText().strip().split('\n') prof = ttl.TtlFileEntry() prof.picture = self._flink + img_src prof.property = "faculty" prof.name = data[0] prof.title = data[1] if not data[2].isspace(): prof.phone = data[2] if not data[3].isspace(): prof.email = data[3] prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.select('tr') for i in range(2, len(elems)): nameStr = elems[i].find('strong').getText() titleStr = elems[i].find('br').next_sibling contact_info = elems[i].select('p')[2].getText().split('\n') emailStr = contact_info[0] phoneStr = contact_info[1] roomStr = contact_info[2] interestsStr = elems[i].select('p')[3].getText() prof = ttl.TtlFileEntry() prof.name = nameStr prof.property = "faculty" prof.title = titleStr prof.email = emailStr prof.phone = phoneStr prof.room = roomStr prof.Interests = interestsStr prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") tables = soup.select('article') for i in range(1, len(tables)): elems = tables[i].select('tr') for j in range(1, len(elems)): data = elems[j].select('td') prof = ttl.TtlFileEntry() prof.name = data[0].getText() prof.property = "faculty" prof.title = data[1].select('br')[0].previous_sibling prof.department = data[1].select('br')[0].next_sibling prof.email = data[2].select('br')[0].previous_sibling.getText() prof.phone = data[2].select('br')[0].next_sibling prof.write_to(ttl_file) ttl_file.close() return ttl_file
def _parse_prof_site(self, website): webpage = requests.get(website) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") prof_div = soup.find('div', {"class" : "faculty-profile"}) name = prof_div.getText() h2_list = soup.findAll('h2') title1 = h2_list[0].getText() title2 = h2_list[1].getText() if title1 and title2: title = title1 + "; " + title2 elif title1: title = title1 elif title2: title = title2 grey_box = soup.find('div', {"class" : "grey-box"}) a_href = grey_box.select('a')[0]['href'] email = a_href.split(':')[1] text_list = (grey_box.getText().split('\n')) office = text_list[2].split(':')[1] phone = text_list[4].split(':')[1] prof = ttl.TtlFileEntry() prof.property = "faculty" prof.name = name prof.title = title prof.email = email prof.room = office prof.phone = phone return prof
def _refreshFromSoup(self, soup, ttl_file): elems = soup.findAll('tr', {"class": "FacultyTableRow"}) for i in range(0, len(elems)): e = elems[i] all_text = e.findAll(text=True) data = list(filter(lambda a: a != "\n", all_text)) nameEdu = data[0].split(',', 1) prof = ttl.TtlFileEntry() prof.name = nameEdu[0] prof.property = "faculty" if len(nameEdu) > 1: prof.education = nameEdu[1] prof.title = data[1] prof.room = data[2] if "@" in data[3]: prof.email = data[3] prof.interests = data[4] else: prof.interests = data[3] prof.write_to(ttl_file)
def _parse_prof_site(self, website): webpage = requests.get(website) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") prof_div = soup.find('div', {"class": "faculty-profile"}) data = prof_div.getText().strip().split("\n") name = data[0] education = data[1] title = data[2] department = data[4] if data[6] and data[7]: phone = data[6] + "; " + data[7] elif data[6]: phone = data[6] elif data[7]: phone = data[7] else: phone = "" email = data[8] room = data[9] bio = "" if data[12]: bio = data[12] elif data[13]: bio = data[13] elif data[14]: bio = data[14] else: for j in range(0, len(data)): print(str(j) + ":" + data[j]) interests = "" if (len(data) > 14): for d in data[14:]: if d.startswith("Research Interests"): interests = d.split(":")[1] prof = ttl.TtlFileEntry() prof.property = "faculty" prof.name = name prof.education = education prof.title = title prof.department = department prof.email = email prof.room = room prof.phone = phone prof.bio = bio prof.interests = interests return prof
def _parse_prof_site(self, website): webpage = requests.get(website) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") center_rail = soup.find('div', {"id": "center-rail"}) divs = center_rail.findAll('div') bio_block = divs[3] edu_block = divs[4].find("ul") #if that isnt the block, the next is educations = '' if not edu_block: edu_block = divs[5].find("ul") if edu_block: educations = edu_block.findAll("li") edu_str = "" for e in educations: edu_str += e.getText() + ". \n" pub_str = "" if len(divs) > 6: pub_block = divs[6].find("ul") #if that isnt the block, either the next is or this professor doesn't have one if not pub_block and len(divs) > 7: pub_block = divs[7].find("ul") if pub_block: publications = pub_block.findAll("li") pub_number = 3 count = 0 for p in publications: pub_str += p.getText() + ". \n" count += 1 if count == pub_number: break website_block = soup.find("div", {"id": "bodytag_2_rightrail_0_pnlSite"}) website_str = website if website_block: website_str = website_block.select("a")[0].getText() prof = ttl.TtlFileEntry() prof.bio = bio_block.getText() prof.education = edu_str prof.publications = pub_str prof.website = website_str return prof
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.select('tr') for i in range(2, len(elems) - 14): prof = ttl.TtlFileEntry() #print (elems[i]) nameStr = elems[i].find('strong').getText().strip() #print (nameStr) titleStr = elems[i].br.next_sibling.strip() #print (titleStr) contactStr = elems[i].select('td')[1].getText() contactList = contactStr.splitlines() #print (contactList) if contactList[1]: emailStr = contactList[1].strip() prof.email = emailStr #print (emailStr) if len(contactList) > 2 and contactList[2]: phoneStr = contactList[2].strip() prof.phone = phoneStr #print (phoneStr) if len(contactList) > 3 and contactList[3]: roomStr = contactList[3].strip() prof.room = roomStr #print (roomStr) interestsStr = elems[i].select('td')[2].getText().strip() #print (interestsStr) img_src = elems[i].select('img')[0]['src'].strip() pictureStr = self._flink + img_src #print (pictureStr) prof.name = nameStr prof.property = "faculty" prof.title = titleStr prof.interests = interestsStr prof.picture = pictureStr prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): with open(self.csv_filename, 'r') as f: ttl_file = ttl.TtlFile(self.ttl_filename) reader = csv.DictReader(f) for row in reader: building = ttl.TtlFileEntry() building.property = "building" names = row["Name"] names = names.split(';') names = names + self.filter_common_names(names) building.name = names[0] building.altnames = names[1:] if (row["Code"]): building.altnames.append(row["Code"]) building.altnames.append(row["Code"].title()) building.department = row["Function"] building.address = row["Address"] building.picture = row["Picture"] building.website = row["Website"] building.mStartTime = row["MStartTime"] building.tStartTime = row["TStartTime"] building.wStartTime = row["WStartTime"] building.thStartTime = row["ThStartTime"] building.fStartTime = row["FStartTime"] building.saStartTime = row["SaStartTime"] building.suStartTime = row["SuStartTime"] building.mEndTime = row["MEndTime"] building.tEndTime = row["TEndTime"] building.wEndTime = row["WEndTime"] building.thEndTime = row["ThEndTime"] building.fEndTime = row["FEndTime"] building.saEndTime = row["SaEndTime"] building.suEndTime = row["SuEndTime"] building.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.select('tr') for i in range(2, len(elems)-6): #print (elems[i]) nameStr = elems[i].find('strong').getText() #print (nameStr) titleStr = elems[i].br.next_sibling.strip() #print (titleStr) emailStr = elems[i].select('p')[2].getText() phoneStr = elems[i].select('p')[3].getText() roomStr = elems[i].select('p')[4].getText() #print (emailStr) #print (phoneStr) #print (roomStr) interestsStr = elems[i].select('p')[5].getText().strip() #print (interestsStr) prof = ttl.TtlFileEntry() prof.name = nameStr prof.property = "faculty" prof.title = titleStr prof.email = emailStr prof.phone = phoneStr prof.room = roomStr prof.Interests = interestsStr prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.findAll('tr', {"class": "FacultyTableRow"}) for i in range(0, len(elems)): data = elems[i] div_facultyHeadshot = data.find('div', {"class": "facultyHeadshot"}) image = div_facultyHeadshot.img div_fac_info = data.find('div', {"class": "fac-info"}) h2_fname = div_fac_info.find('h2', {"class": "fname"}) h2_fname_list = h2_fname.getText().split(",", 1) div_fcontact = div_fac_info.find('div', {"class": "fcontact"}) location_text = div_fcontact.contents[3].getText() location_list = location_text.split("\n") prof = ttl.TtlFileEntry() if image is not None: prof.picture = "http://drexel.edu" + image['src'] prof.name = h2_fname_list[0] prof.property = "faculty" if len(h2_fname_list) > 1: prof.degree = h2_fname_list[1] prof.title = div_fcontact.contents[1].getText() prof.office = location_list[0] prof.email = location_list[1] prof.phone = location_list[2] prof.department = data.find_all('td')[1].next_element prof.write_to(ttl_file) ttl_file.close() return ttl_file
def _parse_prof_site(self, website): webpage = requests.get(website) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") header = soup.find("div", {"class": "profile-header"}) img_src = header.find("img")["src"] picture = self._flink + img_src faculty_txt_div = header.find("div", {"class": "faculty-txt"}) info = faculty_txt_div.select("li") room = info[0].getText() phone = info[1].getText() email = info[2].getText() name_div = header.find("div", {"class": "faculty-name"}) name = name_div.getText().split(",")[0] edu_div = header.find("div", {"class": "faculty-edu"}) next_sib = edu_div.find("h2").next_sibling education = "" while next_sib != None: if '<br/>' not in str(next_sib): education += str(next_sib) + "; " next_sib = next_sib.next_sibling prof = ttl.TtlFileEntry() prof.property = "faculty" prof.website = website prof.picture = picture prof.room = room prof.phone = phone prof.email = email prof.name = name prof.education = education return prof
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.find('tbody').select('tr') for i in range(0, len(elems)): data = elems[i].select('td') img = data[0].find('img') picture = "" if img: img_src = data[0].find('img')['src'] picture = self._flink + img_src info = data[1].getText().split("\n") name = info[1].split(',')[0] title = info[2] phone = info[3] email = info[4] department = data[2].getText() prof = ttl.TtlFileEntry() prof.name = name prof.property = "faculty" prof.picture = picture prof.title = title prof.phone = phone prof.email = email prof.department = department prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") table = soup.select('tbody')[0] elems = table.select('tr') for i in range(2, len(elems)): rows = elems[i].select("td") picture = rows[0].find('img')['src'] picture = self._flink + picture nameStr = rows[0].find('h1').getText() titleStr = rows[0].find('h2').getText() emailStr = rows[1].find('a').getText() phoneStr = rows[1].find('br').next_sibling phoneStr = phoneStr.split(":")[1] phoneStr = phoneStr.split('\n')[0] interestsStr = rows[2].getText() prof = ttl.TtlFileEntry() prof.name = nameStr prof.property = "faculty" prof.picture = picture prof.title = titleStr prof.email = emailStr prof.phone = phoneStr #prof.room = roomStr prof.Interests = interestsStr prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.select('tr') for i in range(3, len(elems)): e = elems[i] nameStr = elems[i].find('h1').getText() titleStr = elems[i].find('p').getText() a_href = elems[i].select('a')[1]['href'] websiteStr = self._flink + a_href emailStr = elems[i].select('a')[2].getText() if elems[i].select('img'): img_src = elems[i].select('img')[0]['src'] pictureStr = self._flink + img_src phoneStr = elems[i].select('td')[2].find('br').next_sibling if not phoneStr.isspace(): phoneStr = phoneStr.split(":")[1] prof = ttl.TtlFileEntry() prof.name = nameStr prof.property = 'faculty' prof.website = websiteStr prof.title = titleStr prof.email = emailStr prof.phone = phoneStr prof.website = websiteStr prof.write_to(ttl_file) ttl_file.close() return ttl_file
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.findAll('div', {"class" : "user-profile-stub clearfix"}) for i in range(0, len(elems)): e = elems[i] all_text = e.findAll(text=True) data = list(filter(lambda a: a != "\n", all_text)) nameEdu = data[0].split(',', 1) prof = ttl.TtlFileEntry() prof.name = nameEdu[0] prof.property = "faculty" if len(nameEdu) > 1: prof.education = nameEdu[1] prof.title = data[1] prof.department = data[2] prof.room = data[4] prof.phone = data[6] prof.email = data[8] if "Areas of Expertise" in data: aoe = data.index("Areas of Expertise") prof.interests = ", ".join(data[aoe+1:]) prof.write_to(ttl_file) ttl_file.close() return ttl_file
def _parse_prof_site(self, website): webpage = requests.get(website) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") prof_div = soup.find('div', {"class": "profile-col1"}) img = prof_div.select('img') picture = "" if img: img_src = img[0]['src'] picture = self._flink + img_src name = prof_div.select('h1')[0].getText() title = prof_div.select('h2')[0].getText() department = '' department_p = prof_div.find( "p", {"id": "bodytag_2_centerrail_0_pDepartment"}) if department_p: department = department_p.getText().split(":")[1] bio = '' bio_div = prof_div.find( "div", {"id": "bodytag_2_centerrail_0_divPatientCareBio"}) if bio_div: bio_p = bio_div.select("p") if bio_p: bio = bio_p[0].getText() else: bio = bio_div.getText() interests = '' interests_div = prof_div.find( "div", {"id": "bodytag_2_centerrail_0_divResearchOverviewText"}) if interests_div: interests = interests_div.getText() education = '' education_div = prof_div.find( "ul", {"id": "bodytag_2_centerrail_0_ulEducation"}) if education_div: for e in education_div: if education: education = education + "; " + e.getText() else: education = e.getText() email = "" room = "" phone = "" grey_box = soup.find('div', {"class": "grey-box"}) if grey_box: a = grey_box.select('a') if a: a_href = a[0]['href'] email = a_href.split(":")[1] grey_box_br = grey_box.select('br') grey_box_text = list(map(lambda x: x.getText(), grey_box_br)) for j in range(0, len(grey_box_text) - 1): g = grey_box_text[j] g_next = grey_box_text[j + 1] grey_box_text[j] = g[:-len(g_next)] phone_index = -1 for j in range(0, len(grey_box_text)): e = grey_box_text[j] if e.startswith("Phone"): phone_index = j break for e in grey_box_text[:phone_index]: if room: room += ", " + e else: room = e if phone_index != -1: phone = grey_box_text[phone_index].split(":")[1] prof = ttl.TtlFileEntry() prof.property = "faculty" prof.picture = picture prof.name = name prof.education = education prof.title = title prof.department = department prof.email = email prof.room = room prof.phone = phone prof.bio = bio prof.interests = interests return prof
def write_ttl(self): ttl_file = ttl.TtlFile(self.ttl_filename) webpage = requests.get(self._link) try: webpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) soup = BeautifulSoup(webpage.text, "html.parser") elems = soup.findAll('a') for i in range(0, len(elems)): e = elems[i] if "/directory/" in e[ "href"] and e["href"] != "/westphal/about/directory/": #print (e["href"]) _plink = self._flink + e["href"] fpage = requests.get(_plink) try: fpage.raise_for_status() except Exception as exc: print('There was a problem: %s' % (exc)) fsoup = BeautifulSoup(fpage.text, "html.parser") #writeHTMLFile(fsoup, "test.html") prof = ttl.TtlFileEntry() nameStr = fsoup.find('div', { "class": "faculty-name" }).getText() #print (nameStr) titleStr = fsoup.find('div', {"class": "title"}).getText() #print (titleStr) contactStr = fsoup.find('div', {"class": "contact"}).getText() contactList = contactStr.splitlines() if "PH:" in contactList[3]: phoneStr = contactList[3].split(": ")[1].replace(".", "") #print(phoneStr) if "Email:" in contactList[4]: emailStr = contactList[4].split(": ")[1] #print(emailStr) if "Website:" in contactList[5]: websiteStr = contactList[5] if websiteStr.split(": ")[1]: websiteStr = websiteStr.split(": ")[1] else: websiteStr = contactList[6].strip() #print (websiteStr) locationStr = fsoup.find('div', { "class": "location" }).getText() locationList = locationStr.splitlines() if len(locationList) > 2 and locationList[2]: officeStr = locationList[2] #print (officeStr) #infoStr = fsoup.find('div', {"id" : "tabs"}).getText() #print (infoStr) prof.name = nameStr.split(',', 1)[0] prof.property = "faculty" if titleStr: prof.title = titleStr prof.phone = phoneStr prof.email = emailStr prof.website = websiteStr prof.room = officeStr prof.write_to(ttl_file) ttl_file.close() return ttl_file