Пример #1
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.select('tr')
        for i in range(0, len(elems)):
            data = elems[i]
            img_src = data.select('img')[0]['src']
            data = data.getText().strip().split('\n')
            prof = ttl.TtlFileEntry()
            prof.picture = self._flink + img_src
            prof.property = "faculty"
            prof.name = data[0]
            prof.title = data[1]
            if not data[2].isspace():
                prof.phone = data[2]
            if not data[3].isspace():
                prof.email = data[3]

            prof.write_to(ttl_file)

        ttl_file.close()
        return ttl_file
Пример #2
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.select('tr')
        for i in range(2, len(elems)):
            nameStr = elems[i].find('strong').getText()
            titleStr = elems[i].find('br').next_sibling
            contact_info = elems[i].select('p')[2].getText().split('\n')
            emailStr = contact_info[0]
            phoneStr = contact_info[1]
            roomStr = contact_info[2]
            interestsStr = elems[i].select('p')[3].getText()

            prof = ttl.TtlFileEntry()

            prof.name = nameStr
            prof.property = "faculty"
            prof.title = titleStr
            prof.email = emailStr
            prof.phone = phoneStr
            prof.room = roomStr
            prof.Interests = interestsStr

            prof.write_to(ttl_file)

        ttl_file.close()

        return ttl_file
Пример #3
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        tables = soup.select('article')
        for i in range(1, len(tables)):
            elems = tables[i].select('tr')
            for j in range(1, len(elems)):
                data = elems[j].select('td')

                prof = ttl.TtlFileEntry()
                prof.name = data[0].getText()
                prof.property = "faculty"
                prof.title = data[1].select('br')[0].previous_sibling
                prof.department = data[1].select('br')[0].next_sibling
                prof.email = data[2].select('br')[0].previous_sibling.getText()
                prof.phone = data[2].select('br')[0].next_sibling
                prof.write_to(ttl_file)

        ttl_file.close()
        return ttl_file
Пример #4
0
    def _parse_prof_site(self, website):
        webpage = requests.get(website)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        prof_div = soup.find('div', {"class" : "faculty-profile"})
        name = prof_div.getText()
        h2_list = soup.findAll('h2')
        title1 = h2_list[0].getText()
        title2 = h2_list[1].getText()
        if title1 and title2:
            title = title1 + "; " + title2
        elif title1:
            title = title1 
        elif title2:
            title = title2
        grey_box = soup.find('div', {"class" : "grey-box"})
        a_href = grey_box.select('a')[0]['href']
        email = a_href.split(':')[1]
        text_list = (grey_box.getText().split('\n'))
        office = text_list[2].split(':')[1]
        phone = text_list[4].split(':')[1]
        
        prof = ttl.TtlFileEntry()
        prof.property = "faculty"
        prof.name = name
        prof.title = title
        prof.email = email
        prof.room = office
        prof.phone = phone

        return prof
Пример #5
0
    def _refreshFromSoup(self, soup, ttl_file):
        elems = soup.findAll('tr', {"class": "FacultyTableRow"})
        for i in range(0, len(elems)):
            e = elems[i]
            all_text = e.findAll(text=True)

            data = list(filter(lambda a: a != "\n", all_text))

            nameEdu = data[0].split(',', 1)

            prof = ttl.TtlFileEntry()

            prof.name = nameEdu[0]
            prof.property = "faculty"
            if len(nameEdu) > 1:
                prof.education = nameEdu[1]
            prof.title = data[1]
            prof.room = data[2]
            if "@" in data[3]:
                prof.email = data[3]
                prof.interests = data[4]
            else:
                prof.interests = data[3]

            prof.write_to(ttl_file)
Пример #6
0
    def _parse_prof_site(self, website):
        webpage = requests.get(website)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        prof_div = soup.find('div', {"class": "faculty-profile"})
        data = prof_div.getText().strip().split("\n")

        name = data[0]
        education = data[1]
        title = data[2]
        department = data[4]

        if data[6] and data[7]:
            phone = data[6] + "; " + data[7]
        elif data[6]:
            phone = data[6]
        elif data[7]:
            phone = data[7]
        else:
            phone = ""

        email = data[8]
        room = data[9]

        bio = ""
        if data[12]:
            bio = data[12]
        elif data[13]:
            bio = data[13]
        elif data[14]:
            bio = data[14]
        else:
            for j in range(0, len(data)):
                print(str(j) + ":" + data[j])

        interests = ""
        if (len(data) > 14):
            for d in data[14:]:
                if d.startswith("Research Interests"):
                    interests = d.split(":")[1]

        prof = ttl.TtlFileEntry()
        prof.property = "faculty"
        prof.name = name
        prof.education = education
        prof.title = title
        prof.department = department
        prof.email = email
        prof.room = room
        prof.phone = phone
        prof.bio = bio
        prof.interests = interests

        return prof
Пример #7
0
    def _parse_prof_site(self, website):
        webpage = requests.get(website)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        center_rail = soup.find('div', {"id": "center-rail"})
        divs = center_rail.findAll('div')
        bio_block = divs[3]
        edu_block = divs[4].find("ul")
        #if that isnt the block, the next is
        educations = ''
        if not edu_block:
            edu_block = divs[5].find("ul")
        if edu_block:
            educations = edu_block.findAll("li")

        edu_str = ""
        for e in educations:
            edu_str += e.getText() + ". \n"

        pub_str = ""
        if len(divs) > 6:
            pub_block = divs[6].find("ul")
            #if that isnt the block, either the next is or this professor doesn't have one
            if not pub_block and len(divs) > 7:
                pub_block = divs[7].find("ul")
            if pub_block:
                publications = pub_block.findAll("li")
                pub_number = 3
                count = 0
                for p in publications:
                    pub_str += p.getText() + ". \n"
                    count += 1
                    if count == pub_number:
                        break

        website_block = soup.find("div",
                                  {"id": "bodytag_2_rightrail_0_pnlSite"})
        website_str = website
        if website_block:
            website_str = website_block.select("a")[0].getText()

        prof = ttl.TtlFileEntry()
        prof.bio = bio_block.getText()
        prof.education = edu_str
        prof.publications = pub_str
        prof.website = website_str

        return prof
Пример #8
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.select('tr')
        for i in range(2, len(elems) - 14):

            prof = ttl.TtlFileEntry()

            #print (elems[i])
            nameStr = elems[i].find('strong').getText().strip()
            #print (nameStr)
            titleStr = elems[i].br.next_sibling.strip()
            #print (titleStr)
            contactStr = elems[i].select('td')[1].getText()
            contactList = contactStr.splitlines()
            #print (contactList)
            if contactList[1]:
                emailStr = contactList[1].strip()
                prof.email = emailStr
                #print (emailStr)
            if len(contactList) > 2 and contactList[2]:
                phoneStr = contactList[2].strip()
                prof.phone = phoneStr
                #print (phoneStr)
            if len(contactList) > 3 and contactList[3]:
                roomStr = contactList[3].strip()
                prof.room = roomStr
                #print (roomStr)
            interestsStr = elems[i].select('td')[2].getText().strip()
            #print (interestsStr)
            img_src = elems[i].select('img')[0]['src'].strip()
            pictureStr = self._flink + img_src
            #print (pictureStr)

            prof.name = nameStr
            prof.property = "faculty"
            prof.title = titleStr
            prof.interests = interestsStr
            prof.picture = pictureStr

            prof.write_to(ttl_file)

        ttl_file.close()

        return ttl_file
    def write_ttl(self):
        with open(self.csv_filename, 'r') as f:
            ttl_file = ttl.TtlFile(self.ttl_filename)
            reader = csv.DictReader(f)
            for row in reader:
                building = ttl.TtlFileEntry()
                building.property = "building"

                names = row["Name"]
                names = names.split(';')
                names = names + self.filter_common_names(names)

                building.name = names[0]
                building.altnames = names[1:]
                if (row["Code"]):
                    building.altnames.append(row["Code"])
                    building.altnames.append(row["Code"].title())

                building.department = row["Function"]
                building.address = row["Address"]
                building.picture = row["Picture"]
                building.website = row["Website"]

                building.mStartTime = row["MStartTime"]
                building.tStartTime = row["TStartTime"]
                building.wStartTime = row["WStartTime"]
                building.thStartTime = row["ThStartTime"]
                building.fStartTime = row["FStartTime"]
                building.saStartTime = row["SaStartTime"]
                building.suStartTime = row["SuStartTime"]
                building.mEndTime = row["MEndTime"]
                building.tEndTime = row["TEndTime"]
                building.wEndTime = row["WEndTime"]
                building.thEndTime = row["ThEndTime"]
                building.fEndTime = row["FEndTime"]
                building.saEndTime = row["SaEndTime"]
                building.suEndTime = row["SuEndTime"]

                building.write_to(ttl_file)

            ttl_file.close()
            return ttl_file
Пример #10
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.select('tr')
        for i in range(2, len(elems)-6):
            #print (elems[i])
            nameStr = elems[i].find('strong').getText()
            #print (nameStr)
            titleStr = elems[i].br.next_sibling.strip()
            #print (titleStr)
            emailStr = elems[i].select('p')[2].getText()
            phoneStr = elems[i].select('p')[3].getText()
            roomStr = elems[i].select('p')[4].getText()
            #print (emailStr)
            #print (phoneStr)
            #print (roomStr)
            interestsStr = elems[i].select('p')[5].getText().strip()
            #print (interestsStr)

            prof = ttl.TtlFileEntry()

            prof.name = nameStr
            prof.property = "faculty"
            prof.title = titleStr
            prof.email = emailStr
            prof.phone = phoneStr
            prof.room = roomStr
            prof.Interests = interestsStr

            prof.write_to(ttl_file)
    
        ttl_file.close()

        return ttl_file
Пример #11
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.findAll('tr', {"class": "FacultyTableRow"})
        for i in range(0, len(elems)):
            data = elems[i]
            div_facultyHeadshot = data.find('div',
                                            {"class": "facultyHeadshot"})
            image = div_facultyHeadshot.img
            div_fac_info = data.find('div', {"class": "fac-info"})
            h2_fname = div_fac_info.find('h2', {"class": "fname"})
            h2_fname_list = h2_fname.getText().split(",", 1)
            div_fcontact = div_fac_info.find('div', {"class": "fcontact"})
            location_text = div_fcontact.contents[3].getText()
            location_list = location_text.split("\n")

            prof = ttl.TtlFileEntry()

            if image is not None:
                prof.picture = "http://drexel.edu" + image['src']
            prof.name = h2_fname_list[0]
            prof.property = "faculty"
            if len(h2_fname_list) > 1:
                prof.degree = h2_fname_list[1]
            prof.title = div_fcontact.contents[1].getText()
            prof.office = location_list[0]
            prof.email = location_list[1]
            prof.phone = location_list[2]
            prof.department = data.find_all('td')[1].next_element

            prof.write_to(ttl_file)

        ttl_file.close()
        return ttl_file
Пример #12
0
    def _parse_prof_site(self, website):
        webpage = requests.get(website)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        header = soup.find("div", {"class": "profile-header"})

        img_src = header.find("img")["src"]
        picture = self._flink + img_src

        faculty_txt_div = header.find("div", {"class": "faculty-txt"})
        info = faculty_txt_div.select("li")
        room = info[0].getText()
        phone = info[1].getText()
        email = info[2].getText()

        name_div = header.find("div", {"class": "faculty-name"})
        name = name_div.getText().split(",")[0]

        edu_div = header.find("div", {"class": "faculty-edu"})
        next_sib = edu_div.find("h2").next_sibling
        education = ""
        while next_sib != None:
            if '<br/>' not in str(next_sib):
                education += str(next_sib) + "; "
            next_sib = next_sib.next_sibling

        prof = ttl.TtlFileEntry()
        prof.property = "faculty"
        prof.website = website
        prof.picture = picture
        prof.room = room
        prof.phone = phone
        prof.email = email
        prof.name = name
        prof.education = education

        return prof
Пример #13
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.find('tbody').select('tr')
        for i in range(0, len(elems)):
            data = elems[i].select('td')

            img = data[0].find('img')
            picture = ""
            if img:
                img_src = data[0].find('img')['src']
                picture = self._flink + img_src

            info = data[1].getText().split("\n")
            name = info[1].split(',')[0]
            title = info[2]
            phone = info[3]
            email = info[4]

            department = data[2].getText()
            
            prof = ttl.TtlFileEntry()
            prof.name = name
            prof.property = "faculty"
            prof.picture = picture
            prof.title = title
            prof.phone = phone
            prof.email = email
            prof.department = department
            prof.write_to(ttl_file)

        ttl_file.close()
        return ttl_file
Пример #14
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        table = soup.select('tbody')[0]
        elems = table.select('tr')
        for i in range(2, len(elems)):
            rows = elems[i].select("td")
            picture = rows[0].find('img')['src']
            picture = self._flink + picture
            nameStr = rows[0].find('h1').getText()
            titleStr = rows[0].find('h2').getText()
            emailStr = rows[1].find('a').getText()
            phoneStr = rows[1].find('br').next_sibling
            phoneStr = phoneStr.split(":")[1]
            phoneStr = phoneStr.split('\n')[0]
            interestsStr = rows[2].getText()

            prof = ttl.TtlFileEntry()

            prof.name = nameStr
            prof.property = "faculty"
            prof.picture = picture 
            prof.title = titleStr
            prof.email = emailStr
            prof.phone = phoneStr
            #prof.room = roomStr
            prof.Interests = interestsStr

            prof.write_to(ttl_file)
    
        ttl_file.close()

        return ttl_file
Пример #15
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")
        
        elems = soup.select('tr')
        for i in range(3, len(elems)):
            e = elems[i]
            nameStr = elems[i].find('h1').getText()
            titleStr = elems[i].find('p').getText()
            a_href = elems[i].select('a')[1]['href']
            websiteStr = self._flink + a_href
            emailStr = elems[i].select('a')[2].getText()
            if elems[i].select('img'):
                img_src = elems[i].select('img')[0]['src']
                pictureStr = self._flink + img_src
            phoneStr = elems[i].select('td')[2].find('br').next_sibling
            if not phoneStr.isspace():
                phoneStr = phoneStr.split(":")[1]
            prof = ttl.TtlFileEntry()
            prof.name = nameStr
            prof.property = 'faculty'
            prof.website = websiteStr
            prof.title = titleStr
            prof.email = emailStr
            prof.phone = phoneStr
            prof.website = websiteStr
                
            prof.write_to(ttl_file)
        
        ttl_file.close()
        return ttl_file
Пример #16
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.findAll('div', {"class" : "user-profile-stub clearfix"})
        for i in range(0, len(elems)):
            e = elems[i]
            all_text = e.findAll(text=True)
            data = list(filter(lambda a: a != "\n", all_text))
            nameEdu = data[0].split(',', 1)

            prof = ttl.TtlFileEntry()

            prof.name = nameEdu[0]
            prof.property = "faculty"
            if len(nameEdu) > 1:
                prof.education = nameEdu[1]
            prof.title = data[1]
            prof.department = data[2]
            prof.room = data[4]
            prof.phone = data[6]
            prof.email = data[8]
            if "Areas of Expertise" in data:
                aoe = data.index("Areas of Expertise")
                prof.interests = ", ".join(data[aoe+1:])

            prof.write_to(ttl_file)
        
        ttl_file.close()
        return ttl_file
Пример #17
0
    def _parse_prof_site(self, website):
        webpage = requests.get(website)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        prof_div = soup.find('div', {"class": "profile-col1"})
        img = prof_div.select('img')
        picture = ""
        if img:
            img_src = img[0]['src']
            picture = self._flink + img_src
        name = prof_div.select('h1')[0].getText()
        title = prof_div.select('h2')[0].getText()

        department = ''
        department_p = prof_div.find(
            "p", {"id": "bodytag_2_centerrail_0_pDepartment"})
        if department_p:
            department = department_p.getText().split(":")[1]

        bio = ''
        bio_div = prof_div.find(
            "div", {"id": "bodytag_2_centerrail_0_divPatientCareBio"})
        if bio_div:
            bio_p = bio_div.select("p")
            if bio_p:
                bio = bio_p[0].getText()
            else:
                bio = bio_div.getText()

        interests = ''
        interests_div = prof_div.find(
            "div", {"id": "bodytag_2_centerrail_0_divResearchOverviewText"})
        if interests_div:
            interests = interests_div.getText()

        education = ''
        education_div = prof_div.find(
            "ul", {"id": "bodytag_2_centerrail_0_ulEducation"})
        if education_div:
            for e in education_div:
                if education:
                    education = education + "; " + e.getText()
                else:
                    education = e.getText()

        email = ""
        room = ""
        phone = ""
        grey_box = soup.find('div', {"class": "grey-box"})
        if grey_box:
            a = grey_box.select('a')
            if a:
                a_href = a[0]['href']
                email = a_href.split(":")[1]

            grey_box_br = grey_box.select('br')
            grey_box_text = list(map(lambda x: x.getText(), grey_box_br))
            for j in range(0, len(grey_box_text) - 1):
                g = grey_box_text[j]
                g_next = grey_box_text[j + 1]
                grey_box_text[j] = g[:-len(g_next)]

            phone_index = -1
            for j in range(0, len(grey_box_text)):
                e = grey_box_text[j]
                if e.startswith("Phone"):
                    phone_index = j
                    break

            for e in grey_box_text[:phone_index]:
                if room:
                    room += ", " + e
                else:
                    room = e

            if phone_index != -1:
                phone = grey_box_text[phone_index].split(":")[1]

        prof = ttl.TtlFileEntry()
        prof.property = "faculty"
        prof.picture = picture
        prof.name = name
        prof.education = education
        prof.title = title
        prof.department = department
        prof.email = email
        prof.room = room
        prof.phone = phone
        prof.bio = bio
        prof.interests = interests

        return prof
Пример #18
0
    def write_ttl(self):
        ttl_file = ttl.TtlFile(self.ttl_filename)

        webpage = requests.get(self._link)
        try:
            webpage.raise_for_status()
        except Exception as exc:
            print('There was a problem: %s' % (exc))
        soup = BeautifulSoup(webpage.text, "html.parser")

        elems = soup.findAll('a')
        for i in range(0, len(elems)):
            e = elems[i]
            if "/directory/" in e[
                    "href"] and e["href"] != "/westphal/about/directory/":
                #print (e["href"])
                _plink = self._flink + e["href"]
                fpage = requests.get(_plink)
                try:
                    fpage.raise_for_status()
                except Exception as exc:
                    print('There was a problem: %s' % (exc))
                fsoup = BeautifulSoup(fpage.text, "html.parser")

                #writeHTMLFile(fsoup, "test.html")

                prof = ttl.TtlFileEntry()

                nameStr = fsoup.find('div', {
                    "class": "faculty-name"
                }).getText()
                #print (nameStr)
                titleStr = fsoup.find('div', {"class": "title"}).getText()
                #print (titleStr)

                contactStr = fsoup.find('div', {"class": "contact"}).getText()
                contactList = contactStr.splitlines()
                if "PH:" in contactList[3]:
                    phoneStr = contactList[3].split(": ")[1].replace(".", "")
                #print(phoneStr)
                if "Email:" in contactList[4]:
                    emailStr = contactList[4].split(": ")[1]
                #print(emailStr)
                if "Website:" in contactList[5]:
                    websiteStr = contactList[5]
                    if websiteStr.split(": ")[1]:
                        websiteStr = websiteStr.split(": ")[1]
                    else:
                        websiteStr = contactList[6].strip()
                #print (websiteStr)

                locationStr = fsoup.find('div', {
                    "class": "location"
                }).getText()
                locationList = locationStr.splitlines()
                if len(locationList) > 2 and locationList[2]:
                    officeStr = locationList[2]
                #print (officeStr)

                #infoStr = fsoup.find('div', {"id" : "tabs"}).getText()
                #print (infoStr)

                prof.name = nameStr.split(',', 1)[0]
                prof.property = "faculty"
                if titleStr:
                    prof.title = titleStr
                prof.phone = phoneStr
                prof.email = emailStr
                prof.website = websiteStr
                prof.room = officeStr

                prof.write_to(ttl_file)

        ttl_file.close()
        return ttl_file