def save_info(name, email, web_url, img_url, major, org): user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close()
def getInfo(url): try: res = fetch(url) except: return getInfo(url) tmp = extract("//div[starts-with(@class,'single-person-entry')]", res, True) for each in tmp: source = str(etree.tostring(each)) # print(source) title = extract("//span[@class='personlist-title']/text()", source) if title is not None and "Emeritus" in title: continue # print(title) # print(source) url = extract("//img[@class='left person-list']/@src", source) # print(url) if url is not None: pic_url = "http://engineering.unm.edu/faculty/directory/" + url else: pic_url = "" email = extract("//table/tr[1]/td[2]/a/@href", source) # print(email) if email is not None: email = email.split(':')[-1] else: continue name = extract("//h4/a/text()", source) web_url = extract("//h4/a/@href", source) if web_url is not None: web = "http://engineering.unm.edu/faculty/directory/" + web_url else: web = "" major = extract("//p[@class='areas']/text()", source) if major: major = "Department of " + major print(pic_url, " ", name, " ", email, " ", web, " ", major) try: pic = requests.Session().get(pic_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + pic_url + "\n") user = People(email=email, name=name, major=major, web=web) session.add(user) try: session.commit() except: session.rollback() time.sleep(1)
def get_info(url, major): global img_url try: html = fetch(url) tmp = extract('//table[@id="scholarships"]/tbody/tr', html, True) # print(tmp) except: return get_info(url, major) for i in tmp: each = str(etree.tostring(i)) web = extract("//a/@href", each) if web: web_url = "https://www.lsu.edu/" + web try: browser = webdriver.Chrome(chrome_options=SelenuimParser( stopimage=2)) browser.get(web_url) browser.set_page_load_timeout(3) except TimeoutException as e: print(e) browser.execute_script('window.stop()') finally: # source = str(etree.tostring(browser.page_source)) source = browser.page_source # print(source) email = extract("//a[contains(@href, '@')]/text()", source) # print(email) name = extract("//h1[@class='fac-name']/text()", source) img_url = extract("//div[@class='fac-photo']/img/@src", source) print(name, email, major, web_url, img_url) user = People(email=email, name=name, major=major, web=web_url) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close() browser.quit() else: continue
def getInfo(url, frontstr, org="Kansas State University"): try: res = fetch(url) except: return getInfo(url, frontstr, org) tmp = extract("//tbody/tr/td", res, True) # print(tmp) major = extract("//div[@id='ksu-unitbar']/h2/a/text()", res) for each in tmp: source = str(etree.tostring(each)) # print(source) email = extract("//a[contains(@href, '@')]/text()", source) if not email: continue name = extract("//strong/text()", source) if not name: name = extract("//strong/a/span/text()", source) web_url = extract("//a[contains(@href, '/people')]/@href", source) # print(web_url) img_url = "" if web_url: web_url = frontstr + web_url try: text = fetch(web_url) img_url = extract("//img[contains(@src, '/docs/people')]/@src", text) except: pass else: continue print(frontstr + str(img_url), " ", name, " ", email, " ", web_url, " ", major, " ", org) if img_url: try: img_url = frontstr + img_url pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n") user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1)
def get_info(url, major, org="San Diego State University"): global img_url try: browser = webdriver.Chrome() # html = fetch(url) browser.get(url) # tmp = extract("//div[@class='container']/table[@width='100%']/tbody/tr", html, True) tmp = browser.find_elements_by_xpath("//table[@width='100%']/tbody/tr") # print(tmp) except Exception as e: print(e) return get_info(url, major) for i in tmp: # each = str(etree.tostring(i)) each = i.get_attribute('innerHTML') # print(each) img = extract("//td[2]/img/@src", each) if img: img_url = "http://ccee.sdsu.edu" + img # print(img) web_url = extract("//td[5]/a/@href", each) if not web_url: web_url = "" name = extract("//td[3]/p/text()", each) email = extract("//td[6]/a/@href", each) if email: email = email.split(':')[1] else: continue print(name, email, major, web_url, img_url, org) user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url # print(img_url) try: pic = requests.Session().get(img_url, timeout=300) with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except Exception as e: print(e) print(img_url) with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n") f.close()
def get_info(url, org="CUNY--City College (Grove)"): global img_url # browser = webdriver.Chrome(chrome_options=SelenuimParser(stopimage=2, stopjs=2)) browser = webdriver.Chrome() try: browser.get(url) except Exception as e: print(e) browser.execute_script('window.stop()') finally: tmp = browser.find_elements_by_xpath("//ul[@id='list-staff']/li") for i in tmp: each = i.get_attribute('innerHTML') web = extract("//h3[@class='desktop']/a/@href", each) if web: web_url = "https://www.ccny.cuny.edu" + web else: web_url = "" email = extract("//a[contains(@href, '@')]/text()", each) # print(email) name = extract("//h3[@class='desktop']/a/text()", each) img_url = extract("//div[@class='lFloatGraphic']/a/img/@src", each) major = extract("//div[@class='inner2 floatLeft']/h4/text()", each) print(name, email, major, web_url, img_url, org) user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close() browser.quit()
def getInfo(url, major): try: res = fetch(url) except: return getInfo(url, major) tmp = extract("//div[@class='profilepage unstructuredpage page']", res, True) for each in tmp: source = str(etree.tostring(each)) try: # print(source) url = extract( "//div[@class='profileinfo-teaser-photo']/picture/img/@src", source) if url is not None: pic_url = "http://engineering.buffalo.edu/" + url else: pic_url = "" email = extract("//a[@class='longtext']/@href", source).split(':')[-1] name = extract("//a[@class='title']/b/text()", source) web_url = extract("//a[@class='title']/@href", source) if web_url is not None: web = "http://engineering.buffalo.edu/" + web_url else: web = "" print(pic_url, " ", name, " ", email, " ", web, " ", major) # sheet.write(n, 0, name) # sheet.write(n, 1, email) # sheet.write(n, 2, major) # sheet.write(n, 3, web) # book.save("/Users/sunlei/scholar-private/out_of_data_module/scholar.xls") try: pic = requests.Session().get(pic_url, timeout=20) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + pic_url + "\n") user = People(email=email, name=name, major=major, web=web) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) except Exception as e: print(e)
def get_info(url, major): global img_url try: html = fetch(url) tmp = extract('//*[@id="maincontent"]/div/div/table/tbody/tr', html, True) # print(tmp) except: return get_info(url, major) for i in tmp: each = str(etree.tostring(i)) # print(each) title = extract("//td[3]/div[2]/text()", each) # print(title) if title and "Professor" not in str(title): continue name = extract("//a[contains(@href, '/eng/bae')]/text()", each) # if not name: # continue img = extract("//img/@src", each) if img: img_url = "https://www.lsu.edu" + img email = extract("//a[contains(@href, '@')]/text()", each) if not email: continue web = extract("//a[contains(@href, '/eng/bae')]/@href", each) if web: web_url = "https://www.lsu.edu" + web else: web_url = "" print(name, email, major, web_url, img_url, title) user = People(email=email, name=name, major=major, web=web_url) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close()
def getInfo(url, major, org="New Mexico State University"): try: res = fetch(url) except: return getInfo(url, major, org) tmp = extract("//div[@class='entry-content']/table[1]/tbody/tr", res, True) # print(tmp) for each in tmp: source = str(etree.tostring(each)) # print(source) email = extract("//a[contains(@href, '@')]/@href", source) if not email: continue else: email = email.split(':')[1] fullname = extract("//td[1]/a[text()]/text()", source) if fullname: web_url = extract("//td[1]/a/@href", source) name = fullname.split(',')[1] + " " + fullname.split(',')[0] else: # fullname = extract("//td[1]/a[text()]/text()", source) name = "" # name = fullname.split(',')[1] + " " + fullname.split(',')[0] # print(web_url) img_url = extract("//td[1]//img/@src", source) # print(name) print(img_url, " ", name, " ", email, " ", web_url, " ", major, " ", org) if img_url: try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n") user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1)
def get_info(url, major): global img_url try: html = fetch(url) tmp = extract('//div[@class="col-md-9"]/div[@class="col-md-3"]', html, True) # print(tmp) except: return get_info(url, major) for i in tmp: each = str(etree.tostring(i)) web = extract("//p/a[1]/@href", each) if web: name = extract("//p/a[1]/@title", each) web_url = "https://www.lsu.edu" + web img = extract("//img/@src", each) if img: img_url = "https://www.lsu.edu" + img try: source = fetch(web_url) email = extract("//a[contains(@href, '@')]/text()", source) if not email: continue print(name, email, major, web_url, img_url) user = People(email=email, name=name, major=major, web=web_url) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url # print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close() except: print(name + "Failed") pass else: continue
def getInfo(url, major): try: res = fetch(url) except: return getInfo(url, major) tmp = extract("//div[@class='faculty-directory']/p", res, True) for each in tmp: source = str(etree.tostring(each)) # print(source) url = extract("//strong/img/@src", source) # print(url) if url is not None: pic_url = "http://www.uta.edu/ee/" + url.split("./")[-1] else: pic_url = "" email = re.search(r'<br/> <a href="(.*?)">', source, re.S) if email is not None: email = str(email).split(':')[-1].split('"')[0] else: continue name = extract("//strong/img/@alt", source) if name: name = str(name).split('Dr. ')[-1].split(",")[0] else: continue web_url = re.findall(r'\| <a href="(.*?)">', source, re.S) if web_url: web = str(web_url[0]) else: web = "" print(pic_url, " ", name, " ", email, " ", web, " ", major) try: pic = requests.Session().get(pic_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + pic_url + "\n") user = People(email=email, name=name, major=major, web=web) session.add(user) try: session.commit() except: session.rollback() time.sleep(1)
def getInfo(url, major): try: browser = webdriver.PhantomJS() browser.get(url) except: return getInfo(url, major) tmp = browser.find_elements_by_xpath( "//section[@aria-label='directory-table']/div") print(len(tmp)) for each in tmp: source = each.get_attribute("innerHTML") # print(source) pic_url = extract("//div[@class='member-photo']/img/@src", source) # print(url) if not url: pic_url = "" email = extract("//*[contains(@href, '@')]/text()", source) if email is not None: email = str(email).split(':')[-1].split('"')[0] else: continue name = extract("//div[@class='member-photo']/img/@alt", source) if not name: continue web_url = extract("//*[@class='name']/a/@href", source) if web_url: web = "https://www.wpi.edu/" + web_url else: web = "" print(pic_url, " ", name, " ", email, " ", web, " ", major) try: pic = requests.Session().get(pic_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except Exception as e: print(e) with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + pic_url + "\n") user = People(email=email, name=name, major=major, web=web) session.add(user) try: session.commit() except: session.rollback() time.sleep(1)
def getInfo(url, major): try: res = fetch(url) except: return getInfo(url, major) tmp = extract("//div[@class='staffdirectory imagebase section']", res, True) for each in tmp: source = str(etree.tostring(each)) # print(source) url = extract("//picture/img/@src", source) if url is not None: pic_url = "http://engineering.buffalo.edu/" + url else: pic_url = "" email = extract("//a[@class='longtext']/@href", source) if email is not None: email = email.split(':')[-1] else: continue name = extract("//span[@class='staff_name_bolded']/a/text()", source) if name is None: name = extract("//span[@class='staff_name_bolded']/text()", source) web_url = extract("//span[@class='staff_name_bolded']/a/@href", source) if web_url is not None: web = "http://engineering.buffalo.edu/" + web_url else: web = "" print(pic_url, " ", name, " ", email, " ", web, " ", major) try: pic = requests.Session().get(pic_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + pic_url + "\n") user = People(email=email, name=name, major=major, web=web) session.add(user) try: session.commit() except: session.rollback() time.sleep(1)
def get_info(url, org="University of Georgia"): global img_url try: html = fetch(url) tmp = extract('//div[@class="people-list"]/article', html, True) # print(tmp) except: return get_info(url) for i in tmp: each = str(etree.tostring(i)) img = extract("//div[@class='photo']/@style", each) # print(img) if not img: img_url = "" else: # name = extract("//a/img/@alt", each).split(' photo')[0] img_url = img.split('url(')[1].split(');')[0] web_url = "http://www.engr.uga.edu" + extract("//a[@class='content']/@href", each) # print(web_url) try: source = fetch(web_url) email = extract("//a[contains(@href, '@')]/text()", source) name = extract("//div[@class='col-sm-8']/h1/text()", source).split(',')[0] major = extract("//ul[@class='categories']/li[1]/a/text()", source) print(name, email, major, web_url, img_url, org) user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close() except Exception as e: print(web_url) print(e) pass
def get_info(url, major): global img_url try: html = fetch(url) tmp = extract('//div[@class="col-md-12"]/p', html, True) # print(tmp) except: return get_info(url, major) for i in tmp: each = str(etree.tostring(i)) img = extract("//a/img/@src", each) # print(img) if not img: continue else: # name = extract("//a/img/@alt", each).split(' photo')[0] img_url = "https://www.lsu.edu" + img web_url = "https://www.lsu.edu" + extract("//a/@href", each) # print(web_url) try: source = fetch(web_url) email = extract("//a[contains(@href, '@')]/text()", source) name = extract("//div[@class='col-md-12']/h2/text()", source).split(',')[0] print(name, email, major, web_url, img_url) user = People(email=email, name=name, major=major, web=web_url) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close() except Exception as e: print(web_url) print(e) pass
def get_info(url, major): global img_url try: html = fetch(url) tmp = extract("//div[@class='people-text']/h4/a/@href", html, True) except: return get_info(url, major) for each in tmp: if each: web_url = "http://www.engr.iupui.edu" + each res = fetch(web_url) img = extract("//div[@class='inset-right faculty-photo']/img/@src", res) if img: img_url = "http://www.engr.iupui.edu" + img name = extract("//div[@class='caption']/h5/text()", res) email = extract("//div[@class='caption']/p/text()[3]", res) if email: email = email.strip() else: continue print(name, email, major, web_url, img_url) user = People(email=email, name=name, major=major, web=web_url) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close()
def get_info(url, major, org): global img_url try: # html = fetch(url) browser = webdriver.Chrome(chrome_options=SelenuimParser(stopimage=2, stopjs=2)) # chrome_options = Options() # prefs = { # 'profile.default_content_setting_values': { # 'images': 2, # 'javascript': 2 # } # } # chrome_options.add_experimental_option('prefs', prefs) # chrome_options.add_argument('--headless') # browser = webdriver.Chrome(chrome_options=chrome_options) # browser.get(web_url) browser.get(url) tmp = browser.find_elements_by_xpath("//div[@class='col-sm-3 col-md-3 col-xs-6 uark-unify-heights']") # print(tmp) except: return get_info(url, major, org) for i in tmp: source = i.get_attribute("innerHTML") # print(source) email = extract("//div[@class='email']/a/text()", source) if not email: continue web = extract("//a[1]/@href", source) name = extract("//div[@class='name']/text()", source) if web: web_url = "https:" + web else: web_url = "" img = extract("//img[@class='img-responsive img-thumbnail']/@src", source) if img: img_url = "https:" + img # print(img_url) else: img_url = "" print(name, email, major, web_url, img_url, org) user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except Exception as e: print(e) session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url # print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n\n") f.close() browser.quit() time.sleep(random.choice(range(2,5)))
def get_info(url, major, org="Virginia Commonwealth University"): global img_url browser = webdriver.Chrome( chrome_options=SelenuimParser(stopimage=2, stopjs=1)) # chrome_options = Options() # prefs = { # 'profile.default_content_setting_values': { # 'images': 2, # 'javascript': 1 # } # } # chrome_options.add_experimental_option('prefs', prefs) # chrome_options.add_argument('--headless') # browser = webdriver.Chrome(chrome_options=chrome_options) # browser = webdriver.Chrome() try: browser.get(url) # print(tmp) except Exception as e: print(e) browser.execute_script('window.stop()') finally: tmp = browser.find_elements_by_xpath('//div[@class="expert clearfix"]') # print(browser.page_source) for i in tmp: each = i.get_attribute('innerHTML') # print(each) img_url = extract("//img/@src", each) # print(img) web_url = extract("//h4/a/@href", each) name = extract("//h4/a/text()", each) if name: name = name.split(',')[0] else: continue email = extract("//a[contains(@href, '@')]/text()", each) print(name, email, major, web_url, img_url, org) user = People(email=email, name=name, major=major, web=web_url, orginazation=org) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url # print(img_url) try: pic = requests.Session().get(img_url, timeout=300) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except Exception as e: print(e) print(img_url) with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n") f.close() browser.quit() time.sleep(3)
def get_info(url): global img_url, browser try: # res = fetch(url) browser = webdriver.Chrome() # print(res) browser.get(url) except Exception as e: browser.execute_script('window.stop()') print(e) finally: dean_tmp = browser.find_elements_by_xpath( '//div[@class="row multi layout-staff"][1]/div[@class="col-sm-4"]') cue_tmp = browser.find_elements_by_xpath( '//*[@id="articleBody"]/div[2]/div[@class="col-sm-4"]') cme_tmp = browser.find_elements_by_xpath( '//*[@id="articleBody"]/div[3]/div[@class="col-sm-4"]') cee_tmp = browser.find_elements_by_xpath( '//*[@id="articleBody"]/div[5]/div[@class="col-sm-4"]') ece_tmp = browser.find_elements_by_xpath( '//*[@id="articleBody"]/div[7]/div[@class="col-sm-4"]') iseem_tmp = browser.find_elements_by_xpath( '//*[@id="articleBody"]/div[9]/div[@class="col-sm-4"]') mae_tmp = browser.find_elements_by_xpath( '//*[@id="articleBody"]/div[11]/div[@class="col-sm-4"]') info = { "Office of the Dean": dean_tmp, "Center for Undergraduate Engineering Education": cue_tmp, "Chemical & Materials Engineering Department": cme_tmp, "Civil & Environmental Engineering Department": cee_tmp, "Electrical & Computer Engineering Department": ece_tmp, "Industrial & Systems Engineering and Engineering Management Department": iseem_tmp, "Mechanical & Aerospace Engineering Department": mae_tmp } exception = [ 'Office of the Dean', 'Center for Undergraduate Engineering Education', 'Mechanical & Aerospace Engineering Department' ] for each in info.keys(): for i in info[each]: # res = str(etree.tostring(i)) res = i.get_attribute("innerHTML") # print(res) major = each web = extract("//a[contains(@href,'departments')]/@href", res) if web: if "https://" in web: web_url = web else: web_url = "https://www.uah.edu" + web else: web_url = "" img = extract("//div[@class='image-holder']/img/@src", res) if img: img_url = "https://www.uah.edu" + img name = extract("//div[@class='image-holder']/img/@alt", res) if not name: continue email = extract("//a[contains(@href,'@uah.edu')]/text()", res) if not email: continue print(name, " ", email, " ", web_url, " ", img_url, " ", major) user = People(email=email, name=name, major=major, web=web_url) session.add(user) try: session.commit() except: session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url) f.close()
def get_info(url, major, org="Embry-Riddle Aeronautical University"): browser = webdriver.Chrome( chrome_options=SelenuimParser(stopimage=2, stopjs=1)) try: browser.get(url) except: browser.execute_script('window.stop()') finally: tmp = browser.find_elements_by_class_name('media') # print(tmp) for each in tmp: res = each.get_attribute('innerHTML') # print(res) img_url = extract("//img[@class='media-object']/@src", res) if not img_url: name = extract("//h4[@class='media-heading name']/text()", res) else: name = extract("//img[@class='media-object']/@alt", res) web = "" id = extract('//div/a[@data-toggle="modal"]/@data-faculty-uid', res) if id: js_url = 'https://webforms.erau.edu/common/services/peoplesearch/faculty.cfc?callback=jQuery1113072479908569549_1541990949502&method=getFacultyByUid&returnformat=plain&uidList=' + id try: text = fetch(js_url) email = re.findall(r'"EMAIL":"(.*?)",', text, re.S)[0] print(email, name, web, img_url, major, org) user = People(email=email, name=name, major=major, web=web, orginazation=org) session.add(user) try: session.commit() except Exception as e: print(e) session.rollback() time.sleep(1) if img_url is not None and email is not None: # img = "http://be.utdallas.edu" + img_url # print(img_url) try: pic = requests.Session().get(img_url, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n\n") f.close() except Exception as e: print(e) pass else: continue browser.quit() time.sleep(random.choice(range(2, 5)))
def get_info(url, org="Florida A&M University - Florida State University"): browser = webdriver.Chrome( chrome_options=SelenuimParser(stopimage=2, stopjs=2)) # browser = webdriver.Chrome() try: browser.get(url) except: browser.execute_script('window.stop()') finally: tmp = browser.find_elements_by_xpath( "//div[contains(@class, 'col-xs-6 col-sm-4 col-md-3 col-lg-2 views-col')]" ) # print(tmp) for each in tmp: res = each.get_attribute('innerHTML') # print(res) web_url = extract("//div[@class='bio-photo']/div/a/@href", res) # print(web_url) if web_url: web = "https://eng.famu.fsu.edu" + web_url else: continue major = extract( "//div[@class='views-field views-field-field-department-s-']/div/text()", res) print(major) if major not in major_list: continue img_url = extract("//div[@class='bio-photo']/div/a/img/@src", res) if not img_url: name = extract( "//h4[@class='views-field views-field-title faculty-name'/]span/a/text()", res).split(',')[0] else: name = extract("//div[@class='bio-photo']/div/a/img/@title", res).split(',')[0] try: text = fetch(web) email = extract('//div[contains(text(), "@")]/text()', text) if not email: continue print(email, name, web, major, org) user = People(email=email, name=name, major=major, web=web, orginazation=org) session.add(user) try: session.commit() except Exception as e: print(e) session.rollback() time.sleep(random.choice(range(0, 3))) if img_url is not None and email is not None: img = "https://eng.famu.fsu.edu" + img_url print(img) try: pic = requests.Session().get(img, timeout=30) with open( "/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg", "wb") as f: f.write(pic.content) f.close() except: with open( "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f: f.write(email + " : " + img_url + "\n\n") f.close() except Exception as e: print(e) pass browser.quit()