def get_data(browser=browser, major=None, first_name=None, layer=0): filename = "clemson/clemson_{}{}.csv".format(major, "_{}".format(first_name) if first_name else "") if not (os.path.exists(filename)): print "{} {}".format(major, first_name) browser.get(build_url(major=major, first_name=first_name)) time.sleep(5) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "advResultsPane")) ) x = browser.find_element_by_class_name("advResultsPane") jq = pq(x.get_attribute("outerHTML")) over = len(jq(".partialResults")) dat = [{"Name": pq(x).text().strip()} for x in jq(".resultsList li a p.name")] if over and layer == 0: dat = itertools.chain( *[ get_data(browser=browser, major=major, first_name=x, layer=layer + 1) for x in get_character_permutations(num_characters=1) ] ) pd.DataFrame(data=list(dat), columns=["Name"]).to_csv(filename, index=False) return dat else: print "Skipped {} {}".format(major, first_name) return []
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import Select def get_last_names(names): profile=webdriver.FirefoxProfile() profile.set_preference('network.proxy.type', 1) profile.set_preference('network.proxy.socks', '127.0.0.1') profile.set_preference('network.proxy.socks_port', 9050) service_args = [ '--proxy=127.0.0.1:9050', '--proxy-type=socks5', ] # browser = webdriver.PhantomJS(service_args=service_args) browser = webdriver.Firefox(profile) try: for name in names: filename = "{}/{}.csv".format(folder,name) browser.get(base_url) browser.find_element_by_id("txtLastName").send_keys(name) browser.find_element_by_css_selector("input[value=SIS]").click() browser.find_element_by_css_selector("#btnSearch").click() WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.ID, "gvDirectory"))) jq = pq(browser.find_element_by_css_selector('#gvDirectory').get_attribute("outerHTML")) dat = [{"Name":pq(i)("td").eq(0).text().encode("utf-8").strip(),"Email":pq(i)("td:eq(4) a").attr("href").replace("mailto:","").encode("utf-8").strip()} for i in jq("table tr:gt(0)")] pd.DataFrame(data=dat,columns=def_col).to_csv(filename,index=False) browser.close() except Exception as e: print e browser.close() get_last_names([x.upper() for x in get_character_permutations(num_characters=1)])