def save_new_proxy_record(one_proxy_dict_data): '''Examine the given single proxy dict object and store into "new" collection ''' db = client["proxypool"] new_storage = db["new"] all_storage = db["all"] # If historical collection contains the proxy information (Key: ip), then ignore if len([i for i in all_storage.find({"ip": one_proxy_dict_data["ip"]})]) > 0: log_print("Found duplicate proxy for " + one_proxy_dict_data["ip"] + ", ignore...") else: log_print("Store " + one_proxy_dict_data["ip"]) new_storage.insert(one_proxy_dict_data) all_storage.insert({"ip": one_proxy_dict_data["ip"]})
# url_pages are a list of strings to be inserted into the url template {NUM} location url_pages = [""] + [i for i in range(1, 3)] # xpath rules to locate data fields xpath = { "ip": "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[1]/font[2]", "port": "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[1]/font[2]", "protocol": "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[2]/a/font[1]", "country": "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[4]/a", } # Lambda functions to further extract the information extractor = { "ip": lambda text: text.split(":")[0], "port": lambda text: text.split(":")[1], "protocol": lambda text: text, "country": lambda text: text, } while True: proxy_list_of_dicts = scrape(url_template, url_pages, xpath, extractor) for proxy_dict in proxy_list_of_dicts: save_new_proxy_record(proxy_dict) log_print("Finished one round of scraping, sleep for " + str(time_interval) + " seconds") sleep(time_interval)
def scrape(url_template, url_pages, xpath, extractor, sleep_before_scrape=0): '''Function for configuring metrics and scraping the target website via given defined rules. Parameters ---------- url_template : string Url base string, with {NUM} indicating the placeholder for different pages url_pages : list A list of string or integer to insert into url_template {NUM} xpath : dict Containing xpath rules for ip, ports, protocol, country extractor : dict Used for further extraction for strings extracted from the xpath rules sleep_time : integer Time in seconds to wait before starting the xpath extraction Return --------- Proxy information as a list of dict objects ''' log_print("Scraping starts...") # Generate all urls to iterate through urls = [ url_template.replace("{NUM}", str(page_num)) for page_num in url_pages ] ips, ports, protocols, countries = [], [], [], [] # Init Chrome Webdriver driver = webdriver.PhantomJS(service_args=["--webdriver-loglevel=NONE"]) # Set Viewport driver.set_window_size(1920, 1080) for url in urls: log_print("Fetching " + url) driver.get(url) sleep(sleep_before_scrape) ips += [ extractor["ip"](ip_element.text) for ip_element in driver.find_elements_by_xpath(xpath["ip"]) ] ports += [ extractor["port"](port_element.text) for port_element in driver.find_elements_by_xpath(xpath["port"]) ] protocols += [ extractor["protocol"](protocol_element.text) for protocol_element in driver.find_elements_by_xpath(xpath["protocol"]) ] countries += [ extractor["country"](country_element.text) for country_element in driver.find_elements_by_xpath(xpath["country"]) ] html = driver.page_source # Close the selenium driver to prevent memory leaking driver.close() if len(ips) != len(ports) != len(protocols) != len(countries): log_print("Error! Number of data fields collected mismatch: " + str(len(ips)) + " " + str(len(ports)) + " " + str(len(protocols)) + str(len(countries))) exit() if len(ips) == 0: log_print("Something went wrong, there are no proxies fetched...") log_print(html) exit() index_to_be_deleted = [] # Filters out the proxies that only supports SOCKS protocol for i in range(len(ips)): protocol = check_protocol(protocols[i]) if not protocol: index_to_be_deleted.append(i) else: protocols[i] = protocol # Iterate through indexes and delete them for i in index_to_be_deleted: del ips[i] del ports[i] del protocols[i] del countries[i] log_print("Fetched total " + str(len(ips)) + " proxies") return _make_dicts(ips, ports, protocols, countries)
from pymongo import MongoClient from loglib import log_print log_print("Initialize MongoDB Connection...") client = MongoClient("", 27017, authSource='admin', username="", password="") def save_new_proxy_record(one_proxy_dict_data): '''Examine the given single proxy dict object and store into "new" collection ''' db = client["proxypool"] new_storage = db["new"] all_storage = db["all"] # If historical collection contains the proxy information (Key: ip), then ignore if len([i for i in all_storage.find({"ip": one_proxy_dict_data["ip"]})]) > 0: log_print("Found duplicate proxy for " + one_proxy_dict_data["ip"] + ", ignore...") else: log_print("Store " + one_proxy_dict_data["ip"]) new_storage.insert(one_proxy_dict_data) all_storage.insert({"ip": one_proxy_dict_data["ip"]})