def _feed_info_queue(self, url): self.logger.info("processing page %s", url) html = fetch(url, proxies=None, logger=self.logger) #print(html.capitalize()) item = extract(RULES["item_url"], html, multi=True) for i in item[:88]: self.info_queue.put_nowait(BASE_URL + i)
def _crawl_info(self, item_url): self.logger.info("processing info %s", item_url) from CustomParser.ece_utexas_parser import ECEUtexasClass sec = fetch(item_url, proxies=None, logger=self.logger) tmp = ECEUtexasClass(sec) parm = tmp.set_value() tmp.terminal_monitoring() self.parm_queue.put_nowait(parm)
def crawl_info(self): from CustomParser.cs_utexas_parser import CSUtexasClass html = fetch(self.base_url, logger=self.logger) sec = extract(RULES["item"], html, multi=True) for i in sec: if i is not None: tmp = CSUtexasClass(str(etree.tostring(i))) parm = tmp.set_value() tmp.terminal_monitoring() self.parm_queue.put_nowait(parm)
def download(url, user_id, logger): try: tmp = fetch(url, decode=False) try: with open('../PhotoTemp/{}.jpg'.format(user_id), 'wb') as f: f.write(tmp) except Exception as e: logger.warn("{} {} save failed! {}".format(url, user_id, e)) except Exception as e: logger.warn("{} {} download failded! {}".format(url, user_id, e))
def _crawl_info(self,item_url): self.logger.info("processing info %s",item_url) from ScholarConfig.me_utexas_rule import RULES from CustomParser.me_utexas_parser import MeUtexasClass from lxml import etree html=fetch(item_url,proxies=None,logger=self.logger) sec=extract(RULES["item"],html,multi=True) for i in sec: tmp = MeUtexasClass(str(etree.tostring(i))) parm = tmp.set_value() tmp.terminal_monitoring() self.parm_queue.put_nowait(parm)
def _feed_info_queue(self, url): self.logger.info("processing page %s", url) html = fetch(url, requests_session=self.requsts_session, proxies=None, logger=self.logger) #print(html.capitalize()) item = extract(self.item_url_rule, html, multi=True) if not self.is_url_joint: [self.info_queue.put_nowait(i) for i in item] else: [self.info_queue.put_nowait(self.default_url + i) for i in item]
def _crawl_info(self, item_url): self.logger.info("processing info %s", item_url) self.parse_data = auto_generate(sampleurl=self.sample_url, data=self.data, common_url=item_url) sec = fetch(item_url, requests_session=self.requsts_session, proxies=None, logger=self.logger) tmp = self.custom_parser(sec=sec, parse_data=self.parse_data) parm = tmp.set_value() tmp.terminal_monitoring() self.parm_queue.put_nowait(parm)
with open(self.proxies_or_path) as f: self.proxies = f.readlines() self.logging.info("reload %s proxies ...", len(self.proxies)) def get_proxy(self): ''' 获取一个可用代理 :return: ''' if self.is_single: return self.proxies proxy = self.proxies[random.randint(0, len(self.proxies) - 1)].strip() host, _ = proxy.split(':') lastest_time = self.host_time_map.get(host, 0) interval = time.time() - lastest_time if interval < self.interval: self.logging.info("%s waiting", proxy) time.sleep(self.interval) self.host_time_map[host] = time.time() return "http://%s" % proxy.strip() if __name__ == '__main__': from utils.connection import fetch from utils.logger import get_logger logger = get_logger("ScienceDirectTask") p = ProxyManager("./1.txt", logger) a = p.get_proxy() b = fetch(url="http://icanhazip.com", proxies=a, logger=logger) print(b)
self.email=tmp.group() def _generate_website(self): pass def _generate_cooperation(self): pass def _generate_bio(self): pass def _generate_keywords(self): pass def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from utils.connection import fetch html= fetch("http://www.me.berkeley.edu/people/faculty") #print(html) a=extract(RULES["item_url"],html,multi=True) print(a)
def _generate_keywords(self): self.keywords = extract(RULES["keyword"],self.sec,multi=True) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from utils.connection import fetch html=fetch("https://www.cs.utexas.edu/faculty") sec = extract(RULES["item"],html,multi=True) for i in sec: if i is not None: # tmp=extract(RULES["info"],str(etree.tostring(i)),multi=True)[1] # if tmp is not None: # a=extract("//a/@href",str(etree.tostring(tmp)),multi=True) # if len(a)==2: # print(a[1]) # else: # print(None) tmp=extract(RULES["keyword"],str(etree.tostring(i)),multi=True) print(tmp)
def _generate_email(self): self.email = extract(RULES["email"],self.sec) def _generate_website(self): self.website = extract(RULES["website"],self.sec) def _generate_cooperation(self): if "Research Interest" in self.sec: self.cooperation = extract(RULES["cooperation"],self.sec,multi=True) def _generate_bio(self): pass def _generate_keywords(self): a = extract(RULES["keywords"],self.sec) if a is not None: self.keywords.append(a) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from utils.connection import fetch html= fetch("http://tmi.utexas.edu/people/type/faculty/") a=extract(RULES["item_url"],html,multi=True) print(a)
self.cooperation = extract(RULES["cooperation"], self.sec, multi=True) def _generate_bio(self): self.bio = extract(RULES["bio"], self.sec) def _generate_keywords(self): self.keywords = extract(RULES["keywords"], self.sec, multi=True) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': from utils.connection import fetch html = fetch("http://www.ece.utexas.edu/people/faculty/david-soloveichik") #print(html) a = extract(RULES["keywords"], html) print(a)
def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': from utils.connection import fetch html = fetch("http://me.utexas.edu/faculty/faculty-directory") sec = extract(RULES["item"], html, multi=True) for i in sec: if i is not None: # tmp=extract(RULES["info"],str(etree.tostring(i)),multi=True)[1] # if tmp is not None: # a=extract("//a/@href",str(etree.tostring(tmp)),multi=True) # if len(a)==2: # print(a[1]) # else: # print(None) #print(etree.tostring(i)) tmp = extract(RULES["info"], str(etree.tostring(i)), multi=True)[-1] if tmp is not None: a = tmp.xpath('string(.)')
def _generate_cooperation(self): self.cooperation.append(extract(RULES["cooperation"], self.sec)) def _generate_bio(self): self.bio = extract(RULES["bio"], self.sec) def _generate_keywords(self): self.keywords.append(extract(RULES["keywords"], self.sec)) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': from utils.connection import fetch html = fetch("http://www.me.berkeley.edu/people/faculty/m-reza-alam") # print(html) a = extract(RULES["phone"], html) print(a)
self.keywords = tmp.split(",") else: self.keywords.append(tmp) else: self.keywords = [] def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': from utils.connection import fetch html = fetch("http://www.caee.utexas.edu/faculty/directory/wright") sec = extract(RULES["item_url"], html, multi=True) html_1 = fetch("http://www.caee.utexas.edu/faculty/directory/gloyna") a = extract(RULES["info"], html_1, multi=True)[-1] b = extract(RULES["cooperation"], html) c = [] print(html)
def _generate_bio(self): self.bio = extract(RULES["bio"], self.sec) def _generate_keywords(self): self.keywords = extract(RULES["keywords"], self.sec, multi=True) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': from utils.connection import fetch html = fetch( "https://www2.eecs.berkeley.edu/Faculty/Lists/list.html?_ga=2.157267064.273235644.1500137616-1698500221.1500137605" ) #print(html) a = extract(RULES["item_url"], html, multi=True) print(a)