def _generate_website(self): if 'Personal Website' in self.sec: tmp = extract(RULES["website-1"], self.sec) if tmp is not None: self.website = tmp else: self.website = extract(RULES["website-2"], self.sec)
def _generate_website(self): if "website" in self.parse_data.keys(): if self.parse_data["website"]: regex = '"(.*?)"' res = re.search(regex, str(self.parse_data["website"])) self.website = res.group() else: if extract(website_rule, self.sec): self.website = extract(website_rule, self.sec)
def _generate_website(self): tmp = extract(RULES["info"],self.sec,multi=True)[1] if tmp is not None: tmp_1 = tmp.xpath('string(.)') if tmp_1 is not None: a=extract("//a/@href",str(etree.tostring(tmp)),multi=True) if len(a)==2: self.website=a[1] else: self.website=None
def _generate_title(self): title_1 = extract(RULES["title_1"], self.sec) title_2 = extract(RULES["title_2"], self.sec) title_3 = extract(RULES["title_3"], self.sec) if title_1 is None: title_1 = "" if title_2 is None: title_2 = "" if title_3 is None: title_3 = "" self.title = "{}{}{}".format(title_1, title_2, title_3)
def _generate_phone(self): if "phone" in self.parse_data.keys(): if self.parse_data["phone"]: self.phone = self.parse_data["phone"] tmp = extract(phone_rule, self.sec) if len(tmp): self.phone = tmp.xpath('string(.)').strip().replace('Phone:','')
def _generate_avatar(self): if "avatar" in self.parse_data.keys(): if self.parse_data["avatar"]: regex = '[a-zA-z]+://[^\s]*' res = re.search(regex, str(self.parse_data["avatar"])) self.avatar = res.group() self.avatar = extract(avatar_rule, self.sec)
def _generate_phone(self): if "phone" in self.parse_data.keys(): if self.parse_data["phone"]: self.phone = self.parse_data["phone"] self.phone = extract(phone_rule, self.sec).xpath('string(.)').strip().replace( 'Office Phone:', '').strip()
def _generate_avatar(self): if "avatar" in self.parse_data.keys(): if self.parse_data["avatar"]: regex = '[a-zA-z]+://[^\s]*' res = re.search(regex, str(self.parse_data["avatar"])) self.avatar = res.group() self.avatar = "http://jacobsschool.ucsd.edu/faculty/faculty_bios/" \ + extract(avatar_rule, self.sec)
def _feed_info_queue(self, url): self.logger.info("processing page %s", url) html = fetch(url, proxies=None, logger=self.logger) #print(html.capitalize()) item = extract(RULES["item_url"], html, multi=True) for i in item[:88]: self.info_queue.put_nowait(BASE_URL + i)
def _generate_lastName(self): if "name" in self.parse_data.keys(): if self.parse_data["name"]: self.lastName = HumanName(self.parse_data["name"]).last self.lastName = HumanName( str( extract(name_rule, self.sec).xpath('string(.)').replace( 'Faculty Directory', ''))).last
def _generate_email(self): a = extract(RULES["info"], self.sec, multi=True)[-1] b = a.xpath('string(.)') if "Email" in b: c=b.replace('Email','').replace('Phone','')\ .replace('Office','').strip().split(":") self.email = c[1] else: self.email = None
def _generate_cooperation(self): tmp = extract(RULES["cooperation"], self.sec) if tmp is not None: if ";" in tmp: self.cooperation = tmp.split(";") else: self.cooperation.append(tmp) else: self.cooperation = []
def _generate_keywords(self): tmp = extract(RULES["keywords"], self.sec) if tmp is not None: if "," in tmp: self.keywords = tmp.split(",") else: self.keywords.append(tmp) else: self.keywords = []
def crawl_info(self): from CustomParser.cs_utexas_parser import CSUtexasClass html = fetch(self.base_url, logger=self.logger) sec = extract(RULES["item"], html, multi=True) for i in sec: if i is not None: tmp = CSUtexasClass(str(etree.tostring(i))) parm = tmp.set_value() tmp.terminal_monitoring() self.parm_queue.put_nowait(parm)
def _crawl_info(self,item_url): self.logger.info("processing info %s",item_url) from ScholarConfig.me_utexas_rule import RULES from CustomParser.me_utexas_parser import MeUtexasClass from lxml import etree html=fetch(item_url,proxies=None,logger=self.logger) sec=extract(RULES["item"],html,multi=True) for i in sec: tmp = MeUtexasClass(str(etree.tostring(i))) parm = tmp.set_value() tmp.terminal_monitoring() self.parm_queue.put_nowait(parm)
def _feed_info_queue(self, url): self.logger.info("processing page %s", url) html = fetch(url, requests_session=self.requsts_session, proxies=None, logger=self.logger) #print(html.capitalize()) item = extract(self.item_url_rule, html, multi=True) if not self.is_url_joint: [self.info_queue.put_nowait(i) for i in item] else: [self.info_queue.put_nowait(self.default_url + i) for i in item]
def _generate_email(self): tmp=extract(RULES["info"],self.sec,multi=True)[1] if tmp is not None: tmp_1 = tmp.xpath('string(.)') if tmp_1 is not None: emailRegex = r"([\w\.\-]+@[\w\.\-]+)" a=(tmp.xpath('string(.)').split('\\n')[1].strip()) import re b=re.search(emailRegex,str(a)) if b is not None: self.email=b.group(0) else: self.email=None
def _generate_phone(self): tmp=extract(RULES["info"],self.sec,multi=True)[1] if tmp is not None: tmp_1 = tmp.xpath('string(.)') if tmp_1 is not None: #print(tmp.xpath('string(.)').split('\\n')[1].strip().split(',')) if len(tmp_1.split('\\n')[1].strip().split(','))==4: self.phone=(tmp_1.split('\\n')[1].strip().split(',')[2]) elif len(tmp_1.split('\\n')[1].strip().split(','))==3: if '@' not in (tmp.xpath('string(.)').split('\\n')[1].strip().split(',')[1]): self.phone=(tmp_1.split('\\n')[1].strip().split(',')[1]) else: self.phone=None
def _generate_title(self): tmp = extract(RULES["info"],self.sec,multi=True)[0] if tmp is not None: tmp_1 = tmp.xpath('string(.)') if tmp_1 is not None: self.title = tmp_1.split('\\n')[2].strip()
def _generate_lastName(self): tmp = extract(RULES["info"],self.sec,multi=True)[0] self.lastName = HumanName(extract("//strong/a/text()",str(etree.tostring(tmp)))).last
def _generate_keywords(self): self.keywords = extract(RULES["keyword"],self.sec,multi=True)
self.keywords = extract(RULES["keyword"],self.sec,multi=True) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from utils.connection import fetch html=fetch("https://www.cs.utexas.edu/faculty") sec = extract(RULES["item"],html,multi=True) for i in sec: if i is not None: # tmp=extract(RULES["info"],str(etree.tostring(i)),multi=True)[1] # if tmp is not None: # a=extract("//a/@href",str(etree.tostring(tmp)),multi=True) # if len(a)==2: # print(a[1]) # else: # print(None) tmp=extract(RULES["keyword"],str(etree.tostring(i)),multi=True) print(tmp)
def _generate_email(self): if "email" in self.parse_data.keys(): if self.parse_data["email"]: self.email = self.parse_data["email"] else: self.email = extract(email_rule, self.sec).replace('mailto:', '')
def _generate_lastName(self): tmp = extract(RULES["name"],self.sec) self.lastName = HumanName(tmp).last
self.email=tmp.group() def _generate_website(self): pass def _generate_cooperation(self): pass def _generate_bio(self): pass def _generate_keywords(self): pass def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from utils.connection import fetch html= fetch("http://www.me.berkeley.edu/people/faculty") #print(html) a=extract(RULES["item_url"],html,multi=True) print(a)
def _generate_avatar(self): # regex = '[a-zA-z]+://[^\s]*' # res = re.search(regex, str(self.parse_data["avatar"])) self.avatar = "http://www.me.umn.edu/people/{}".format( extract(avatar_rule, self.sec))
def _generate_avatar(self): self.avatar = extract(RULES["avatar"],self.sec)
def _generate_bio(self): if "bio" in self.parse_data.keys(): if self.parse_data["bio"]: self.bio = self.parse_data["bio"] if len(bio_rule): self.bio = extract(bio_rule, self.sec).xpath('string(.)')
def _generate_title(self): self.title = extract(RULES["title"],self.sec)
def _generate_avatar(self): # regex = '[a-zA-z]+://[^\s]*' # res = re.search(regex, str(self.parse_data["avatar"])) self.avatar = "http://www.mccormick.northwestern.edu{}".format( extract(avatar_rule, self.sec).replace('../../..', ''))