def parse(self, response): # print response.body # return for row in response.xpath( '//*[@id="gs_cit_list_table"]/tr[position()>1]'): item = GooglePublicationItem() item['cate1_id'] = response.meta['cate1_id'] item['cate2_id'] = response.meta['cate2_id'] item['name'] = DataFilter.simple_format( row.xpath('td[position()=2]').extract()) item['desc'] = '' item['h5_idx'] = DataFilter.simple_format( row.xpath('td[position()=3]').extract()) item['h5_med'] = DataFilter.simple_format( row.xpath('td[position()=4]').extract()) item['rank'] = DataFilter.simple_format( row.xpath('td[position()=1]').extract()) item['create_time'] = mysql_datetime() article_list_url = "%s%s" % ( self.domain, DataFilter.simple_format( row.xpath('td[position()=3]/a/@href').extract())) publication_id = MYSQLUtils.save(self, "google_publication", item)[0] response.meta['publication_id'] = publication_id response.meta['h5_idx'] = item['h5_idx'] yield Request(article_list_url, callback=self.parse_article_list, meta=response.meta)
def parse(self, response): author_items = [] for author in response.xpath('//*[contains(@class, "authorName")]'): author_item = GoogleAuthorsItem() author_item['publication_id'] = response.meta['publication_id'] author_item['article_id'] = response.meta['article_id'] author_item['affiliation_id'] = '' author_item['fullname'] = DataFilter.simple_format( author.xpath('.').extract()) author_item['create_time'] = mysql_datetime() author_items.append(author_item) affiliation_items = [] for affiliation in response.xpath( '//*[contains(@class, "affiliation")]'): affiliation_item = GoogleAffiliationItem() affiliation_item['publication_id'] = response.meta[ 'publication_id'] affiliation_item['article_id'] = response.meta['article_id'] affiliation_item['desc'] = DataFilter.simple_format( affiliation.xpath('.').extract()) affiliation_item['create_time'] = mysql_datetime() affiliation_items.append(affiliation_item) MYSQLUtils.save(self, "google_authors", author_items) MYSQLUtils.save(self, "google_affiliations", affiliation_items)
def parse_cate2(self, response): cate1_id = response.meta['cate1_id'] items = [] for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'): item = GoogleCategoryItem() item['fid'] = cate1_id item['name'] = DataFilter.simple_format(a.xpath('.').extract()) item['cate_url'] = "%s%s" % (self.domain, DataFilter.simple_format( a.xpath("./@href").extract())) item['create_time'] = mysql_datetime() items.append(item) MYSQLUtils.save(self, "google_category", items)
def parse_candidate_research_item(self, response, cb_id): now_time = mysql_datetime() items = [] item = CandidateResearchItem() item['cb_id'] = cb_id item['interests'] = DataFilter.simple_format( response.xpath('//*[@id="field-research-interests"]').extract()) item['current_research'] = DataFilter.simple_format( response.xpath('//*[@id="field-current-research"]').extract()) item['research_summary'] = DataFilter.simple_format( response.xpath('//*[@id="field-research-summary"]').extract()) item['create_time'] = now_time items.append(item) return items
def parse(self, response): for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'): cate1_url = "%s%s" % (self.domain, DataFilter.simple_format( a.xpath("./@href").extract())) cate1_name = DataFilter.simple_format(a.xpath('.').extract()) item = GoogleCategoryItem() item['fid'] = 0 item['name'] = cate1_name item['cate_url'] = cate1_url item['create_time'] = mysql_datetime() cate1_id = MYSQLUtils.save(self, "google_category", item)[0] yield Request(cate1_url, callback=self.parse_cate2, meta={"cate1_id": cate1_id})
def parse_staff_profile_url(self, staff): profile = DataFilter.simple_format( staff.xpath( 'descendant::div[contains(@class, "field-name-field-contact-profile-url")]/descendant::a/@href' ).extract()) return profile pass
def parse_candidate_publications_item(self, response, cb_id): now_time = mysql_datetime() items = [] pub_items = response.xpath( '//*[@id="field-recent-pubs"]/descendant::p') for pub_item in pub_items: item = CandidatePublicationsItem() # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理 item['cb_id'] = cb_id #item['publications'] = DataFilter.simple_format(pub_item.xpath("./text()[normalize-space(.)]").extract()) item['publications'] = DataFilter.simple_format( pub_item.xpath(".//text()[normalize-space(.)]").extract()) publication_item = pub_item.xpath( ".//text()[normalize-space(.)]").extract() if isinstance(publication_item, list): for i in publication_item: print MySQLdb.escape_string(i) print len(publication_item) if not item['publications']: continue item['create_time'] = now_time items.append(item) # print items return items pass
def parse_text_by_multi_content(content, delimiter): text = '' for c in content: text = "%s%s%s" % ( text, DataFilter.simple_format(c.xpath('.').extract()), delimiter) text = text[:-len(delimiter)] return text
def parse_item(self, response): print response.body # 在新页面爬取电话号码和email的信息,整合到传递过来的item中 item = response.meta['basic_item'] item['phonenumber'] = DataFilter.simple_format( response.xpath( '//*[@itemprop="telephone"]/text()[normalize-space(.)]'). extract()) item['email'] = DataFilter.simple_format( response.xpath( '//*[@itemprop="email"]/text()[normalize-space(.)]').extract()) #打印一下basic item print item #接下来就是save basic item research interests item 这些到mysql,编码省略 pass
def parse_article(self, response): for row in response.xpath( '//*[@id="gs_cit_list_table"]/tr[position()>1]'): article_selector = row.xpath('td[position()=1]') ref_selector = row.xpath('td[position()=2]') response.meta['publication_id'] = response.meta['publication_id'] response.meta['cate1_id'] = response.meta['cate1_id'] response.meta['cate2_id'] = response.meta['cate2_id'] response.meta['article_title'] = DataFilter.simple_format( article_selector.xpath('descendant::span[1]').extract()) response.meta['article_link'] = DataFilter.simple_format( article_selector.xpath('descendant::a/@href').extract()) response.meta['article_authors'] = DataFilter.simple_format( article_selector.xpath('descendant::span[2]').extract()) response.meta['publish_info'] = DataFilter.simple_format( article_selector.xpath('descendant::span[3]').extract()) response.meta['ref_link'] = "%s%s" % ( self.domain, DataFilter.simple_format( ref_selector.xpath('descendant::a/@href').extract())) response.meta['ref_count'] = DataFilter.simple_format( ref_selector.xpath('descendant::a').extract()) response.meta['publish_date'] = DataFilter.simple_format( row.xpath('td[position()=3]').extract()) response.meta['create_time'] = mysql_datetime() yield Request(response.meta['article_link'], callback=self.insert_article, meta=response.meta)
def parse_article_list(self, response): page_size = 20 h5_idx = response.meta['h5_idx'] page_count = int(math.ceil(int(h5_idx) / page_size)) for page_number in range(1, page_count + 1): cstart = (page_number - 1) * page_size url = DataFilter.add_url_parameter(response.url, "cstart=%d" % cstart) yield Request(url, callback=self.parse_article, meta=response.meta)
def insert_article(self, response): article_link = response.url content = DataFilter.simple_format( response.xpath('//meta[@http-equiv="refresh"]/@content').extract()) article_link_match = re.search(self.article_link_pattern, content) if article_link_match: article_link = article_link_match.group(1) item = GoogleArticlesItem() for key in MYSQLUtils.get_columns_by_item(item): item[key] = response.meta[key] item['article_link'] = article_link article_id = MYSQLUtils.save(self, "google_articles", item)[0] response.meta['article_id'] = article_id
def parse_guesser(self, response): # node = response.xpath("/html/body[1]/div[4]/div[1]/div[1]/div[2]/div[1]/div[4]/div[1]/div[1]/div[1]/div[ # 1]/div[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/p[1]") # print response.body nodes = extract_guesser_nodes(response) # print nodes # 遍历last节点得到probability # pprint.pprint(nodes["last_nodes"]) for xpath in nodes["last_nodes"]: node = nodes["last_nodes"][xpath]["node"] text = DataFilter.simple_format(node.xpath(".").extract()) unknown_data, analyzer_result = BayesUtils.vectorize_unknown( [text], self.vectorizer) is_meaningful = check_text_meaningful(text, analyzer_result) if not is_meaningful: continue proba = self.clf.predict_proba(unknown_data) # break print nodes["last_nodes"][xpath]["node"] print text print proba # print nodes # # # for xpath in nodes["last2_nodes"]: # node = nodes["last2_nodes"][xpath]["node"] # text = [DataFilter.simple_format(node.xpath(".").extract())] # is_meaningful = check_text_meaningful(text) # if not is_meaningful: # continue # unknown_data = BayesUtils.vectorize_unknown(text, self.vectorizer) # proba = self.clf.predict_proba(unknown_data) # print nodes["last2_nodes"][xpath]["node"] # print text # print proba # # for xpath in nodes["last3_nodes"]: # node = nodes["last3_nodes"][xpath]["node"] # text = [DataFilter.simple_format(node.xpath(".").extract())] # is_meaningful = check_text_meaningful(text) # if not is_meaningful: # continue # unknown_data = BayesUtils.vectorize_unknown(text, self.vectorizer) # proba = self.clf.predict_proba(unknown_data) # print nodes["last3_nodes"][xpath]["node"] # print text # print proba pass
def parse_candidate_publications_item(self, response, cb_id): now_time = mysql_datetime() items = [] pub_items = response.xpath( '//*[@id="field-recent-pubs"]/descendant::p') for pub_item in pub_items: item = CandidatePublicationsItem() item['cb_id'] = cb_id item['publications'] = DataFilter.simple_format( pub_item.xpath('./text()[normalize-space(.)]').extract()) if (not item['publications']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_courses_item(self, response, cb_id): now_time = mysql_datetime() items = [] course_items = response.xpath( '//*[@id="field-courses-taught"]/descendant::li') for course_item in course_items: item = CandidateCoursesItem() item['cb_id'] = cb_id item['courses_no'] = '0' item['courses_desc'] = DataFilter.simple_format( course_item.xpath('./text()[normalize-space(.)]').extract()) if (not item['courses_desc']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_publications_item(self, summary, cb_id): now_time = mysql_datetime() items = [] pub_items = self.parse_content(summary, "Representative Publications") for pub_item in pub_items: item = CandidatePublicationsItem() # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理 item['cb_id'] = cb_id item['publications'] = DataFilter.simple_format( pub_item.xpath('.').extract()) if not item['publications']: continue item['create_time'] = now_time items.append(item) print items return items pass
def parse_content(self, summary, head_str): is_end = False next_h2 = summary.xpath( 'descendant::h2[text()="%s"]/following-sibling::h2[1]' % head_str) if not next_h2: is_end = True if is_end: content = summary.xpath( 'descendant::h2[text()="%s"]/following-sibling::p' % head_str) else: next_h2_txt = DataFilter.simple_format(next_h2.extract()) content = summary.xpath( 'descendant::p[preceding-sibling::h2/text()="%s" and following-sibling::h2/text()="%s"]' % (head_str, next_h2_txt)) print content # summary.xpath('descendant::p[preceding-sibling::h2/text()='%s' and following-sibling::h2/@property='p2']') return content pass
def parse_candidate_education_item(self, response, cb_id): now_time = mysql_datetime() items = [] edu_items = response.xpath('//*[@id="field-education"]/descendant::li') for edu_item in edu_items: item = CandidateEducationItem() item['cb_id'] = cb_id item['college'] = '' item['discipline'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['degree'] = '' item['desc'] = DataFilter.simple_format( edu_item.xpath('./text()[normalize-space(.)]').extract()) if (not item['desc']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_basic_item(self, response): item = CandidateBasicItem() item['country_id'] = self.country_id item['college_id'] = self.college_id item['discipline_id'] = '0' item['fullname'] = DataFilter.simple_format( response.xpath( '//h1[@id="page-title"]/text()[normalize-space(.)]').extract()) item['academic_title'] = DataFilter.simple_format( response.xpath( '//div[contains(@class, "field-label") and contains(text(), "Academic Title")]/following-sibling::*' ).extract()) item['other_title'] = DataFilter.simple_format( response.xpath( '//div[contains(@class, "field-label") and contains(text(), "Other Titles")]/following-sibling::*' ).extract()) item['nationality'] = get_chinese_by_fullname(item['fullname'], surname_list) item['email'] = DataFilter.simple_format( response.xpath( '//a[contains(@href, "mailto:")]/text()[normalize-space(.)]'). extract()) item['phonenumber'] = DataFilter.simple_format( response.xpath( '//*[contains(@class, "fa-phone")]/parent::*/following-sibling::*' ).extract()) item['external_link'] = DataFilter.simple_format( response.xpath( '//*[contains(@class, "fa-external-link")]/parent::*/following-sibling::*' ).extract()) item['experience'] = '' item['desc'] = '' item['avatar_url'] = DataFilter.simple_format( response.xpath( '//div[contains(@class, "field-name-field-profile-photo")]/descendant::img/@src' ).extract()) item['create_time'] = mysql_datetime() item['extra'] = '' item['url'] = response.url #items.append(item) # print items return item pass
def parse_candidate_workexperience_item(self, response, cb_id): now_time = mysql_datetime() items = [] workexperience_items = response.xpath( '//*[@id="field-professional-experience"]/descendant::p') for workexperience_item in workexperience_items: item = CandidateWorkexperienceItem() item['cb_id'] = cb_id item['job_title'] = '' item['company'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['desc'] = DataFilter.simple_format( workexperience_item.xpath( './text()[normalize-space(.)]').extract()) if (not item['desc']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_education_item(self, response, cb_id): now_time = mysql_datetime() items = [] edu_items = response.xpath('//*[@id="field-education"]/descendant::li') for edu_item in edu_items: item = CandidateEducationItem() # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理 item['cb_id'] = cb_id item['college'] = '' item['discipline'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['degree'] = '' item['desc'] = DataFilter.simple_format( edu_item.xpath("./text()[normalize-space(.)]").extract()) if not item['desc']: continue item['create_time'] = now_time items.append(item) # print items return items pass
def parse_candidate_basic_item(self, staff): items = [] item = CandidateBasicItem() item['country_id'] = self.country_id item['college_id'] = self.college_id item['discipline_id'] = '0' item['fullname'] = DataFilter.simple_format( staff.xpath('descendant::*[@property="schema:name"]').extract()) item['academic_title'] = DataFilter.simple_format( staff.xpath( 'descendant::*[@property="schema:jobTitle"]').extract()) item['other_title'] = '' item['nationality'] = get_chinese_by_fullname(item['fullname'], surname_list) item['email'] = DataFilter.simple_format( staff.xpath('descendant::*[@property="schema:email"]').extract()) item['phonenumber'] = DataFilter.simple_format( staff.xpath( 'descendant::*[@property="schema:telephone"]').extract()) item['external_link'] = DataFilter.simple_format( staff.xpath( 'descendant::*[contains(@class, "field-name-field-contact-website-url")]\ /descendant::a/@href').extract()) item['experience'] = '' item['desc'] = '' item['avatar_url'] = DataFilter.simple_format( staff.xpath( 'descendant::*[contains(@class, "field-name-field-contact-image")]\ /descendant::img/@src').extract()) item['create_time'] = mysql_datetime() location = DataFilter.simple_format( staff.xpath( 'descendant::*[@property="schema:workLocation"]').extract()) group = parse_text_by_multi_content( staff.xpath('descendant::*[@rel="schema:affiliation"]'), ",") item['extra'] = '{"location": "%s", "group": "%s"}' % (location, group) items.append(item) print items return items pass
def parse(self, response): print response.body # return #i = 0 #该动态网页的编程思路是携带,技术实现是用scrapy的meta来携带上下文数据 #1.用正则提取出url item = CandidateBasicItem() item['country_id'] = self.country_id item['college_id'] = self.college_id item['discipline_id'] = '0' item['avatar_url'] = DataFilter.simple_format( response.xpath('//*[@id="faculty_image"]/img/@src').extract()) #因为是举个例子,因此没有所有都进行爬取 hcard_match = re.search(self.hcard_pattern, response.body) meta = {"basic_item": item} #提醒一下,除了basic信息,其他research interests和publication这些item信息也要一并进行传递,我这里是个例子,所以仅传递了basic if hcard_match: hcard_url = hcard_match.group(1) print hcard_url return Request(hcard_url, callback=self.parse_item, meta=meta) else: #进行一些类似continue跳过循环的处理,因为爬取不到教授信息 pass