def parse(self, response): author_items = [] for author in response.xpath('//*[contains(@class, "authorName")]'): author_item = GoogleAuthorsItem() author_item['publication_id'] = response.meta['publication_id'] author_item['article_id'] = response.meta['article_id'] author_item['affiliation_id'] = '' author_item['fullname'] = DataFilter.simple_format( author.xpath('.').extract()) author_item['create_time'] = mysql_datetime() author_items.append(author_item) affiliation_items = [] for affiliation in response.xpath( '//*[contains(@class, "affiliation")]'): affiliation_item = GoogleAffiliationItem() affiliation_item['publication_id'] = response.meta[ 'publication_id'] affiliation_item['article_id'] = response.meta['article_id'] affiliation_item['desc'] = DataFilter.simple_format( affiliation.xpath('.').extract()) affiliation_item['create_time'] = mysql_datetime() affiliation_items.append(affiliation_item) MYSQLUtils.save(self, "google_authors", author_items) MYSQLUtils.save(self, "google_affiliations", affiliation_items)
def parse_article(self, response): for row in response.xpath( '//*[@id="gs_cit_list_table"]/tr[position()>1]'): article_selector = row.xpath('td[position()=1]') ref_selector = row.xpath('td[position()=2]') response.meta['publication_id'] = response.meta['publication_id'] response.meta['cate1_id'] = response.meta['cate1_id'] response.meta['cate2_id'] = response.meta['cate2_id'] response.meta['article_title'] = DataFilter.simple_format( article_selector.xpath('descendant::span[1]').extract()) response.meta['article_link'] = DataFilter.simple_format( article_selector.xpath('descendant::a/@href').extract()) response.meta['article_authors'] = DataFilter.simple_format( article_selector.xpath('descendant::span[2]').extract()) response.meta['publish_info'] = DataFilter.simple_format( article_selector.xpath('descendant::span[3]').extract()) response.meta['ref_link'] = "%s%s" % ( self.domain, DataFilter.simple_format( ref_selector.xpath('descendant::a/@href').extract())) response.meta['ref_count'] = DataFilter.simple_format( ref_selector.xpath('descendant::a').extract()) response.meta['publish_date'] = DataFilter.simple_format( row.xpath('td[position()=3]').extract()) response.meta['create_time'] = mysql_datetime() yield Request(response.meta['article_link'], callback=self.insert_article, meta=response.meta)
def parse_candidate_publications_item(self, response, cb_id): now_time = mysql_datetime() items = [] pub_items = response.xpath( '//*[@id="field-recent-pubs"]/descendant::p') for pub_item in pub_items: item = CandidatePublicationsItem() # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理 item['cb_id'] = cb_id #item['publications'] = DataFilter.simple_format(pub_item.xpath("./text()[normalize-space(.)]").extract()) item['publications'] = DataFilter.simple_format( pub_item.xpath(".//text()[normalize-space(.)]").extract()) publication_item = pub_item.xpath( ".//text()[normalize-space(.)]").extract() if isinstance(publication_item, list): for i in publication_item: print MySQLdb.escape_string(i) print len(publication_item) if not item['publications']: continue item['create_time'] = now_time items.append(item) # print items return items pass
def parse(self, response): # print response.body # return for row in response.xpath( '//*[@id="gs_cit_list_table"]/tr[position()>1]'): item = GooglePublicationItem() item['cate1_id'] = response.meta['cate1_id'] item['cate2_id'] = response.meta['cate2_id'] item['name'] = DataFilter.simple_format( row.xpath('td[position()=2]').extract()) item['desc'] = '' item['h5_idx'] = DataFilter.simple_format( row.xpath('td[position()=3]').extract()) item['h5_med'] = DataFilter.simple_format( row.xpath('td[position()=4]').extract()) item['rank'] = DataFilter.simple_format( row.xpath('td[position()=1]').extract()) item['create_time'] = mysql_datetime() article_list_url = "%s%s" % ( self.domain, DataFilter.simple_format( row.xpath('td[position()=3]/a/@href').extract())) publication_id = MYSQLUtils.save(self, "google_publication", item)[0] response.meta['publication_id'] = publication_id response.meta['h5_idx'] = item['h5_idx'] yield Request(article_list_url, callback=self.parse_article_list, meta=response.meta)
def parse_cate2(self, response): cate1_id = response.meta['cate1_id'] items = [] for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'): item = GoogleCategoryItem() item['fid'] = cate1_id item['name'] = DataFilter.simple_format(a.xpath('.').extract()) item['cate_url'] = "%s%s" % (self.domain, DataFilter.simple_format( a.xpath("./@href").extract())) item['create_time'] = mysql_datetime() items.append(item) MYSQLUtils.save(self, "google_category", items)
def parse_candidate_research_item(self, response, cb_id): now_time = mysql_datetime() items = [] item = CandidateResearchItem() item['cb_id'] = cb_id item['interests'] = DataFilter.simple_format( response.xpath('//*[@id="field-research-interests"]').extract()) item['current_research'] = DataFilter.simple_format( response.xpath('//*[@id="field-current-research"]').extract()) item['research_summary'] = DataFilter.simple_format( response.xpath('//*[@id="field-research-summary"]').extract()) item['create_time'] = now_time items.append(item) return items
def parse_candidate_publications_item(self, response, cb_id): now_time = mysql_datetime() items = [] pub_items = response.xpath( '//*[@id="field-recent-pubs"]/descendant::p') for pub_item in pub_items: item = CandidatePublicationsItem() item['cb_id'] = cb_id item['publications'] = DataFilter.simple_format( pub_item.xpath('./text()[normalize-space(.)]').extract()) if (not item['publications']): continue item['create_time'] = now_time items.append(item) return items
def parse(self, response): for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'): cate1_url = "%s%s" % (self.domain, DataFilter.simple_format( a.xpath("./@href").extract())) cate1_name = DataFilter.simple_format(a.xpath('.').extract()) item = GoogleCategoryItem() item['fid'] = 0 item['name'] = cate1_name item['cate_url'] = cate1_url item['create_time'] = mysql_datetime() cate1_id = MYSQLUtils.save(self, "google_category", item)[0] yield Request(cate1_url, callback=self.parse_cate2, meta={"cate1_id": cate1_id})
def parse_candidate_research_item(self, summary, cb_id): now_time = mysql_datetime() items = [] item = CandidateResearchItem() item['cb_id'] = cb_id interests = self.parse_content(summary, "Research Interests") item['interests'] = parse_text_by_multi_content(interests, "\n") item['current_research'] = '' item['research_summary'] = '' item['create_time'] = now_time items.append(item) print items return items pass
def parse_candidate_courses_item(self, response, cb_id): now_time = mysql_datetime() items = [] course_items = response.xpath( '//*[@id="field-courses-taught"]/descendant::li') for course_item in course_items: item = CandidateCoursesItem() item['cb_id'] = cb_id item['courses_no'] = '0' item['courses_desc'] = DataFilter.simple_format( course_item.xpath('./text()[normalize-space(.)]').extract()) if (not item['courses_desc']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_publications_item(self, summary, cb_id): now_time = mysql_datetime() items = [] pub_items = self.parse_content(summary, "Representative Publications") for pub_item in pub_items: item = CandidatePublicationsItem() # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理 item['cb_id'] = cb_id item['publications'] = DataFilter.simple_format( pub_item.xpath('.').extract()) if not item['publications']: continue item['create_time'] = now_time items.append(item) print items return items pass
def parse_candidate_workexperience_item(self, summary, cb_id): now_time = mysql_datetime() items = [] desc = self.parse_content(summary, "Industrial and Sabbatical Experience") item = CandidateWorkexperienceItem() item['cb_id'] = cb_id item['job_title'] = '' item['company'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['desc'] = parse_text_by_multi_content(desc, "\n") item['create_time'] = now_time items.append(item) print items return items pass
def parse_candidate_education_item(self, summary, cb_id): now_time = mysql_datetime() items = [] desc = self.parse_content(summary, "Degrees and Awards") item = CandidateEducationItem() item['cb_id'] = cb_id item['college'] = '' item['discipline'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['degree'] = '' item['desc'] = parse_text_by_multi_content(desc, "\n") item['create_time'] = now_time items.append(item) print items return items pass
def parse_candidate_basic_item(self, response): item = CandidateBasicItem() item['country_id'] = self.country_id item['college_id'] = self.college_id item['discipline_id'] = '0' item['fullname'] = DataFilter.simple_format( response.xpath( '//h1[@id="page-title"]/text()[normalize-space(.)]').extract()) item['academic_title'] = DataFilter.simple_format( response.xpath( '//div[contains(@class, "field-label") and contains(text(), "Academic Title")]/following-sibling::*' ).extract()) item['other_title'] = DataFilter.simple_format( response.xpath( '//div[contains(@class, "field-label") and contains(text(), "Other Titles")]/following-sibling::*' ).extract()) item['nationality'] = get_chinese_by_fullname(item['fullname'], surname_list) item['email'] = DataFilter.simple_format( response.xpath( '//a[contains(@href, "mailto:")]/text()[normalize-space(.)]'). extract()) item['phonenumber'] = DataFilter.simple_format( response.xpath( '//*[contains(@class, "fa-phone")]/parent::*/following-sibling::*' ).extract()) item['external_link'] = DataFilter.simple_format( response.xpath( '//*[contains(@class, "fa-external-link")]/parent::*/following-sibling::*' ).extract()) item['experience'] = '' item['desc'] = '' item['avatar_url'] = DataFilter.simple_format( response.xpath( '//div[contains(@class, "field-name-field-profile-photo")]/descendant::img/@src' ).extract()) item['create_time'] = mysql_datetime() item['extra'] = '' item['url'] = response.url #items.append(item) # print items return item pass
def parse_candidate_education_item(self, response, cb_id): now_time = mysql_datetime() items = [] edu_items = response.xpath('//*[@id="field-education"]/descendant::li') for edu_item in edu_items: item = CandidateEducationItem() item['cb_id'] = cb_id item['college'] = '' item['discipline'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['degree'] = '' item['desc'] = DataFilter.simple_format( edu_item.xpath('./text()[normalize-space(.)]').extract()) if (not item['desc']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_workexperience_item(self, response, cb_id): now_time = mysql_datetime() items = [] workexperience_items = response.xpath( '//*[@id="field-professional-experience"]/descendant::p') for workexperience_item in workexperience_items: item = CandidateWorkexperienceItem() item['cb_id'] = cb_id item['job_title'] = '' item['company'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['desc'] = DataFilter.simple_format( workexperience_item.xpath( './text()[normalize-space(.)]').extract()) if (not item['desc']): continue item['create_time'] = now_time items.append(item) return items
def parse_candidate_basic_item(self, staff): items = [] item = CandidateBasicItem() item['country_id'] = self.country_id item['college_id'] = self.college_id item['discipline_id'] = '0' item['fullname'] = DataFilter.simple_format( staff.xpath('descendant::*[@property="schema:name"]').extract()) item['academic_title'] = DataFilter.simple_format( staff.xpath( 'descendant::*[@property="schema:jobTitle"]').extract()) item['other_title'] = '' item['nationality'] = get_chinese_by_fullname(item['fullname'], surname_list) item['email'] = DataFilter.simple_format( staff.xpath('descendant::*[@property="schema:email"]').extract()) item['phonenumber'] = DataFilter.simple_format( staff.xpath( 'descendant::*[@property="schema:telephone"]').extract()) item['external_link'] = DataFilter.simple_format( staff.xpath( 'descendant::*[contains(@class, "field-name-field-contact-website-url")]\ /descendant::a/@href').extract()) item['experience'] = '' item['desc'] = '' item['avatar_url'] = DataFilter.simple_format( staff.xpath( 'descendant::*[contains(@class, "field-name-field-contact-image")]\ /descendant::img/@src').extract()) item['create_time'] = mysql_datetime() location = DataFilter.simple_format( staff.xpath( 'descendant::*[@property="schema:workLocation"]').extract()) group = parse_text_by_multi_content( staff.xpath('descendant::*[@rel="schema:affiliation"]'), ",") item['extra'] = '{"location": "%s", "group": "%s"}' % (location, group) items.append(item) print items return items pass
def parse_candidate_education_item(self, response, cb_id): now_time = mysql_datetime() items = [] edu_items = response.xpath('//*[@id="field-education"]/descendant::li') for edu_item in edu_items: item = CandidateEducationItem() # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理 item['cb_id'] = cb_id item['college'] = '' item['discipline'] = '' item['start_time'] = '' item['end_time'] = '' item['duration'] = '' item['degree'] = '' item['desc'] = DataFilter.simple_format( edu_item.xpath("./text()[normalize-space(.)]").extract()) if not item['desc']: continue item['create_time'] = now_time items.append(item) # print items return items pass