def parse(self, response): author_items = [] for author in response.xpath('//*[contains(@class, "authorName")]'): author_item = GoogleAuthorsItem() author_item['publication_id'] = response.meta['publication_id'] author_item['article_id'] = response.meta['article_id'] author_item['affiliation_id'] = '' author_item['fullname'] = DataFilter.simple_format( author.xpath('.').extract()) author_item['create_time'] = mysql_datetime() author_items.append(author_item) affiliation_items = [] for affiliation in response.xpath( '//*[contains(@class, "affiliation")]'): affiliation_item = GoogleAffiliationItem() affiliation_item['publication_id'] = response.meta[ 'publication_id'] affiliation_item['article_id'] = response.meta['article_id'] affiliation_item['desc'] = DataFilter.simple_format( affiliation.xpath('.').extract()) affiliation_item['create_time'] = mysql_datetime() affiliation_items.append(affiliation_item) MYSQLUtils.save(self, "google_authors", author_items) MYSQLUtils.save(self, "google_affiliations", affiliation_items)
def __init__(self, mode=None, **kwargs): self.db = mysql_connection self.mode = mode if mode == "init": os.system("scrapy crawl GoogleScholarCategorySpider") MYSQLUtils.cleanup_google_publication_articles(self) super(GoogleScholarSpider, self).__init__(**kwargs) pass
def read(self): self.data['courses'] = MYSQLUtils.fetch_courses_data(self.db) self.data['education'] = MYSQLUtils.fetch_education_data(self.db) self.data['publications'] = MYSQLUtils.fetch_publications_data(self.db) self.data['research'] = MYSQLUtils.fetch_research_data(self.db) self.data['workexperience'] = MYSQLUtils.fetch_workexperience_data( self.db) return self
def __init__(self, fmt="mysql", **kwargs): self.fmt = fmt if fmt == "mysql": self.db = mysql_connection MYSQLUtils.cleanup_data(self) else: self.fh = FileHandler() self.fh.cleanup_data(self, fmt) super(StanfordSpider, self).__init__(**kwargs) pass
def parse_cate2(self, response): cate1_id = response.meta['cate1_id'] items = [] for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'): item = GoogleCategoryItem() item['fid'] = cate1_id item['name'] = DataFilter.simple_format(a.xpath('.').extract()) item['cate_url'] = "%s%s" % (self.domain, DataFilter.simple_format( a.xpath("./@href").extract())) item['create_time'] = mysql_datetime() items.append(item) MYSQLUtils.save(self, "google_category", items)
def insert_article(self, response): article_link = response.url content = DataFilter.simple_format( response.xpath('//meta[@http-equiv="refresh"]/@content').extract()) article_link_match = re.search(self.article_link_pattern, content) if article_link_match: article_link = article_link_match.group(1) item = GoogleArticlesItem() for key in MYSQLUtils.get_columns_by_item(item): item[key] = response.meta[key] item['article_link'] = article_link article_id = MYSQLUtils.save(self, "google_articles", item)[0] response.meta['article_id'] = article_id
def parse(self, response): # print response.body # return for row in response.xpath( '//*[@id="gs_cit_list_table"]/tr[position()>1]'): item = GooglePublicationItem() item['cate1_id'] = response.meta['cate1_id'] item['cate2_id'] = response.meta['cate2_id'] item['name'] = DataFilter.simple_format( row.xpath('td[position()=2]').extract()) item['desc'] = '' item['h5_idx'] = DataFilter.simple_format( row.xpath('td[position()=3]').extract()) item['h5_med'] = DataFilter.simple_format( row.xpath('td[position()=4]').extract()) item['rank'] = DataFilter.simple_format( row.xpath('td[position()=1]').extract()) item['create_time'] = mysql_datetime() article_list_url = "%s%s" % ( self.domain, DataFilter.simple_format( row.xpath('td[position()=3]/a/@href').extract())) publication_id = MYSQLUtils.save(self, "google_publication", item)[0] response.meta['publication_id'] = publication_id response.meta['h5_idx'] = item['h5_idx'] yield Request(article_list_url, callback=self.parse_article_list, meta=response.meta)
def start_requests(self): cate_list = MYSQLUtils.fetch_cate_list(self) # yield Request("http://www.baidu.com", callback=self.parse) for cate in cate_list: meta = {"cate1_id": cate['fid'], "cate2_id": cate['cate_id']} cate_url = get_google_spider_url(cate['cate_url']) yield Request(cate_url, callback=self.parse, meta=meta)
def parse_staff_profile(self, response): cb_id = response.meta['cb_id'] summary = response.xpath( '//div[contains(@class, "field-type-text-with-summary")]') ce_items = self.parse_candidate_education_item(summary, cb_id) MYSQLUtils.save(self, "candidate_education", ce_items) cr_items = self.parse_candidate_research_item(summary, cb_id) MYSQLUtils.save(self, "candidate_research", cr_items) # cp_items = self.parse_candidate_publications_item(summary, cb_id) MYSQLUtils.save(self, "candidate_publications", cp_items) # cc_items = self.parse_candidate_courses_item(summary, cb_id) MYSQLUtils.save(self, "candidate_courses", cc_items) # cw_items = self.parse_candidate_workexperience_item(summary, cb_id) MYSQLUtils.save(self, "candidate_workexperience", cw_items) pass
def start_requests(self): article_list = MYSQLUtils.fetch_article_list(self, self.domain) # print article_list[0] for article in article_list: meta = {} url = article['article_link'] for key in article.keys(): meta[key] = article[key] headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' } yield Request(url, callback=self.parse, meta=meta, headers=headers)
def parse(self, response): for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'): cate1_url = "%s%s" % (self.domain, DataFilter.simple_format( a.xpath("./@href").extract())) cate1_name = DataFilter.simple_format(a.xpath('.').extract()) item = GoogleCategoryItem() item['fid'] = 0 item['name'] = cate1_name item['cate_url'] = cate1_url item['create_time'] = mysql_datetime() cate1_id = MYSQLUtils.save(self, "google_category", item)[0] yield Request(cate1_url, callback=self.parse_cate2, meta={"cate1_id": cate1_id})
def parse(self, response): # return i = 0 for staff in response.xpath( '//div[contains(@class, "staff-contact")]'): cb_items = self.parse_candidate_basic_item(staff) cb_id = MYSQLUtils.save(self, "candidate_basic", cb_items)[0] staff_profile_url = self.parse_staff_profile_url(staff) if staff_profile_url: print staff_profile_url yield Request(staff_profile_url, callback=self.parse_staff_profile, meta={"cb_id": cb_id}) pass print cb_id
def __init__(self, **kwargs): self.db = mysql_connection MYSQLUtils.cleanup_data(self) super(StanfordSpider, self).__init__(**kwargs) pass
def __init__(self, **kwargs): self.db = mysql_connection MYSQLUtils.cleanup_data(self) super(ComputerScienceOfWaterLooSpider, self).__init__(**kwargs) pass
def __init__(self, **kwargs): self.db = mysql_connection MYSQLUtils.cleanup_google_category(self) super(GoogleScholarCategorySpider, self).__init__(**kwargs) pass
def parse_item(self, response): cb_item = self.parse_candidate_basic_item(response) cb_id = MYSQLUtils.save(self, 'candidate_basic', cb_item)[0] ce_items = self.parse_candidate_education_item(response, cb_id) MYSQLUtils.save(self, 'candidate_education', ce_items) cr_items = self.parse_candidate_research_item(response, cb_id) MYSQLUtils.save(self, 'candidate_research', cr_items) cp_items = self.parse_candidate_publications_item(response, cb_id) MYSQLUtils.save(self, 'candidate_publications', cp_items) cc_items = self.parse_candidate_courses_item(response, cb_id) MYSQLUtils.save(self, 'candidate_courses', cc_items) cw_items = self.parse_candidate_workexperience_item(response, cb_id) MYSQLUtils.save(self, 'candidate_workexperience', cw_items)
def parse_item(self, response): pass # print response.body # cb_item = self.parse_candidate_basic_item(response) # cb_id = MYSQLUtils.save(self, "candidate_basic", cb_item)[0] # # print cb_id # ce_items = self.parse_candidate_education_item(response, cb_id) # MYSQLUtils.save(self, "candidate_education", ce_items) # # cr_items = self.parse_candidate_research_item(response, cb_id) # MYSQLUtils.save(self, "candidate_research", cr_items) # # cp_items = self.parse_candidate_publications_item(response, cb_id) # MYSQLUtils.save(self, "candidate_publications", cp_items) # # cc_items = self.parse_candidate_courses_item(response, cb_id) # MYSQLUtils.save(self, "candidate_courses", cc_items) # # cw_items = self.parse_candidate_workexperience_item(response, cb_id) # MYSQLUtils.save(self, "candidate_workexperience", cw_items) cb_item = self.parse_candidate_basic_item(response) if self.fmt == "mysql": cb_id = MYSQLUtils.save(self, "candidate_basic", cb_item)[0] else: cb_id = self.fh.generate_id(cb_item['fullname'] + cb_item['url']) ce_items = self.parse_candidate_education_item(response, cb_id) cr_items = self.parse_candidate_research_item(response, cb_id) cp_items = self.parse_candidate_publications_item(response, cb_id) cc_items = self.parse_candidate_courses_item(response, cb_id) cw_items = self.parse_candidate_workexperience_item(response, cb_id) if self.fmt == "mysql": MYSQLUtils.save(self, "candidate_education", ce_items) MYSQLUtils.save(self, "candidate_research", cr_items) MYSQLUtils.save(self, "candidate_publications", cp_items) MYSQLUtils.save(self, "candidate_courses", cc_items) MYSQLUtils.save(self, "candidate_workexperience", cw_items) else: self.fh.data['candidate_basic']['item'] = cb_item self.fh.data['candidate_education']['item'] = ce_items self.fh.data['candidate_research']['item'] = cr_items self.fh.data['candidate_publications']['item'] = cp_items self.fh.data['candidate_courses']['item'] = cc_items self.fh.data['candidate_workexperience']['item'] = cw_items self.fh.write(self.fmt)
def __init__(self, mode=None, **kwargs): self.db = mysql_connection MYSQLUtils.cleanup_google_author_affiliations(self, self.domain) super(ScienceDirectSpider, self).__init__(**kwargs) pass