예제 #1
0
    def parse(self, response):

        author_items = []
        for author in response.xpath('//*[contains(@class, "authorName")]'):
            author_item = GoogleAuthorsItem()
            author_item['publication_id'] = response.meta['publication_id']
            author_item['article_id'] = response.meta['article_id']
            author_item['affiliation_id'] = ''
            author_item['fullname'] = DataFilter.simple_format(
                author.xpath('.').extract())
            author_item['create_time'] = mysql_datetime()
            author_items.append(author_item)

        affiliation_items = []
        for affiliation in response.xpath(
                '//*[contains(@class, "affiliation")]'):
            affiliation_item = GoogleAffiliationItem()
            affiliation_item['publication_id'] = response.meta[
                'publication_id']
            affiliation_item['article_id'] = response.meta['article_id']
            affiliation_item['desc'] = DataFilter.simple_format(
                affiliation.xpath('.').extract())
            affiliation_item['create_time'] = mysql_datetime()
            affiliation_items.append(affiliation_item)

        MYSQLUtils.save(self, "google_authors", author_items)
        MYSQLUtils.save(self, "google_affiliations", affiliation_items)
예제 #2
0
 def parse_article(self, response):
     for row in response.xpath(
             '//*[@id="gs_cit_list_table"]/tr[position()>1]'):
         article_selector = row.xpath('td[position()=1]')
         ref_selector = row.xpath('td[position()=2]')
         response.meta['publication_id'] = response.meta['publication_id']
         response.meta['cate1_id'] = response.meta['cate1_id']
         response.meta['cate2_id'] = response.meta['cate2_id']
         response.meta['article_title'] = DataFilter.simple_format(
             article_selector.xpath('descendant::span[1]').extract())
         response.meta['article_link'] = DataFilter.simple_format(
             article_selector.xpath('descendant::a/@href').extract())
         response.meta['article_authors'] = DataFilter.simple_format(
             article_selector.xpath('descendant::span[2]').extract())
         response.meta['publish_info'] = DataFilter.simple_format(
             article_selector.xpath('descendant::span[3]').extract())
         response.meta['ref_link'] = "%s%s" % (
             self.domain,
             DataFilter.simple_format(
                 ref_selector.xpath('descendant::a/@href').extract()))
         response.meta['ref_count'] = DataFilter.simple_format(
             ref_selector.xpath('descendant::a').extract())
         response.meta['publish_date'] = DataFilter.simple_format(
             row.xpath('td[position()=3]').extract())
         response.meta['create_time'] = mysql_datetime()
         yield Request(response.meta['article_link'],
                       callback=self.insert_article,
                       meta=response.meta)
예제 #3
0
    def parse_candidate_publications_item(self, response, cb_id):
        now_time = mysql_datetime()
        items = []
        pub_items = response.xpath(
            '//*[@id="field-recent-pubs"]/descendant::p')
        for pub_item in pub_items:
            item = CandidatePublicationsItem()
            # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理
            item['cb_id'] = cb_id
            #item['publications'] = DataFilter.simple_format(pub_item.xpath("./text()[normalize-space(.)]").extract())
            item['publications'] = DataFilter.simple_format(
                pub_item.xpath(".//text()[normalize-space(.)]").extract())
            publication_item = pub_item.xpath(
                ".//text()[normalize-space(.)]").extract()
            if isinstance(publication_item, list):
                for i in publication_item:
                    print MySQLdb.escape_string(i)
                    print len(publication_item)

            if not item['publications']:
                continue
            item['create_time'] = now_time
            items.append(item)
        # print items
        return items
        pass
예제 #4
0
 def parse(self, response):
     # print response.body
     # return
     for row in response.xpath(
             '//*[@id="gs_cit_list_table"]/tr[position()>1]'):
         item = GooglePublicationItem()
         item['cate1_id'] = response.meta['cate1_id']
         item['cate2_id'] = response.meta['cate2_id']
         item['name'] = DataFilter.simple_format(
             row.xpath('td[position()=2]').extract())
         item['desc'] = ''
         item['h5_idx'] = DataFilter.simple_format(
             row.xpath('td[position()=3]').extract())
         item['h5_med'] = DataFilter.simple_format(
             row.xpath('td[position()=4]').extract())
         item['rank'] = DataFilter.simple_format(
             row.xpath('td[position()=1]').extract())
         item['create_time'] = mysql_datetime()
         article_list_url = "%s%s" % (
             self.domain,
             DataFilter.simple_format(
                 row.xpath('td[position()=3]/a/@href').extract()))
         publication_id = MYSQLUtils.save(self, "google_publication",
                                          item)[0]
         response.meta['publication_id'] = publication_id
         response.meta['h5_idx'] = item['h5_idx']
         yield Request(article_list_url,
                       callback=self.parse_article_list,
                       meta=response.meta)
 def parse_cate2(self, response):
     cate1_id = response.meta['cate1_id']
     items = []
     for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'):
         item = GoogleCategoryItem()
         item['fid'] = cate1_id
         item['name'] = DataFilter.simple_format(a.xpath('.').extract())
         item['cate_url'] = "%s%s" % (self.domain,
                                      DataFilter.simple_format(
                                          a.xpath("./@href").extract()))
         item['create_time'] = mysql_datetime()
         items.append(item)
     MYSQLUtils.save(self, "google_category", items)
 def parse_candidate_research_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     item = CandidateResearchItem()
     item['cb_id'] = cb_id
     item['interests'] = DataFilter.simple_format(
         response.xpath('//*[@id="field-research-interests"]').extract())
     item['current_research'] = DataFilter.simple_format(
         response.xpath('//*[@id="field-current-research"]').extract())
     item['research_summary'] = DataFilter.simple_format(
         response.xpath('//*[@id="field-research-summary"]').extract())
     item['create_time'] = now_time
     items.append(item)
     return items
 def parse_candidate_publications_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     pub_items = response.xpath(
         '//*[@id="field-recent-pubs"]/descendant::p')
     for pub_item in pub_items:
         item = CandidatePublicationsItem()
         item['cb_id'] = cb_id
         item['publications'] = DataFilter.simple_format(
             pub_item.xpath('./text()[normalize-space(.)]').extract())
         if (not item['publications']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
 def parse(self, response):
     for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'):
         cate1_url = "%s%s" % (self.domain,
                               DataFilter.simple_format(
                                   a.xpath("./@href").extract()))
         cate1_name = DataFilter.simple_format(a.xpath('.').extract())
         item = GoogleCategoryItem()
         item['fid'] = 0
         item['name'] = cate1_name
         item['cate_url'] = cate1_url
         item['create_time'] = mysql_datetime()
         cate1_id = MYSQLUtils.save(self, "google_category", item)[0]
         yield Request(cate1_url,
                       callback=self.parse_cate2,
                       meta={"cate1_id": cate1_id})
    def parse_candidate_research_item(self, summary, cb_id):
        now_time = mysql_datetime()
        items = []
        item = CandidateResearchItem()
        item['cb_id'] = cb_id
        interests = self.parse_content(summary, "Research Interests")
        item['interests'] = parse_text_by_multi_content(interests, "\n")
        item['current_research'] = ''
        item['research_summary'] = ''
        item['create_time'] = now_time
        items.append(item)
        print items
        return items

        pass
 def parse_candidate_courses_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     course_items = response.xpath(
         '//*[@id="field-courses-taught"]/descendant::li')
     for course_item in course_items:
         item = CandidateCoursesItem()
         item['cb_id'] = cb_id
         item['courses_no'] = '0'
         item['courses_desc'] = DataFilter.simple_format(
             course_item.xpath('./text()[normalize-space(.)]').extract())
         if (not item['courses_desc']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
 def parse_candidate_publications_item(self, summary, cb_id):
     now_time = mysql_datetime()
     items = []
     pub_items = self.parse_content(summary, "Representative Publications")
     for pub_item in pub_items:
         item = CandidatePublicationsItem()
         # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理
         item['cb_id'] = cb_id
         item['publications'] = DataFilter.simple_format(
             pub_item.xpath('.').extract())
         if not item['publications']:
             continue
         item['create_time'] = now_time
         items.append(item)
     print items
     return items
     pass
 def parse_candidate_workexperience_item(self, summary, cb_id):
     now_time = mysql_datetime()
     items = []
     desc = self.parse_content(summary,
                               "Industrial and Sabbatical Experience")
     item = CandidateWorkexperienceItem()
     item['cb_id'] = cb_id
     item['job_title'] = ''
     item['company'] = ''
     item['start_time'] = ''
     item['end_time'] = ''
     item['duration'] = ''
     item['desc'] = parse_text_by_multi_content(desc, "\n")
     item['create_time'] = now_time
     items.append(item)
     print items
     return items
     pass
 def parse_candidate_education_item(self, summary, cb_id):
     now_time = mysql_datetime()
     items = []
     desc = self.parse_content(summary, "Degrees and Awards")
     item = CandidateEducationItem()
     item['cb_id'] = cb_id
     item['college'] = ''
     item['discipline'] = ''
     item['start_time'] = ''
     item['end_time'] = ''
     item['duration'] = ''
     item['degree'] = ''
     item['desc'] = parse_text_by_multi_content(desc, "\n")
     item['create_time'] = now_time
     items.append(item)
     print items
     return items
     pass
예제 #14
0
    def parse_candidate_basic_item(self, response):

        item = CandidateBasicItem()
        item['country_id'] = self.country_id
        item['college_id'] = self.college_id
        item['discipline_id'] = '0'
        item['fullname'] = DataFilter.simple_format(
            response.xpath(
                '//h1[@id="page-title"]/text()[normalize-space(.)]').extract())
        item['academic_title'] = DataFilter.simple_format(
            response.xpath(
                '//div[contains(@class, "field-label") and contains(text(), "Academic Title")]/following-sibling::*'
            ).extract())
        item['other_title'] = DataFilter.simple_format(
            response.xpath(
                '//div[contains(@class, "field-label") and contains(text(), "Other Titles")]/following-sibling::*'
            ).extract())
        item['nationality'] = get_chinese_by_fullname(item['fullname'],
                                                      surname_list)
        item['email'] = DataFilter.simple_format(
            response.xpath(
                '//a[contains(@href, "mailto:")]/text()[normalize-space(.)]').
            extract())
        item['phonenumber'] = DataFilter.simple_format(
            response.xpath(
                '//*[contains(@class, "fa-phone")]/parent::*/following-sibling::*'
            ).extract())
        item['external_link'] = DataFilter.simple_format(
            response.xpath(
                '//*[contains(@class, "fa-external-link")]/parent::*/following-sibling::*'
            ).extract())
        item['experience'] = ''
        item['desc'] = ''
        item['avatar_url'] = DataFilter.simple_format(
            response.xpath(
                '//div[contains(@class, "field-name-field-profile-photo")]/descendant::img/@src'
            ).extract())
        item['create_time'] = mysql_datetime()
        item['extra'] = ''
        item['url'] = response.url
        #items.append(item)
        # print items
        return item
        pass
 def parse_candidate_education_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     edu_items = response.xpath('//*[@id="field-education"]/descendant::li')
     for edu_item in edu_items:
         item = CandidateEducationItem()
         item['cb_id'] = cb_id
         item['college'] = ''
         item['discipline'] = ''
         item['start_time'] = ''
         item['end_time'] = ''
         item['duration'] = ''
         item['degree'] = ''
         item['desc'] = DataFilter.simple_format(
             edu_item.xpath('./text()[normalize-space(.)]').extract())
         if (not item['desc']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
 def parse_candidate_workexperience_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     workexperience_items = response.xpath(
         '//*[@id="field-professional-experience"]/descendant::p')
     for workexperience_item in workexperience_items:
         item = CandidateWorkexperienceItem()
         item['cb_id'] = cb_id
         item['job_title'] = ''
         item['company'] = ''
         item['start_time'] = ''
         item['end_time'] = ''
         item['duration'] = ''
         item['desc'] = DataFilter.simple_format(
             workexperience_item.xpath(
                 './text()[normalize-space(.)]').extract())
         if (not item['desc']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
 def parse_candidate_basic_item(self, staff):
     items = []
     item = CandidateBasicItem()
     item['country_id'] = self.country_id
     item['college_id'] = self.college_id
     item['discipline_id'] = '0'
     item['fullname'] = DataFilter.simple_format(
         staff.xpath('descendant::*[@property="schema:name"]').extract())
     item['academic_title'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[@property="schema:jobTitle"]').extract())
     item['other_title'] = ''
     item['nationality'] = get_chinese_by_fullname(item['fullname'],
                                                   surname_list)
     item['email'] = DataFilter.simple_format(
         staff.xpath('descendant::*[@property="schema:email"]').extract())
     item['phonenumber'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[@property="schema:telephone"]').extract())
     item['external_link'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[contains(@class, "field-name-field-contact-website-url")]\
             /descendant::a/@href').extract())
     item['experience'] = ''
     item['desc'] = ''
     item['avatar_url'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[contains(@class, "field-name-field-contact-image")]\
             /descendant::img/@src').extract())
     item['create_time'] = mysql_datetime()
     location = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[@property="schema:workLocation"]').extract())
     group = parse_text_by_multi_content(
         staff.xpath('descendant::*[@rel="schema:affiliation"]'), ",")
     item['extra'] = '{"location": "%s", "group": "%s"}' % (location, group)
     items.append(item)
     print items
     return items
     pass
예제 #18
0
 def parse_candidate_education_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     edu_items = response.xpath('//*[@id="field-education"]/descendant::li')
     for edu_item in edu_items:
         item = CandidateEducationItem()
         # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理
         item['cb_id'] = cb_id
         item['college'] = ''
         item['discipline'] = ''
         item['start_time'] = ''
         item['end_time'] = ''
         item['duration'] = ''
         item['degree'] = ''
         item['desc'] = DataFilter.simple_format(
             edu_item.xpath("./text()[normalize-space(.)]").extract())
         if not item['desc']:
             continue
         item['create_time'] = now_time
         items.append(item)
     # print items
     return items
     pass