Exemplo n.º 1
0
 def parse(self, response):
     # print response.body
     # return
     for row in response.xpath(
             '//*[@id="gs_cit_list_table"]/tr[position()>1]'):
         item = GooglePublicationItem()
         item['cate1_id'] = response.meta['cate1_id']
         item['cate2_id'] = response.meta['cate2_id']
         item['name'] = DataFilter.simple_format(
             row.xpath('td[position()=2]').extract())
         item['desc'] = ''
         item['h5_idx'] = DataFilter.simple_format(
             row.xpath('td[position()=3]').extract())
         item['h5_med'] = DataFilter.simple_format(
             row.xpath('td[position()=4]').extract())
         item['rank'] = DataFilter.simple_format(
             row.xpath('td[position()=1]').extract())
         item['create_time'] = mysql_datetime()
         article_list_url = "%s%s" % (
             self.domain,
             DataFilter.simple_format(
                 row.xpath('td[position()=3]/a/@href').extract()))
         publication_id = MYSQLUtils.save(self, "google_publication",
                                          item)[0]
         response.meta['publication_id'] = publication_id
         response.meta['h5_idx'] = item['h5_idx']
         yield Request(article_list_url,
                       callback=self.parse_article_list,
                       meta=response.meta)
Exemplo n.º 2
0
    def parse(self, response):

        author_items = []
        for author in response.xpath('//*[contains(@class, "authorName")]'):
            author_item = GoogleAuthorsItem()
            author_item['publication_id'] = response.meta['publication_id']
            author_item['article_id'] = response.meta['article_id']
            author_item['affiliation_id'] = ''
            author_item['fullname'] = DataFilter.simple_format(
                author.xpath('.').extract())
            author_item['create_time'] = mysql_datetime()
            author_items.append(author_item)

        affiliation_items = []
        for affiliation in response.xpath(
                '//*[contains(@class, "affiliation")]'):
            affiliation_item = GoogleAffiliationItem()
            affiliation_item['publication_id'] = response.meta[
                'publication_id']
            affiliation_item['article_id'] = response.meta['article_id']
            affiliation_item['desc'] = DataFilter.simple_format(
                affiliation.xpath('.').extract())
            affiliation_item['create_time'] = mysql_datetime()
            affiliation_items.append(affiliation_item)

        MYSQLUtils.save(self, "google_authors", author_items)
        MYSQLUtils.save(self, "google_affiliations", affiliation_items)
 def parse_cate2(self, response):
     cate1_id = response.meta['cate1_id']
     items = []
     for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'):
         item = GoogleCategoryItem()
         item['fid'] = cate1_id
         item['name'] = DataFilter.simple_format(a.xpath('.').extract())
         item['cate_url'] = "%s%s" % (self.domain,
                                      DataFilter.simple_format(
                                          a.xpath("./@href").extract()))
         item['create_time'] = mysql_datetime()
         items.append(item)
     MYSQLUtils.save(self, "google_category", items)
 def parse_candidate_research_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     item = CandidateResearchItem()
     item['cb_id'] = cb_id
     item['interests'] = DataFilter.simple_format(
         response.xpath('//*[@id="field-research-interests"]').extract())
     item['current_research'] = DataFilter.simple_format(
         response.xpath('//*[@id="field-current-research"]').extract())
     item['research_summary'] = DataFilter.simple_format(
         response.xpath('//*[@id="field-research-summary"]').extract())
     item['create_time'] = now_time
     items.append(item)
     return items
 def parse(self, response):
     for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'):
         cate1_url = "%s%s" % (self.domain,
                               DataFilter.simple_format(
                                   a.xpath("./@href").extract()))
         cate1_name = DataFilter.simple_format(a.xpath('.').extract())
         item = GoogleCategoryItem()
         item['fid'] = 0
         item['name'] = cate1_name
         item['cate_url'] = cate1_url
         item['create_time'] = mysql_datetime()
         cate1_id = MYSQLUtils.save(self, "google_category", item)[0]
         yield Request(cate1_url,
                       callback=self.parse_cate2,
                       meta={"cate1_id": cate1_id})
 def parse_staff_profile_url(self, staff):
     profile = DataFilter.simple_format(
         staff.xpath(
             'descendant::div[contains(@class, "field-name-field-contact-profile-url")]/descendant::a/@href'
         ).extract())
     return profile
     pass
Exemplo n.º 7
0
    def parse_candidate_publications_item(self, response, cb_id):
        now_time = mysql_datetime()
        items = []
        pub_items = response.xpath(
            '//*[@id="field-recent-pubs"]/descendant::p')
        for pub_item in pub_items:
            item = CandidatePublicationsItem()
            # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理
            item['cb_id'] = cb_id
            #item['publications'] = DataFilter.simple_format(pub_item.xpath("./text()[normalize-space(.)]").extract())
            item['publications'] = DataFilter.simple_format(
                pub_item.xpath(".//text()[normalize-space(.)]").extract())
            publication_item = pub_item.xpath(
                ".//text()[normalize-space(.)]").extract()
            if isinstance(publication_item, list):
                for i in publication_item:
                    print MySQLdb.escape_string(i)
                    print len(publication_item)

            if not item['publications']:
                continue
            item['create_time'] = now_time
            items.append(item)
        # print items
        return items
        pass
Exemplo n.º 8
0
def parse_text_by_multi_content(content, delimiter):
    text = ''
    for c in content:
        text = "%s%s%s" % (
            text, DataFilter.simple_format(c.xpath('.').extract()), delimiter)
    text = text[:-len(delimiter)]
    return text
Exemplo n.º 9
0
    def parse_item(self, response):
        print response.body
        # 在新页面爬取电话号码和email的信息,整合到传递过来的item中
        item = response.meta['basic_item']
        item['phonenumber'] = DataFilter.simple_format(
            response.xpath(
                '//*[@itemprop="telephone"]/text()[normalize-space(.)]').
            extract())
        item['email'] = DataFilter.simple_format(
            response.xpath(
                '//*[@itemprop="email"]/text()[normalize-space(.)]').extract())
        #打印一下basic item
        print item

        #接下来就是save basic item research interests item 这些到mysql,编码省略

        pass
Exemplo n.º 10
0
 def parse_article(self, response):
     for row in response.xpath(
             '//*[@id="gs_cit_list_table"]/tr[position()>1]'):
         article_selector = row.xpath('td[position()=1]')
         ref_selector = row.xpath('td[position()=2]')
         response.meta['publication_id'] = response.meta['publication_id']
         response.meta['cate1_id'] = response.meta['cate1_id']
         response.meta['cate2_id'] = response.meta['cate2_id']
         response.meta['article_title'] = DataFilter.simple_format(
             article_selector.xpath('descendant::span[1]').extract())
         response.meta['article_link'] = DataFilter.simple_format(
             article_selector.xpath('descendant::a/@href').extract())
         response.meta['article_authors'] = DataFilter.simple_format(
             article_selector.xpath('descendant::span[2]').extract())
         response.meta['publish_info'] = DataFilter.simple_format(
             article_selector.xpath('descendant::span[3]').extract())
         response.meta['ref_link'] = "%s%s" % (
             self.domain,
             DataFilter.simple_format(
                 ref_selector.xpath('descendant::a/@href').extract()))
         response.meta['ref_count'] = DataFilter.simple_format(
             ref_selector.xpath('descendant::a').extract())
         response.meta['publish_date'] = DataFilter.simple_format(
             row.xpath('td[position()=3]').extract())
         response.meta['create_time'] = mysql_datetime()
         yield Request(response.meta['article_link'],
                       callback=self.insert_article,
                       meta=response.meta)
Exemplo n.º 11
0
 def parse_article_list(self, response):
     page_size = 20
     h5_idx = response.meta['h5_idx']
     page_count = int(math.ceil(int(h5_idx) / page_size))
     for page_number in range(1, page_count + 1):
         cstart = (page_number - 1) * page_size
         url = DataFilter.add_url_parameter(response.url,
                                            "cstart=%d" % cstart)
         yield Request(url, callback=self.parse_article, meta=response.meta)
Exemplo n.º 12
0
 def insert_article(self, response):
     article_link = response.url
     content = DataFilter.simple_format(
         response.xpath('//meta[@http-equiv="refresh"]/@content').extract())
     article_link_match = re.search(self.article_link_pattern, content)
     if article_link_match:
         article_link = article_link_match.group(1)
     item = GoogleArticlesItem()
     for key in MYSQLUtils.get_columns_by_item(item):
         item[key] = response.meta[key]
     item['article_link'] = article_link
     article_id = MYSQLUtils.save(self, "google_articles", item)[0]
     response.meta['article_id'] = article_id
Exemplo n.º 13
0
    def parse_guesser(self, response):

        # node = response.xpath("/html/body[1]/div[4]/div[1]/div[1]/div[2]/div[1]/div[4]/div[1]/div[1]/div[1]/div[
        # 1]/div[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/p[1]")

        # print response.body
        nodes = extract_guesser_nodes(response)
        # print nodes
        # 遍历last节点得到probability
        # pprint.pprint(nodes["last_nodes"])
        for xpath in nodes["last_nodes"]:
            node = nodes["last_nodes"][xpath]["node"]
            text = DataFilter.simple_format(node.xpath(".").extract())
            unknown_data, analyzer_result = BayesUtils.vectorize_unknown(
                [text], self.vectorizer)
            is_meaningful = check_text_meaningful(text, analyzer_result)
            if not is_meaningful:
                continue
            proba = self.clf.predict_proba(unknown_data)
            #    break
            print nodes["last_nodes"][xpath]["node"]
            print text
            print proba
        # print nodes
        # #
        # for xpath in nodes["last2_nodes"]:
        #     node = nodes["last2_nodes"][xpath]["node"]
        #     text = [DataFilter.simple_format(node.xpath(".").extract())]
        #     is_meaningful = check_text_meaningful(text)
        #     if not is_meaningful:
        #         continue
        #     unknown_data = BayesUtils.vectorize_unknown(text, self.vectorizer)
        #     proba = self.clf.predict_proba(unknown_data)
        #     print nodes["last2_nodes"][xpath]["node"]
        #     print text
        #     print proba
        #
        # for xpath in nodes["last3_nodes"]:
        #     node = nodes["last3_nodes"][xpath]["node"]
        #     text = [DataFilter.simple_format(node.xpath(".").extract())]
        #     is_meaningful = check_text_meaningful(text)
        #     if not is_meaningful:
        #         continue
        #     unknown_data = BayesUtils.vectorize_unknown(text, self.vectorizer)
        #     proba = self.clf.predict_proba(unknown_data)
        #     print nodes["last3_nodes"][xpath]["node"]
        #     print text
        #     print proba

        pass
 def parse_candidate_publications_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     pub_items = response.xpath(
         '//*[@id="field-recent-pubs"]/descendant::p')
     for pub_item in pub_items:
         item = CandidatePublicationsItem()
         item['cb_id'] = cb_id
         item['publications'] = DataFilter.simple_format(
             pub_item.xpath('./text()[normalize-space(.)]').extract())
         if (not item['publications']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
 def parse_candidate_courses_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     course_items = response.xpath(
         '//*[@id="field-courses-taught"]/descendant::li')
     for course_item in course_items:
         item = CandidateCoursesItem()
         item['cb_id'] = cb_id
         item['courses_no'] = '0'
         item['courses_desc'] = DataFilter.simple_format(
             course_item.xpath('./text()[normalize-space(.)]').extract())
         if (not item['courses_desc']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
 def parse_candidate_publications_item(self, summary, cb_id):
     now_time = mysql_datetime()
     items = []
     pub_items = self.parse_content(summary, "Representative Publications")
     for pub_item in pub_items:
         item = CandidatePublicationsItem()
         # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理
         item['cb_id'] = cb_id
         item['publications'] = DataFilter.simple_format(
             pub_item.xpath('.').extract())
         if not item['publications']:
             continue
         item['create_time'] = now_time
         items.append(item)
     print items
     return items
     pass
    def parse_content(self, summary, head_str):
        is_end = False
        next_h2 = summary.xpath(
            'descendant::h2[text()="%s"]/following-sibling::h2[1]' % head_str)
        if not next_h2:
            is_end = True
        if is_end:
            content = summary.xpath(
                'descendant::h2[text()="%s"]/following-sibling::p' % head_str)
        else:
            next_h2_txt = DataFilter.simple_format(next_h2.extract())
            content = summary.xpath(
                'descendant::p[preceding-sibling::h2/text()="%s" and following-sibling::h2/text()="%s"]'
                % (head_str, next_h2_txt))
        print content
        # summary.xpath('descendant::p[preceding-sibling::h2/text()='%s' and following-sibling::h2/@property='p2']')
        return content

        pass
 def parse_candidate_education_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     edu_items = response.xpath('//*[@id="field-education"]/descendant::li')
     for edu_item in edu_items:
         item = CandidateEducationItem()
         item['cb_id'] = cb_id
         item['college'] = ''
         item['discipline'] = ''
         item['start_time'] = ''
         item['end_time'] = ''
         item['duration'] = ''
         item['degree'] = ''
         item['desc'] = DataFilter.simple_format(
             edu_item.xpath('./text()[normalize-space(.)]').extract())
         if (not item['desc']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
Exemplo n.º 19
0
    def parse_candidate_basic_item(self, response):

        item = CandidateBasicItem()
        item['country_id'] = self.country_id
        item['college_id'] = self.college_id
        item['discipline_id'] = '0'
        item['fullname'] = DataFilter.simple_format(
            response.xpath(
                '//h1[@id="page-title"]/text()[normalize-space(.)]').extract())
        item['academic_title'] = DataFilter.simple_format(
            response.xpath(
                '//div[contains(@class, "field-label") and contains(text(), "Academic Title")]/following-sibling::*'
            ).extract())
        item['other_title'] = DataFilter.simple_format(
            response.xpath(
                '//div[contains(@class, "field-label") and contains(text(), "Other Titles")]/following-sibling::*'
            ).extract())
        item['nationality'] = get_chinese_by_fullname(item['fullname'],
                                                      surname_list)
        item['email'] = DataFilter.simple_format(
            response.xpath(
                '//a[contains(@href, "mailto:")]/text()[normalize-space(.)]').
            extract())
        item['phonenumber'] = DataFilter.simple_format(
            response.xpath(
                '//*[contains(@class, "fa-phone")]/parent::*/following-sibling::*'
            ).extract())
        item['external_link'] = DataFilter.simple_format(
            response.xpath(
                '//*[contains(@class, "fa-external-link")]/parent::*/following-sibling::*'
            ).extract())
        item['experience'] = ''
        item['desc'] = ''
        item['avatar_url'] = DataFilter.simple_format(
            response.xpath(
                '//div[contains(@class, "field-name-field-profile-photo")]/descendant::img/@src'
            ).extract())
        item['create_time'] = mysql_datetime()
        item['extra'] = ''
        item['url'] = response.url
        #items.append(item)
        # print items
        return item
        pass
 def parse_candidate_workexperience_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     workexperience_items = response.xpath(
         '//*[@id="field-professional-experience"]/descendant::p')
     for workexperience_item in workexperience_items:
         item = CandidateWorkexperienceItem()
         item['cb_id'] = cb_id
         item['job_title'] = ''
         item['company'] = ''
         item['start_time'] = ''
         item['end_time'] = ''
         item['duration'] = ''
         item['desc'] = DataFilter.simple_format(
             workexperience_item.xpath(
                 './text()[normalize-space(.)]').extract())
         if (not item['desc']):
             continue
         item['create_time'] = now_time
         items.append(item)
     return items
Exemplo n.º 21
0
 def parse_candidate_education_item(self, response, cb_id):
     now_time = mysql_datetime()
     items = []
     edu_items = response.xpath('//*[@id="field-education"]/descendant::li')
     for edu_item in edu_items:
         item = CandidateEducationItem()
         # 斯坦福大学无法直接获取到教育经历的相关字段,因此只有desc字段有值,其他字段留待后续分析处理
         item['cb_id'] = cb_id
         item['college'] = ''
         item['discipline'] = ''
         item['start_time'] = ''
         item['end_time'] = ''
         item['duration'] = ''
         item['degree'] = ''
         item['desc'] = DataFilter.simple_format(
             edu_item.xpath("./text()[normalize-space(.)]").extract())
         if not item['desc']:
             continue
         item['create_time'] = now_time
         items.append(item)
     # print items
     return items
     pass
 def parse_candidate_basic_item(self, staff):
     items = []
     item = CandidateBasicItem()
     item['country_id'] = self.country_id
     item['college_id'] = self.college_id
     item['discipline_id'] = '0'
     item['fullname'] = DataFilter.simple_format(
         staff.xpath('descendant::*[@property="schema:name"]').extract())
     item['academic_title'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[@property="schema:jobTitle"]').extract())
     item['other_title'] = ''
     item['nationality'] = get_chinese_by_fullname(item['fullname'],
                                                   surname_list)
     item['email'] = DataFilter.simple_format(
         staff.xpath('descendant::*[@property="schema:email"]').extract())
     item['phonenumber'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[@property="schema:telephone"]').extract())
     item['external_link'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[contains(@class, "field-name-field-contact-website-url")]\
             /descendant::a/@href').extract())
     item['experience'] = ''
     item['desc'] = ''
     item['avatar_url'] = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[contains(@class, "field-name-field-contact-image")]\
             /descendant::img/@src').extract())
     item['create_time'] = mysql_datetime()
     location = DataFilter.simple_format(
         staff.xpath(
             'descendant::*[@property="schema:workLocation"]').extract())
     group = parse_text_by_multi_content(
         staff.xpath('descendant::*[@rel="schema:affiliation"]'), ",")
     item['extra'] = '{"location": "%s", "group": "%s"}' % (location, group)
     items.append(item)
     print items
     return items
     pass
Exemplo n.º 23
0
 def parse(self, response):
     print response.body
     # return
     #i = 0
     #该动态网页的编程思路是携带,技术实现是用scrapy的meta来携带上下文数据
     #1.用正则提取出url
     item = CandidateBasicItem()
     item['country_id'] = self.country_id
     item['college_id'] = self.college_id
     item['discipline_id'] = '0'
     item['avatar_url'] = DataFilter.simple_format(
         response.xpath('//*[@id="faculty_image"]/img/@src').extract())
     #因为是举个例子,因此没有所有都进行爬取
     hcard_match = re.search(self.hcard_pattern, response.body)
     meta = {"basic_item": item}
     #提醒一下,除了basic信息,其他research interests和publication这些item信息也要一并进行传递,我这里是个例子,所以仅传递了basic
     if hcard_match:
         hcard_url = hcard_match.group(1)
         print hcard_url
         return Request(hcard_url, callback=self.parse_item, meta=meta)
     else:
         #进行一些类似continue跳过循环的处理,因为爬取不到教授信息
         pass