Exemplo n.º 1
0
    def parse_article(self, response):
        lalitem = LalItem()
        lalitem['url'] = response.url
        data = response.meta['data']
        lalitem['title'] = data['title'][0]
        lalitem['journal'] = data['source'][0]
        lalitem['doi'] = data['doi'][0]
        lalitem['authors'] = ', '.join(data['authors'])
        lalitem['year'] = int(data['pubyear'][0])

        abstract_text = response.css('.section.abstract p').extract()
        if abstract_text:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text[0],
                                      re.S)
            lalitem['abstract'] = abstract_match.group(1)
        else:
            lalitem['abstract'] = ''

        lalitem['abs_img_url'] = response.url + '/F1.large.jpg'
        lalitem['keywords'] = ''
        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name
        # #请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 2
0
Arquivo: iop.py Projeto: QiliWu/lalsci
    def parse_article(self, response):
        # 解析文章主页
        lalitem = LalItem()
        lalitem['url'] = response.url
        # 保留sub_tag
        title = response.css('.wd-jnl-art-title').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        lalitem['title'] = title_match.group(1)

        lalitem['journal'] = response.css(
            '.wd-jnl-art-breadcrumb-title a::text').extract()[0]

        doi_link = response.css('.wd-jnl-art-doi a::text').extract()[0]
        lalitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        try:
            abstract_text = response.css('.wd-jnl-art-abstract p').extract()[0]
        except:
            abstract_text = response.css('.wd-jnl-art-abstract').extract()[0]
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        lalitem['abstract'] = abstract_match.group(1)

        img_url = response.css('img[alt="Fig. 1."]::attr(src)').extract_first(
            default='')
        if img_url:
            lalitem['abs_img_url'] = img_url
        else:
            lalitem['abs_img_url'] = ''

        lalitem['keywords'] = ''
        year_info = response.css(
            '.wd-jnl-art-article-info-citation p::text').extract()
        if year_info:
            year_match = re.match(r'.*\s(\d{4})\s.*', ' '.join(year_info))
            lalitem['year'] = int(year_match.group(1))
        else:
            lalitem['year'] = None

        lalitem['authors'] = ', '.join(
            response.css('.mb-0 span[itemprop="name"]::text').extract())
        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name

        # #请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 3
0
    def parse_article(self, response):
        # 解析文章主页
        lalitem = LalItem()
        lalitem['url'] = response.url
        # 部分title中会有子标签
        try:  #有些文章有两个title,第一个是德文的,第二个才是英文的
            title = response.css('.citation__title--second').extract()[0]
        except:
            title = response.css('.citation__title').extract()[0]

        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        title = title_match.group(1)
        lalitem['title'] = re.sub('\n', ' ', title)

        lalitem['journal'] = response.css(
            '.article-citation h1 a::text').extract()[0]

        doi_link = response.css('.epub-doi::text').extract()[0]
        lalitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        abstract_text = response.css(
            '.article-section__content p').extract()[0]
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        abstract_text = abstract_match.group(1)
        lalitem['abstract'] = re.sub('\n', ' ', abstract_text)

        lalitem['keywords'] = ', '.join(
            response.css(
                'meta[name="citation_keywords"]::attr(content)').extract())
        lalitem['year'] = int(
            response.css('.epub-date::text').extract()[0][-4:])

        author_group = response.css(
            '.accordion-tabbed .accordion-tabbed__tab-mobile').extract()
        commun_author = [
            author for author in author_group
            if 'Corresponding Author' in author
        ]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a href=.*?><span>(.+?)<.*', author, re.S)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        lalitem['authors'] = ', '.join(authors)

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name

        toc_url = 'https://onlinelibrary.wiley.com' + response.css(
            'a.volume-issue::attr(href)').extract()[0]
        yield Request(url=toc_url,
                      callback=self.parse_toc,
                      meta={'item': lalitem})
Exemplo n.º 4
0
Arquivo: acs.py Projeto: QiliWu/lalsci
    def parse(self, response):
        # 解析文章主页
        lalitem = LalItem()
        lalitem['url'] = response.url
        # 保留sub_tag
        title = response.css('.hlFld-Title').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        lalitem['title'] = title_match.group(1)

        lalitem['journal'] = response.css('#citation cite::text').extract_first(default='')
        lalitem['doi'] = response.css('#doi::text').extract()[0]
        #保留sub_tag
        abstract_text = response.css('.articleBody_abstractText').extract_first(default='')
        if abstract_text:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
            lalitem['abstract'] = abstract_match.group(1)
        else:
            lalitem['abstract'] = ''

        abs_img_url = response.css('#absImg img::attr(src)').extract_first(default='')
        if abs_img_url:
            abs_img_url = urljoin('https://pubs.acs.org', abs_img_url)
        lalitem['abs_img_url'] = abs_img_url
        # lalitem['citing_num'] = len(response.css('#citedBy li'))
        lalitem['keywords'] = ''
        try:
            lalitem['year'] = int(response.css('.citation_year::text').extract()[0])
        except:
            lalitem['year'] = int(response.css('#pubDate::text').extract()[0][-4:])

        author_group = response.css('#authors > span').extract()
        commun_author = [author for author in author_group if '#cor1' in author]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a id="authors".*?>(.+?)</a.*', author, re.S)
            if match:
                name = match.group(1)
                if author in commun_author:
                    name = name + '*'
                authors.append(name)
        lalitem['authors'] = ', '.join(authors)
        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name
        #请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {'User_Agent': 'Mozilla/5.0', 'Referer': 'https://gf1.jwss.site/'}
        yield Request(
            url=glgoo_url + urlencode({'q': remove_tags(lalitem['title'])}) + '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
            headers=headers,
            meta={'lalitem': lalitem},
            dont_filter=True,
            callback=self.get_citation)
Exemplo n.º 5
0
Arquivo: osa.py Projeto: QiliWu/lalsci
    def parse_article(self, response):
        lalitem = LalItem()
        data = response.meta['data']
        lalitem['url'] = response.url
        lalitem['title'] = data['title']
        # lalitem['authors'] = data['author'].split('; ')
        lalitem['authors'] = data['author'].replace('; ', ', ')
        lalitem['doi'] = data['doi']
        lalitem['journal'] = response.css(
            '.article-journal-name li strong::text').extract_first('')
        if not lalitem['journal']:
            lalitem['journal'] = data['name'].split(',')[0]

        lalitem['year'] = int(data['years'])
        lalitem['keywords'] = ''
        lalitem['abs_img_url'] = response.css(
            'img[alt="Fig. 1"]::attr(data-src)').extract_first(default='')

        abstract_text = response.css('#articleBody p').extract()
        abstract_list = []
        if abstract_text:
            #有些文章是没有摘要的。如果有摘要,格式也是多变的。
            for element in abstract_text:
                if '©' in element:
                    break
                else:
                    abstract_match = re.match(r'<.+?>(.*)</.+>', element, re.S)
                    abstract_list.append(abstract_match.group(1))
        lalitem['abstract'] = ''.join(abstract_list)

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name

        ##请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 6
0
    def parse_article(self, response):
        lalitem = LalItem()
        lalitem['url'] = response.url
        #title中可能有子标签
        title = response.css('.title-text').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        lalitem['title'] = title_match.group(1)

        lalitem['journal'] = response.css(
            '.publication-title-link::text').extract()[0]

        doi_link = response.css('.DoiLink .doi::text').extract()[0]
        lalitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        abstract_text = response.css('.abstract.author p').extract()
        abstract_text = '\n'.join(abstract_text)
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        lalitem['abstract'] = abstract_match.group(1)

        img_url = response.css(
            '.abstract.graphical img::attr(src)').extract_first(default='')
        if img_url:
            lalitem['abs_img_url'] = response.css(
                '.abstract.graphical img::attr(src)').extract_first(default='')
        else:
            lalitem[
                'abs_img_url'] = 'https://ars.els-cdn.com/content/image/1-s2.0-' + response.url.split(
                    '/')[-1] + '-gr1.jpg'

        lalitem['keywords'] = ', '.join(
            response.css('.keywords-section .keyword span::text').extract())

        year = response.css(
            '.publication-volume .size-m::text, .publication-volume .text-xs::text'
        ).extract()
        year = ''.join(year)
        year_match = re.match(r'.*\s(\d{4}),.*', year)
        lalitem['year'] = int(year_match.group(1))

        author_group = response.css('.AuthorGroups .author').extract()
        commun_author = [author for author in author_group if '<svg' in author]
        authors = []

        for author in author_group:
            match = re.match(
                r'.*"text given-name">(.+?)<.*"text surname">(.+?)<.*', author)
            name = match.group(1) + ' ' + match.group(2)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        lalitem['authors'] = ', '.join(authors)

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name
        # #请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 7
0
Arquivo: rsc.py Projeto: QiliWu/lalsci
    def parse_article(self, response):
        # 解析文章主页
        lalitem = LalItem()
        lalitem['url'] = response.url
        # 部分title中会有子标签

        title = response.css(
            '.article__title h2 p, .article__title p, .article__title h2, .article-control h2'
        ).extract()[0]
        title_match = re.match(r'<.+?>(.+)</.*?>', title, re.S)
        lalitem['title'] = title_match.group(1).strip()

        lalitem['journal'] = response.css(
            '.h--heading3.no-heading::text').extract_first(default='')
        lalitem['doi'] = response.css('.list__item-data::text')[1].extract()

        abstract_text = response.css('.capsule__text p').extract_first(
            default='')
        if abstract_text:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
            lalitem['abstract'] = abstract_match.group(1)
        else:
            lalitem['abstract'] = ''

        img_url = response.css(
            '.capsule__article-image img::attr(src)').extract_first(default='')
        if img_url:
            lalitem['abs_img_url'] = 'https://pubs.rsc.org' + img_url
        else:
            lalitem['abs_img_url'] = ''

        lalitem['keywords'] = ''
        year_info = response.css(
            '.article-nav__issue.autopad--h a::text').extract_first(default='')
        if year_info:
            year_match = re.match(r'.*Issue \d+, (\d{4}).*', year_info)
            lalitem['year'] = int(year_match.group(1))
        else:
            lalitem['year'] = None

        author_group = response.css('.article__author-link').extract()
        commun_author = [author for author in author_group if '>*</' in author]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a href=.*?>(.+?)</a.*', author, re.S)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        lalitem['authors'] = ', '.join(authors)
        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name

        ##请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 8
0
Arquivo: aip.py Projeto: QiliWu/lalsci
    def parse_article(self, response):
        # 解析文章主页
        lalitem = LalItem()
        lalitem['url'] = response.url
        # 保留sub_tag
        title = response.css('.publicationContentTitle h3').extract()[0]
        title_match = re.match(r'<h3>(.*?)<span.*', title, re.S)
        lalitem['title'] = title_match.group(1).strip()

        lalitem['journal'] = response.css(
            '.publicationContentCitation::text').extract()[0].strip()
        doi_link = response.css(
            '.publicationContentCitation a::text').extract()[0]
        lalitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        year_info = response.css(
            '.publicationContentCitation::text').extract()[1].strip()
        year_match = re.match(r'.*\((\d{4})\).*', year_info)
        lalitem['year'] = int(year_match.group(1))

        abstract_text = response.css('div.NLM_paragraph').extract()[0]
        lalitem['abstract'] = re.sub(r'(<|</)(div|named).*?>', '',
                                     abstract_text, re.S)

        img_url = response.css('.figure-no-f1 img::attr(src)').extract_first(
            default='')
        if img_url:
            lalitem['abs_img_url'] = 'https://aip.scitation.org' + img_url
        else:
            info_match = re.match(
                r".*journal=(.+?)&volume=(\d+?)&issue=(\d+?)&doi=10.1063/(.+?)\';.*",
                response.text, re.S)
            if info_match:
                jname, vol, issue, doiend = info_match.groups()
                img_url = 'https://aip.scitation.org/na101/home/literatum/publisher/aip/journals/content/{0}/{4}/{0}.{4}.{1}.issue-{2}/{3}/production/images/small/{3}.figures.f1.gif'
                lalitem['abs_img_url'] = img_url.format(
                    jname, vol, issue, doiend, lalitem['year'])
                #此法对于2017-2018(即近两年)的文章是没有效果的
            else:
                lalitem['abs_img_url'] = ''

        lalitem['keywords'] = ', '.join(
            response.css('.topicTags a::text').extract())
        author_group = response.css('.contrib-author').extract()
        authors = ''.join([remove_tags(author) for author in author_group])
        lalitem['authors'] = authors.replace('a)', '*').strip()

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name
        ##请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 9
0
    def parse_article(self, response):
        # 解析文章主页
        lalitem = LalItem()
        lalitem['url'] = response.url
        #部分title中会有子标签
        title = response.css('.ArticleTitle').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        lalitem['title'] = title_match.group(1)

        lalitem['journal'] = response.css('.JournalTitle::text').extract()[0]

        doi_link = response.css('#doi-url::text').extract()[0]
        lalitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        lalitem['year'] = int(
            response.css('.ArticleCitation_Year time::text').extract()[0][-4:])

        abstract_text = response.css('p.Para').extract_first(default='')
        try:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
            lalitem['abstract'] = abstract_match.group(1)
        except:
            lalitem['abstract'] = ''

        img_url = response.css('div.Para img::attr(src)').extract_first(
            default='')
        if img_url:
            lalitem['abs_img_url'] = img_url
        else:
            try:
                journalid = re.match(r".*\'Journal Id\':\'(.+?)\'.*",
                                     response.text, re.S).group(1)
                part_1 = lalitem['doi'].split('/')[0]
                part_2 = lalitem['doi'].split('/')[1]
                part_3 = part_2.split('-')[2].replace('0', '')
                img_url = 'https://media.springernature.com/original/springer-static/image/art%3A{0}%2F{1}/MediaObjects/{2}_{3}_{4}_Fig1_HTML.jpg'
                lalitem['abs_img_url'] = img_url.format(
                    part_1, part_2, journalid, lalitem['year'], part_3)
            except:
                lalitem['abs_img_url'] = ''

        lalitem['keywords'] = ', '.join(
            response.css('.KeywordGroup .Keyword::text').extract())

        author_group = response.css('.authors__list li').extract()
        commun_author = [
            author for author in author_group if 'authors__contact' in author
        ]
        authors = []
        for author in author_group:
            match = re.match(r'.*class="authors__name">(.+?)<.*', author)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        lalitem['authors'] = ', '.join(authors)

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name
        # yield lalitem
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Exemplo n.º 10
0
    def parse_article(self, response):
        lalitem = LalItem()

        lalitem['url'] = response.url
        title = response.css(
            'header .tighten-line-height.small-space-below').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        lalitem['title'] = title_match.group(1)
        lalitem['journal'] = response.css(
            'meta[name="citation_journal_title"]::attr(content)').extract()[0]
        lalitem['doi'] = response.css(
            'meta[name="citation_doi"]::attr(content)').extract()[0]
        lalitem['year'] = response.css(
            'meta[name="citation_online_date"]::attr(content)').extract(
            )[0].split('/')[0]
        abstract_text = response.css(
            '.pl20.mq875-pl0.js-collapsible-section p').extract()[0]
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        lalitem['abstract'] = abstract_match.group(1)
        img_match = re.match(
            r'.*?"index" : 1.*?"imagePaths" : \[ "(.*?jpg)" \].*',
            response.text, re.S)
        if img_match:
            lalitem['abs_img_url'] = (
                'https:' + img_match.group(1)) if not img_match.group(
                    1).startswith('http') else img_match.group(1)
        else:
            lalitem['abs_img_url'] = ''

        lalitem['keywords'] = ', '.join(
            response.css('.subject-tag-link::text').extract())
        author_group = response.css('li[itemprop="author"]').extract()
        commun_author = [
            author for author in author_group if 'data-corresp-id' in author
        ]
        authors = []
        for author in author_group:
            match = re.match(
                r'.*<span itemprop="name".*?>(?:<a data-test="author-name".*?>)?(.+?)(?:</a>)?</span.*',
                author, re.S)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        lalitem['authors'] = ', '.join(authors)

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name
        # #请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)