Exemplo n.º 1
0
 def test_author(self):
     with db_session:
         a = Author(author_id='3333')
         a.name = 'given'
         p = Paper(paper_id='qqqq')
         p.title = 'titttttttle'
         a.papers.add(p)
Exemplo n.º 2
0
def upsert_paper(info, author_id):
    try:
        with db_session:
            publisher_id = info['publisher_id']
            if publisher_id:
                publisher = Publisher.get(publisher_id=publisher_id)
                if publisher:
                    pass
                else:
                    publisher = Publisher(publisher_id=publisher_id)
                    publisher.name = info['publishername']
    except:
        pass
    with db_session:
        p = Paper.get(paper_id=info['paper_id'])
        if p:
            logger.debug('paper has existed, paper_id=%s', info['paper_id'])
            return
        paper = Paper(paper_id=info['paper_id'])
        paper.title = info['title']
        paper.abstract = info['abstract']
        paper.cite_num = info['cite_num']
        paper.cited_num = info['cited_num']

        publisher_id = info['publisher_id']
        if publisher_id:
            publisher = Publisher.get(publisher_id=publisher_id)
            if publisher:
                paper.publisher = publisher

        if author_id is None:
            return
        a = Author.get(author_id=author_id)
        if a:
            paper.authors.add(a)
        else:
            a_info = api.get_author(author_id)
            author = Author(author_id=a_info['author_id'])
            author.name = a_info['name']
            author.image_url = a_info['image_url']
            author.organization = a_info['organization']
            author.home_page = a_info['home_page']
            author.paper_count = a_info['paper_count']
            author.citied_count = a_info['cited_count']
            paper.authors.add(author)
Exemplo n.º 3
0
    def write_db(self):

        print "len of entry list " + str(len(self.entry_list))

        for entry in self.entry_list:
            paper = Paper()
            if entry.has_key("id"):
                paper.id = entry["id"]
            if entry.has_key("type"):
                paper.type = entry["type"]
            if entry.has_key("title"):
                paper.title = entry["title"]
            if entry.has_key("author"):
                paper.authors = entry["author"]
            if entry.has_key("year"):
                paper.year = int(entry["year"])
            if entry.has_key("journal"):
                paper.journal = entry["journal"]
            if entry.has_key("booktitle"):
                paper.book_title = entry["booktitle"]
            if entry.has_key("publisher"):
                paper.publisher = entry["publisher"]
            if entry.has_key("institution"):
                paper.institution = entry["institution"]
            if entry.has_key("volume"):
                paper.volume = int(entry["volume"])
            if entry.has_key("number"):
                paper.number = int(entry["number"])
            if entry.has_key("pages"):
                paper.pages = entry["pages"]
            if entry.has_key("url"):
                paper.url = entry["url"]
            if entry.has_key("doi"):
                paper.doi = entry["doi"]
            if entry.has_key("isbn"):
                paper.isbn = entry["isbn"]

            paper.save()
Exemplo n.º 4
0
def add(search_query, author, title):
    fl = [
        'id', 'author', 'first_author', 'bibcode', 'id', 'year', 'title',
        'abstract', 'doi', 'pubdate', "pub", "keyword", "doctype",
        "identifier", "links_data"
    ]
    if author:
        search_query += "author:" + author
    if title:
        search_query += "title:" + title
    papers = list(ads.SearchQuery(q=search_query, fl=fl))
    if len(papers) == 0:
        selection = ads.search.Article
        exit()
    elif len(papers) == 1:
        selection = papers[0]  # type:ads.search.Article
    else:
        # first_ten = itertools.islice(papers, 10)
        first_ten = papers[:10]
        single_paper: ads.search.Article
        for index, single_paper in enumerate(first_ten):
            print(index, single_paper.title[0], single_paper.first_author)
        selected_index = click.prompt('select paper', type=int)
        selection = papers[selected_index]  # type:ads.search.Article

    assert len(selection.doi) == 1
    doi = selection.doi[0]

    try:

        paper = Paper.get(Paper.doi == doi)
        print("this paper has already been added")
        exit(1)

    except peewee.DoesNotExist:
        pass

    print("fetching bibcode")
    q = ads.ExportQuery([selection.bibcode])
    bibtex = q.execute()

    print("saving in db")

    paper = Paper()
    assert len(selection.title) == 1
    paper.doi = doi
    paper.title = selection.title[0]
    paper.abstract = selection.abstract
    paper.bibcode = selection.bibcode
    paper.year = selection.year
    paper.pubdate = selection.pubdate
    paper.pdf_downloaded = False
    paper.first_author = Author.get_or_create(name=selection.first_author)[0]
    paper.publication = Publication.get_or_create(name=selection.pub)[0]
    paper.doctype = Doctype.get_or_create(name=selection.doctype)[0]
    paper.arxiv_identifier = [
        ident for ident in selection.identifier if "arXiv:" in ident
    ][0].split("arXiv:")[-1]
    paper.bibtex = bibtex
    links = [json.loads(string) for string in selection.links_data]
    print(links)
    paper.save()
    authors = [Author.get_or_create(name=name)[0] for name in selection.author]
    for author in db.batch_commit(authors, 100):
        PaperAuthors.create(author=author, paper=paper)
    keywords = [
        Keyword.get_or_create(keyword=keyword)[0]
        for keyword in selection.keyword
    ]
    for keyword in db.batch_commit(keywords, 100):
        PaperKeywords.create(keyword=keyword, paper=paper)
    print("fetching PDF")
    arxiv_url = "https://arxiv.org/pdf/{id}".format(id=paper.arxiv_identifier)
    r = requests.get(arxiv_url, stream=True)
    print(arxiv_url)
    with open('library/{filename}.pdf'.format(filename=paper.id), 'wb') as f:
        chunk_size = 1024  # bytes
        file_size = int(r.headers.get('content-length', 0))
        progress_length = math.ceil(file_size // chunk_size)
        with click.progressbar(r.iter_content(chunk_size=20),
                               length=progress_length) as progress_chunks:
            for chunk in progress_chunks:
                f.write(chunk)
    paper.pdf_downloaded = True
    paper.save()
Exemplo n.º 5
0
    def post(self, request):
        param = QueryDict(request.body)

        uuid = param.get('uuid')
        title = param.get('title')
        time = param.get('time')
        origin = param.get('origin')
        _authors = param.getlist('authors')
        link = param.get('link')
        _tags = param.getlist('tags')
        content = param.get('content')
        refer_to = param.getlist('reference')
        score = param.get('score')

        try:
            year, month = time.split('-')
            year, month = int(year), int(month)
            publish_time = datetime.date(year, month, 1)
        except Exception as e:
            logger.error(traceback.format_exc(e))
            return JsonResponse({'msg': '提供的日期{}有误'.format(time)}, status=500)

        for _tag in _tags:
            try:
                _tag = int(_tag)
                _ = ResearchTag.objects.get(research_tag_id=_tag)
            except Exception as e:
                logger.error(traceback.format_exc(e))
                return JsonResponse({'msg': '错误的标签{}'.format(_tag)},
                                    status=500)
        tags = ResearchTag.objects.filter(
            research_tag_id__in=[int(_t) for _t in _tags])

        author_ids = []
        for _author in _authors:
            if _author.isdigit():
                author_ids.append(int(_author))
            elif Author.objects.filter(name=_author).exists():
                a = Author.objects.get(name=_author).author_id
                author_ids.append(a)
            else:
                a = Author(name=_author)
                a.save()
                author_ids.append(a.author_id)

        authors = Author.objects.filter(author_id__in=author_ids)

        try:
            score = int(score)
        except Exception as e:
            logger.error(traceback.format_exc(e))
            return JsonResponse({'msg': '错误的评分分数格式'}, status=500)

        if not Paper.objects.filter(paper_uuid=uuid).exists():
            # 新建的场合
            try:
                comment = PaperComment(content=content)
                comment.save()
                paper = Paper(paper_uuid=uuid,
                              title=title,
                              publish_origin=origin,
                              publish_time=publish_time,
                              author=authors,
                              link=link,
                              tag=tags,
                              comment=comment,
                              self_score=score)
                paper.save()
                redis.set(self.LATEST_KEY, str(uuid_gen.uuid4()))
            except Exception as e:
                logger.error(traceback.format_exc(e))
                return JsonResponse({'msg': '保存失败'}, status=500)
            else:
                return JsonResponse({
                    'next':
                    reverse('paperdb.detail',
                            kwargs={'paper_uuid': paper.paper_uuid})
                })

        try:
            # 编辑的场合
            paper = Paper.objects.get(paper_uuid=uuid)
        except Exception as e:
            logger.error(traceback.format_exc(e))
            return JsonResponse({'msg': '错误的uuid/未找到相关论文记录'}, status=404)
        else:
            paper.title = title
            paper.publish_time = publish_time
            paper.publish_origin = origin
            paper.author = authors
            paper.link = paper.link
            paper.tag = tags
            paper.self_score = score

            try:
                paper.save()
            except Exception as e:
                logger.error(traceback.format_exc(e))
                return JsonResponse({'msg': '保存失败'}, status=500)

            if paper.comment is None:
                if content != '':
                    comment = PaperComment(content=content)
                    comment.save()
                    paper.comment = comment
                    paper.save()
            elif content != paper.comment.content.replace(
                    '\r\n', '\n'):  # traditional下的换行符出入
                paper.comment.content = content
                paper.comment.save()

        for refer_to_paper in Paper.objects.filter(paper_uuid__in=refer_to):
            if not Reference.objects.filter(
                    reference_src=paper,
                    reference_trg=refer_to_paper).exists():
                reference = Reference(reference_src=paper,
                                      reference_trg=refer_to_paper)
                reference.save()

        return JsonResponse({
            'next':
            reverse('paperdb.detail', kwargs={'paper_uuid': paper.paper_uuid})
        })
Exemplo n.º 6
0
def get_references_citations_by_id(profile_id):
    if isinstance(profile_id, dict):
        profile_id = profile_id.get('profile_id')
        if MONGO:
            if data_collection.find({"id": profile_id}).count() > 0:
                # 说明这个数据已经被爬取过了
                return []
    print('func2')
    if not profile_id:
        return -1
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
        'accept-language': 'zh-CN,zh;q=0.9'
    }
    session = requests.Session()
    while True:
        try:
            response = session.get(
                'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn'
                .format(profile_id),
                headers=headers)
            response.raise_for_status()
            response.encoding = 'utf-8'
            break
        except KeyboardInterrupt:
            raise KeyboardInterrupt
        except Exception as e:
            time.sleep(3.0)
            print(e)
    result = re.search(r'IG:"(.*?)"', response.text)
    if result:
        ig = result.group(1)
    result = re.search(
        r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>',
        response.text)
    if result:
        citation_num = result.group(1)

    html = etree.HTML(response.text)

    paper = Paper(save2mongo=MONGO)
    try:
        paper.title = html.xpath('//li[@class="aca_title"]/text()')[0]
        paper.id = profile_id
        paper.citation_num = citation_num
        result = re.search(
            r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>',
            response.text)
        if result:
            paper.doi = result.group(1)
        paper.authors = html.xpath(
            '//div[@class="aca_desc b_snippet"]/span//a/text()')
        paper.abstract = html.xpath(
            '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1]
        result = re.search(
            r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>',
            response.text)
        if result:
            paper.publish_year = result.group(1)

        base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}'

        count = 9
        citation_links = list()
        for i in range(1, int(citation_num) // count):
            ajax_url = base_url.format(id=profile_id,
                                       first=i * (count + 1),
                                       count=count + 1,
                                       ig=ig,
                                       num=i,
                                       rt='2')
            while True:
                try:
                    response = session.get(ajax_url, headers=headers)
                    response.raise_for_status()
                    response.encoding = 'utf-8'
                    break
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception as e:
                    time.sleep(3.0)
                    print(e)
            html = etree.HTML(response.text)
            citation_links.extend(html.xpath('//a[@target="_blank"]/@href'))
        print('number of citation_links', len(citation_links), 'citation_num',
              citation_num)
        if len(citation_links) >= 0:
            for i, citation_link in enumerate(citation_links):
                profile_id = get_profile_id(citation_link)
                if profile_id.get('title', False):
                    paper.citations.append(profile_id)
                print('get_profile_id: {}/{}\r'.format(i + 1,
                                                       len(citation_links)),
                      end='')
        print('\nnumber of ids:', len(paper.citations))
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as e:
        print(e)
    paper.save()
    # for profile_id in paper.citations:
    #     get_references_citations_by_id(profile_id)
    return paper.citations
Exemplo n.º 7
0
 def test_paper(self):
     with db_session:
         p = Paper(paper_id='666aaaaa66')
         p.title = 'hehehehhehe'