示例#1
0
def run():
    gs = GSData()
    db.connect()

    with open('data.txt', encoding='utf-8') as f:
        for line in f:
            wx_name = line.split(' ')[0]
            try:
                data = gs.query(wx_name)
            except GSException as ex:
                print(str(ex))
                continue
            else:
                for item in data:
                    url = 'https://{0}'.format(item['url'].split('://', 1)[1])
                    md5s = hashlib.md5(url.encode('utf-8')).hexdigest()
                    if Paper.select().where(Paper.url_hash == md5s).count():
                        continue
                    print(item)
                    p = Paper.create(
                        wx_name=item['wx_name'],
                        name=item['name'],
                        title=item['title'],
                        author=item['author'],
                        content=item['content'],
                        url=url,
                        url_hash=md5s,
                        post_time=datetime.strptime(item['posttime'],
                                                    '%Y-%m-%d %H:%M:%S'),
                        add_time=datetime.strptime(item['add_time'],
                                                   '%Y-%m-%d %H:%M:%S'))
                    if type(item['readnum_newest']) == int:
                        p.read_num = item['readnum_newest']
                    if type(item['likenum_newest']) == int:
                        p.like_num = item['likenum_newest']
                    if item['picurl']:
                        p.pic_url = item['picurl']

                    p.save()
            sleep(3)

    db.close()
示例#2
0
def update_paper():
    idx = 0
    for filename in tqdm(glob.glob("oai/*.xml")):
        article = parse_xml_file(filename)
        if article is None or idx < 346728:
            idx += 1
            continue
        arvixID = article['id'].split('v')[0]
        query = Paper.select().where(Paper.arvixID == arvixID)
        if query.exists():
            continue
        success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
        if success is False:
            logging.debug(
                "Paper not exists in semantic scholar, arvixID : %s" % arvixID)
            continue
        authorIDList = [
            int(author['authorId']) if author['authorId'] is not None else -1
            for author in article_meta['authors']
        ]
        authorNames = [article['main_author']]
        authorCount = len(article_meta['authors'])
        if authorCount > 1:
            other_author = [
                name.strip() for name in article['authors'].split(',')
                if len(name) > 1 and name != article['main_author']
            ]
            authorNames += other_author
        paper_category = [article['term']]
        try:
            paper = Paper.create(
                indexID=idx,
                arvixID=arvixID,
                paperId=article_meta['paperId'],
                doiID=str(article_meta['doi']),
                title=article['title'],
                summary=article['abstract'],
                category=paper_category,
                comments=article['comment'],
                journal_ref=article['journal_ref'],
                url=article['url'],
                authorID=authorIDList,
                authorName=authorNames,
                authorCount=authorCount,
                publishedDate=article['publish_date'],
                citationVelocity=article_meta['citationVelocity'],
                referencesCount=len(article_meta['references']),
                topics=article_meta['topics'],
                venue=str(article_meta['venue']),
                year=article_meta['year'],
                influentialCitationCount=article_meta[
                    'influentialCitationCount'],
                citationCount=len(article_meta['citations']),
                citations=article_meta['citations'],
            )
            try:
                for meta in ['page', 'figure', 'table']:
                    if meta in article['comment']:
                        comment = article['comment'].replace(';', ',')
                        for segment in comment.split(','):
                            if meta in segment:
                                page_prefix = segment.split(meta)[0]
                                if meta == 'page':
                                    paper.pages = int(page_prefix.strip())
                                elif meta == 'figure':
                                    paper.figures = int(page_prefix.strip())
                                elif meta == 'table':
                                    paper.table = int(page_prefix.strip())
                                break
            except:
                logging.debug("Error in parsing meta data")
            paper.save()
        except BaseException as e:
            logging.warning("Error in arvix id %s, error: %s" %
                            (arvixID, str(e)))
        time.sleep(0.2)
        idx += 1
示例#3
0
def paper():
    url = 'http://ieeexplore.ieee.org/rest/search'
    page = 31501
    n_paper = 0
    print(f'Started populate paper at: {datetime.datetime.now()}')

    print("Request search page for get cookies.")
    response = requests.get(
        'http://ieeexplore.ieee.org/search/searchresult.jsp')
    cookies = response.cookies
    while True:
        try:
            page += 1
            print(f'Request page {page}')
            payload = {
                'pageNumber': str(page),
            }
            headers = {
                'Accept':
                'application/json, text/plain, */*',
                'Content-Type':
                'application/json;charset=UTF-8',
                'Content-Length':
                '18',
                'Accept-Language':
                'en-US,en;q=0.8,pt;q=0.6',
                'Referer':
                'http://ieeexplore.ieee.org/search/searchresult.jsp',
                'Origin':
                'http://ieeexplore.ieee.org',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/60.0.3112.113 Safari/537.36',
            }

            response = requests.post(url,
                                     json=payload,
                                     headers=headers,
                                     cookies=cookies,
                                     timeout=15)

            papers = json.loads(response.text)['records']
            if (page % 150) == 0:
                db.commit()
                print(f'committed at: {page}')
                print(f'insert {n_paper} rows')
                print("Request search page for get cookies.")
                response = requests.get(
                    'http://ieeexplore.ieee.org/search/searchresult.jsp')
                cookies = response.cookies
            if len(papers) <= 0:
                db.commit()
                print(
                    f'{datetime.datetime.now()}- Finished, populated with {n_paper} papers'
                )
                break

            for p in papers:

                if p.get('title'):
                    Paper.create(title=p['title'],
                                 abstract=p.get('abstract', ''),
                                 finalScore=0.,
                                 accepted=True)
                    n_paper += 1

        except Exception as e:
            print(
                f'{datetime.datetime.now()} - Error: {e} \nretry in 30 seconds'
            )
            time.sleep(30)
            continue
示例#4
0
def crawl_category(term='cs.LG'):
    index_iteration = 500
    logging.info("Crawling category : %s", term)
    for index in range(start_index, end_index, index_iteration):
        logging.info("\nBatch : %d-%d" % (index, index + index_iteration))
        articles = arxivpy.query(search_query=[term],
                                 start_index=index,
                                 max_index=index + index_iteration,
                                 results_per_iteration=index_iteration,
                                 wait_time=0.2,
                                 sort_by='lastUpdatedDate')
        article_batch_count = len(articles)
        if article_batch_count == 0:
            logging.warning('Article not found in batch %d - %d' %
                            (index, index + index_iteration))
        for idx, article in tqdm(enumerate(articles),
                                 total=article_batch_count):
            arvixID = article['id'].split('v')[0]
            query = Paper.select().where(Paper.arvixID == arvixID)
            if query.exists():
                paper = Paper.get(Paper.arvixID == arvixID)
                categories = paper.category
                if term not in categories:
                    categories.append(term)
                Paper.update(category=categories).where(
                    Paper.arvixID == arvixID).execute()
                continue
            success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
            if success is False:
                logging.debug(
                    "Paper not exists in semantic scholar, arvixID : %s" %
                    arvixID)
                continue
            authorIDList = [
                int(author['authorId'])
                if author['authorId'] is not None else -1
                for author in article_meta['authors']
            ]
            authorNames = [article['main_author']]
            authorCount = len(article_meta['authors'])
            if authorCount > 1:
                other_author = [
                    name.strip() for name in article['authors'].split(',')
                    if len(name) > 1 and name != article['main_author']
                ]
                authorNames += other_author
            paper_category = [article['term']]
            if article['term'] != term:
                paper_category.append(term)
            try:
                paper = Paper.create(
                    indexID=idx + index,
                    arvixID=arvixID,
                    paperId=article_meta['paperId'],
                    doiID=str(article_meta['doi']),
                    title=article['title'],
                    summary=article['abstract'],
                    category=paper_category,
                    comments=article['comment'],
                    journal_ref=article['journal_ref'],
                    url=article['url'],
                    authorID=authorIDList,
                    authorName=authorNames,
                    authorCount=authorCount,
                    publishedDate=article['publish_date'],
                    citationVelocity=article_meta['citationVelocity'],
                    referencesCount=len(article_meta['references']),
                    topics=article_meta['topics'],
                    venue=str(article_meta['venue']),
                    year=article_meta['year'],
                    influentialCitationCount=article_meta[
                        'influentialCitationCount'],
                    citationCount=len(article_meta['citations']),
                    citations=article_meta['citations'],
                )
                try:
                    for meta in ['page', 'figure', 'table']:
                        if meta in article['comment']:
                            comment = article['comment'].replace(';', ',')
                            for segment in comment.split(','):
                                if meta in segment:
                                    page_prefix = segment.split(meta)[0]
                                    if meta == 'page':
                                        paper.pages = int(page_prefix.strip())
                                    elif meta == 'figure':
                                        paper.figures = int(
                                            page_prefix.strip())
                                    elif meta == 'table':
                                        paper.table = int(page_prefix.strip())
                                    break
                except:
                    logging.debug("Error in parsing meta data")
                paper.save()
            except BaseException as e:
                logging.warning("Error in arvix id %s, error: %s" %
                                (arvixID, str(e)))
            time.sleep(0.3)