Пример #1
0
def download_extract(paper, extract_figure=False, extract_table=False):
    if paper.pages >= 0 and paper.table >= 0:
        return False
    paper_info = {
        'pdf_url': paper.url,
        'title': paper.title,
    }
    api_paper = arxiv.query(id_list=[paper.arvixID])[0]
    if 'pdf_url' not in api_paper:
        return False
    pdf_url = api_paper['pdf_url']
    # pdf_url = 'https://arxiv.org/pdf/' + paper.url.split('/')[-1] +'.pdf'
    file_path = os.path.join(store_path, paper.paperId + '.pdf')
    # if not os.path.isfile(file_path):
    urllib.request.urlretrieve(pdf_url, file_path)

    if extract_table:
        df = wrapper.read_pdf(file_path, multiple_tables=True, pages='all')
        table_count = len(df)
        del df

    if extract_figure:
        figure_count, page_count = get_figure_count(file_path)
        modified = False
        if paper.pages == -1:
            modified = True
            paper.pages = page_count
        else:
            page_count = paper.pages
        if paper.table == -1:
            modified = True
            paper.table = table_count
        if os.path.exists(file_path):
            os.remove(file_path)
        if modified:
            Paper.update(table=table_count, pages=page_count).where(
                Paper.arvixID == paper.arvixID).execute()
            # paper.save()
            return modified
    # api_paper = arxiv.query(id_list=[paper.arvixID])[0]
    # if 'pdf_url' not in api_paper:
    #     return False
    # pdf_url = api_paper['pdf_url']
    texts = extract_text(file_path, pdf_url)
    if texts is None:
        print("PDF either do not exists or failed : ", paper.url)
        return False
    affiliation = []
    for text in texts.split():
        if re.match("[^@]+@[^@]+\.[^@]+", text):
            domain_name = text.split('@')[-1]
            affiliation.append(domain_name)
    if len(affiliation) > 0:
        Paper.update(affiliation=affiliation).where(
            Paper.arvixID == paper.arvixID).execute()

    return False
Пример #2
0
def crawl_category(term='cs.LG'):
    index_iteration = 500
    logging.info("Crawling category : %s", term)
    for index in range(start_index, end_index, index_iteration):
        logging.info("\nBatch : %d-%d" % (index, index + index_iteration))
        articles = arxivpy.query(search_query=[term],
                                 start_index=index,
                                 max_index=index + index_iteration,
                                 results_per_iteration=index_iteration,
                                 wait_time=0.2,
                                 sort_by='lastUpdatedDate')
        article_batch_count = len(articles)
        if article_batch_count == 0:
            logging.warning('Article not found in batch %d - %d' %
                            (index, index + index_iteration))
        for idx, article in tqdm(enumerate(articles),
                                 total=article_batch_count):
            arvixID = article['id'].split('v')[0]
            query = Paper.select().where(Paper.arvixID == arvixID)
            if query.exists():
                paper = Paper.get(Paper.arvixID == arvixID)
                categories = paper.category
                if term not in categories:
                    categories.append(term)
                Paper.update(category=categories).where(
                    Paper.arvixID == arvixID).execute()
                continue
            success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
            if success is False:
                logging.debug(
                    "Paper not exists in semantic scholar, arvixID : %s" %
                    arvixID)
                continue
            authorIDList = [
                int(author['authorId'])
                if author['authorId'] is not None else -1
                for author in article_meta['authors']
            ]
            authorNames = [article['main_author']]
            authorCount = len(article_meta['authors'])
            if authorCount > 1:
                other_author = [
                    name.strip() for name in article['authors'].split(',')
                    if len(name) > 1 and name != article['main_author']
                ]
                authorNames += other_author
            paper_category = [article['term']]
            if article['term'] != term:
                paper_category.append(term)
            try:
                paper = Paper.create(
                    indexID=idx + index,
                    arvixID=arvixID,
                    paperId=article_meta['paperId'],
                    doiID=str(article_meta['doi']),
                    title=article['title'],
                    summary=article['abstract'],
                    category=paper_category,
                    comments=article['comment'],
                    journal_ref=article['journal_ref'],
                    url=article['url'],
                    authorID=authorIDList,
                    authorName=authorNames,
                    authorCount=authorCount,
                    publishedDate=article['publish_date'],
                    citationVelocity=article_meta['citationVelocity'],
                    referencesCount=len(article_meta['references']),
                    topics=article_meta['topics'],
                    venue=str(article_meta['venue']),
                    year=article_meta['year'],
                    influentialCitationCount=article_meta[
                        'influentialCitationCount'],
                    citationCount=len(article_meta['citations']),
                    citations=article_meta['citations'],
                )
                try:
                    for meta in ['page', 'figure', 'table']:
                        if meta in article['comment']:
                            comment = article['comment'].replace(';', ',')
                            for segment in comment.split(','):
                                if meta in segment:
                                    page_prefix = segment.split(meta)[0]
                                    if meta == 'page':
                                        paper.pages = int(page_prefix.strip())
                                    elif meta == 'figure':
                                        paper.figures = int(
                                            page_prefix.strip())
                                    elif meta == 'table':
                                        paper.table = int(page_prefix.strip())
                                    break
                except:
                    logging.debug("Error in parsing meta data")
                paper.save()
            except BaseException as e:
                logging.warning("Error in arvix id %s, error: %s" %
                                (arvixID, str(e)))
            time.sleep(0.3)