示例#1
0
* https://arxiv.org/help/oa/index
"""

import os
import gzip
import glob
import json
import time
import hashlib
import datetime
import requests
import xml.etree.ElementTree as ET

from arxiv_public_data.config import LOGGER, DIR_BASE

log = LOGGER.getChild('metadata')

URL_ARXIV_OAI = 'https://export.arxiv.org/oai2'
URL_CITESEER_OAI = 'http://citeseerx.ist.psu.edu/oai2'
OAI_XML_NAMESPACES = {
    'OAI': 'http://www.openarchives.org/OAI/2.0/',
    'arXiv': 'http://arxiv.org/OAI/arXivRaw/'
}


def get_list_record_chunk(resumptionToken=None,
                          harvest_url=URL_ARXIV_OAI,
                          metadataPrefix='arXivRaw'):
    """
    Query OIA API for the metadata of 1000 Arxiv article
示例#2
0
import shutil
import tarfile
import boto3
import hashlib
import requests
import subprocess 

from functools import partial
from multiprocessing import Pool
from collections import defaultdict
import xml.etree.ElementTree as ET

from arxiv_public_data import fulltext
from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER

logger = LOGGER.getChild('s3')

CHUNK_SIZE = 2**20  # 1MB
BUCKET_NAME = 'arxiv'
S3_PDF_MANIFEST = 'pdf/arXiv_pdf_manifest.xml'
S3_TEX_MANIFEST = 'src/arXiv_src_manifest.xml'
HEADERS = {'x-amz-request-payer': 'requester'}

s3 = boto3.client('s3', region_name='us-east-1')

def download_file(filename, outfile, chunk_size=CHUNK_SIZE, redownload=False,
                  dryrun=False):
    """
    Downloads filename from the ArXiv AWS S3 bucket, and returns streaming md5
    sum of the content
    Parameters
示例#3
0
from arxiv_public_data.oai_metadata import load_metadata
from arxiv_public_data.authors import parse_authorline_parallel
from arxiv_public_data.config import LOGGER

logger = LOGGER.getChild('authorsplit')

if __name__ == "__main__":
    import sys

    processes = int(sys.argv[1]) if len(sys.argv) > 1 else None

    logger.info('Loading OAI metadata...')
    metadata = load_metadata()
    article_authors = [[md.get('id'), md.get('authors')] for md in metadata]
    parse_authorline_parallel(article_authors, processes)
#! /usr/bin/env python
import time
import re
import sys
import glob
import os
import gzip
import json
import math
from multiprocessing import Pool, cpu_count

from arxiv_public_data.regex_arxiv import REGEX_ARXIV_FLEXIBLE, clean
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT, LOGGER

log = LOGGER.getChild('fulltext')
RE_FLEX = re.compile(REGEX_ARXIV_FLEXIBLE)
RE_OLDNAME_SPLIT = re.compile(r"([a-z\-]+)(\d+)")


def path_to_id(path):
    """ Convert filepath name of ArXiv file to ArXiv ID """
    name = os.path.splitext(os.path.basename(path))[0]
    if '.' in name:  # new  ID
        return name
    split = [a for a in RE_OLDNAME_SPLIT.split(name) if a]
    return "/".join(split)


def all_articles(directory=DIR_FULLTEXT):
    """ Find all *.txt files in directory """
    out = []
示例#5
0
"""
tf_hub.py

Find text embeddings using pre-trained TensorFlow Hub models
"""

import os
import pickle
import numpy as np

from arxiv_public_data.config import DIR_OUTPUT, LOGGER
from arxiv_public_data.embeddings.util import batch_fulltext

logger = LOGGER.getChild('embds')

try:
    import tensorflow as tf
    import tensorflow_hub as hub
    import sentencepiece as spm
except ImportError as e:
    logger.warn("This module requires 'tensorflow', 'tensorflow-hub', and"
                "'sentencepiece'\n"
                'Please install these modules to use tf_hub.py')


UNIV_SENTENCE_ENCODER_URL = ('https://tfhub.dev/google/'
                             'universal-sentence-encoder/2')

ELMO_URL = "https://tfhub.dev/google/elmo/2"
ELMO_KWARGS = dict(signature='default', as_dict=True)
ELMO_MODULE_KWARGS = dict(trainable=True)