def setup(self):
        self.heuristic = LastModified()
        self.time_now = time.time()
        day_in_seconds = 86400
        self.year_ago = self.last_modified(day_in_seconds * 365)
        self.week_ago = self.last_modified(day_in_seconds * 7)
        self.day_ago = self.last_modified(day_in_seconds)
        self.now = self.last_modified(0)

        # NOTE: We pass in a negative to get a positive... Probably
        #       should refactor.
        self.day_ahead = self.last_modified(-day_in_seconds)
示例#2
0
    def __init__(self):
        self._API_SERVER = "https://data.nba.net"

        self._cache_control_adapter = CacheControlAdapter(heuristic=LastModified())
        self._requests_session = requests.Session()
        self._requests_session.mount('http://', CacheControlAdapter())
        self._requests_session.mount('https://', CacheControlAdapter())

        self._TEAM_TRICODES = frozenset(('CHA', 'ATL', 'IND', 'MEM', 'DET',
                                         'UTA', 'CHI', 'TOR', 'CLE', 'OKC',
                                         'DAL', 'MIN', 'BOS', 'SAS', 'MIA',
                                         'DEN', 'LAL', 'PHX', 'NOP', 'MIL',
                                         'HOU', 'NYK', 'ORL', 'SAC', 'PHI',
                                         'BKN', 'POR', 'GSW', 'LAC', 'WAS'))

        self._STAT_CATEGORIES = frozenset(('ppg', 'trpg', 'apg', 'fgp', 'ftp',
                                           'tpp', 'bpg', 'spg', 'tpg', 'pfpg'))

        self._CONFERENCES = frozenset(('west', 'east'))

        self._EASTERN_DIVISIONS = frozenset(('southeast', 'atlantic', 'central'))
        self._WESTERN_DIVISIONS = frozenset(('southwest', 'pacific', 'northwest'))
        self._DIVISIONS = {'west': self._WESTERN_DIVISIONS,
                           'east': self._EASTERN_DIVISIONS}

        # Cached dictionaries. Saving these copies avoids having to
        # re-parse JSONs when they are returned from the HTTP cache.
        self._person_ids = None
        self._team_ids_to_tricodes = None
        self._team_tricodes_to_ids = None
示例#3
0
class Settings:
    do_update_wikidata = True
    # Don't activate this, it's most likely broken
    do_update_wikipedia = False

    sparql_file = "free_software_items.rq"
    oauth_token_file = "github_oauth_token.txt"

    # pywikibot is too stupid to cache the calendar model, so let's do this manually
    calendarmodel = pywikibot.Site().data_repository().calendarmodel()
    wikidata_repo = pywikibot.Site("wikidata", "wikidata").data_repository()

    repo_regex = re.compile(r"https://github.com/[^/]+/[^/]+")
    version_regex = re.compile(r"\d+(\.\d+)+")
    unmarked_prerelease_regex = re.compile(
        r"[ -._\d](b|r|rc|beta|alpha)([ .\d].*)?$", re.IGNORECASE)

    cached_session = CacheControl(requests.Session(),
                                  cache=FileCache('cache', forever=True),
                                  heuristic=LastModified())

    properties = {
        "software version": "P348",
        "publication date": "P577",
        "retrieved": "P813",
        "reference URL": "P854",
        "official website": "P856",
        "source code repository": "P1324",
    }
示例#4
0
class Settings:
    do_update_wikidata = True
    # Don't activate this, it's most likely broken
    do_update_wikipedia = False

    normalize_url = True

    sparql_file = "free_software_items.rq"

    # pywikibot is too stupid to cache the calendar model, so let's do this manually
    calendarmodel = pywikibot.Site().data_repository().calendarmodel()
    wikidata_repo = pywikibot.Site("wikidata", "wikidata").data_repository()

    repo_regex = re.compile(r"^[a-z]+://github.com/[^/]+/[^/]+/?$")

    cached_session = CacheControl(
        requests.Session(),
        cache=FileCache("cache", forever=True),
        heuristic=LastModified(),
    )

    properties = {
        "software version": "P348",
        "publication date": "P577",
        "retrieved": "P813",
        "reference URL": "P854",
        "official website": "P856",
        "source code repository": "P1324",
        "title": "P1476",
        "protocol": "P2700",
    }
示例#5
0
    def __init__(self,
                 uri: str = None,
                 session: requests.Session = None,
                 seed: str = None):

        # Airtable and gssutils are using slightly different field names....
        self.meta_field_mapping = {"published": "issued"}

        # Add an explicit on/off for temp scraping (based on presence of dataURL)
        self.temp_scrape = False

        # Use seed if provided
        if seed is not None:
            with open(seed, "r") as f:
                self.seed = json.load(f)
                if "dataURL" in self.seed:
                    logging.warning(
                        "A temporary dataURL has been specified; proceeding with a temp scrape."
                    )
                    self.temp_scrape = True
                if "landingPage" not in self.seed.keys():
                    raise MetadataError(
                        'We always need to provide a "landingPage" via the seed. Either'
                        " it's own or alongside a dataURL for temporary scrapes."
                    )
                uri = self.seed["landingPage"]
        else:
            self.seed = None

        self.uri = uri
        self.dataset = pmdcat.Dataset(uri)
        self.catalog = dcat.Catalog()
        self.dataset.modified = datetime.now(timezone.utc).astimezone()
        self.distributions = []

        if session:
            self.session = session
        elif "RECORD_MODE" in os.environ:
            # don't use cachecontrol, but we'll need to patch the session when used.
            self.session = requests.Session()
        else:
            self.session = CacheControl(
                requests.Session(),
                cache=FileCache(".cache"),
                serializer=BiggerSerializer(),
                heuristic=LastModified(),
            )

        if "JOB_NAME" in os.environ:
            self._base_uri = URIRef("http://gss-data.org.uk")
            self._dataset_id = pathify(os.environ["JOB_NAME"])
        else:
            self._base_uri = BNode()
            parsed_scrape_uri = urlparse(self.uri)
            self._dataset_id = (parsed_scrape_uri.netloc.replace(".", "/") +
                                parsed_scrape_uri.path)
        self.update_dataset_uris()
        self._run()
示例#6
0
 def __init__(self,
              name: str = None,
              description: str = None,
              version: str = None):
     self.app_id = {'X-TBA-App-Id': ""}
     self.session = requests.Session()
     self.session = CacheControl(self.session, heuristic=LastModified())
     self.session.headers.update(self.app_id)
     if name is not None: self.set_api_key(name, description, version)
    def setup(self):
        self.heuristic = LastModified()
        self.time_now = time.time()
        day_in_seconds = 86400
        self.year_ago = self.last_modified(day_in_seconds * 365)
        self.week_ago = self.last_modified(day_in_seconds * 7)
        self.day_ago = self.last_modified(day_in_seconds)
        self.now = self.last_modified(0)

        # NOTE: We pass in a negative to get a positive... Probably
        #       should refactor.
        self.day_ahead = self.last_modified(-day_in_seconds)
示例#8
0
    def __init__(self, uri: str = None, session: requests.Session = None, seed: str = None):

        # Airtable and gssutils are using slightly different field names....
        self.meta_field_mapping = {
            "published": "issued"
        }

        # Add an explicit on/off for temp scraping (based on presence of dataURL)
        self.temp_scrape = False

        # Use seed if provided
        if seed is not None:
            with open(seed, "r") as f:
                self.seed = json.load(f)
                if "dataURL" in self.seed:
                    logging.warning("A temporary dataURL has been specified; proceeding with a temp scrape.")
                    uri = self.seed["dataURL"]
                    self.temp_scrape = True
                elif "landingPage" not in self.seed:
                    raise MetadataError("Aborting, insufficient seed data. No landing page supplied via "
                                        "info.json and no dataURL to use as a fallback.")
                else:
                    uri = self.seed["landingPage"]
        else:
            self.seed = None

        self.uri = uri
        self.dataset = pmdcat.Dataset(uri)
        self.catalog = dcat.Catalog()
        self.dataset.modified = datetime.now(timezone.utc).astimezone()
        self.distributions = []

        if session:
            self.session = session
        else:
            self.session = CacheControl(requests.Session(),
                                        cache=FileCache('.cache'),
                                        serializer=BiggerSerializer(),
                                        heuristic=LastModified())

        if 'JOB_NAME' in os.environ:
            self._base_uri = URIRef('http://gss-data.org.uk')
            self._dataset_id = pathify(os.environ['JOB_NAME'])
        else:
            self._base_uri = BNode()
            parsed_scrape_uri = urlparse(self.uri)
            self._dataset_id = parsed_scrape_uri.netloc.replace('.', '/') + parsed_scrape_uri.path
        self.update_dataset_uris()
        self._run()
示例#9
0
def csv_dialect(fd):
    snippet = fd.read(1024).encode('utf-8') if PY2 else fd.read(1024)
    fd.seek(0)
    return csv.Sniffer().sniff(snippet)


### HTTP utils ###

try:
    import requests
    from cachecontrol import CacheControl, CacheControlAdapter
    from cachecontrol.caches import FileCache
    from cachecontrol.heuristics import LastModified

    cache_dir = '%s/Library/Caches/PlotDevice'%os.environ['HOME']
    HTTP = CacheControl(requests.Session(), cache=FileCache(cache_dir), heuristic=LastModified())
except ImportError:
    class Decoy(object):
        def get(self, url):
            unsupported = 'could not find the "requests" library (try running "python setup.py build" first)'
            raise RuntimeError(unsupported)
    HTTP = Decoy()

def binaryish(content, format):
    bin_types = ('pdf','eps','png','jpg','jpeg','gif','tiff','tif','zip','tar','gz')
    bin_formats = ('raw','bytes','img','image')
    if any(b in content for b in bin_types):
        return True
    if format:
        return any(b in format for b in bin_types+bin_formats)
    return False
示例#10
0
).downloadURL


# In[119]:


if is_interactive():
    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache
    from cachecontrol.heuristics import LastModified
    from pathlib import Path

    session = CacheControl(requests.Session(),
                           cache=FileCache('.cache'),
                           heuristic=LastModified())

    sourceFolder = Path('in')
    sourceFolder.mkdir(exist_ok=True)

    inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Alcohol_Tables_17.xls'
    inputFile = sourceFolder / 'Alcohol_Tables_17.xls'
    response = session.get(inputURL)
    with open(inputFile, 'wb') as f:
      f.write(response.content)
    tab = loadxlstabs(inputFile, sheetids='Table 2')[0]


# In[120]:

class TestModifiedUnitTests(object):

    def last_modified(self, period):
        return time.strftime(TIME_FMT, time.gmtime(self.time_now - period))

    def setup(self):
        self.heuristic = LastModified()
        self.time_now = time.time()
        day_in_seconds = 86400
        self.year_ago = self.last_modified(day_in_seconds * 365)
        self.week_ago = self.last_modified(day_in_seconds * 7)
        self.day_ago = self.last_modified(day_in_seconds)
        self.now = self.last_modified(0)

        # NOTE: We pass in a negative to get a positive... Probably
        #       should refactor.
        self.day_ahead = self.last_modified(-day_in_seconds)

    def test_no_expiry_is_inferred_when_no_last_modified_is_present(self):
        assert self.heuristic.update_headers(DummyResponse(200, {})) == {}

    def test_expires_is_not_replaced_when_present(self):
        resp = DummyResponse(200, {"Expires": self.day_ahead})
        assert self.heuristic.update_headers(resp) == {}

    def test_last_modified_is_used(self):
        resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.week_ago})
        modified = self.heuristic.update_headers(resp)
        assert ["expires"] == list(modified.keys())
        assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now()

    def test_last_modified_is_not_used_when_cache_control_present(self):
        resp = DummyResponse(
            200,
            {
                "Date": self.now,
                "Last-Modified": self.week_ago,
                "Cache-Control": "private",
            },
        )
        assert self.heuristic.update_headers(resp) == {}

    def test_last_modified_is_not_used_when_status_is_unknown(self):
        resp = DummyResponse(299, {"Date": self.now, "Last-Modified": self.week_ago})
        assert self.heuristic.update_headers(resp) == {}

    def test_last_modified_is_used_when_cache_control_public(self):
        resp = DummyResponse(
            200,
            {
                "Date": self.now,
                "Last-Modified": self.week_ago,
                "Cache-Control": "public",
            },
        )
        modified = self.heuristic.update_headers(resp)
        assert ["expires"] == list(modified.keys())
        assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now()

    def test_warning_not_added_when_response_more_recent_than_24_hours(self):
        resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.week_ago})
        assert self.heuristic.warning(resp) is None

    def test_warning_is_not_added_when_heuristic_was_not_used(self):
        resp = DummyResponse(200, {"Date": self.now, "Expires": self.day_ahead})
        assert self.heuristic.warning(resp) is None

    def test_expiry_is_no_more_that_twenty_four_hours(self):
        resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.year_ago})
        modified = self.heuristic.update_headers(resp)
        assert ["expires"] == list(modified.keys())
        assert self.day_ahead == modified["expires"]
class TestModifiedUnitTests(object):
    def last_modified(self, period):
        return time.strftime(TIME_FMT, time.gmtime(self.time_now - period))

    def setup(self):
        self.heuristic = LastModified()
        self.time_now = time.time()
        day_in_seconds = 86400
        self.year_ago = self.last_modified(day_in_seconds * 365)
        self.week_ago = self.last_modified(day_in_seconds * 7)
        self.day_ago = self.last_modified(day_in_seconds)
        self.now = self.last_modified(0)

        # NOTE: We pass in a negative to get a positive... Probably
        #       should refactor.
        self.day_ahead = self.last_modified(-day_in_seconds)

    def test_no_expiry_is_inferred_when_no_last_modified_is_present(self):
        assert self.heuristic.update_headers(DummyResponse(200, {})) == {}

    def test_expires_is_not_replaced_when_present(self):
        resp = DummyResponse(200, {"Expires": self.day_ahead})
        assert self.heuristic.update_headers(resp) == {}

    def test_last_modified_is_used(self):
        resp = DummyResponse(200, {
            "Date": self.now,
            "Last-Modified": self.week_ago
        })
        modified = self.heuristic.update_headers(resp)
        assert ["expires"] == list(modified.keys())
        assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now()

    def test_last_modified_is_not_used_when_cache_control_present(self):
        resp = DummyResponse(
            200,
            {
                "Date": self.now,
                "Last-Modified": self.week_ago,
                "Cache-Control": "private",
            },
        )
        assert self.heuristic.update_headers(resp) == {}

    def test_last_modified_is_not_used_when_status_is_unknown(self):
        resp = DummyResponse(299, {
            "Date": self.now,
            "Last-Modified": self.week_ago
        })
        assert self.heuristic.update_headers(resp) == {}

    def test_last_modified_is_used_when_cache_control_public(self):
        resp = DummyResponse(
            200,
            {
                "Date": self.now,
                "Last-Modified": self.week_ago,
                "Cache-Control": "public",
            },
        )
        modified = self.heuristic.update_headers(resp)
        assert ["expires"] == list(modified.keys())
        assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now()

    def test_warning_not_added_when_response_more_recent_than_24_hours(self):
        resp = DummyResponse(200, {
            "Date": self.now,
            "Last-Modified": self.week_ago
        })
        assert self.heuristic.warning(resp) is None

    def test_warning_is_not_added_when_heuristic_was_not_used(self):
        resp = DummyResponse(200, {
            "Date": self.now,
            "Expires": self.day_ahead
        })
        assert self.heuristic.warning(resp) is None

    def test_expiry_is_no_more_that_twenty_four_hours(self):
        resp = DummyResponse(200, {
            "Date": self.now,
            "Last-Modified": self.year_ago
        })
        modified = self.heuristic.update_headers(resp)
        assert ["expires"] == list(modified.keys())
        assert self.day_ahead == modified["expires"]
 def setup(self):
     self.sess = Session()
     self.cached_sess = CacheControl(self.sess, heuristic=LastModified())
示例#14
0
import os, datetime
import requests
from .data.rfeed import Item, Feed
from flask import Flask, jsonify, request as flask_request
from cachecontrol import CacheControlAdapter
from cachecontrol.heuristics import LastModified

app = Flask(__name__)

adapter = CacheControlAdapter(heuristic=LastModified())

sess = requests.Session()
sess.mount('http://', adapter)
sess.mount('https://', adapter)

SERVICE_NAME = os.path.splitext(os.path.basename(__file__))[0]


@app.route("/rss/summary", methods=['GET'])
def latest_articles():
    if flask_request.method == 'GET':
        response = sess.get('http://localhost/article/collect/10')
        article_collection = []
        if response.status_code == requests.codes.ok:
            articles = response.json()['success']
            for article in articles:
                article_collection.append(
                    Item(
                        title=article['title'],
                        author=article['author'],
                        pubDate=datetime.datetime.strptime(