예제 #1
0
    def scrape(self, session):
        """Iterates through all the sites in url_dict to extract new documents.

        This is implemented as follows:
          1. Download each of the pages.
          2. Extract the URLs from the pages.
          3. Check which of those URLs are not yet in our database.
          4. For each of the URLs that are not yet in our database, 
             add them as a new Bid object to the database.
        """
        scraper = scrapelib.Scraper()
        for url, xpaths in self.url_dict.items():
            page = scraper.get(URL_PREFIX + url)
            # doc_ids is dictionary: relative URL => title of doc
            doc_ids = \
                results_page_scraper.scrape_results_page(page.content, xpaths)
            log.info("Found docs: {}".format(doc_ids))
            new_urls = get_new_urls(
                session,
                doc_ids.keys(),  # relative URL is the identifier
                self.get_site()
            )
            log.info("New docs: {}".format(new_urls))
            new_docs = self.add_new_documents(new_urls, doc_ids)
            session.add_all(new_docs)
            # Save all the new docs from this results page in one db call.
            session.commit()
예제 #2
0
def import_versions(state, rpm=60):
    scraper = scrapelib.Scraper(requests_per_minute=rpm)

    for bill in db.bills.find({'state': state}):
        logging.info("Importing %s" % bill['bill_id'])

        bill_changed = False
        for version in bill['versions']:
            if 'document_id' in version or 'url' not in version:
                continue

            doc = scraper.urlopen(version['url'])

            metadata = {'bill': {'state': bill['state'],
                                 'chamber': bill['chamber'],
                                 'session': bill['session'],
                                 'bill_id': bill['bill_id'],
                                 'title': bill['title']},
                        'name': version['name'],
                        'url': version['url']}

            content_type = doc.response.headers['content-type']

            version['document_id'] = put_document(doc, content_type,
                                                  metadata)
            bill_changed = True

        if bill_changed:
            db.bills.save(bill, safe=True)
예제 #3
0
 def scrape_legislator_list(self, session_num):
     url = ("http://www.legis.state.ak.us/publicservice/basis/members"
            "?minifyresult=false&session=" + session_num)
     xml = scrapelib.Scraper().get(url).content
     for line in lxml.etree.fromstring(xml).xpath("//Member/MemberDetails"):
         person = self.handle_list_item(line, session_num)
         yield person
예제 #4
0
def start_data(search_url, params=None):
    browser = webdriver.Chrome(options=OPTIONS)
    browser.set_page_load_timeout(-1)
    browser.implicitly_wait(15)

    if params is None:
        browser.get(BASE_URL + search_url)
    else:
        browser.get(BASE_URL + search_url + "?" +
                    urllib.parse.urlencode(params))

    download_link = browser.find_element_by_id('download-button')
    payload = {
        'cacheId': download_link.get_attribute('data-cacheid'),
        'typeOfReport': download_link.get_attribute('data-typeofreport'),
        'token': str(datetime.datetime.now())
    }

    scraper = scrapelib.Scraper(retry_attempts=20)
    response = scraper.post(
        'https://www.nlrb.gov/nlrb-downloads/start-download/' +
        payload['typeOfReport'] + '/' + payload['cacheId'] + '/' +
        payload['token'])

    result = response.json()['data']
    return result
예제 #5
0
    def __init__(self, username, password):
        self.headers = {'Referer': self._REFERER, 'User-Agent': self._USER_AGENT}
        self.cookies = dict()
        self.client_data = {'client_id': self._CLIENT_ID, 'client_secret': self._CLIENT_SECRET}
        self.scraper = scrapelib.Scraper(retry_attempts=3)

        self._authenticate(username=username, password=password)
예제 #6
0
파일: dump.py 프로젝트: showerst/billy
    def dump(self, abbr, filename, validate, schema_dir):
        scraper = scrapelib.Scraper(requests_per_minute=600, retry_attempts=3)

        zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)

        if not schema_dir:
            cwd = os.path.split(__file__)[0]
            schema_dir = os.path.join(cwd, "../../schemas/api/")

        with open(os.path.join(schema_dir, "bill.json")) as f:
            bill_schema = json.load(f)

        with open(os.path.join(schema_dir, "legislator.json")) as f:
            legislator_schema = json.load(f)

        with open(os.path.join(schema_dir, "committee.json")) as f:
            committee_schema = json.load(f)

        # write out metadata
        response = scraper.get(api_url('metadata/%s' % abbr)).content
        zip.writestr('metadata.json', response)

        logging.info('exporting %s legislators...' % abbr)
        for legislator in db.legislators.find({settings.LEVEL_FIELD: abbr}):
            path = 'legislators/%s' % legislator['_id']
            url = api_url(path)

            response = scraper.get(url).content
            if validate:
                validictory.validate(json.loads(response), legislator_schema,
                                     validator_cls=APIValidator)

            zip.writestr(path, response)

        logging.info('exporting %s committees...' % abbr)
        for committee in db.committees.find({settings.LEVEL_FIELD: abbr}):
            path = 'committees/%s' % committee['_id']
            url = api_url(path)

            response = scraper.get(url).content
            if validate:
                validictory.validate(json.loads(response), committee_schema,
                                     validator_cls=APIValidator)

            zip.writestr(path, response)

        logging.info('exporting %s bills...' % abbr)
        for bill in db.bills.find({settings.LEVEL_FIELD: abbr}, timeout=False):
            path = "bills/%s/%s/%s/%s" % (abbr, bill['session'],
                                          bill['chamber'], bill['bill_id'])
            url = api_url(path)

            response = scraper.get(url).content
            if validate:
                validictory.validate(json.loads(response), bill_schema,
                                     validator_cls=APIValidator)

            zip.writestr(path, response)

        zip.close()
예제 #7
0
def dump_json(abbr, filename, validate, schema_dir):
    scraper = scrapelib.Scraper(requests_per_minute=600, follow_robots=False)
    level = metadata(abbr)['level']

    zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)

    if not schema_dir:
        cwd = os.path.split(__file__)[0]
        schema_dir = os.path.join(cwd, "../schemas/api/")

    with open(os.path.join(schema_dir, "bill.json")) as f:
        bill_schema = json.load(f)

    with open(os.path.join(schema_dir, "legislator.json")) as f:
        legislator_schema = json.load(f)

    with open(os.path.join(schema_dir, "committee.json")) as f:
        committee_schema = json.load(f)

    logging.info('dumping %s bills...' % abbr)
    for bill in db.bills.find({'level': level, level: abbr}, timeout=False):
        path = "bills/%s/%s/%s/%s" % (abbr, bill['session'], bill['chamber'],
                                      bill['bill_id'])
        url = api_url(path)

        response = scraper.urlopen(url)
        if validate:
            validictory.validate(json.loads(response),
                                 bill_schema,
                                 validator_cls=APIValidator)

        zip.writestr(path, response)

    logging.info('dumping %s legislators...' % abbr)
    for legislator in db.legislators.find({'level': level, level: abbr}):
        path = 'legislators/%s' % legislator['_id']
        url = api_url(path)

        response = scraper.urlopen(url)
        if validate:
            validictory.validate(json.loads(response),
                                 legislator_schema,
                                 validator_cls=APIValidator)

        zip.writestr(path, response)

    logging.info('dumping %s committees...' % abbr)
    for committee in db.committees.find({'level': level, level: abbr}):
        path = 'committees/%s' % committee['_id']
        url = api_url(path)

        response = scraper.urlopen(url)
        if validate:
            validictory.validate(json.loads(response),
                                 committee_schema,
                                 validator_cls=APIValidator)

        zip.writestr(path, response)

    zip.close()
예제 #8
0
def get_latest():
    """
    Get and load the latest SQL dumps from the California legislature.
    """
    scraper = scrapelib.Scraper()

    meta = db.metadata.find_one({'_id': 'ca'})
    last_update = meta['_last_update']

    base_url = "ftp://www.leginfo.ca.gov/pub/bill/"
    with scraper.urlopen(base_url) as page:
        next_day = last_update + datetime.timedelta(days=1)

        while next_day.date() < datetime.date.today():
            for f in parse_directory_listing(page):
                if (re.match(r'pubinfo_(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\.zip',
                             f['filename'])
                        and f['mtime'].date() == next_day.date()):

                    url = base_url + f['filename']
                    print "Getting %s" % url
                    get_and_load(url)

                    meta['_last_update'] = next_day
                    db.metadata.save(meta, safe=True)
                    break
            else:
                print "Couldn't find entry for %s" % str(next_day.date())
                break

            next_day = next_day + datetime.timedelta(days=1)
예제 #9
0
def session_list():
    html = scrapelib.Scraper().get('http://www.sdlegislature.gov/'
                                   'Legislative_Session/Menu.aspx').text
    doc = lxml.html.fromstring(html)
    sessions = doc.xpath('//div[@id="ctl00_ContentPlaceHolder1_BlueBoxLeft"]//ul/li'
        '/a/div/text()')
    return sessions
예제 #10
0
    def run(self):
        # We first will ensure the cache and data directories exist
        if not os.path.exists(CACHE_DIR):
            os.makedirs(CACHE_DIR)
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)

        # We use scrapelib as we are unsure of the integrity of the server we will be pulling from
        s = scrapelib.Scraper(retry_wait_seconds=5, retry_attempts=10)

        # Enable caching so we don't repeat downloads
        s.cache_storage = scrapelib.FileCache(CACHE_DIR)
        s.cache_write_only = False

        # Simple download function
        def download_entity(s, filename):
            """ Download an asset """
            logging.info('Downloading %s from %s' %
                         (filename, join(SOURCE_URL, filename)))
            s.urlretrieve('%s/%s' % (SOURCE_URL, filename),
                          '%s/%s' % (self.output().path, filename))

        # Download the data!
        os.system('mkdir -p "%s"' % self.output().path)
        for filename in CANDIDATE_SOURCE_FILES.values():
            download_entity(s, filename)

        for filename in COMMITTEE_SOURCE_FILES.values():
            download_entity(s, filename)

        for filename in META_SOURCE_FILES.values():
            download_entity(s, filename)
예제 #11
0
def session_list():
    import scrapelib
    import lxml.html
    html = scrapelib.Scraper().get(
        'http://legis.sd.gov/Legislative_Session/Menu.aspx').text
    doc = lxml.html.fromstring(html)
    return doc.xpath(
        '//div[@id="ContentPlaceHolder1_BlueBoxLeft"]//ul/li/a/div/text()')
예제 #12
0
    def get_session_list(self):
        import scrapelib
        import lxml.html

        url = 'http://www.legis.nd.gov/assembly/'
        html = scrapelib.Scraper().get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        return doc.xpath("//div[@class='view-content']//a/text()")
예제 #13
0
def session_list():
    import scrapelib
    text = scrapelib.Scraper().get('ftp://ftp.cga.ct.gov').text
    sessions = [line.split()[-1] for line in text.splitlines()]

    for not_session_name in ('incoming', 'pub', 'CGAAudio', 'rba', 'NCSL',
                             "apaac"):
        sessions.remove(not_session_name)
    return sessions
예제 #14
0
def download(url):
    scraper = scrapelib.Scraper()
    with scraper.urlopen(url) as resp:
        (fd, path) = tempfile.mkstemp('.zip')

        with os.fdopen(fd, 'wb') as w:
            w.write(resp)

        return path
예제 #15
0
 def get_session_list(self):
     html = scrapelib.Scraper().get('http://www.sdlegislature.gov/'
                                    'Legislative_Session/archive.aspx').text
     doc = lxml.html.fromstring(html)
     sessions = [
         x.strip()
         for x in doc.xpath('//table//td[@data-title="Year"]/text()')
     ]
     return sessions
예제 #16
0
def dump_json(state, filename, validate, schema_dir):
    scraper = scrapelib.Scraper(requests_per_minute=600)

    zip = zipfile.ZipFile(filename, 'w')

    if not schema_dir:
        cwd = os.path.split(__file__)[0]
        schema_dir = os.path.join(cwd, "../schemas/api/")

    with open(os.path.join(schema_dir, "bill.json")) as f:
        bill_schema = json.load(f)

    with open(os.path.join(schema_dir, "legislator.json")) as f:
        legislator_schema = json.load(f)

    with open(os.path.join(schema_dir, "committee.json")) as f:
        committee_schema = json.load(f)

    for bill in db.bills.find({'state': state}):
        path = "bills/%s/%s/%s/%s" % (state, bill['session'], bill['chamber'],
                                      bill['bill_id'])
        url = api_url(path)

        response = scraper.urlopen(url)
        if validate:
            validictory.validate(json.loads(response),
                                 bill_schema,
                                 validator_cls=APIValidator)

        zip.writestr(path, scraper.urlopen(url))

    for legislator in db.legislators.find({'state': state}):
        path = 'legislators/%s' % legislator['_id']
        url = api_url(path)

        response = scraper.urlopen(url)
        if validate:
            validictory.validate(json.loads(response),
                                 legislator_schema,
                                 validator_cls=APIValidator)

        zip.writestr(path, response)

    for committee in db.committees.find({'state': state}):
        path = 'committees/%s' % committee['_id']
        url = api_url(path)

        response = scraper.urlopen(url)
        if validate:
            validictory.validate(json.loads(response),
                                 committee_schema,
                                 validator_cls=APIValidator)

        zip.writestr(path, response)

    zip.close()
예제 #17
0
파일: core.py 프로젝트: resistbot/people
 def __init__(self,
              initial_page,
              page_processors=None,
              *,
              scraper: scrapelib.Scraper = None):
     self.initial_page = initial_page
     if not isinstance(page_processors, (list, tuple)):
         self.page_processors = [page_processors]
     else:
         self.page_processors = page_processors
     if not scraper:
         self.scraper = scrapelib.Scraper()
예제 #18
0
    def do_scrape(
        self,
        scraper: typing.Optional[scrapelib.Scraper] = None
    ) -> typing.Iterable[typing.Any]:
        """
        yield results from this page and any subpages

        :param scraper: Optional `scrapelib.Scraper` instance to use for running scrape.
        :returns: Generator yielding results from the scrape.
        """
        if scraper is None:
            scraper = scrapelib.Scraper()
        yield from self._to_items(scraper)
예제 #19
0
    def get_session_list(self):
        import scrapelib
        import lxml.html

        url = "http://www.legis.nd.gov/assembly/"
        html = scrapelib.Scraper().get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        sessions = doc.xpath("//div[@class='view-content']//a/text()")
        sessions = [
            session for session in sessions if "Territorial Assembly" not in session
        ]
        return sessions
예제 #20
0
    def download_files(self):
        s = scrapelib.Scraper(requests_per_minute=self.req_per_min,
                              retry_attempts=2)

        #enable cache on scrapelib
        cache_dir = os.path.join(os.getcwd(), 'cache')
        if not cache_dir:
            os.mkdir(cache_dir)

        s.cache_storage = scrapelib.FileCache(cache_dir)
        s.cache_write_only = False
        # TODO : update scrapelib to check last modified header

        with closing(
                shelve.open(os.path.join(self.import_dir,
                                         self.shelf_file))) as db:
            for key in db.keys():
                dir_for_solnbr = self.create_dir_by_solnbr(key)

                attachments = db[key]['attachments']

                for (i, a) in enumerate(attachments):
                    self.log.info("Downloading file ({}: {}) from {}".format(
                        a['filename'], a['desc'], a['url']))

                    # parse URL into components
                    u = urlparse(a['url'])

                    # match main portion to dict of special cases, get function to use
                    downloader_func = downloaders.func_map.get(
                        u.netloc, downloaders.default)

                    try:
                        local_file_path = downloader_func(s,
                                                          a['url'],
                                                          dir_for_solnbr,
                                                          solnbr=key)
                        a.update({'local_file_path': local_file_path})
                    except:
                        self.log.exception(
                            "Attachment couldn't be retrieved for unknown reasons. URL: {} Continuing."
                            .format(a['url']))
                        a.update({'exception': 1})
                        continue
                    finally:
                        attachments[i] = a

                meta = {'dl_complete': True, 'num_dl': len(attachments)}
                db[key] = {'attachments': attachments, 'meta': meta}
예제 #21
0
    def get_session_list(self):
        html = scrapelib.Scraper().get('http://www.sdlegislature.gov/'
                                       'Legislative_Session/archive.aspx').text
        doc = lxml.html.fromstring(html)
        sessions = [x.strip() for x in doc.xpath('//table//td[@data-title="Year"]/text()')]

        # Archive page lacks the latest session
        current_session_url = doc.xpath(
            '//*[@id="ctl00_divHeader_mnuMain"]/li[6]/ul/li[1]/a/@href')[0]
        current_session = current_session_url.replace(
            '/Legislative_Session/Bills/Default.aspx?Session=', '')
        if current_session not in sessions:
            sessions.append(current_session)

        return sessions
예제 #22
0
def get_data(file_name, search_url, params=None):

    attempts = 0
    while True:
        print("Attempt " + str(attempts + 1))
        try:
            result = start_data(search_url=search_url, params=params)
            break
        except:
            attempts += 1
            sleep(10)
            if attempts > 40:
                print("Unable to download")
                raise

    previous = 0
    s = scrapelib.Scraper(retry_attempts=20)

    with tqdm.tqdm(total=result['total'],
                   desc='NLRB.gov preparing download') as pbar:
        while not result['finished']:
            response = s.get(BASE_URL + '/nlrb-downloads/progress/' +
                             str(result['id']))
            result = response.json()['data']

            # update progress bar
            current = result['processed']
            pbar.update(current - previous)
            previous = current
            #sleep(2)

    print(BASE_URL + result['filename'])
    attempts = 0
    while True:
        file_out = s.get(BASE_URL + result['filename'])
        if file_out.content[0:6] == b'Region' or file_out.content[
                0:5] == b'"Case':
            break
        else:
            if attempts > 30:
                Exception("Cannot download")
            attempts += 1
            sleep(10)

    with open(DOWNLOAD_FOLDER + "/" + file_name, 'wb') as f:
        f.write(file_out.content)
예제 #23
0
def main(abbr):

    request_defaults = {
        # 'proxies': {"http": "localhost:8888"},
        'timeout': 5.0,
        'headers': {
            'Accept': ('text/html,application/xhtml+xml,application/'
                       'xml;q=0.9,*/*;q=0.8'),
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-us,en;q=0.5',
            'User-Agent': ('Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0.2) '
                           'Gecko/20100101 Firefox/10.0.2'),
        },
        'follow_robots': False,

        # Note, this script needs run in the same dir as billy_settings.py
    }

    logger = logbook.Logger()
    DATA = join(settings.BILLY_DATA_DIR, abbr, 'billtext')

    try:
        os.makedirs(DATA)
    except OSError:
        pass
    logger.info('writing files to %r' % DATA)

    session = scrapelib.Scraper(cache_obj=scrapelib.FileCache('cache'),
                                cache_write_only=False,
                                use_cache_first=True,
                                requests_per_minute=0,
                                **request_defaults)

    for bill in db.bills.find({'state': abbr}):
        if len(bill['versions']):
            bill_id = bill['bill_id']
            url = bill['versions'][0]['url']
            logger.info('trying %r: %r' % (bill_id, url))
            text = session.get(url).text
            with open(join(DATA, bill['_id']), 'w') as f:
                f.write(text.encode('utf-8'))
예제 #24
0
def scrape(old_data_path, no_download):
    """
    Main function -- read in existing data, scrape new data, merge the two
    sets, and save to the output location.
    """
    scraper = scrapelib.Scraper(requests_per_minute=180,
                                retry_attempts=5,
                                retry_wait_seconds=15)

    if old_data_path:
        existing_parolees = get_existing_parolees(old_data_path)
    else:
        existing_parolees = {}

    if no_download:
        new_parolees = []
    else:
        new_parolees = scrape_interviews(scraper)
        new_parolees = scrape_details(scraper, new_parolees)

    for parolee in new_parolees:
        din = parolee[u"din"]
        interview_date = parolee[u"parole board interview date"]
        key = (din, interview_date)

        # Clear out any hearing date that corresponds to a hearing that hadn't
        # yet happened.  This occurs because specific dates aren't set in
        # advance -- only a month and year.  This is notated via the date
        # "YYYY-MM-*".  However, once the interview has happened, we have
        # a date and should replace that row.
        scheduled_date = '-'.join(interview_date.split('-')[0:2]) + '-*'
        scheduled_key = (din, scheduled_date)
        if key != scheduled_key:
            if scheduled_key in existing_parolees:
                del existing_parolees[scheduled_key]

        existing_parolees[key] = parolee

    print_data(existing_parolees.values())
예제 #25
0
    def dump(self, abbr, filename):
        scraper = scrapelib.Scraper(requests_per_minute=600, retry_attempts=3)

        zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)

        # write out metadata
        response = scraper.get(api_url('metadata/%s' % abbr)).content
        zip.writestr('metadata.json', response)

        logging.info('exporting %s legislators...' % abbr)
        for legislator in db.legislators.find({settings.LEVEL_FIELD: abbr}):
            path = 'legislators/%s' % legislator['_id']
            url = api_url(path)

            response = scraper.get(url).content

            zip.writestr(path, response)

        logging.info('exporting %s committees...' % abbr)
        for committee in db.committees.find({settings.LEVEL_FIELD: abbr}):
            path = 'committees/%s' % committee['_id']
            url = api_url(path)

            response = scraper.get(url).content

            zip.writestr(path, response)

        logging.info('exporting %s bills...' % abbr)
        for bill in db.bills.find({settings.LEVEL_FIELD: abbr}, timeout=False):
            path = "bills/%s/%s/%s/%s" % (abbr, bill['session'],
                                          bill['chamber'], bill['bill_id'])
            url = api_url(path)

            response = scraper.get(url).content

            zip.writestr(path, response)

        zip.close()
예제 #26
0
 def handle(self, *args, **options):
     self.scraper = scrapelib.Scraper()
     self.base_url = "https://efile.fara.gov/pls/apex/"
     self.per_page = 15  #right now we have no way of displaying
     #anything but 15 at a time, this shouldn't
     #be changed without a much better understanding
     #of how they're passing parameters
     start_record = 0
     end_record = 0
     total_records = 1
     while end_record < total_records:
         page_param = "pgR_min_row={}max_rows={}rows_fetched={}"
         page_param = page_param.format(end_record + 1, self.per_page,
                                        self.per_page)
         next_page = self.get_page(page_param)
         data_table = next_page.find('div', {'id': 'apexir_DATA_PANEL'})
         self.process_records(data_table.findAll("tr"))
         page_info = next_page.find("td", {"class": "pagination"}).text
         page_info = page_info.replace('of', '-')
         record_info = page_info.split('-')
         start_record, end_record, total_records = [
             int(r.strip()) for r in record_info
         ]
         num_records_on_page = end_record - start_record + 1
예제 #27
0
# read in an opt-in config file for changing directories and supplying email settings
# returns None if it's not there, and this should always be handled gracefully
path = "config.yml"
if os.path.exists(path):
    # Don't use a cached config file, just in case, and direct_yaml_load is not yet defined.
    import yaml
    config = yaml.load(open(path))
else:
    config = None

eastern_time_zone = timezone('US/Eastern')

# scraper should be instantiated at class-load time, so that it can rate limit appropriately
scraper = scrapelib.Scraper(requests_per_minute=120,
                            follow_robots=False,
                            retry_attempts=3)
scraper.user_agent = "unitedstates/congress (https://github.com/unitedstates/congress)"


def format_datetime(obj):
    if isinstance(obj, datetime.datetime):
        return eastern_time_zone.localize(
            obj.replace(microsecond=0)).isoformat()
    elif isinstance(obj, datetime.date):
        return obj.isoformat()
    elif isinstance(obj, (str, unicode)):
        return obj
    else:
        return None
예제 #28
0
import logging
import yaml
from bs4 import BeautifulSoup
from datetime import datetime
import ssl
import requests
import urllib.parse
import io
import gzip
import certifi

from . import admin

# scraper should be instantiated at class-load time, so that it can rate limit appropriately
import scrapelib
scraper = scrapelib.Scraper(requests_per_minute=120, retry_attempts=3)
scraper.user_agent = "unitedstates/inspectors-general (https://github.com/unitedstates/inspectors-general)"


class Soft404HttpAdapter(requests.adapters.HTTPAdapter):
    """Transport adapter that checks all responses against a blacklist of "file
  not found" pages that are served with 200 status codes."""

    SOFT_404_URLS_RE = re.compile(
        r"^(http://www\.dodig\.mil/errorpages/index\.html|http://www\.fec\.gov/404error\.shtml|http://www\.gpo\.gov/maintenance/error\.htm)$"
    )
    SOFT_404_BODY_SIGNATURES = {
        "cftc.gov": b"<title>404 Page Not Found - CFTC</title>",
        "cpb.org": b"<title>CPB: Page Not Found</title>",
        "ncua.gov": b"Redirect.aspx?404",
        "si.edu": b"<title>Page Not Found Smithsonian</title>",
예제 #29
0
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'en-us,en;q=0.5',
        'Connection':
        'keep-alive',
    },
    'user_agent': ('Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0.2) '
                   'Gecko/20100101 Firefox/10.0.2'),
    'follow_robots':
    False,
}

if __name__ == '__main__':

    session = scrapelib.Scraper(**request_defaults)

    def fetch(url):
        logger.info('trying %r' % url)
        try:
            return session.get(url)
        except Exception as e:
            logger.exception(e)

    filenames = os.listdir(join(PATH, 'urls'))
    filenames = filter(lambda s: '~' not in s, filenames)
    for urls_filename in filenames:
        abbr = urls_filename.lower().replace('.txt', '')
        if sys.argv[1:] and (abbr not in sys.argv[1:]):
            continue
        with open(join(PATH, 'urls', urls_filename)) as urls:
예제 #30
0
import csv
import math
import warnings
import click
import dj_database_url
import django
import scrapelib
from django.contrib.postgres.search import SearchVector
from django.db import transaction
from django.db.models import Count

from extract.utils import jid_to_abbr, abbr_to_jid
from extract import get_extract_func, DoNotDownload, CONVERSION_FUNCTIONS

# disable SSL validation and ignore warnings
scraper = scrapelib.Scraper(verify=False)
scraper.user_agent = "Mozilla"
warnings.filterwarnings("ignore", module="urllib3")


MIMETYPES = {
    "application/pdf": "pdf",
    "text/html": "html",
    "application/msword": "doc",
    "application/rtf": "rtf",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
}


def init_django():
    from django.conf import settings