Exemplo n.º 1
0
def get_links(content, session):
    links = []
    if '<a' in content:
        for link in pq(content)('a'):
            if 'href' not in link.attrib:
                continue
            href = link.attrib['href']
            if href.startswith('/'):
                href = SITE_URL + href
            if not href.startswith('http'):
                continue
            if href in links:
                continue
            filename = href.rpartition('/')[2]
            if filename == '' or filename.endswith(
                    '.html') or filename.endswith('.aspx'):
                continue

            s3_object_name = 'government_decisions/' + filename
            if not object_storage.exists(s3_object_name):
                try:
                    conn = session.get(href)
                    if not conn.status_code == requests.codes.ok:
                        continue
                    href = object_storage.write(s3_object_name,
                                                data=conn.content,
                                                public_bucket=True,
                                                create_bucket=True)
                except:
                    continue
            else:
                href = object_storage.urlfor(s3_object_name)
            links.append(dict(href=href, title=pq(link).text()))
    return links
Exemplo n.º 2
0
def process_row(row, *_):
    s3_object_name = row['s3_object_name']
    url = row['url']
    conn = session.get(url)
    time.sleep(3)
    if not conn.status_code == requests.codes.ok:
        return None

    charset = get_charset(conn)
    conn.encode = charset
    object_storage.write(s3_object_name,
                         data=conn.content,
                         public_bucket=True,
                         create_bucket=True,
                         content_type="text/html; charset={}".format(charset))
    return row
Exemplo n.º 3
0
 def write_to_object_storage(self, object_name, data):
     logging.error('write_to_object_storage %s', object_name)
     if not object_storage.exists(object_name):
         ret = object_storage.write(object_name, data=data, public_bucket=True, create_bucket=True)
     else:
         ret = object_storage.urlfor(object_name)
     return ret
Exemplo n.º 4
0
            obj_name = os.path.join('spending-reports', obj_name)
            if not object_storage.exists(obj_name):
                tmp = tempfile.NamedTemporaryFile()
                try:
                    stream = requests.get(url_to_use,
                                          stream=True,
                                          verify=False).raw
                except:
                    logging.exception('Failed to load data from %s',
                                      url_to_use)
                stream.read = functools.partial(stream.read,
                                                decode_content=True)
                shutil.copyfileobj(stream, tmp)
                tmp.flush()
                url_to_use = object_storage.write(obj_name,
                                                  file_name=tmp.name,
                                                  create_bucket=False)
                tmp.close()
                del tmp
            else:
                url_to_use = object_storage.urlfor(obj_name)

        report['report-sheets'] = 0
        report['report-headers-row'] = None
        report['report-rows'] = None
        report['report-bad-rows'] = None
        report['load-error'] = None

        with tempfile.NamedTemporaryFile(
                suffix=os.path.splitext(url_to_use)[1]) as tmp:
            if url_to_use.startswith('http'):