def handle_noargs(self, **options):
        doc_type = "CBO CE"
        file_type = "pdf"
        d = feedparser.parse("http://www.cbo.gov/rss/latest10.xml")

        for entry in d.entries:
            title_dict = split_title(entry.title)
            release_date = entry.updated_parsed
            release_date = datetime.datetime(release_date[0], release_date[1],
                                             release_date[2])
            congress = congress_from_year(release_date.year)
            add_date = datetime.datetime.now()
            title = title_dict['title']
            bill_list = extract_legislation(title)
            if len(bill_list) > 0:
                bill_num = bill_list[0]
                gov_id = "%s-%s" % (congress, bill_num.replace(
                    '.', '').replace(' ', ''))
            else:
                bill_num = None
                gov_id = None
            if 'description' in entry:
                description = entry.description
            original_url = entry.link
            entry = {
                'release_date': release_date,
                'congress': congress,
                'add_date': add_date,
                'title': title,
                'description': description,
                'bill_list': bill_list,
                'original_url': original_url
            }
            print entry
Пример #2
0
    def handle_noargs(self, **options):
        doc_type = "DPC LB"
        file_type = "html"
        add_date = datetime.datetime.now()
        year = add_date.year
        congress = congress_from_year(year)
        url_prefix = "http://dpc.senate.gov/"
        url = "%sdpcreports.cfm?cf_year=%s&doctype=lb" % (url_prefix, year)
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)

        rows = soup.findAll('p', {"class": "doclist"})
        for row in rows:
            file_name = row('a')[0]['href'].strip()
            p = re.compile('dpcdoc\.cfm\?doc_name=')
            standard_format = p.findall(file_name)
            if standard_format:
                gov_id = file_name.replace('dpcdoc.cfm?doc_name=', '').upper()
            else:
                gov_id = None
            original_url = "%s%s" % (url_prefix, file_name)
            local_file = ''
            title = row('a')[0].string
            description = ''
            bill_list = extract_legislation(title)
            date_str = row.contents[3].string.replace('(', '').replace(')', '')
            release_date = time.strftime('%Y-%m-%d',
                                         time.strptime(date_str, '%m/%d/%y'))

            matches = Document.objects.filter(doc_type=doc_type,
                                              gov_id=gov_id,
                                              release_date=release_date)
            if len(matches) == 0:
                if gov_id:
                    local_file = archive_file(original_url, gov_id, doc_type,
                                              file_type)
                    time.sleep(2)
                    full_text = None
                    doc = Document(gov_id=gov_id,
                                   release_date=release_date,
                                   add_date=add_date,
                                   title=title,
                                   description=description,
                                   doc_type=doc_type,
                                   original_url=original_url,
                                   local_file=local_file,
                                   full_text=full_text)
                    doc.save()
                    for bill_num in bill_list:
                        bill_dupe = DocumentLegislation.objects.filter(
                            congress=congress).filter(
                                bill_num=bill_num).filter(document=doc)
                        if not bill_dupe:
                            bill = DocumentLegislation(congress=congress,
                                                       bill_num=bill_num,
                                                       document=doc)
                            bill.save()
    def handle_noargs(self, **options):
        doc_type = "RCR SRP"
        file_type = "html"
        base_url = 'http://repcloakroom.house.gov/news/'
        page = urllib2.urlopen(
            "http://repcloakroom.house.gov/news/DocumentQuery.aspx?DocumentTypeID=1501&Page=1"
        )
        add_date = datetime.datetime.now()

        soup = BeautifulSoup(page)
        rows = soup.findAll('span', {"class": "middlecopy"})
        for row in rows:
            if row.find('span', {"class": "middleheadline"}):
                title = str(
                    row.find('span', {
                        "class": "middleheadline"
                    }).contents[1]).replace('<b>', '').replace('</b>',
                                                               '').strip()
                bill_list = extract_legislation(title)
                date_str = row.find('span', {
                    "class": "middleheadline"
                }).parent.contents[5].contents[0].replace('&nbsp;-',
                                                          '').strip()
                release_date = time.strftime(
                    '%Y-%m-%d', time.strptime(date_str, '%b %d, %Y'))
                year = int(
                    time.strftime('%Y', time.strptime(date_str, '%b %d, %Y')))
                congress = congress_from_year(year)
                description = unicode(
                    row.find('span', {
                        "class": "middleheadline"
                    }).parent.contents[6]).strip()
                if not bill_list:
                    bill_list = extract_legislation(description)
                if title == "":
                    title = "".join(bill_list)
                file_name = row.find('span', {
                    "class": "middleheadline"
                }).parent.contents[7]['href']
                original_url = "%s%s" % (base_url, file_name)
                gov_id = "SRP-%s-%s-%s" % (congress, bill_list[0].replace(
                    ' ', '').replace('.', ''), release_date)

                doc = {
                    'gov_id': gov_id,
                    'original_url': original_url,
                    'file_name': file_name,
                    'title': title,
                    'description': description,
                    'congress': congress,
                    'year': year,
                    'release_date': release_date,
                    'bill_list': bill_list
                }
                print doc
Пример #4
0
 def handle_noargs(self, **options):
     doc_type = "OMB SAP"
     file_type = "pdf"
     base_url = "http://whitehouse.gov"
     current_congress = current_congress_session()['congress']
     page = urllib2.urlopen("http://www.whitehouse.gov/omb/%s/legislative_sap_date_%s" % (current_congress, datetime.datetime.now().year))
     add_date = datetime.datetime.now()
     
     soup = BeautifulSoup(page)
     link = soup.find('a', href=re.compile('.pdf'))
     rows = link.parent.parent.parent.parent.findAll('tr')
     for row in rows:
         cols = row.findAll('td')
         if cols:
             bill = cols[0].find('a').string.replace('&nbsp;', ' ').replace('S ', 'S.').replace('HR ', 'H.R.').replace(' ', '')
             clean_bill = bill.replace('.', '').replace(' ', '')
             original_url = "%s%s" % (base_url, cols[0].find('a')['href'])
             title = cols[1].contents[0].replace('&nbsp;', ' ').strip()
             date_str = cols[2].string.replace('&nbsp;', ' ').strip()
             try:
                 release_date = time.strftime('%Y-%m-%d', time.strptime(date_str, '%B %d, %Y'))
             except:
                 release_date = None
             year = time.strptime(date_str, '%B %d, %Y')[0]
             congress = congress_from_year(year)
             session = session_from_year(year)
             recipient = cols[3].string.strip().replace('&nbsp;', ' ')
             bill_list = [bill]
             description = ""
             local_file = ""
             suffix = release_date
             gov_id = "%s-%s-SAP%s-%s" % (congress, session, clean_bill, suffix)
             
             matches = Document.objects.filter(doc_type=doc_type, gov_id=gov_id, release_date=release_date)
             if len(matches) == 0:
                 if gov_id:
                     print release_date
                     local_file = archive_file(original_url, gov_id, doc_type, file_type)
                     #full_text = pdf_extract_text(local_file, original_url)
                     full_text = None
                     doc = Document(gov_id=gov_id, release_date=release_date, add_date=add_date, title=title, 
                         description=description, doc_type=doc_type, original_url=original_url, 
                         local_file=local_file, full_text=full_text)
                     doc.save()
                     for bill_num in bill_list:
                         bill_dupe = DocumentLegislation.objects.filter(congress=congress).filter(bill_num=bill_num).filter(document=doc)
                         if not bill_dupe:
                             bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc)
                             bill.save()
Пример #5
0
 def handle_noargs(self, **options):
     docs = Document.objects.all()
     for doc in docs:
         bill_list = extract_legislation(doc.title)
         congress = congress_from_year(doc.release_date.year)
         if bill_list:
             i = 0
             for bill_num in bill_list:
                 bill_dupe = DocumentLegislation.objects.filter(
                     congress=congress).filter(bill_num=bill_num).filter(
                         document=doc)
                 if not bill_dupe:
                     bill = DocumentLegislation(congress=congress,
                                                bill_num=bill_num,
                                                document=doc)
                     bill.save()
                     print "%s %s" % (doc.gov_id, bill_list)
Пример #6
0
    def handle_noargs(self, **options):
        doc_type = "RCR SRP"
        file_type = "html"
        base_url = 'http://repcloakroom.house.gov/news/'
        page = urllib2.urlopen("http://repcloakroom.house.gov/news/DocumentQuery.aspx?DocumentTypeID=1501&Page=1")
        add_date = datetime.datetime.now()
        
        soup = BeautifulSoup(page)
        rows = soup.findAll('span', { "class":"middlecopy" })
        for row in rows:
            if row.find('span', { "class":"middleheadline" }):
                title = str(row.find('span', { "class":"middleheadline" }).contents[1]).replace('<b>', '').replace('</b>', '').strip()
                bill_list = extract_legislation(title)
                date_str = row.find('span', { "class":"middleheadline" }).parent.contents[5].contents[0].replace('&nbsp;-', '').strip()
                release_date = time.strftime('%Y-%m-%d', time.strptime(date_str, '%b %d, %Y'))
                year = int(time.strftime('%Y', time.strptime(date_str, '%b %d, %Y')))
                congress = congress_from_year(year)
                description = unicode(row.find('span', { "class":"middleheadline" }).parent.contents[6]).strip()
                if not bill_list:
                    bill_list = extract_legislation(description)
                if title == "":
                    title = "".join(bill_list)
                file_name = row.find('span', { "class":"middleheadline" }).parent.contents[7]['href']
                original_url = "%s%s" % (base_url, file_name)
                gov_id = "SRP-%s-%s-%s" % (congress, bill_list[0].replace(' ', '').replace('.', ''), release_date)
                matches = Document.objects.filter(doc_type=doc_type, gov_id=gov_id, release_date=release_date)

                if len(matches) == 0:
                    print_url = original_url.replace('DocumentSingle', 'DocumentPrint')
                    #print_page = urllib2.urlopen(print_url).read()
                    #full_text = ''.join(BeautifulSoup(print_page).findAll(text=True)).replace('DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"', '').strip()
                    #full_text = re.sub("\s+" , " ", full_text)
                    full_text = None
                    if gov_id:
                        local_file = archive_file(print_url, gov_id, doc_type, file_type)
                        doc = Document(gov_id=gov_id, release_date=release_date, add_date=add_date, title=title, description=description, doc_type=doc_type, original_url=original_url, local_file=local_file, full_text=full_text)
                        doc.save()
                        for bill in bill_list:
                            bill_num = clean_bill_num(bill)
                            bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc)
                            bill.save()
Пример #7
0
    def handle_noargs(self, **options):
        doc_type = "CBO CE"
        file_type = "pdf"
        d = feedparser.parse("http://www.cbo.gov/rss/latest10.xml")
        
        for entry in d.entries:
            title_dict = split_title(entry.title)
            release_date = entry.updated_parsed
            release_date=datetime.datetime(release_date[0], release_date[1], release_date[2])            
            congress = congress_from_year(release_date.year)
            add_date = datetime.datetime.now()
            title = title_dict['title']
            bill_list = extract_legislation(title)
            if len(bill_list) > 0:
                bill_num = bill_list[0]
                gov_id = "%s-%s" % (congress, bill_num.replace('.', '').replace(' ', ''))
            else:
                bill_num = None
                gov_id = None
            if 'description' in entry:
                description = entry.description
            original_url = entry.link

            matches = Document.objects.filter(doc_type=doc_type, gov_id=gov_id, release_date=release_date)
            if len(matches) == 0:
                if gov_id:
                    local_file = archive_file(original_url, gov_id, doc_type, file_type)
                    #full_text = pdf_extract_text(local_file, original_url)
                    full_text = None
                    doc = Document(gov_id=gov_id, release_date=release_date, add_date=add_date, title=title, 
                        description=description, doc_type=doc_type, original_url=original_url, 
                        local_file=local_file, full_text=full_text)
                    doc.save()
                    for bill_num in bill_list:
                        bill_dupe = DocumentLegislation.objects.filter(congress=congress).filter(bill_num=bill_num).filter(document=doc)
                        if not bill_dupe:
                            bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc)
                            bill.save()
    def handle_noargs(self, **options):
        doc_type = "DPC LB"
        file_type = "html"
        add_date = datetime.datetime.now()
        year = add_date.year
        congress = congress_from_year(year)
        url_prefix = "http://dpc.senate.gov/"
        url = "%sdpcreports.cfm?cf_year=%s" % (url_prefix, year)
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)

        rows = soup.findAll('p', {"class": "doclist"})
        for row in rows:
            file_name = row('a')[0]['href'].strip()
            p = re.compile('dpcdoc\.cfm\?doc_name=')
            standard_format = p.findall(file_name)
            if standard_format:
                gov_id = file_name.replace('dpcdoc.cfm?doc_name=', '').upper()
            else:
                gov_id = None
            original_url = "%s%s" % (url_prefix, file_name)
            local_file = ''
            title = row('a')[0].string
            description = ''
            bill_list = extract_legislation(title)
            date_str = row.contents[3].string.replace('(', '').replace(')', '')
            release_date = time.strftime('%Y-%m-%d',
                                         time.strptime(date_str, '%m/%d/%y'))
            doc = {
                'original_url': original_url,
                'local_file': local_file,
                'title': title,
                'description': description,
                'bill_list': bill_list,
                'release_date': release_date
            }
            print doc