Python get_or_create_source_doc示例，model_helpers.get_or_create_source_doc Python示例

示例#1

0

显示文件

文件： get_elections_pdfs.py 项目： ringwraith/access_mo

					if 'county' not in url.lower():

						# set up the file name
						elec_name = re.search('\/.+\/(.+\.pdf)', urlparse(url).path).group(1).strip()
						file_name = f_path + elec_name.replace(' ', '')

						# check to see if we already have the file
						if path.isfile(file_name):
							print "  Already downloaded {}.".format(file_name)
						else:
							print "  Downloading {}.".format(file_name)
						
							# request the pdf
							sleep(3)
							response = requests_session.get(url)

							# save the file
							with open(file_name, 'w') as f:
								f.write(response.content)

						# whether we have the file or not, try creating a source_doc
						doc = get_or_create_source_doc(
								  source = 'SoS'
								, name = elec_name
								, file_name = file_name
								, url = url
								, parent = past_results_page
							)

print 'fin.'

示例#2

0

显示文件

文件： parse_elections.py 项目： ringwraith/access_mo

	if '.txt' in i:

		print 'Getting data from {}'.format(i)

		# set up an election for each file
		election = Election(
			  date = None
			, races = []
		)

		# this particular election was not available in pdf
		# had to copy the text from the 2001 Blue Book
		if i == 'AllRacesSpecialMarch2000SD5.txt':
			election.source_doc = get_or_create_source_doc(
					  name = 'All Races Special March 2000 SD5'
					, file_name = f_path + i
					, url = 'http://s1.sos.mo.gov/cmsimages/bluebook/2001-2002/0711-0717.pdf#p715'
					, parent = None
				)
		else:
			source_doc_file = f_path + i
			election.source_doc = Source_Doc.get(Source_Doc.file_name == source_doc_file.replace('txt', 'pdf'))

		# determine which type of election it is (based on file name)
		for elec_type in Election_Type.select():
			if elec_type.name in i:
				election.election_type = elec_type

		# open the file
		with io.open(f_path + i, mode = 'r', encoding='UTF-8') as f:

			# declare a line reader so that we can reference line numbers (i.e., index position)

示例#3

0

显示文件

	for bill in sb_q:
		
		# define the file path
		file_path = 'past_content/S/' +  bill.session.name.replace(' ', '_') + '/co_sponsors/'

		print 'Getting co-sponsors for {0.bill_type.id} {0.number} from {0.session.year}'.format(bill)
		print bill.source_doc.url

		# define the file
		the_file = file_path+str(bill.bill_type.id)+'_'+ str(bill.number)+'_co_sponsors.htm'

		# get or create a source_doc record for the co-sponsor page
		source_doc = get_or_create_source_doc(
				  file_name=the_file
				, name = '{0.bill_type.id} {0.number} co-sponsors'.format(bill)
				, session = bill.session
				, url = bill.co_sponsor_link
				, parent = bill.source_doc
				, chamber = 'S'
			)

		content = None

		# load the content from the co-sponsor page
		while content == None:
			try:
				content = get_content(source_doc, requests_session)
			except requests.exceptions.ConnectionError as e:
				print e
				print '   Connection failed. Retrying...'
				requests_session = requests.session()
			except Exception as e:

示例#4

0

显示文件

    current_session_links = [
        'http://house.mo.gov/member.aspx', 'http://house.mo.gov/billlist.aspx',
        'http://www.senate.mo.gov/16info/SenateRoster.htm',
        'http://www.senate.mo.gov/16info/BTS_Web/BillList.aspx?SessionType=R'
    ]

    for link in current_session_links:

        doc_data = {'parent': None, 'url': link, 'session': current_session}

        if 'house' in link.lower():
            doc_data['chamber'] = 'H'
        elif 'senate' in link.lower():
            doc_data['chamber'] = 'S'

        if 'member' in link.lower() or 'roster' in link.lower():
            doc_data['name'] = '{} Roster'.format(doc_data['chamber'])
        elif 'bill' in link.lower():
            doc_data['name'] = '{} bills'.format(doc_data['chamber'])

        if doc_data['chamber'] == 'H':
            doc_data['file_name'] = '{0}/{1}.html'.format(
                h_dir, doc_data['name'].replace(' ', '_'))
        elif doc_data['chamber'] == 'S':
            doc_data['file_name'] = '{0}/{1}.html'.format(
                s_dir, doc_data['name'].replace(' ', '_'))

        get_or_create_source_doc(**doc_data)

print 'fin.'

示例#5

0

显示文件

文件： get_senate_co_sponsors.py 项目： gordonje/access_mo

    for bill in sb_q:

        # define the file path
        file_path = "past_content/S/" + bill.session.name.replace(" ", "_") + "/co_sponsors/"

        print "Getting co-sponsors for {0.bill_type.id} {0.number} from {0.session.year}".format(bill)
        print bill.source_doc.url

        # define the file
        the_file = file_path + str(bill.bill_type.id) + "_" + str(bill.number) + "_co_sponsors.htm"

        # get or create a source_doc record for the co-sponsor page
        source_doc = get_or_create_source_doc(
            file_name=the_file,
            name="{0.bill_type.id} {0.number} co-sponsors".format(bill),
            session=bill.session,
            url=bill.co_sponsor_link,
            parent=bill.source_doc,
            chamber="S",
        )

        content = None

        # load the content from the co-sponsor page
        while content == None:
            try:
                content = get_content(source_doc, requests_session)
            except requests.exceptions.ConnectionError as e:
                print e
                print "   Connection failed. Retrying..."
                requests_session = requests.session()
            except Exception as e:

示例#6

0

显示文件

文件： scrape_recent_elections.py 项目： ringwraith/access_mo

		spl_txt = opt.text.split(' - ')

		# set up a new election
		election = Election(
				  name = opt.text.strip()
				, election_date = spl_txt[-1].strip()
				, opt_value = opt['value']
				, races = []
			)

		# get or create the source doc for the election
		election.source_doc = get_or_create_source_doc(
				  source = 'SoS'
				, name = election.name
				, file_name = 'source_docs/SoS/election_results/html/{}.html'.format(election.name.replace('-', '').replace(',', '').replace(' ', '_').replace('__', '_'))
				, url = 'http://enrarchives.sos.mo.gov/enrnet/Default.aspx?eid={}'.format(election.opt_value)
				, parent = None
			)

		# check each election type...
		for elec_type in Election_Type.select():
			if elec_type.name in spl_txt[0]:
				# then set this attribute
				election.election_type = elec_type

		# if it's a general election...
		if election.election_type.name == 'General':
			# assume it's for the assembly starting next year
			election.assembly = Assembly.get(start_year = int(re.search('\d{4}', election.election_date).group()) + 1)
		# if it's a special election...

示例#7

0

显示文件

文件： get_past_session_urls.py 项目： gordonje/access_mo

		, 'http://www.senate.mo.gov/16info/SenateRoster.htm'
		, 'http://www.senate.mo.gov/16info/BTS_Web/BillList.aspx?SessionType=R'
	]

	for link in current_session_links:

		doc_data = {
			  'parent': None
			, 'url': link
			, 'session': current_session
		}

		if 'house' in link.lower():
			doc_data['chamber'] = 'H'
		elif 'senate' in link.lower():
			doc_data['chamber'] = 'S'

		if 'member' in link.lower() or 'roster' in link.lower():
			doc_data['name'] = '{} Roster'.format(doc_data['chamber'])
		elif 'bill' in link.lower():
			doc_data['name'] = '{} bills'.format(doc_data['chamber'])

		if doc_data['chamber'] == 'H':
			doc_data['file_name'] = '{0}/{1}.html'.format(h_dir, doc_data['name'].replace(' ', '_'))
		elif doc_data['chamber'] == 'S':
			doc_data['file_name'] = '{0}/{1}.html'.format(s_dir, doc_data['name'].replace(' ', '_'))

		get_or_create_source_doc(**doc_data)
		
print 'fin.'