def update_ministerials(self,person,ministerials): print('Updating ministerial positions') for p in ministerials.findall('para'): # clean up text text = p.text.strip().strip('.') text = re.sub("\s\s+" , " ", text) try: position,dates = text.split(' from ',1) dates = dates.split(' to ',1) if len(dates) == 2: start,end = dates else: start = dates[0] end = None #print([position,import_utils.str_to_date(start),import_utils.str_to_date(end)]) ministerial_position,c = models.MinisterialPosition.objects.get_or_create(name=position) models.MinisterialAppointment.objects.get_or_create( person=person, position=ministerial_position, start_date=import_utils.str_to_date(start), defaults={ 'end_date':import_utils.str_to_date(end) }) except: print("ERROR: ---- ","Failed on text-",text)
def make_party_membership(self, person, data): party, created = models.Party.objects.get_or_create(code=data[1], defaults={"name": data[0]}) mem, c = models.PartyMembership.objects.get_or_create( person=person, party=party, start_date=import_utils.str_to_date(data[3]), end_date=import_utils.str_to_date(data[4]), )
def make_party_membership(self,person,data): party,created = models.Party.objects.get_or_create(code=data[1],defaults={'name':data[0]}) mem,c = models.PartyMembership.objects.get_or_create( person = person, party = party, start_date = import_utils.str_to_date(data[3]), end_date = import_utils.str_to_date(data[4]), )
def update_committees(self, phid, committees): # print('Updating committee positions') # Need an array as many committees might be on the same line for p in committees.findall("para"): # clean up text text = p.text.strip().strip(".") text = re.sub("\s\s+", " ", text) try: comm_type, comms = text.split(": ", 1) except: print ("ERROR: ----", "Committee failed for whole line:", text) for comm in comms.split(";"): comm = comm.strip() comm_name = comm.split("from", 1)[0].strip() try: dates = "from" + comm.split("from", 1)[1] except: print "(", phid, "-", comm, ")", continue # raise # If you don't understand regular expressions, view the one below in https://regex101.com/ for a full breakdown # The short version is it finds most possible dates for a committee membership from relatively freetext string date_regex = "(?:from (?P<start>[\d\.]+)(?:(?P<notes>.*?)?(?: to (?P<end>[\d\.]+)))?)" dates = re.findall(date_regex, dates) committees = self.output_files["committees"] committees.add( {"pk": len(committees.data), "name": comm_name, "type": comm_type}, check=("name", "type") ) committee = committees.get_as_dict(name=comm_name, type=comm_type) for date in dates: start = date[0] notes = date[1].strip() end = date[2] # For reference, this is the header: # ['parlhand.person.phid','parlhand.committee.name','parlhand.committee.type', # 'start_date','end_date','notes'] com_mem = self.output_files["committeememberships"] start_date = import_utils.str_to_date(start) if start_date: start_date = start_date.date() end_date = import_utils.str_to_date(end) if end_date: end_date = end_date.date() com_mem.append( { "parlhand.person.phid": phid, "parlhand.committee.name": committee["name"], "parlhand.committee.name": committee["type"], "start_date": start_date, "end_date": end_date, "notes": notes, } )
def mine_prime_ministers(self): print("Mining Prime Ministers") http = httplib2.Http() status, response = http.request( 'https://en.wikipedia.org/wiki/List_of_Prime_Ministers_of_Australia' ) soup = BeautifulSoup(response, "lxml") tables = soup.findAll('table', class_="wikitable") rows = [ row for row in tables[0].findAll('tr', {'style': "background:#EEEEEE"}) ] for row in rows: cols = row.findAll('td') # Damn you Billy Hughes if row.findAll('th') and (row.findAll('th')[0].text or "Billy" in cols[0].text): #This row is a new starter row, find the PM and do stuff pm = cols[0].text.replace('Sir ', '').split('(')[0] first_name, last_name = pm.split(' ') print(first_name, last_name) first_name = first_name.strip() last_name = last_name.strip() qs = models.Person.objects.filter( surname=last_name, first_names__icontains=first_name) if qs.count() == 0: qs = models.Person.objects.filter( surname=last_name, preferred_name__icontains=first_name) if qs.count() != 1: if qs.count() < 1: print("ERROR - no PM for -", pm) if qs.count() > 1: print("ERROR - too many PM's for -", pm) print(" found these: ", qs.all()) continue pm = qs.first() if not pm.picture: picture = cols[1].findAll('img')[0].get('src') self.scrape_image_for_person(picture, pm) start = import_utils.str_to_date(cols[3].text) end = import_utils.str_to_date(cols[4].text) #self.make_pm(pm,start,end) else: # Continuation row use prior pm if not pm: print(" -- No PM?") continue # This is a different prime ministership if len(cols) > 2: print(cols) start = import_utils.str_to_date(cols[1].text) end = import_utils.str_to_date(cols[2].text) self.make_pm(pm, start, end)
def mine_prime_ministers(self): print("Mining Prime Ministers") http = httplib2.Http() status, response = http.request("https://en.wikipedia.org/wiki/List_of_Prime_Ministers_of_Australia") soup = BeautifulSoup(response, "lxml") tables = soup.findAll("table", class_="wikitable") rows = [row for row in tables[0].findAll("tr", {"style": "background:#EEEEEE"})] for row in rows: cols = row.findAll("td") # Damn you Billy Hughes if row.findAll("th") and (row.findAll("th")[0].text or "Billy" in cols[0].text): # This row is a new starter row, find the PM and do stuff pm = cols[0].text.replace("Sir ", "").split("(")[0] first_name, last_name = pm.split(" ") print(first_name, last_name) first_name = first_name.strip() last_name = last_name.strip() qs = models.Person.objects.filter(surname=last_name, first_names__icontains=first_name) if qs.count() == 0: qs = models.Person.objects.filter(surname=last_name, preferred_name__icontains=first_name) if qs.count() != 1: if qs.count() < 1: print("ERROR - no PM for -", pm) if qs.count() > 1: print("ERROR - too many PM's for -", pm) print(" found these: ", qs.all()) continue pm = qs.first() if not pm.picture: picture = cols[1].findAll("img")[0].get("src") self.scrape_image_for_person(picture, pm) start = import_utils.str_to_date(cols[3].text) end = import_utils.str_to_date(cols[4].text) # self.make_pm(pm,start,end) else: # Continuation row use prior pm if not pm: print(" -- No PM?") continue # This is a different prime ministership if len(cols) > 2: print(cols) start = import_utils.str_to_date(cols[1].text) end = import_utils.str_to_date(cols[2].text) self.make_pm(pm, start, end)
def make_member(self, person, data): state, created = models.State.objects.get_or_create(code=data[3]) electorate, created = models.Electorate.objects.get_or_create( name=data[2], state=state, ) rep, c = models.Service.objects.get_or_create( person=person, electorate=electorate, seat_type=models.Service.SEAT_TYPES.Member, start_date=import_utils.str_to_date(data[5]), end_date=import_utils.str_to_date(data[6]), defaults={ 'start_reason': data[4], 'end_reason': data[8], })
def make_senator(self,person,data): state,created = models.State.objects.get_or_create(code=data[3], defaults={'name':data[2]}) electorate,created = models.Electorate.objects.get_or_create( name=data[2], state=state, ) sen,c = models.Service.objects.get_or_create( person = person, electorate = electorate, seat_type = models.Service.SEAT_TYPES.Senator, start_date = import_utils.str_to_date(data[5]), end_date = import_utils.str_to_date(data[6]), defaults = { 'start_reason': data[4], 'end_reason': data[8], } )
def mine_deputy_prime_ministers(self): print("Mining Deputy Prime Ministers") http = httplib2.Http() status, response = http.request( 'https://en.wikipedia.org/wiki/Deputy_Prime_Minister_of_Australia') soup = BeautifulSoup(response, "lxml") tables = soup.findAll('table', class_="wikitable") rows = [row for row in tables[0].findAll('tr')] for row in rows[1:]: cols = row.findAll('td') #This row is a new starter row, find the Deputy PM and do stuff dpm = cols[1].text.replace('Sir ', '').split('(')[0] first, last = dpm.split(' ') qs = qs.filter(surname__icontains=last, first_names__icontains=first) if qs.count() == 0: qs = models.Person.objects.filter( surname__icontains=last, preferred_name__icontains=first) if qs.count() != 1: if qs.count() < 1: print("ERROR - no Deputy PM for -", dpm) if qs.count() > 1: print("ERROR - too many Deputy PM's for -", dpm) print(" found these: ", qs.all()) continue dpm = qs.first() if not dpm.picture: picture = cols[2].findAll('img') if picture: self.scrape_image_for_person(picture[0].get('src'), dpm) start, end = None, None start_loc = row.findAll('span', class_='dtstart') end_loc = row.findAll('span', class_='dtend') if start_loc: start = import_utils.str_to_date(start_loc[0].text) if end_loc: end = import_utils.str_to_date(end_loc[0].text) print("-----", dpm, start_loc, end_loc, start, end)
def mine_deputy_prime_ministers(self): print("Mining Deputy Prime Ministers") http = httplib2.Http() status, response = http.request("https://en.wikipedia.org/wiki/Deputy_Prime_Minister_of_Australia") soup = BeautifulSoup(response, "lxml") tables = soup.findAll("table", class_="wikitable") rows = [row for row in tables[0].findAll("tr")] for row in rows[1:]: cols = row.findAll("td") # This row is a new starter row, find the Deputy PM and do stuff dpm = cols[1].text.replace("Sir ", "").split("(")[0] first, last = dpm.split(" ") qs = qs.filter(surname__icontains=last, first_names__icontains=first) if qs.count() == 0: qs = models.Person.objects.filter(surname__icontains=last, preferred_name__icontains=first) if qs.count() != 1: if qs.count() < 1: print("ERROR - no Deputy PM for -", dpm) if qs.count() > 1: print("ERROR - too many Deputy PM's for -", dpm) print(" found these: ", qs.all()) continue dpm = qs.first() if not dpm.picture: picture = cols[2].findAll("img") if picture: self.scrape_image_for_person(picture[0].get("src"), dpm) start, end = None, None start_loc = row.findAll("span", class_="dtstart") end_loc = row.findAll("span", class_="dtend") if start_loc: start = import_utils.str_to_date(start_loc[0].text) if end_loc: end = import_utils.str_to_date(end_loc[0].text) print("-----", dpm, start_loc, end_loc, start, end)
def update_committees(self,person,committees): print('Updating committee positions') #Need an array as many committees might be on the same line for p in committees.findall('para'): # clean up text text = p.text.strip().strip('.') text = re.sub("\s\s+" , " ", text) try: comm_type, comms = text.split(': ',1) except: print("ERROR: ----","Committee failed for whole line:",text) for comm in comms.split(';'): comm = comm.strip() try: comm_name = comm.split('from',1)[0].strip() dates = "from"+comm.split('from',1)[1] # If you don't understand regular expressions, view the one below in https://regex101.com/ for a full breakdown # The short version is it finds most possible dates for a committee membership from relatively freetext string date_regex = "(?:from (?P<start>[\d\.]+)(?:(?P<notes>.*?)?(?: to (?P<end>[\d\.]+)))?)" dates = re.findall(date_regex,dates) committee,c = models.Committee.objects.get_or_create(name=comm_name,type=comm_type) for date in dates: start = date[0] notes = date[1].strip() end = date[2] models.CommitteeMembership.objects.update_or_create( person=person, committee=committee, start_date=import_utils.str_to_date(start), defaults={ 'notes':notes, 'end_date':import_utils.str_to_date(end) }) except Exception, e: print("ERROR: ----","Committee failed at:",comm)
def update_ministerials(self,person,row): #print('Updating ministerial positions') # clean up text ministry,c = models.Ministry.objects.get_or_create(name=row[-1]) start = row[7] end = row[8] position = row[3] _type = row[6] if row[5] == "Yes": _type = "Cabinet" ministerial_position,c = models.MinisterialPosition.objects.get_or_create(name=position) models.MinisterialAppointment.objects.get_or_create( person=person, position=ministerial_position, ministry=ministry, type = models.MinisterialAppointment.TYPES[_type], end_date = import_utils.str_to_date(end), start_date=import_utils.str_to_date(start), )
def handle(self, *args, **options): file = args[0] print("importing - ", file) with open(file, 'r') as imported_csv: reader = csv.reader(imported_csv) # creates the reader object next(reader, None) # skip the headers for i, row in enumerate( reader): # iterates the rows of the file in orders if i % 25 == 1: print('.', end="") #for col in row: phid = row[0].replace("\"", "") person, created = models.Person.objects.get_or_create( phid=phid) person.sen_id = row[1].strip() or None person.rep_id = row[2].strip() or None person.first_names = row[3].split(',', 1)[1].strip() person.surname = row[3].split(',', 1)[0].strip() person.honorifics = row[5] person.preferred_name = row[6] person.postnomials = row[8] person.biography = row[18] person.gender = row[11] person.date_of_birth = import_utils.str_to_date(row[12]) person.place_of_birth = row[13] person.date_of_death = import_utils.str_to_date(row[14]) person.save() # Make senators and members seats for s in [20, 29, 38, 47]: if row[s] != "": self.make_division(person, row[s:s + 9]) # Make parties for s in [57, 62, 67]: if row[s] != "": self.make_party_membership(person, row[s:s + 5]) print("Done")
def handle(self, *args, **options): file = args[0] print("importing - ",file) with open(file, 'r') as imported_csv: reader = csv.reader(imported_csv) # creates the reader object next(reader, None) # skip the headers for i,row in enumerate(reader): # iterates the rows of the file in orders if i%25 == 1: print('.',end="") #for col in row: phid = row[0].replace("\"","") person,created = models.Person.objects.get_or_create(phid=phid) person.sen_id = row[1].strip() or None person.rep_id = row[2].strip() or None person.first_names = row[3].split(',',1)[1].strip() person.surname = row[3].split(',',1)[0].strip() person.honorifics = row[5] person.preferred_name = row[6] person.postnomials = row[8] person.biography = row[18] person.gender = row[11] person.date_of_birth = import_utils.str_to_date(row[12]) person.place_of_birth = row[13] person.date_of_death = import_utils.str_to_date(row[14]) person.save() # Make senators and members seats for s in [20,29,38,47]: if row[s] != "": self.make_division(person,row[s:s+9]) # Make parties for s in [57,62,67]: if row[s] != "": self.make_party_membership(person,row[s:s+5]) print("Done")
def update_committees(self, phid, committees): #print('Updating committee positions') #Need an array as many committees might be on the same line for p in committees.findall('para'): # clean up text text = p.text.strip().strip('.') text = re.sub("\s\s+", " ", text) try: comm_type, comms = text.split(': ', 1) except: print("ERROR: ----", "Committee failed for whole line:", text) for comm in comms.split(';'): comm = comm.strip() comm_name = comm.split('from', 1)[0].strip() try: dates = "from" + comm.split('from', 1)[1] except: print '(', phid, '-', comm, ')', continue #raise # If you don't understand regular expressions, view the one below in https://regex101.com/ for a full breakdown # The short version is it finds most possible dates for a committee membership from relatively freetext string date_regex = "(?:from (?P<start>[\d\.]+)(?:(?P<notes>.*?)?(?: to (?P<end>[\d\.]+)))?)" dates = re.findall(date_regex, dates) committees = self.output_files['committees'] committees.add( { 'pk': len(committees.data), 'name': comm_name, 'type': comm_type }, check=('name', 'type')) committee = committees.get_as_dict(name=comm_name, type=comm_type) for date in dates: start = date[0] notes = date[1].strip() end = date[2] # For reference, this is the header: # ['parlhand.person.phid','parlhand.committee.name','parlhand.committee.type', # 'start_date','end_date','notes'] com_mem = self.output_files['committeememberships'] start_date = import_utils.str_to_date(start) if start_date: start_date = start_date.date() end_date = import_utils.str_to_date(end) if end_date: end_date = end_date.date() com_mem.append({ 'parlhand.person.phid': phid, 'parlhand.committee.name': committee['name'], 'parlhand.committee.name': committee['type'], 'start_date': start_date, 'end_date': end_date, 'notes': notes, })