示例#1
0
    def update_ministerials(self,person,ministerials):
        print('Updating ministerial positions')
        for p in ministerials.findall('para'):
            # clean up text
            text = p.text.strip().strip('.')
            text = re.sub("\s\s+" , " ", text)

            try:
                position,dates = text.split(' from ',1)
                dates = dates.split(' to ',1)
                if len(dates) == 2:
                    start,end = dates
                else:
                    start = dates[0]
                    end = None
    
                #print([position,import_utils.str_to_date(start),import_utils.str_to_date(end)])
                
                ministerial_position,c = models.MinisterialPosition.objects.get_or_create(name=position)
                models.MinisterialAppointment.objects.get_or_create(
                    person=person,
                    position=ministerial_position,
                    start_date=import_utils.str_to_date(start),
                    defaults={
                        'end_date':import_utils.str_to_date(end)
                    })
            except:
                print("ERROR: ---- ","Failed on text-",text)
示例#2
0
 def make_party_membership(self, person, data):
     party, created = models.Party.objects.get_or_create(code=data[1], defaults={"name": data[0]})
     mem, c = models.PartyMembership.objects.get_or_create(
         person=person,
         party=party,
         start_date=import_utils.str_to_date(data[3]),
         end_date=import_utils.str_to_date(data[4]),
     )
示例#3
0
 def make_party_membership(self,person,data):
     party,created = models.Party.objects.get_or_create(code=data[1],defaults={'name':data[0]})
     mem,c = models.PartyMembership.objects.get_or_create(
             person = person,
             party = party,
             start_date = import_utils.str_to_date(data[3]),
             end_date = import_utils.str_to_date(data[4]),
             )
    def update_committees(self, phid, committees):
        # print('Updating committee positions')
        # Need an array as many committees might be on the same line

        for p in committees.findall("para"):
            # clean up text
            text = p.text.strip().strip(".")
            text = re.sub("\s\s+", " ", text)

            try:
                comm_type, comms = text.split(": ", 1)
            except:
                print ("ERROR: ----", "Committee failed for whole line:", text)
            for comm in comms.split(";"):
                comm = comm.strip()
                comm_name = comm.split("from", 1)[0].strip()
                try:
                    dates = "from" + comm.split("from", 1)[1]
                except:
                    print "(", phid, "-", comm, ")",
                    continue
                    # raise

                # If you don't understand regular expressions, view the one below in https://regex101.com/ for a full breakdown
                # The short version is it finds most possible dates for a committee membership from relatively freetext string
                date_regex = "(?:from (?P<start>[\d\.]+)(?:(?P<notes>.*?)?(?: to (?P<end>[\d\.]+)))?)"

                dates = re.findall(date_regex, dates)
                committees = self.output_files["committees"]
                committees.add(
                    {"pk": len(committees.data), "name": comm_name, "type": comm_type}, check=("name", "type")
                )
                committee = committees.get_as_dict(name=comm_name, type=comm_type)

                for date in dates:
                    start = date[0]
                    notes = date[1].strip()
                    end = date[2]
                    # For reference, this is the header:
                    # ['parlhand.person.phid','parlhand.committee.name','parlhand.committee.type',
                    #  'start_date','end_date','notes']
                    com_mem = self.output_files["committeememberships"]
                    start_date = import_utils.str_to_date(start)
                    if start_date:
                        start_date = start_date.date()
                    end_date = import_utils.str_to_date(end)
                    if end_date:
                        end_date = end_date.date()
                    com_mem.append(
                        {
                            "parlhand.person.phid": phid,
                            "parlhand.committee.name": committee["name"],
                            "parlhand.committee.name": committee["type"],
                            "start_date": start_date,
                            "end_date": end_date,
                            "notes": notes,
                        }
                    )
示例#5
0
    def mine_prime_ministers(self):
        print("Mining Prime Ministers")
        http = httplib2.Http()
        status, response = http.request(
            'https://en.wikipedia.org/wiki/List_of_Prime_Ministers_of_Australia'
        )
        soup = BeautifulSoup(response, "lxml")
        tables = soup.findAll('table', class_="wikitable")

        rows = [
            row
            for row in tables[0].findAll('tr', {'style': "background:#EEEEEE"})
        ]

        for row in rows:
            cols = row.findAll('td')

            # Damn you Billy Hughes
            if row.findAll('th') and (row.findAll('th')[0].text
                                      or "Billy" in cols[0].text):
                #This row is a new starter row, find the PM and do stuff
                pm = cols[0].text.replace('Sir ', '').split('(')[0]
                first_name, last_name = pm.split(' ')
                print(first_name, last_name)
                first_name = first_name.strip()
                last_name = last_name.strip()
                qs = models.Person.objects.filter(
                    surname=last_name, first_names__icontains=first_name)
                if qs.count() == 0:
                    qs = models.Person.objects.filter(
                        surname=last_name,
                        preferred_name__icontains=first_name)
                if qs.count() != 1:
                    if qs.count() < 1:
                        print("ERROR - no PM for -", pm)
                    if qs.count() > 1:
                        print("ERROR - too many PM's for -", pm)
                        print("   found these: ", qs.all())
                    continue
                pm = qs.first()

                if not pm.picture:
                    picture = cols[1].findAll('img')[0].get('src')
                    self.scrape_image_for_person(picture, pm)
                start = import_utils.str_to_date(cols[3].text)
                end = import_utils.str_to_date(cols[4].text)
                #self.make_pm(pm,start,end)
            else:
                # Continuation row use prior pm
                if not pm:
                    print(" -- No PM?")
                    continue
                # This is a different prime ministership
                if len(cols) > 2:
                    print(cols)
                    start = import_utils.str_to_date(cols[1].text)
                    end = import_utils.str_to_date(cols[2].text)
                    self.make_pm(pm, start, end)
示例#6
0
    def mine_prime_ministers(self):
        print("Mining Prime Ministers")
        http = httplib2.Http()
        status, response = http.request("https://en.wikipedia.org/wiki/List_of_Prime_Ministers_of_Australia")
        soup = BeautifulSoup(response, "lxml")
        tables = soup.findAll("table", class_="wikitable")

        rows = [row for row in tables[0].findAll("tr", {"style": "background:#EEEEEE"})]

        for row in rows:
            cols = row.findAll("td")

            # Damn you Billy Hughes
            if row.findAll("th") and (row.findAll("th")[0].text or "Billy" in cols[0].text):
                # This row is a new starter row, find the PM and do stuff
                pm = cols[0].text.replace("Sir ", "").split("(")[0]
                first_name, last_name = pm.split(" ")
                print(first_name, last_name)
                first_name = first_name.strip()
                last_name = last_name.strip()
                qs = models.Person.objects.filter(surname=last_name, first_names__icontains=first_name)
                if qs.count() == 0:
                    qs = models.Person.objects.filter(surname=last_name, preferred_name__icontains=first_name)
                if qs.count() != 1:
                    if qs.count() < 1:
                        print("ERROR - no PM for -", pm)
                    if qs.count() > 1:
                        print("ERROR - too many PM's for -", pm)
                        print("   found these: ", qs.all())
                    continue
                pm = qs.first()

                if not pm.picture:
                    picture = cols[1].findAll("img")[0].get("src")
                    self.scrape_image_for_person(picture, pm)
                start = import_utils.str_to_date(cols[3].text)
                end = import_utils.str_to_date(cols[4].text)
                # self.make_pm(pm,start,end)
            else:
                # Continuation row use prior pm
                if not pm:
                    print(" -- No PM?")
                    continue
                # This is a different prime ministership
                if len(cols) > 2:
                    print(cols)
                    start = import_utils.str_to_date(cols[1].text)
                    end = import_utils.str_to_date(cols[2].text)
                    self.make_pm(pm, start, end)
示例#7
0
 def make_member(self, person, data):
     state, created = models.State.objects.get_or_create(code=data[3])
     electorate, created = models.Electorate.objects.get_or_create(
         name=data[2],
         state=state,
     )
     rep, c = models.Service.objects.get_or_create(
         person=person,
         electorate=electorate,
         seat_type=models.Service.SEAT_TYPES.Member,
         start_date=import_utils.str_to_date(data[5]),
         end_date=import_utils.str_to_date(data[6]),
         defaults={
             'start_reason': data[4],
             'end_reason': data[8],
         })
示例#8
0
 def make_senator(self,person,data):
     state,created = models.State.objects.get_or_create(code=data[3], defaults={'name':data[2]})
     electorate,created = models.Electorate.objects.get_or_create(
         name=data[2],
         state=state,
         )
     sen,c = models.Service.objects.get_or_create(
             person = person,
             electorate = electorate,
             seat_type = models.Service.SEAT_TYPES.Senator,
             start_date = import_utils.str_to_date(data[5]),
             end_date = import_utils.str_to_date(data[6]),
             defaults = {
                 'start_reason': data[4],
                 'end_reason': data[8],
             }
         )
示例#9
0
    def mine_deputy_prime_ministers(self):
        print("Mining Deputy Prime Ministers")
        http = httplib2.Http()
        status, response = http.request(
            'https://en.wikipedia.org/wiki/Deputy_Prime_Minister_of_Australia')
        soup = BeautifulSoup(response, "lxml")
        tables = soup.findAll('table', class_="wikitable")

        rows = [row for row in tables[0].findAll('tr')]

        for row in rows[1:]:
            cols = row.findAll('td')

            #This row is a new starter row, find the Deputy PM and do stuff
            dpm = cols[1].text.replace('Sir ', '').split('(')[0]

            first, last = dpm.split(' ')
            qs = qs.filter(surname__icontains=last,
                           first_names__icontains=first)
            if qs.count() == 0:
                qs = models.Person.objects.filter(
                    surname__icontains=last, preferred_name__icontains=first)
            if qs.count() != 1:
                if qs.count() < 1:
                    print("ERROR - no Deputy PM for -", dpm)
                if qs.count() > 1:
                    print("ERROR - too many Deputy PM's for -", dpm)
                    print("   found these: ", qs.all())
                continue
            dpm = qs.first()

            if not dpm.picture:
                picture = cols[2].findAll('img')
                if picture:
                    self.scrape_image_for_person(picture[0].get('src'), dpm)

            start, end = None, None
            start_loc = row.findAll('span', class_='dtstart')
            end_loc = row.findAll('span', class_='dtend')
            if start_loc:
                start = import_utils.str_to_date(start_loc[0].text)
            if end_loc:
                end = import_utils.str_to_date(end_loc[0].text)
            print("-----", dpm, start_loc, end_loc, start, end)
示例#10
0
    def mine_deputy_prime_ministers(self):
        print("Mining Deputy Prime Ministers")
        http = httplib2.Http()
        status, response = http.request("https://en.wikipedia.org/wiki/Deputy_Prime_Minister_of_Australia")
        soup = BeautifulSoup(response, "lxml")
        tables = soup.findAll("table", class_="wikitable")

        rows = [row for row in tables[0].findAll("tr")]

        for row in rows[1:]:
            cols = row.findAll("td")

            # This row is a new starter row, find the Deputy PM and do stuff
            dpm = cols[1].text.replace("Sir ", "").split("(")[0]

            first, last = dpm.split(" ")
            qs = qs.filter(surname__icontains=last, first_names__icontains=first)
            if qs.count() == 0:
                qs = models.Person.objects.filter(surname__icontains=last, preferred_name__icontains=first)
            if qs.count() != 1:
                if qs.count() < 1:
                    print("ERROR - no Deputy PM for -", dpm)
                if qs.count() > 1:
                    print("ERROR - too many Deputy PM's for -", dpm)
                    print("   found these: ", qs.all())
                continue
            dpm = qs.first()

            if not dpm.picture:
                picture = cols[2].findAll("img")
                if picture:
                    self.scrape_image_for_person(picture[0].get("src"), dpm)

            start, end = None, None
            start_loc = row.findAll("span", class_="dtstart")
            end_loc = row.findAll("span", class_="dtend")
            if start_loc:
                start = import_utils.str_to_date(start_loc[0].text)
            if end_loc:
                end = import_utils.str_to_date(end_loc[0].text)
            print("-----", dpm, start_loc, end_loc, start, end)
示例#11
0
    def update_committees(self,person,committees):
        print('Updating committee positions')
        #Need an array as many committees might be on the same line

        for p in committees.findall('para'):
            # clean up text
            text = p.text.strip().strip('.')
            text = re.sub("\s\s+" , " ", text)

            try:
                comm_type, comms = text.split(': ',1)
            except:
                print("ERROR: ----","Committee failed for whole line:",text)
            for comm in comms.split(';'):
                comm = comm.strip()
                try:
                    comm_name = comm.split('from',1)[0].strip()
                    dates = "from"+comm.split('from',1)[1]
                    
                    # If you don't understand regular expressions, view the one below in https://regex101.com/ for a full breakdown
                    # The short version is it finds most possible dates for a committee membership from relatively freetext string
                    date_regex = "(?:from (?P<start>[\d\.]+)(?:(?P<notes>.*?)?(?: to (?P<end>[\d\.]+)))?)"
                    
                    dates = re.findall(date_regex,dates)
    
                    committee,c = models.Committee.objects.get_or_create(name=comm_name,type=comm_type)
                    for date in dates:
                        start = date[0]
                        notes = date[1].strip()
                        end = date[2]
            
                        models.CommitteeMembership.objects.update_or_create(
                            person=person,
                            committee=committee,
                            start_date=import_utils.str_to_date(start),
                            defaults={
                                'notes':notes,
                                'end_date':import_utils.str_to_date(end)
                            })
                except Exception, e:
                    print("ERROR: ----","Committee failed at:",comm)
示例#12
0
    def update_ministerials(self,person,row):
        #print('Updating ministerial positions')
        # clean up text
        ministry,c = models.Ministry.objects.get_or_create(name=row[-1])

        start = row[7]
        end = row[8]
        position = row[3]
        _type = row[6]
        if row[5] == "Yes":
            _type = "Cabinet"

        ministerial_position,c = models.MinisterialPosition.objects.get_or_create(name=position)
        models.MinisterialAppointment.objects.get_or_create(
            person=person,
            position=ministerial_position,
            ministry=ministry,
            type = models.MinisterialAppointment.TYPES[_type],
            end_date = import_utils.str_to_date(end),
            start_date=import_utils.str_to_date(start),
            )
示例#13
0
    def handle(self, *args, **options):
        file = args[0]
        print("importing - ", file)
        with open(file, 'r') as imported_csv:
            reader = csv.reader(imported_csv)  # creates the reader object
            next(reader, None)  # skip the headers
            for i, row in enumerate(
                    reader):  # iterates the rows of the file in orders
                if i % 25 == 1:
                    print('.', end="")
                #for col in row:
                phid = row[0].replace("\"", "")
                person, created = models.Person.objects.get_or_create(
                    phid=phid)
                person.sen_id = row[1].strip() or None
                person.rep_id = row[2].strip() or None
                person.first_names = row[3].split(',', 1)[1].strip()
                person.surname = row[3].split(',', 1)[0].strip()
                person.honorifics = row[5]
                person.preferred_name = row[6]
                person.postnomials = row[8]
                person.biography = row[18]
                person.gender = row[11]
                person.date_of_birth = import_utils.str_to_date(row[12])
                person.place_of_birth = row[13]
                person.date_of_death = import_utils.str_to_date(row[14])
                person.save()

                # Make senators and members seats
                for s in [20, 29, 38, 47]:
                    if row[s] != "":
                        self.make_division(person, row[s:s + 9])

                # Make parties
                for s in [57, 62, 67]:
                    if row[s] != "":
                        self.make_party_membership(person, row[s:s + 5])
        print("Done")
示例#14
0
 def handle(self, *args, **options):
     file = args[0]
     print("importing - ",file)
     with open(file, 'r') as imported_csv:
         reader = csv.reader(imported_csv)  # creates the reader object
         next(reader, None)  # skip the headers
         for i,row in enumerate(reader):   # iterates the rows of the file in orders
             if i%25 == 1:
                 print('.',end="")
             #for col in row:
             phid = row[0].replace("\"","")
             person,created = models.Person.objects.get_or_create(phid=phid)
             person.sen_id = row[1].strip() or None
             person.rep_id = row[2].strip() or None
             person.first_names = row[3].split(',',1)[1].strip()
             person.surname = row[3].split(',',1)[0].strip()
             person.honorifics = row[5]
             person.preferred_name = row[6]
             person.postnomials = row[8]
             person.biography = row[18]
             person.gender = row[11]
             person.date_of_birth = import_utils.str_to_date(row[12])
             person.place_of_birth = row[13]
             person.date_of_death = import_utils.str_to_date(row[14])
             person.save()
             
             # Make senators and members seats
             for s in [20,29,38,47]:
                 if row[s] != "":
                     self.make_division(person,row[s:s+9])
             
             # Make parties
             for s in [57,62,67]:
                 if row[s] != "":
                     self.make_party_membership(person,row[s:s+5])
     print("Done")
    def update_committees(self, phid, committees):
        #print('Updating committee positions')
        #Need an array as many committees might be on the same line

        for p in committees.findall('para'):
            # clean up text
            text = p.text.strip().strip('.')
            text = re.sub("\s\s+", " ", text)

            try:
                comm_type, comms = text.split(': ', 1)
            except:
                print("ERROR: ----", "Committee failed for whole line:", text)
            for comm in comms.split(';'):
                comm = comm.strip()
                comm_name = comm.split('from', 1)[0].strip()
                try:
                    dates = "from" + comm.split('from', 1)[1]
                except:
                    print '(', phid, '-', comm, ')',
                    continue
                    #raise

                # If you don't understand regular expressions, view the one below in https://regex101.com/ for a full breakdown
                # The short version is it finds most possible dates for a committee membership from relatively freetext string
                date_regex = "(?:from (?P<start>[\d\.]+)(?:(?P<notes>.*?)?(?: to (?P<end>[\d\.]+)))?)"

                dates = re.findall(date_regex, dates)
                committees = self.output_files['committees']
                committees.add(
                    {
                        'pk': len(committees.data),
                        'name': comm_name,
                        'type': comm_type
                    },
                    check=('name', 'type'))
                committee = committees.get_as_dict(name=comm_name,
                                                   type=comm_type)

                for date in dates:
                    start = date[0]
                    notes = date[1].strip()
                    end = date[2]
                    # For reference, this is the header:
                    # ['parlhand.person.phid','parlhand.committee.name','parlhand.committee.type',
                    #  'start_date','end_date','notes']
                    com_mem = self.output_files['committeememberships']
                    start_date = import_utils.str_to_date(start)
                    if start_date:
                        start_date = start_date.date()
                    end_date = import_utils.str_to_date(end)
                    if end_date:
                        end_date = end_date.date()
                    com_mem.append({
                        'parlhand.person.phid': phid,
                        'parlhand.committee.name': committee['name'],
                        'parlhand.committee.name': committee['type'],
                        'start_date': start_date,
                        'end_date': end_date,
                        'notes': notes,
                    })