예제 #1
0
파일: parse.py 프로젝트: wilbrodn/aleph
def parse_country(country, guess=True):
    """Determine a two-letter country code based on an input.

    The input may be a country code, a country name, etc.
    """
    if guess:
        country = countrynames.to_code(country)
    if country is not None:
        country = country.lower()
        if is_country_code(country):
            return country
예제 #2
0
    def clean_text(self, country, fuzzy=False, **kwargs):
        """Determine a two-letter country code based on an input.

        The input may be a country code, a country name, etc.
        """
        code = country.lower().strip()
        if code in self.codes:
            return code
        country = countrynames.to_code(country, fuzzy=fuzzy)
        if country is not None:
            return country.lower()
예제 #3
0
def test_GB():
    assert to_code("Scotland") == "GB-SCT"
    assert to_code("Wales") == "GB-WLS"
    assert to_code("Northern Ireland") == "GB-NIR"
    assert to_code("Northern Ireland", fuzzy=True) == "GB-NIR"
    assert to_code(
        "United Kingdom of Great Britain and Northern Ireland") == "GB"
    assert to_code("United Kingdom of Great Britain and Northern Ireland",
                   fuzzy=True) == "GB"
예제 #4
0
    def clean_text(
        self,
        text: str,
        fuzzy: bool = False,
        format: Optional[str] = None,
        proxy: Optional["EntityProxy"] = None,
    ) -> Optional[str]:
        """Determine a two-letter country code based on an input.

        The input may be a country code, a country name, etc.
        """
        code = countrynames.to_code(text, fuzzy=fuzzy)
        if code is not None:
            lower = code.lower()
            if lower in self.codes:
                return lower
        return None
예제 #5
0
    def emit_entity(self, data):
        uid = data.get('uid') or data.get('uid_canonical')
        if uid is None:
            raise ValueError("No UID for entity: %r", data)

        if data.get('type') not in TYPES:
            raise ValueError("Invalid entity type: %r", data)

        try:
            data['weight'] = int(data.get('weight', 0))
        except Exception:
            raise ValueError("Invalid weight: %r", data)

        if 'country' in data:
            data['country'] = countrynames.to_code(data['country'])

        name = data.get('name')
        if name is not None:
            name = unicode(name).strip()
            if not len(name):
                name = None
            data['name'] = name

        for k, v in data.items():
            if v is None:
                data.pop(k)

        # TODO: partial dates
        aliases = data.pop('aliases', [])
        self.entities.upsert(data, ['origin', 'uid'])
        for alias in aliases:
            self.emit_alias({
                'name': alias,
                'origin': data.get('origin'),
                'uid': data.get('uid'),
                'uid_canonical': data.get('uid_canonical'),
            })
        return data
예제 #6
0
def test_non_standard_codes():
    assert to_code("European Union") == "EU"
    assert to_code_3("European Union") == "EUU"
    assert to_code("Kosovo") == "XK"
    assert to_code_3("Kosovo") == "XKX"
예제 #7
0
def test_fuzzy_matching():
    assert to_code('Rossiyskaya Federatsiya', fuzzy=True) == "RU"
    assert to_code("Falklands Islands", fuzzy=True) == "FK"
    assert to_code("TGermany", fuzzy=True) == "DE"
예제 #8
0
def test_unicode():
    assert to_code(u'Российская Федерация') == "RU"
def parse_officer(line):

    results = dict()

    # number of the company to which this officer is appointed to.
    # The majority of company numbers are 8 digit numeric; however, some
    # consist of a prefix of 2 alphanum characters
    # followed by 6 digits.
    results['appointed_to_company_number'] = line[0:8]

    # nature of the officer.
    # 1 is person (as in officer, could be legal or natural person)
    # 2 is company (as in companies to which the officer is appointed to)
    results['record_type'] = line[8]

    # source document of the appointment date.
    results['appointment_date_origin_code'] = line[9:10]

    # role of the appointed officer.
    results['officer_role_code'] = line[10:12]

    # personal number: as of 2009 pnr are composed of 12 digits. The first
    # 8 uniquely identify the person.  A person is composed of a name and a
    # usual residence address (URA) which is *not* public. If director with
    # many appointments changes URA or name for an appointment then the pnr
    # last 4 digits will be incremented from 0000 to 0001.
    results['pnr'] = line[12:24]

    # indicator for record being a company.  officer can be either natual
    # (h**o sapiens =! Y) or legal (corporation == Y) person.
    results['is_company'] = line[24] == 'Y'

    # filler, can throw away.
    results['filler_a'] = line[25:32]

    # appointment dates.  If a date is provided for officer_role_code 11,
    # 12, or 13 this refers to the date that the form was registered.
    # Resigned appointments are not normally included in a snapshot so this
    # field will usually be blank.  date format: CCYYMMDD (C for century, Y
    # for year, M for month, D for day).
    results['start_date_text'] = line[32:40]
    results['end_date_text'] = line[40:48]

    # postal code.
    results['service_address_post_code'] = line[48:56]

    # dob.  partial_dob field will contain either all spaces, or a partial
    # dob followed by 2 space chars ‘CCYYMM ‘.  If full_dob is provided
    # then partial_dob will also be provided, but partial_dob may be
    # provided w/out full_dob.
    results['partial_dob'] = line[56:64]

    # full_dob could be thrown away but we keep for completeness.
    # tested on 1000 records, always '        '.
    results['full_dob'] = line[64:72]

    # holds the length of the variable data bit (incl. "<" chars), used for
    # validation, do not insert in database.
    results['unwanted_variable_data_length'] = line[72:76]

    # variable_data: contains officer’s name, service address, occupation,
    # and nationality, formatted as below:
    # TITLE                      |-> 'title'
    # <FORENAMES                 |-> 'name'
    # <SURNAME                   |-> 'surname'
    # <HONOURS                   |-> 'honours'
    # <CARE OF                   |-> 'service_address_care_of'
    # <PO BOX                    |-> 'service_address_po_box'
    # <ADDRESS LINE 1            |-> 'service_address_line_1'
    # <ADDRESS LINE 2            |-> 'service_address_line_2'
    # <POST TOWN                 |-> 'service_address_post_town'
    # <COUNTY                    |-> 'service_address_county'
    # <COUNTRY                   |-> 'service_address_country'
    # <OCCUPATION                |-> 'occupation'
    # <NATIONALITY               |-> 'nationality'
    # <USUAL RESIDENTIAL COUNTRY |-> 'ura_country'
    # <                          |-> 'filler_b'

    # Each variable data field will contain 14 “<” delimiters.  Consecutive
    # “<” delimiters indicates that the particular element of the variable
    # data is not present.

    variable_data = line[76:].rstrip(' \n')
    vardata = variable_data.split('<')
    vardata_components = (
        'title',
        'name',
        'surname',
        'honours',
        'service_address_care_of',
        'service_address_po_box',
        'service_address_line_1',
        'service_address_line_2',
        'service_address_post_town',
        'service_address_county',
        'service_address_country',
        'occupation',
        'nationality',
        'ura_country',
        'filler_b',
    )  # after the last '<' there's just a bunch of
    # white spaces till end of line, can throw away.

    for component, datapoint in zip(vardata_components, vardata):
        results[component] = datapoint

        results["ura_country_norm"] = to_code(results.get(
            "ura_country_norm", None),
                                              fuzzy=True)
        results["nationality_norm"] = to_code(results.get("nationality", None),
                                              fuzzy=True)
        results["service_address_country_norm"] = to_code(results.get(
            "service_address_country", None),
                                                          fuzzy=True)

        results["name_fp"] = generate(
            results.get("name", "") + " " + results.get("surname", ""))

    return results
예제 #10
0
 def country(self, name):
     self.country_name = name
     self.country_code = countrynames.to_code(name)
예제 #11
0
def normalize_country(name):
    return countrynames.to_code(name)
예제 #12
0
def main(f, failed_out, new_out, log_out):
    logging.basicConfig(filename=log_out, level=logging.INFO)
    logging.getLogger().addHandler(logging.StreamHandler())
    new_docs = []
    failed_docs = []
    try:

        with open(f) as fd:
            docs = list(yaml.load_all(fd))
            N = len(docs)
            for K, d in enumerate(docs):
                url = d.get('url')
                affs = has_affiliation(d)
                success_rate = 100.0 - float(len(failed_docs)) / N * 100
                logging.info(
                    "{c.Fore.CYAN}[{s:.1f}%]: {K}/{N}. {t}>>"
                    "{c.Fore.YELLOW}Trying {0}{c.Style.RESET_ALL}".format(
                        url,
                        s=success_rate,
                        K=K,
                        N=N,
                        t=time.ctime(),
                        c=colorama))
                try:
                    if not affs:
                        raise Exception('No affiliations')
                    countries = list(
                        filter(bool, [
                            countrynames.to_code(aff, fuzzy=True)
                            for aff in affs
                        ]))
                    centers = list(
                        filter(bool, [get_centers(aff) for aff in affs]))
                    if not centers:
                        raise Exception(
                            'No countries ({aff!s})'.format(aff=affs))
                except Exception as e:
                    logging.info("{c.Fore.RED}\tFailed ({e})"
                                 "{c.Style.RESET_ALL}".format(e=e, c=colorama))
                    failed_docs.append(d)
                else:
                    logging.info("{c.Fore.GREEN}\tsuccess "
                                 "{codes!s}\n"
                                 "\t\t{affs!s}"
                                 "{c.Style.RESET_ALL}".format(d.get('url'),
                                                              codes=countries,
                                                              affs=affs,
                                                              c=colorama))
                    d['countries'] = countries
                    d['centers'] = countries
                    new_docs.append(d)

    except Exception as e:
        logging.error(e)
    finally:
        with open(failed_out, 'w+') as fd:
            logging.info('writing ' + failed_out)
            yaml.dump_all(list(failed_docs),
                          fd,
                          allow_unicode=True,
                          default_flow_style=False)

        with open(new_out, 'w+') as fd:
            logging.info('writing ' + new_out)
            yaml.dump_all(list(new_docs),
                          fd,
                          allow_unicode=True,
                          default_flow_style=False)
예제 #13
0
def process_file(filepath):

    with open(filepath) as f:
        for ix, line in enumerate(f):

            print(f"Inserting line {ix} of file {filepath}")

            # check line is not empty string.
            if line.strip():

                jsonline = json.loads(line)
                line_type = determine_line(jsonline)

                if line_type == "psc":

                    (current_psc, current_address, current_identification,
                     current_control) = unpack_psc_line(jsonline)

                    # normalise some fields and insert
                    current_psc["name_fp"] = generate(current_psc["name"])
                    current_psc["country_of_residence_norm"] = to_code(
                        current_psc.get("country_of_residence", None),
                        fuzzy=True)
                    current_psc["nationality_norm"] = to_code(current_psc.get(
                        "nationality", None),
                                                              fuzzy=True)
                    psc_id = psc_table.insert(current_psc)

                    if current_address:

                        # normalise country field and insert
                        current_address["country_norm"] = to_code(
                            current_address.get("country", None), fuzzy=True)
                        address_table.insert({
                            **current_address,
                            **{
                                "psc_serial_id": psc_id
                            }
                        })
                    if current_identification:
                        current_identification[
                            "country_registered_norm"] = to_code(
                                current_identification.get(
                                    "country_registered", None),
                                fuzzy=True)

                        identification_table.insert({
                            **current_identification,
                            **{
                                "psc_serial_id": psc_id
                            }
                        })
                    if current_control:

                        # stack the array of control types like this

                        # company_number | nature_of_control
                        # -----------------------------------
                        # OC123456       | sometypeofcotrol_1
                        # OC123456       | sometypeofcotrol_2
                        # OC123456       | sometypeofcotrol_3

                        root = current_control["company_number"]

                        for nature in current_control["natures_of_control"]:
                            stacked_control_data = {
                                "company_number": root,
                                "psc_serial_id": psc_id,
                                "nature_of_control": nature
                            }
                            control_table.insert(stacked_control_data)

                # the exempted psc json is different. Needs its own processing.

                elif line_type == "exemptions":

                    # lazyly create the list of dictionaries to be inserted as
                    # records into the table.
                    exemptions_generator = unpack_exemptions_line(
                        json.loads(line))

                    for exemption_dict in list(exemptions_generator):
                        exemptions_table.insert(exemption_dict)

                elif line_type == "summary_line":

                    # example of summary_line below:

                    # {"data":
                    #   {
                    # "kind": "totals#persons-of-significant-control-snapshot",
                    # "persons_of_significant_control_count": 7131880,
                    # "statements_count": 564130,
                    # "exemptions_count": 92,
                    # "generated_at"    : "2020-03-25T03:39:38Z"}
                    # }

                    summary_data = jsonline.pop("data")
                    summary_table.insert(summary_data)

            # if line is empty, go to next one.
            else:
                continue
예제 #14
0
 def normalize_value(self, value, prop, record):
     return [countrynames.to_code(value)]
예제 #15
0
 def clean(self, value, prop, record):
     value = super(CountryProperty, self).clean(value, prop, record)
     return countrynames.to_code(value) or value
예제 #16
0
def test_to_code():
    assert to_code("Germany") == "DE"
    assert to_code("UK") == "GB"
    assert to_code("Nothing") == None
예제 #17
0
def test_fuzzy_matching():
    assert to_code("Rossiyskaya Federacia", fuzzy=True) == "RU"
    assert to_code("Falklands Islands", fuzzy=True) == "FK"
    assert to_code("TGermany", fuzzy=True) == "DE"
    assert to_code_3("State of Palestine", fuzzy=True) == "PSE"
예제 #18
0
def test_to_code():
    assert to_code("Germany") == "DE"
    assert to_code("UK") == "GB"
    assert to_code("North Macedonia") == "MK"
    assert to_code("Nothing") is None
예제 #19
0
# coding: utf-8
import countrynames

tests = [
    'Germany', 'DE', 'UK', u'Российская Федерация', 'Rossiyskaya Federatsiya',
    'Tgermany', None
]

for test in tests:
    print[test,
          countrynames.to_code(test, fuzzy=False),
          countrynames.to_code(test)]