Пример #1
0
  def testSNameStandardiser(self):
    """Test name standardiser routines (surname first)"""

#    return

    ns = standardisation.NameStandardiser(descript = 'Test name standardiser',
                                          input_fields = ['in_sname'],
                                          output_fiel = ['title',
                                                         'gender_guess',
                                                         'given_name',
                                                         'alt_given_name',
                                                         'surname',
                                                         'alt_surname'],
                                          female_t = self.name_female_titles,
                                          male_t = self.name_male_titles,
                                          tag_t=self.name_tag_table,
                                          corr_l=self.name_corr_list,
                                          first_name_c = 'sname',
                                          hmm_train_fi = 'test-hmm-train.txt')

    rs = standardisation.RecordStandardiser(descr = 'Test record standardiser',
                                            input_dataset = self.in_ds,
                                            output_dataset = self.out_ds,
                                            comp_stand_list =[ns])

    for (name_str, name_res) in self.names_snames:

      clean_name_str = ns.clean_component(name_str)
      test_name_res =  ns.standardise(name_str, clean_name_str)

#      assert name_res == test_name_res, \
#             'Wrong surname first standardisation: %s, should be: %s' % \
#             (str(test_name_res), str(name_res))

    print 'Count dict:', ns.count_dict
Пример #2
0
    def testDateStandardiser(
            self):  # - - - - - - - - - - - - - - - - - - - - -
        """Test date standardiser routines"""

        return

        ds = standardisation.DateStandardiser(
            descript="Test date standardiser",
            parse_form=self.date_parse_formats,
            input_fields=["in_date"],
            output_fiel=["day", "month", "year"],
        )

        rs = standardisation.RecordStandardiser(
            descr="Test record standardiser",
            input_dataset=self.in_ds,
            output_dataset=self.out_ds,
            comp_stand_list=[ds],
            pass_fiel=[("pass1", "out_pass1"), ("pass2", "out_pass2")],
        )

        for (date_str, date_res) in self.dates:

            clean_date_str = ds.clean_component(date_str)
            test_date_res = ds.standardise(date_str, clean_date_str)

            assert date_res == test_date_res, (
                "Wrong date standardisation: %s, should be: %s" %
                (str(test_date_res), str(date_res)))

        rs.standardise()  # Use record standardiser and write output file

        # Test the content of the output data set
        #
        test_ds = dataset.DataSetCSV(
            description="Test standardised data set",
            access_mode="read",
            rec_ident="rec_id",
            field_list=[],
            header_line=True,
            write_header=True,
            file_name="test-standardised-dataset.csv",
        )

        i = 0
        for (rec_id, rec_list) in test_ds.readall():
            test_day = rec_list[0]
            test_month = rec_list[1]
            test_year = rec_list[2]

            true_day = self.dates[i][1][0]
            true_month = self.dates[i][1][1]
            true_year = self.dates[i][1][2]

            assert test_day == true_day, (i, rec_list[0:3], self.dates[i][1])
            assert test_month == true_month, (i, rec_list[0:3],
                                              self.dates[i][1])
            assert test_year == true_year, (i, rec_list[0:3], self.dates[i][1])

            i += 1
Пример #3
0
    def testDateStandardiser(
            self):  # - - - - - - - - - - - - - - - - - - - - -
        """Test date standardiser routines"""

        return

        ds = standardisation.DateStandardiser(
            descript='Test date standardiser',
            parse_form=self.date_parse_formats,
            input_fields=['in_date'],
            output_fiel=['day', 'month', 'year'])

        rs = standardisation.RecordStandardiser(
            descr='Test record standardiser',
            input_dataset=self.in_ds,
            output_dataset=self.out_ds,
            comp_stand_list=[ds],
            pass_fiel=[('pass1', 'out_pass1'), ('pass2', 'out_pass2')])

        for (date_str, date_res) in self.dates:

            clean_date_str = ds.clean_component(date_str)
            test_date_res = ds.standardise(date_str, clean_date_str)

            assert date_res == test_date_res, \
                   'Wrong date standardisation: %s, should be: %s' % \
                   (str(test_date_res), str(date_res))

        rs.standardise()  # Use record standardiser and write output file

        # Test the content of the output data set
        #
        test_ds = dataset.DataSetCSV(description='Test standardised data set',
                                     access_mode='read',
                                     rec_ident='rec_id',
                                     field_list=[],
                                     header_line=True,
                                     write_header=True,
                                     file_name='test-standardised-dataset.csv')

        i = 0
        for (rec_id, rec_list) in test_ds.readall():
            test_day = rec_list[0]
            test_month = rec_list[1]
            test_year = rec_list[2]

            true_day = self.dates[i][1][0]
            true_month = self.dates[i][1][1]
            true_year = self.dates[i][1][2]

            assert test_day == true_day, (i, rec_list[0:3], self.dates[i][1])
            assert test_month == true_month, (i, rec_list[0:3],
                                              self.dates[i][1])
            assert test_year == true_year, (i, rec_list[0:3], self.dates[i][1])

            i += 1
Пример #4
0
    def testGNameStandardiser(
            self):  # -----------------------------------------
        """Test name standardiser routines (given name first)"""

        #    return

        ns = standardisation.NameStandardiser(
            descript="Test name standardiser",
            input_fields=["in_gname"],
            output_fiel=[
                "title",
                "gender_guess",
                "given_name",
                "alt_given_name",
                "surname",
                "alt_surname",
            ],
            female_t=self.name_female_titles,
            male_t=self.name_male_titles,
            tag_t=self.name_tag_table,
            corr_l=self.name_corr_list,
            hmm_train_fil="test-hmm-train.txt",
        )

        rs = standardisation.RecordStandardiser(
            descr="Test record standardiser",
            input_dataset=self.in_ds,
            output_dataset=self.out_ds,
            comp_stand_list=[ns],
        )

        for (name_str, name_res) in self.names_gnames:

            clean_name_str = ns.clean_component(name_str)
            test_name_res = ns.standardise(name_str, clean_name_str)

        #      assert name_res == test_name_res, \
        #             'Wrong given name first standardisation: %s, should be: %s' % \
        #             (str(test_name_res), str(name_res))

        #    rs.standardise()  # Use record standardiser and write output file

        print("Count dict:", ns.count_dict)
Пример #5
0
  def testPhoneNumStandardiserNone(self):
    """Test phone number standardiser routines"""

    return

    ps = standardisation.PhoneNumStandardiser(descript = \
                                              'Test phone number standardiser',
                                          input_fields = ['in_phonenum'],
                                          output_fiel = ['country_code',
                                                         None,
                                                         'area_code', 'number',
                                                         None])

    rs = standardisation.RecordStandardiser(descr = 'Test record standardiser',
                                            input_dataset = self.in_ds,
                                            output_dataset = self.out_ds,
                                            comp_stand_list = [ps])

    for (phonenum_str, phonenum_res) in self.phonenums:

      clean_phonenum_str = ps.clean_component(phonenum_str)
      test_phonenum_res =  ps.standardise(phonenum_str, clean_phonenum_str)

      assert phonenum_res == test_phonenum_res, \
             'Wrong phone number standardisation: %s, should be: %s' % \
             (str(test_phonenum_res), str(phonenum_res))

    rs.standardise()  # Use record standardiser and write output file

    # Test the content of the output data set
    #
    test_ds = dataset.DataSetCSV(description='Test standardised data set',
                                 access_mode='read',
                                 rec_ident = 'rec_id',
                                 field_list = [],
                                 header_line=True,
                                 write_header=True,
                                 file_name='test-standardised-dataset.csv')

    i = 0
    for (rec_id, rec_list) in test_ds.readall():
      test_country_code = rec_list[3]
      test_country_name = rec_list[4]
      test_area_code =    rec_list[5]
      test_number =       rec_list[6]
      test_extension =    rec_list[7]

      true_country_code = self.phonenums[i][1][0]
      true_area_code =    self.phonenums[i][1][2]
      true_number =       self.phonenums[i][1][3]

      assert test_country_code == true_country_code, \
             (i, rec_list[3:8], self.phonenums[i][1])
      assert test_country_name == '', \
             (i, rec_list[3:8], self.phonenums[i][1])
      assert test_area_code == true_area_code, \
             (i, rec_list[3:8], self.phonenums[i][1])
      assert test_number == true_number, \
             (i, rec_list[3:8], self.phonenums[i][1])
      assert test_extension == '', \
             (i, rec_list[3:8], self.phonenums[i][1])

      i += 1
Пример #6
0
    def testPhoneNumStandardiserNone(self):
        """Test phone number standardiser routines"""

        return

        ps = standardisation.PhoneNumStandardiser(
            descript="Test phone number standardiser",
            input_fields=["in_phonenum"],
            output_fiel=["country_code", None, "area_code", "number", None],
        )

        rs = standardisation.RecordStandardiser(
            descr="Test record standardiser",
            input_dataset=self.in_ds,
            output_dataset=self.out_ds,
            comp_stand_list=[ps],
        )

        for (phonenum_str, phonenum_res) in self.phonenums:

            clean_phonenum_str = ps.clean_component(phonenum_str)
            test_phonenum_res = ps.standardise(phonenum_str,
                                               clean_phonenum_str)

            assert phonenum_res == test_phonenum_res, (
                "Wrong phone number standardisation: %s, should be: %s" %
                (str(test_phonenum_res), str(phonenum_res)))

        rs.standardise()  # Use record standardiser and write output file

        # Test the content of the output data set
        #
        test_ds = dataset.DataSetCSV(
            description="Test standardised data set",
            access_mode="read",
            rec_ident="rec_id",
            field_list=[],
            header_line=True,
            write_header=True,
            file_name="test-standardised-dataset.csv",
        )

        i = 0
        for (rec_id, rec_list) in test_ds.readall():
            test_country_code = rec_list[3]
            test_country_name = rec_list[4]
            test_area_code = rec_list[5]
            test_number = rec_list[6]
            test_extension = rec_list[7]

            true_country_code = self.phonenums[i][1][0]
            true_area_code = self.phonenums[i][1][2]
            true_number = self.phonenums[i][1][3]

            assert test_country_code == true_country_code, (
                i,
                rec_list[3:8],
                self.phonenums[i][1],
            )
            assert test_country_name == "", (i, rec_list[3:8],
                                             self.phonenums[i][1])
            assert test_area_code == true_area_code, (
                i,
                rec_list[3:8],
                self.phonenums[i][1],
            )
            assert test_number == true_number, (i, rec_list[3:8],
                                                self.phonenums[i][1])
            assert test_extension == "", (i, rec_list[3:8],
                                          self.phonenums[i][1])

            i += 1