class ReadInputData:
    """
    Class to read Persons raw data from a csv file and return the output as a list
    """

    log = cl(log_level=logging.INFO)

    def read_data(self) -> list:
        """
        Reads data from csv file. Uses 'csv' library

        :return: list of items. Each item is a list consisting of persons raw data
        :exception raises exception if there is any blank line in the source csv file
        """
        person_details = []
        try:
            with open("./persons_raw_data.csv") as test_data:
                csv_reader = csv.reader(test_data)
                for row in csv_reader:
                    person_details.extend([row])
            test_data.close()
            self.log.info(
                msg="Raw data has been successfully read from csv file")
            return person_details

        except FileNotFoundError:
            self.log.error(msg="The input csv file does not exist")
Пример #2
0
class GetFirstNRecordsFromCSVFile:
    """
    Class to read Persons raw data from a csv file and return the output as a list
    """

    log = cl(log_level=logging.INFO)

    # Decorator
    def read_data_from_csv(self) -> list:
        """
        Decorator function to read lines in a given csv file
        Stops reading if there is blank or EOF

        :return: list of strings
        """
        try:
            with open("./persons_raw_data.csv") as test_data:
                csv_reader = csv.reader(test_data)
                next(csv_reader)  # Filters header line
                for line in csv_reader:
                    if line:
                        yield line
                    else:
                        break
            test_data.close()
        except IOError:
            self.log.error(msg='Unable to access input data file')

    def get_first_n_records(self, count: int) -> list:
        """
        Gets the first 'count' records or all records if less than 'count' from input file

        :param count: int  number of person records
        :return: list of maximum 'count' items. Each item is a list i.e. each person details
        """
        person_details = []
        counter = 0
        record = self.read_data_from_csv()
        for row in record:
            if counter < count:
                person_details.append(row)
                counter += 1
        self.log.info(msg=f'Collected {len(person_details)} records')
        return person_details
Пример #3
0
class FormatAndWriteRelatedNamesToAFile:
    """
        FormatAndWriteRelatedNamesToAFile class gets the Matched names data,
        formats it as mentioned in requirements document and writes into a txt file"
        """

    log = cl(log_level=logging.INFO)

    # Generator
    @staticmethod
    def build_format_for_related_names(related_names_data: dict) -> str:
        """
        Generator function which returns string in the expected output
        :param related_names_data: list (received from get_related_names_data)
        :return: string (formatted)
        """
        for key, value in related_names_data.items():
            matched_names = ', '.join([str(x) for x in value])
            yield f'{key}: {matched_names} \n'

    def write_related_names_data_to_text_file(self):
        """
        Writes the string received from Generator to 'related_persons_info.text' file
        :return: text file
        """
        related_names_data = GetRelatedPersons().get_related_names_data()
        matches = FormatAndWriteRelatedNamesToAFile.build_format_for_related_names(
            related_names_data)
        try:
            with open('related_persons_info_solution2.txt',
                      'w') as output_file:
                for match in matches:
                    try:
                        output_file.write(match)
                    except OSError:
                        self.log.error(
                            msg="Error while writing to the Output text file")
            output_file.close()
        except IOError:
            self.log.error(msg='Unable to access output txt file')
        self.log.info(
            msg="Check out the output file for Related Persons details")
Пример #4
0
class FilterFields:
    """
    FilterFields class performs all the required user validations defined in requirements document
    and returns list items where each item is a list consisting of a person's first_name and last_name
    """

    log = cl(log_level=logging.INFO)

    def get_data_with_fields_length_less_than_257(self, data: list) -> list:
        """
        Filters the person record if any of the person field length is more than 256

        :param data: list (received from function 'get_first_1000_records_max')
        :return: list of items. Each item is a person details with each detail length less then 256
        """
        rows_with_fields_len_257_max = []
        for row in data:
            field_len_less_than_257 = True
            for field in row:
                if len(field) > 257:
                    field_len_less_than_257 = False
                    break
            if field_len_less_than_257 is True:
                rows_with_fields_len_257_max.append(row)
        self.log.info(msg=f"""{len(rows_with_fields_len_257_max)} out of {len(
            data)} records have fields less than the length of 257""")
        return rows_with_fields_len_257_max

    # Helper
    def get_data_with_only_first_lastname_email(self, data: list) -> list:
        """
        Filters all the fields from Persons details and return only first_name, last_name and email of each person

        :param data: list (received from function 'get_data_with_fields_length_less_than_257')
        :return: list of items. Each item is a person details with only first_name, last_name and email
        """
        data_with_required_fields_only = []
        for row in data:
            temp = [row[0], row[1], row[9]]
            data_with_required_fields_only.append(temp)
        self.log.info(
            msg=
            'Filtered all fields keeping first_name, last_name and email only')
        return data_with_required_fields_only

    def get_fields_with_valid_email_format(self, data: list) -> list:
        """
        Filters the person records consisting of invalid formatted emails
        Uses 'validate_email' library
        Checks whole email address is in valid format or not using validate_email library
        Checks Local part of email length is less than 65 chars

        :param data: list (received from function 'get_names_containing_alpha_or_space_hyphen_only')
        :return: list of items
        Each item is a person details with first_name, last_name, email where emails are in valid format only

        ..note:: filters following email which are valid as per https://en.wikipedia.org/wiki/Email_address
        " "@example.org,  "john..doe"@example.org
        """
        data_with_valid_email_format = []
        for row in data:
            local_part = row[-1].split('@')[0]
            if (validate_email(row[-1])) and (len(local_part) < 65):
                data_with_valid_email_format.append(row)
        self.log.info(
            f'{len(data_with_valid_email_format)} out of {len(data)} records have valid emails'
        )
        return data_with_valid_email_format

    def get_first_last_name_email_notblank_combination(self,
                                                       data: list) -> list:
        """
        Filters the person record if it contain blank first_name or last_name or email

        :param data: list (received from function 'get_data_with_only_first_lastname_email')
        :return: list of items.
        Each item is a person details with first_name, last_name and email which are not blank
        """
        data_with_nonblank_fields = []
        for row in data:
            is_field_blank = True
            for field in row:
                if not field:
                    is_field_blank = False
                    break
            if is_field_blank is True:
                data_with_nonblank_fields.append(row)
        self.log.info(
            msg=
            f'{len(data_with_nonblank_fields)} out of {len(data)} records do not have blank emails'
        )
        return data_with_nonblank_fields

    # Helper
    def get_first_and_lastname_details_and_remove_email(self,
                                                        data: list) -> list:
        """
        Filters the person record and return only first_name and last_name
        :param data: list (received from function 'get_fields_with_valid_email_format')
        :return: list of items
        Each item is a person details with first_name, last_name only
        """
        data_first_lastname_filtered = []
        for row in data:
            data_first_lastname_filtered.append(row[0:2])
        self.log.info(
            msg='Filtered email keeping the first_name and last_name only')
        return data_first_lastname_filtered

    def get_names_containing_atleast_one_alpha(self, data: list) -> list:
        """
        Filters the person record if it does not contain atleast one alpha character in first_name or last_name

        :param data: list (received from function 'get_first_last_name_email_notblank_combination')
        :return: list of items
        Each item is a person details with first_name, last_name, email where first_name and last_name
        contain atleast one alpha character
        Example: it filters any first_name or last_name with only '--' or '  '
        """
        data_with_atleast_one_alpha = []
        for row in data:
            field_with_atleast_one_alpha = True
            for field in row:
                if not re.search('[a-zA-Z]', field):
                    field_with_atleast_one_alpha = False
                    break
            if field_with_atleast_one_alpha is True:
                data_with_atleast_one_alpha.append(row)
        self.log.info(msg=f"""{len(data_with_atleast_one_alpha)} out of {len(
            data)} records have atleast one alpha in their first_name and last_name"""
                      )
        return data_with_atleast_one_alpha

    def get_names_containing_alpha_or_space_hyphen_only(self,
                                                        data: list) -> list:
        """
        Filters the person record if it contains any character other than allowed alpha, space, hyphen
        in first_name or last_name

        :param data: list (received from function 'get_names_containing_atleast_one_alpha')
        :return: list of items
        Each item is a person details with first_name, last_name, email where first_name and last_name
        does not contain any character other than allowed
        """
        data_with_alpha_or_space_hypen = []
        for row in data:
            field_with_alpha_space_hyphen = True
            for field in row:
                if not re.fullmatch('^[a-zA-Z- ]*$', field):
                    field_with_alpha_space_hyphen = False
                    break
            if field_with_alpha_space_hyphen is True:
                data_with_alpha_or_space_hypen.append(row)
        self.log.info(f"""{len(data_with_alpha_or_space_hypen)} out of {len(
            data)} records do not chars other than alpha or space or hyphen""")
        return data_with_alpha_or_space_hypen

    # All filter actions methods calling
    def get_filtered_first_lastname_details(self) -> list:
        """
        This function calls all the user validation functions above in an order and return persons details with
        last_name and first_name
        Uses fancy functoolz.compose from toolz library

        :return:list of items
        Each item is a person details which have gone through all user validations as per requirements
        Each item is a list consisting of first_name, last_name only
        """
        data = GetFirstNRecordsFromCSVFile().get_first_n_records(count=1000)
        name_details_after_fields_filtering = functoolz.compose(
            self.get_names_containing_alpha_or_space_hyphen_only,
            self.get_names_containing_atleast_one_alpha,
            self.get_first_and_lastname_details_and_remove_email,
            self.get_fields_with_valid_email_format,
            self.get_first_last_name_email_notblank_combination,
            self.get_data_with_only_first_lastname_email,
            self.get_data_with_fields_length_less_than_257)(data)
        self.log.info(
            msg=
            f'{len(name_details_after_fields_filtering)} records passed filtering'
        )
        return name_details_after_fields_filtering
Пример #5
0
class GetRelatedPersons:
    """
    RelatedPersons class finds whether a person is related to any another person based on search/match patterns
    defined in requirements document
    """

    log = cl(log_level=logging.INFO)

    def split_last_name(self, last_name: str, split_char: str) -> list:
        """
        Gets a string as a 1st param, check if the string contains a split char.
        If the string contains a split char, it will be split based on split char otherwise not
        In either way it return string converted to list
        :param last_name: a string normally, ex: "William-Scott" or "William"
        :param split_char: any char, in our project it is hyphen
        :return: list
        """
        if '-' in last_name:
            last_name_ = last_name.split(split_char)
            # self.log.info(msg=f'"{last_name}" last Name contains {split_char} and splitted to {last_name_}') # enable if needed
            return last_name_
        else:
            # self.log.info(msg=f'"{last_name}" last name does not contain {split_char}') # enable if needed
            last_name = [last_name]
            return last_name

    def filter_keys_with_empty_values(self, names: dict) -> dict:
        """
        It receives all the persons as keys and value as list of matching persons.
        When a person does not have a matching then that value will be empty list.
        This function filters those dictionary items having empty lists as the values

        :param names: dictionary with a key as a person first_name, last_name and value as list of matching persons.
        :return:
        """
        filtered_names = {k: v for k, v in names.items() if v}
        self.log.info(
            msg=
            f'{len(filtered_names)} keys filtered out of {len(names)} keys have values'
        )
        return filtered_names

    def get_related_names_data(self) -> dict:
        """
        Takes one name at a time and compares its last name with the last names next in the order in the list
        The improvement over the solution1 is it compares two names in a list only once
        :return: dict with the values having last name matching with the last name in the respective key
        """
        items = FilterFields().get_filtered_first_lastname_details()
        related_names_dict = {}
        for i in range(0, len(items)):
            k = ' '.join([items[i][0], items[i][1]])
            last_name = []
            if k not in related_names_dict:
                related_names_dict[k] = []
            last_name.extend(
                self.split_last_name(last_name=items[i][1], split_char='-'))

            for j in range(i + 1, len(items)):
                another_last_name = self.split_last_name(last_name=items[j][1],
                                                         split_char='-')
                match = [x for x in last_name if x in another_last_name]
                j = ' '.join([items[j][0], items[j][1]])
                if match:
                    related_names_dict[k].append(j)
                    if j not in related_names_dict:
                        related_names_dict[j] = []
                    related_names_dict[j].append(k)
        self.log.info(
            "Completed comparing last names but result dictionary may contain keys with empty values"
        )
        related_names_final_dict = self.filter_keys_with_empty_values(
            names=related_names_dict)
        self.log.info(
            f'Found {len(related_names_final_dict)} names in total having last name similar to others'
        )
        return related_names_final_dict
class FilterFields:
    """
    FilterFields class performs all the required user validations defined in requirements document
    and returns list items where each item is a list consisting of a person's first_name and last_name
    """

    log = cl(log_level=logging.INFO)

    def get_first_1000_records_max(self, data) -> list:
        """
        Gets the first 1000 items in a list

        :param data: list
        :return: list of maximum 1000 items. Each item is a list i.e. each person details
        """
        len_of_records = len(data)
        if len_of_records <= 1000:
            data = data[1:len_of_records]
            self.log.info(msg=f'Collected all {len_of_records-1} records')
        else:
            data = data[1:1000]
            self.log.info(msg="Collected first 1000 records")
        return data

    def get_data_with_fields_length_less_than_257(self, data: list) -> list:
        """
        Filters the person record if any of the person field length is more than 256

        :param data: list (received from function 'get_first_1000_records_max')
        :return: list of items. Each item is a person details with each detail length less then 256
        """
        rows_with_fields_len_257_max = []
        for row in data:
            field_len_less_than_257 = True
            for field in row:
                if len(field) > 257:
                    field_len_less_than_257 = False
                    break
            if field_len_less_than_257 is True:
                rows_with_fields_len_257_max.append(row)
        self.log.info(msg=f"""{len(rows_with_fields_len_257_max)} out of {len(
            data)} records have fields less than the length of 257""")
        return rows_with_fields_len_257_max

    # Helper
    def get_data_with_only_first_lastname_email(self, data: list) -> list:
        """
        Filters all the fields from Persons details and return only first_name, last_name and email of each person

        :param data: list (received from function 'get_data_with_fields_length_less_than_257')
        :return: list of items. Each item is a person details with only first_name, last_name and email
        """
        data_with_required_fields_only = []
        for row in data:
            temp = []
            temp.extend([row[0]])
            temp.extend([row[1]])
            temp.extend([row[9]])
            data_with_required_fields_only.append(temp)
        self.log.info(
            msg=
            'Filtered all fields keeping first_name, last_name and email only')
        return data_with_required_fields_only

    def get_first_last_name_email_notblank_combination(self,
                                                       data: list) -> list:
        """
        Filters the person record if it contain blank first_name or last_name or email

        :param data: list (received from function 'get_data_with_only_first_lastname_email')
        :return: list of items.
        Each item is a person details with first_name, last_name and email which are not blank
        """
        data_with_nonblank_fields = []
        for row in data:
            is_field_blank = True
            for field in row:
                if not field:
                    is_field_blank = False
                    break
            if is_field_blank is True:
                data_with_nonblank_fields.append(row)
        self.log.info(
            msg=
            f'{len(data_with_nonblank_fields)} out of {len(data)} records do not have blank emails'
        )
        return data_with_nonblank_fields

    def get_names_containing_atleast_one_alpha(self, data: list) -> list:
        """
        Filters the person record if it does not contain atleast one alpha character in first_name or last_name

        :param data: list (received from function 'get_first_last_name_email_notblank_combination')
        :return: list of items
        Each item is a person details with first_name, last_name, email where first_name and last_name
        contain atleast one alpha character
        """
        data_with_atleast_one_alpha = []
        for row in data:
            field_with_atleast_one_alpha = True
            for field in row[0:2]:
                if not re.search('[a-zA-Z]', field):
                    field_with_atleast_one_alpha = False
                    break
            if field_with_atleast_one_alpha is True:
                data_with_atleast_one_alpha.append(row)
        self.log.info(msg=f"""{len(data_with_atleast_one_alpha)} out of {len(
            data)} records have atleast one alpha in their first_name and last_name"""
                      )
        return data_with_atleast_one_alpha

    def get_names_containing_alpha_or_space_hyphen_only(self,
                                                        data: list) -> list:
        """
        Filters the person record if it contains any character other than allowed alpha, space, hyphen
        in first_name or last_name

        :param data: list (received from function 'get_names_containing_atleast_one_alpha')
        :return: list of items
        Each item is a person details with first_name, last_name, email where first_name and last_name
        does not contain any character other than allowed
        """
        data_with_alpha_or_space_hypen = []
        for row in data:
            field_with_alpha_space_hyphen = True
            for field in row[0:2]:
                if not re.fullmatch('^[a-zA-Z- ]*$', field):
                    field_with_alpha_space_hyphen = False
                    break
            if field_with_alpha_space_hyphen is True:
                data_with_alpha_or_space_hypen.append(row)
        self.log.info(f"""{len(data_with_alpha_or_space_hypen)} out of {len(
            data)} records do not chars other than alpha or space or hyphen""")
        return data_with_alpha_or_space_hypen

    def get_fields_with_valid_email_format(self, data: list) -> list:
        """
        Filters the person records consisting of invalid formatted emails
        Uses 'validate_email' library

        :param data: list (received from function 'get_names_containing_alpha_or_space_hyphen_only')
        :return: list of items
        Each item is a person details with first_name, last_name, email where emails are in valid format only

        ..note:: filters following email which are valid as per https://en.wikipedia.org/wiki/Email_address
        " "@example.org,  "john..doe"@example.org
        """
        data_with_valid_email_format = []
        for row in data:
            local_part = row[-1].split('@')[0]
            if (validate_email(row[-1])) and (len(local_part) < 65):
                data_with_valid_email_format.append(row)
        self.log.info(
            f'{len(data_with_valid_email_format)} out of {len(data)} records have valid emails'
        )
        return data_with_valid_email_format

    # Helper
    def get_first_and_lastname_details_and_remove_email(self,
                                                        data: list) -> list:
        """
        Filters the person record and return only first_name and last_name
        :param data: list (received from function 'get_fields_with_valid_email_format')
        :return: list of items
        Each item is a person details with first_name, last_name only
        """
        data_first_lastname_filtered = []
        for row in data:
            data_first_lastname_filtered.append(row[0:2])
        self.log.info(
            msg='Filtered email keeping the first_name and last_name only')
        return data_first_lastname_filtered

    # All filter actions methods calling
    def get_filtered_first_lastname_details(self) -> list:
        """
        This functions calls all the user validation functions above in an order and return persons details with
        last_name and first_name
        :return:list of items
        Each item is a person details which have gone through all user validations as per requirements
        Each item is a list consisting of first_name, last_name only
        """
        raw_input_data = ReadInputData().read_data()
        first_1000_records = FilterFields().get_first_1000_records_max(
            data=raw_input_data)
        data_with_fields_less_than_257chars = FilterFields(
        ).get_data_with_fields_length_less_than_257(data=first_1000_records)
        data_with_only_first_lastname_email = FilterFields(
        ).get_data_with_only_first_lastname_email(
            data=data_with_fields_less_than_257chars)
        filter_blank_names_emails_records = FilterFields(
        ).get_first_last_name_email_notblank_combination(
            data=data_with_only_first_lastname_email)
        data_with_atleast_one_alphachar = FilterFields(
        ).get_names_containing_atleast_one_alpha(
            data=filter_blank_names_emails_records)
        data_with_names_containing_alpha_or_space_hyphen_only = FilterFields(
        ).get_names_containing_alpha_or_space_hyphen_only(
            data=data_with_atleast_one_alphachar)
        data_with_valid_email_format = FilterFields(
        ).get_fields_with_valid_email_format(
            data=data_with_names_containing_alpha_or_space_hyphen_only)
        filtered_first_lastname_details = FilterFields(
        ).get_first_and_lastname_details_and_remove_email(
            data=data_with_valid_email_format)
        self.log.info(
            msg=
            f'{len(filtered_first_lastname_details)} records passed filtering')
        return filtered_first_lastname_details
class RelatedPersons:
    """
    RelatedPersons class finds whether a person is related to any another person based on search/match patterns
    defined in requirements document
    """

    log = cl(log_level=logging.INFO)

    def get_lastname_that_is_same_as_lastname_of_another(
            self, lastname: str, another_lastname: str) -> bool:
        """
        Compares two strings whether they are exactly same
        :param lastname: string
        :param another_lastname: string
        :return: boolean
        True if strings are exactly same
        """
        if lastname == another_lastname:
            self.log.info(
                msg=f'{lastname} and {another_lastname} are exactly similar')
            return True
        else:
            self.log.info(
                msg=f'{lastname} and {another_lastname} are NOT exactly similar'
            )
            return False

    def get_a_part_of_hyphenated_name_that_appears_as_hyphenated_part_of_another(
            self, lastname: str, another_lastname: str) -> bool:
        """
        Finds whether a part of hyphenated string matches with any part of another hyphenated string
        :param lastname: string
        :param another_lastname: string
        :return: boolean
        True if matches else False
        """
        if ("-" in lastname) and ("-" in another_lastname):
            hyphenated_lastname = lastname.split("-")
            hyphenated_other_lastname = another_lastname.split("-")
            matches = [
                x for x in hyphenated_lastname
                if x in hyphenated_other_lastname
            ]
            self.log.info(
                msg=
                f'Part of the {lastname} matches with part of {another_lastname} '
            )
            return matches
        else:
            self.log.info(
                msg=
                f'Part of the {lastname} does not match with part of {another_lastname} '
            )
            return False

    def get_lastname_of_one_that_appears_as_part_of_hyphenated_lastname_of_another(
            self, lastname: str, another_lastname: str) -> bool:
        """
        Finds whether a string is exactly same as any part of hyphenated string
        :param lastname: string
        :param another_lastname: string
        :return: boolean
        True if same else False
        """
        if "-" in another_lastname:
            hyphenated_other_lastname = another_lastname.split("-")
            if lastname in hyphenated_other_lastname:
                self.log.info(
                    msg=f'{lastname} matches with part of {another_lastname}')
                return True
        elif "-" in lastname:
            hyphenated_lastname = lastname.split("-")
            if another_lastname in hyphenated_lastname:
                self.log.info(
                    msg=f'Part of {another_lastname} matches with {lastname}')
                return True
        else:
            return False

    def get_related_names_data(self) -> list:
        """
        Call the search pattern functions above and returns a list of related persons
        :return: list
        each list contains a dictionary.
        In each dictionary:
            key = person first_name, last_name
            value: list of persons with first_name, last_name whose last_name matches with last_name in the key person
        as per requirements
        """
        data = FilterFields().get_filtered_first_lastname_details()
        related_names = []
        for i in range(0, len(data)):
            row = {}
            matching_lastnames = []
            for j in range(0, len(data)):
                if i != j:
                    record = data[i]
                    another_record = data[j]
                    if RelatedPersons(
                    ).get_lastname_that_is_same_as_lastname_of_another(
                            lastname=record[1],
                            another_lastname=another_record[1]):
                        matching_lastnames.append(another_record[0] + " " +
                                                  another_record[1])
                        row[record[0] + " " + record[1]] = matching_lastnames
                    elif RelatedPersons(
                    ).get_a_part_of_hyphenated_name_that_appears_as_hyphenated_part_of_another(
                            lastname=record[1],
                            another_lastname=another_record[1]):
                        matching_lastnames.append(another_record[0] + " " +
                                                  another_record[1])
                        row[record[0] + " " + record[1]] = matching_lastnames
                    elif RelatedPersons(
                    ).get_lastname_of_one_that_appears_as_part_of_hyphenated_lastname_of_another(
                            lastname=record[1],
                            another_lastname=another_record[1]):
                        matching_lastnames.append(another_record[0] + " " +
                                                  another_record[1])
                        row[record[0] + " " + record[1]] = matching_lastnames
            if row:
                related_names.append(row)
        self.log.info(f'{len(related_names)} are related')
        return related_names

    # Generator
    @staticmethod
    def build_format_for_related_names(related_names_data: list) -> str:
        """
        Generator function which returns string in the expected output
        :param related_names_data: list (received from get_related_names_data)
        :return: string (formatted)
        """
        for record in related_names_data:
            for key, value in record.items():
                matched_names = ', '.join([str(x) for x in value])
                yield f'{key}: {matched_names} \n'

    def write_related_names_data_to_text_file(self):
        """
        Writes the string received from Generator to 'related_persons_info.text' file
        :return: text file
        """
        related_names_data = RelatedPersons().get_related_names_data()
        matches = RelatedPersons.build_format_for_related_names(
            related_names_data)
        try:
            with open('related_persons_info_solution1.txt',
                      'w') as output_file:
                for match in matches:
                    try:
                        output_file.write(match)
                    except OSError:
                        self.log.error(
                            msg="Error while writing to the Output text file")
            output_file.close()
        except FileNotFoundError:
            self.log.error(msg="The output txt file does not exist")
        self.log.info(msg="Check out the output for Related Persons")