示例#1
0
    def createSourceDataSQLQuery(self):
        """
        Create query for pulling source data from db
        """

        logging.info("Obtaining source data...")
        query = \
            """
            SELECT            
            distinct t.aw_supplier_name as src_name,
            '' as src_tag,
            '' as src_address_locality,
            '' as src_address_postalcode,
            '' as src_address_countryname,
            '' as src_address_streetaddress,
            t.source as source
            
              FROM {0} as t
            WHERE TRUE
              AND t.countryname IN ('United Kingdom','England','Scotland','Northern Ireland','Wales')
              AND t.aw_supplier_name NOTNULL
              AND t.releasedate >= {1}
              AND t.releasedate <= {2}
              
            ;
    
            """.format(self.src_data_source, "'" + self.in_args.data_from_date + "'", "'" + self.in_args.data_to_date + "'")

        return query
示例#2
0
    def upload_file(self, file_name, bucket, object_name=None):
        """Upload a file to an S3 bucket

        :param file_name: File to upload
        :param bucket: Bucket to upload to
        :param object_name: S3 object name. If not specified then file_name is used
        :return: True if file was uploaded, else False
        """

        # If S3 object_name was not specified, use file_name
        if object_name is None:
            object_name = file_name

        # Upload the file
        s3_client = boto3.client(
            's3',
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key)
        try:
            s3_client.upload_file(file_name, bucket, object_name)
        except ClientError as e:
            logging.exception(e)
            return False
        logging.info("Upload to S3 bucket complete!")

        return True
示例#3
0
    def dedupeCluster(self):
        if not os.path.exists(self.clustered_fp):

            # Copy training file from first clustering session if recycle mode
            if self.in_args.recycle:
                copyfile(
                    self.directories['cluster_training_backup'].format(
                        self.region_dir), self.cluster_training_file)

            logging.info("Starting clustering...")

            sys.argv = [
                'csvdedupe', self.matched_fp, '--field_names',
                ' '.join(self.src_fields), '--training_file',
                self.cluster_training_file, '--settings_file',
                self.learned_settings_file, '--output_file', self.clustered_fp,
                str(self.ctrain[0])
            ]
            if self.ctrain[0] == '':
                sys.argv = sys.argv[:-1]
            launch_clustering()

            if not self.in_args.recycle:
                # Copy training file to backup, so it can be found and copied into recycle phase clustering
                copyfile(
                    self.cluster_training_file,
                    self.directories['cluster_training_backup'].format(
                        self.region_dir))
        else:
            pass
示例#4
0
 def fetchdata(self, query):
     """
     Retrieve data from the db using query
     """
     conn, _ = self.db_calls.DbCalls.createConnection(self)
     logging.info('Importing data...')
     df = pd.read_sql(query, con=conn)
     conn.close()
     return df
    def assign(self, df, assigned_file=None):
        df.sort_values(by=['Cluster_ID'], inplace=True, axis=0, ascending=True)
        df.reset_index(drop=True, inplace=True)
        tqdm.pandas()
        logging.info("Assigning close matches within clusters...")

        df = df.groupby(['Cluster_ID'
                         ]).progress_apply(AssignRegDataToClusters.getMaxId)
        df.to_csv(assigned_file, index=False)
        return df
示例#6
0
    def dedupeSplitMatch(self):
        '''
        Takes all files in Adj_Data/Splits and iteratively runs them through matching process before combining into one
        output file
        :return:
        '''

        # Define filepaths and vars to be passed to dedupe
        splits_input_dir = self.directories["splits_inputs_dir"].format(
            self.region_dir, self.proc_type)
        splits_output_dir = self.directories['splits_outputs_dir'].format(
            self.region_dir, self.proc_type)
        files = glob.glob(os.path.join(splits_input_dir, '*'))
        numfiles = len(files)
        fileno = 0

        for src_fp in files:
            fileno += 1

            if not os.path.exists(self.matched_fp):
                logging.info(
                    f"Starting matching of split file {str(fileno)} / {str(numfiles)}"
                )

                sys.argv = [
                    'csvlink',
                    str(src_fp),
                    str(self.reg_fp), '--field_names_1',
                    ' '.join(self.src_fields), '--field_names_2',
                    ' '.join(self.reg_fields), '--training_file',
                    self.matches_training_file, '--settings_file',
                    self.learned_settings_file, '--output_file',
                    os.path.join(splits_output_dir,
                                 str(fileno) + '.csv'),
                    str(self.mtrain[0])
                ]

                if self.mtrain[0] == '':
                    sys.argv = sys.argv[:-1]

                launch_matching()

        files = glob.glob(os.path.join(splits_output_dir, '*'))
        frames = []
        # Load in each file and add to frames list
        for file in files:
            df = pd.read_csv(file)
            frames.append(df)
        # Concatenate files in list into one df
        df = pd.concat(frames)
        df = df.drop_duplicates()

        # Save as normal to match outputs folder
        df.to_csv(self.matched_fp, index=False)
示例#7
0
    def createSourceDataSQLQuery(self):
        """
        Create query for pulling source data from db
        """

        logging.info("Obtaining source data...")
        if self.in_args.prodn:
            query = \
                """
                SELECT            
                distinct t.buyer as src_name,
                t.json -> 'releases' -> 0 -> 'tag' as src_tag,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'locality' as src_address_locality,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'postalCode' as src_address_postalcode,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'countryName' as src_address_countryname,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'streetAddress' as src_address_streetaddress,
                t.source as source
                
                  FROM {0} as t
                WHERE TRUE
                  AND t.countryname IN ('United Kingdom','England','Scotland','Northern Ireland','Wales')
                  AND t.buyer NOTNULL
                  AND t.releasedate >= {1}
                  AND t.releasedate <= {2}
                  
                ;
        
                """.format(self.src_data_source, "'" + self.in_args.data_from_date + "'", "'" + self.in_args.data_to_date + "'")
        else:
            query = \
                """
                SELECT            
                distinct t.buyer as src_name,
                t.json -> 'releases' -> 0 -> 'tag' as src_tag,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'locality' as src_address_locality,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'postalCode' as src_address_postalcode,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'countryName' as src_address_countryname,
                t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'streetAddress' as src_address_streetaddress,
                t.source as source

                  FROM {0} as t
                WHERE TRUE
                  AND t.buyer NOTNULL
                  AND t.releasedate >= '2009-09-11 00:00:00.000000'
                  AND t.releasedate <= '2015-09-11 00:00:00.000000'
                  AND t.countryname IN ('United Kingdom','England','Scotland','Northern Ireland','Wales')
                   
                   LIMIT 2000
                ;

                """.format(self.src_data_source)

        return query
示例#8
0
    def createConnection(self):
        '''
        Creates a connection to the database specified .env

        :return connection : the database connection object
        :return cur : the cursor (temporary storage for retrieved data
        '''
        logging.info('Connecting to database...')
        conn = psy.connect(host=self.host_remote,
                           dbname=self.dbname_remote,
                           user=self.user_remote,
                           password=self.password_remote)
        cur = conn.cursor()
        return conn, cur
示例#9
0
    def dedupeMatch(self):
        """
    	Deduping - first the registry and source data are matched using dedupes csvlink,
    	then the matched file is put into clusters

        :param src_file:
    	:param directories: file/folder locations
    	:param  config_files: the main config files
    	:param proc_type: the 'type' of the process (Name, Name & Address)
    	:param proc_num: the individual process within the config file
    	:return None
    	:output : matched output file
    	:output : matched and clustered output file
    	"""

        src_fp = self.directories['adj_dir'].format(
            self.region_dir) + self.directories['adj_src_data'].format(
                self.in_args.src_adj)

        # Matching:
        if not os.path.exists(self.matched_fp):
            if self.in_args.recycle:
                # Copy manual matching file over to build on for clustering
                copyfile(
                    self.directories['manual_matching_train_backup'].format(
                        self.region_dir), self.matches_training_file)

            logging.info("Starting matching...")

            sys.argv = [
                'csvlink',
                str(src_fp),
                str(self.reg_fp), '--field_names_1', ' '.join(self.src_fields),
                '--field_names_2', ' '.join(self.reg_fields),
                '--training_file', self.matches_training_file,
                '--settings_file', self.learned_settings_file, '--output_file',
                self.matched_fp,
                str(self.mtrain[0])
            ]
            if self.mtrain[0] == '':
                sys.argv = sys.argv[:-1]

            launch_matching()

            df = pd.read_csv(self.matched_fp, dtype=self.df_dtypes)
            df = df[pd.notnull(df['src_name'])]

            df = df.drop_duplicates()
            df.to_csv(self.matched_fp, index=False)
示例#10
0
 def transferMatches(self):
     logging.info(
         f"Tranferring new matches from {self.upload_table} to {self.transfer_table}."
     )
     query = \
     """
     INSERT INTO {}
         SELECT DISTINCT src_name, reg_scheme, reg_id, reg_name, match_source, match_date, match_by FROM {} m
         
         WHERE
              NOT EXISTS (SELECT src_name, reg_name FROM {} t WHERE m.src_name = t.org_string)
              AND m.manual_match_n LIKE 'Y'
         ON CONFLICT DO NOTHING
     """.format(self.transfer_table, self.upload_table, self.transfer_table)
     return query
示例#11
0
    def createRegistryDataSQLQuery(self):
        """
        Create query for downloading registry data from db
        """
        logging.info("Obtaining registry data...")
        query = \
            """
            SELECT
           legalname as reg_name,
           id as reg_id,
           '' as reg_address,
           scheme as reg_scheme,
           'dedupe_script' as match_source
           
            from {}

            """.format(self.reg_data_source)
        return query
示例#12
0
    def addDataToTable(self):
        '''
        Adds the confirmed_matches data to table
        :param table_name: the database table to which the confirmed matches will be addded
        :param directories:  directory variables
        :param proc_type: Process type, initially name_only
        :param upload_file: the dataframe containing the data
        :return: None
        '''

        conn, cur = self.createConnection()
        logging.info(f"Connected to {self.upload_table}")

        files = glob.glob(
            os.path.join(
                self.directories['verified_matches_dir'].format(
                    self.region_dir, self.proc_type), '*'))
        for upload_file in files:
            with open(upload_file, 'r', encoding='utf-8-sig') as f:
                # Get headers dynamically
                reader = csv.reader(f)
                headers = next(reader, None)
                headers = ", ".join(headers)
                self.headers = headers
                next(f)  # Skip header row
                # Input the data into the dedupe table
                # copy_expert allows access to csv methods (i.e. char escaping)
                cur.copy_expert(
                    """COPY {}({}) from stdin (format csv)""".format(
                        self.upload_table, headers), f)
                conn.commit()

        # Remove any exact duplicates from db table
        try:
            query = self.removeTableDuplicates()
            cur.execute(query)
            conn.commit()
        except:
            next

        # Also transfer matches to transfer table (orgs_lookup, where doesn't exist already)
        query = self.transferMatches()
        cur.execute(query)
        conn.commit()
示例#13
0
    def removeTableDuplicates(self):
        """
        Remove any exact duplicates from db table

        :param table_name: the database table containing duplicates
        :param headers: the csv headers
        :return: the sql query to be executed
        """

        logging.info("Removing duplicates from table...")
        query = \
            """
            WITH dups AS 
                (SELECT DISTINCT ON ({}) * FROM {})

            DELETE FROM {} WHERE ({}.id) NOT IN
            (SELECT id FROM dups);
            """.format(self.headers, self.upload_table, self.upload_table, self.upload_table, self.upload_table)
        return query
    def manualMatching(self):
        """
        Provides user-input functionality for manual matching based on the extracted records
        :return manual_match_file: extracted file with added column (Y/N/Unsure)
        """

        best_filtered = pd.read_csv(self.best_filtered,
                                    index_col=None,
                                    dtype=self.df_dtypes)
        best_filtered['Manual_Match_N'] = ''
        best_filtered['Manual_Match_NA'] = ''

        if self.in_args.terminal_matching:
            # Iterate over the file, shuffled with sample, as best matches otherwise would show first:
            for index, row in best_filtered.sample(frac=1).iterrows():
                logging.info("\nsource name: " + str(row.src_name_adj))
                logging.info("\nRegistry name: " + str(row.reg_name_adj))
                logging.info("\nLevenshtein distance: " +
                             str(row.leven_dist_N))
                match_options = ["y", "n", "u", "f"]
                match = input("\nMatch? Yes, No, Unsure, Finished (Y/N/U/F):")
                while match.lower() not in match_options:
                    match = input(
                        "\nMatch? Yes, No, Unsure, Finished (Y/N/U/F):")

                if str(match).lower() != "f":
                    best_filtered.at[index, 'Manual_Match_N'] = str(
                        match).capitalize()
                    continue
                else:
                    break

            best_filtered.sort_values(by=['Cluster_ID'],
                                      inplace=True,
                                      axis=0,
                                      ascending=True)

            logging.info("Saving...")
            best_filtered.to_csv(self.unverified_file,
                                 index=False,
                                 columns=self.dbUpload_cols)
        else:
            best_filtered.to_csv(self.unverified_file,
                                 index=False,
                                 columns=self.dbUpload_cols)
    def clean(self):

        raw_data = self.directories['raw_dir'].format(
            self.region_dir) + self.directories['raw_reg_data'].format(
                self.in_args.reg)
        adj_data = self.directories['adj_dir'].format(
            self.region_dir) + self.directories['adj_reg_data'].format(
                self.in_args.reg_adj)

        if not os.path.exists(adj_data):
            logging.info("Re-organising registry data...")
            df = pd.read_csv(raw_data, dtype=self.df_dtypes, chunksize=500000)

            dffullmerge = pd.DataFrame([])
            for chunk in df:

                # Remove punctuation and double spacing
                adj_col = str('reg_name_adj')
                orig_col = str('reg_name')

                # chunk[adj_col] = ''
                chunk = DataProcessing.remvPunct(self, chunk, orig_col,
                                                 adj_col)

                # # Duplicate rows containing 'and' then convert to &
                # chunk1 = DataProcessing.duplicaterowscontainingand(self, df, adj_col)
                #
                # # Duplicate rows containing '&' and convert to 'and'
                # chunk2 = DataProcessing.duplicaterowscontainingampersand(self, df, adj_col)
                #
                # # Concatenate the two together
                # chunk = pd.concat([chunk1, chunk2]).sort_index().reset_index(drop=True)

                # Above gives 'Uncaught exception: 'TextFileReader' object has no attribute 'copy''

                # Replace organisation suffixes with standardised version
                chunk[adj_col].replace(self.org_suffixes.org_suffixes_dict,
                                       regex=True,
                                       inplace=True)

                # Remove punctuation and double spacing in address
                adj_col = str('reg_address_adj')
                orig_col = str('reg_address')

                # dfmerge = self.remvPunct(dfmerge, orig_col, adj_col)
                dfmerge = DataProcessing.remvPunct(self, chunk, orig_col,
                                                   adj_col)

                # dfmerge = self.remvStreetNumber(dfmerge, adj_col)
                dfmerge = DataProcessing.joinFields(self, dfmerge, 'reg')

                dffullmerge = pd.concat([dffullmerge, dfmerge],
                                        ignore_index=True)

            dffullmerge.drop_duplicates(inplace=True)
            logging.info("...done")

            dffullmerge['reg_joinfields'] = dffullmerge[
                'reg_joinfields'].astype(str)
            dffullmerge['match_source'] = dffullmerge['match_source'].astype(
                str)

            dffullmerge['reg_scheme'] = dffullmerge['reg_scheme'].astype(str)
            dffullmerge['reg_id'] = dffullmerge['reg_id'].astype(str)
            dffullmerge['reg_address_adj'] = dffullmerge[
                'reg_address_adj'].astype(str)
            dffullmerge.to_csv(adj_data, index=False)
            return dffullmerge
        else:
            return pd.read_csv(adj_data, dtype=self.df_dtypes)
    def clean(self):
        raw_data = self.directories['raw_dir'].format(
            self.region_dir) + self.directories['raw_src_data'].format(
                self.in_args.src)
        adj_data = self.directories['adj_dir'].format(
            self.region_dir) + self.directories['adj_src_data'].format(
                self.in_args.src_adj)

        if not os.path.exists(adj_data):
            df = pd.read_csv(raw_data,
                             usecols=self.raw_src_data_cols,
                             dtype=self.df_dtypes)

            logging.info("Re-organising source data...")
            # Remove punctuation and double spacing in name
            adj_col = str('src_name_adj')
            orig_col = str('src_name')

            df = df.dropna(subset=[orig_col])

            # Decode html entities i.e. "&amp;" into "&"
            df[adj_col] = df[orig_col].apply(html.unescape)

            # Duplicate rows containing 'and' then convert to &
            df1 = DataProcessing.duplicaterowscontainingand(self, df, adj_col)

            # Duplicate rows containing '&' and convert to 'and'
            df2 = DataProcessing.duplicaterowscontainingampersand(
                self, df, adj_col)

            # Concatenate the two together
            df = pd.concat([df1, df2]).sort_index().reset_index(drop=True)

            # df = DataProcessing.remvPunct(self, df, orig_col, adj_col)
            df = DataProcessing.remvPunct(self, df, adj_col, adj_col)

            # Replace organisation suffixes with standardised version
            df[adj_col].replace(self.org_suffixes.org_suffixes_dict,
                                regex=True,
                                inplace=True)

            # # Remove punctuation and double spacing in address
            adj_col = str('src_address_adj')
            df[adj_col] = df['src_address_streetaddress'] + ', ' + df[
                'src_address_locality'] + ', ' + df[
                    'src_address_postalcode'] + ', ' + df[
                        'src_address_countryname']
            df = DataProcessing.remvPunct(self, df, adj_col, adj_col)
            df = DataProcessing.joinFields(self, df, 'src')
            df = df.drop([
                'src_address_streetaddress', 'src_address_locality',
                'src_address_postalcode', 'src_address_countryname'
            ],
                         axis=1)
            logging.info("...done")

            # Remove blacklisted entities
            df = self.filter_blacklisted(df)

            df.to_csv(adj_data, index=False)

            # If flag 'split' has been used, split the source file into smaller files
            if self.in_args.split:
                chunksize = self.in_args.splitsize
                numberofchunks = len(df) // chunksize + 1
                for i in range(numberofchunks):
                    if i == 0:
                        df[:(i + 1) * chunksize].to_csv(os.path.join(
                            os.getcwd(), 'Regions', str(self.in_args.region),
                            'Data_Inputs', 'Adj_Data', 'Splits',
                            str(self.in_args.src)[:-4] + str(i) + '.csv'),
                                                        index=False)
                    else:
                        df[chunksize * i:(i + 1) * chunksize].to_csv(
                            os.path.join(
                                os.getcwd(), 'Regions',
                                str(self.in_args.region), 'Data_Inputs',
                                'Adj_Data', 'Splits',
                                str(self.in_args.src)[:-4] + str(i) + '.csv'),
                            index=False)