Exemplo n.º 1
0
def diff_csv(file1,file2,separator=None,nrows=None):
    logger = Logger("CSV File Diff")
    
    if separator == None:
        sep1 = get_separator_from_filename(file1)
        sep2 = get_separator_from_filename(file2)
    else:
        sep1 = separator
        sep2 = separator

    df1 = pd.read_csv(file1,sep=sep1,nrows=nrows)
    df2 = pd.read_csv(file2,sep=sep2,nrows=nrows)
    
    exact_match = df1.equals(df2)
    if exact_match:
        return

    df = pd.concat([df1,df2]).drop_duplicates(keep=False)

    if len(df) > 0:
        logger.error(" ======== Differing Rows ========== ")
        logger.error(df)
        m = df1.merge(df2, on=df.columns[0], how='outer', suffixes=['', '_'], indicator=True)[['_merge']]
        m = m[~m['_merge'].str.contains('both')]
        file1 = file1.split('/')[-1]
        file2 = file2.split('/')[-1]
        
        m['_merge'] = m['_merge'].map({'left_only':file1,'right_only':file2})
        m = m.rename(columns={'_merge':'Only Contained Within'})
        m.index.name = 'Row Number'
        logger.error(m.reset_index().to_dict(orient='records'))
        raise DifferingRows("Something not right with the rows, changes detected.")
        
    elif len(df1.columns) != len(df2.columns):
        
        raise DifferingColumns('in df1 but not df2',list(set(df1.columns) - set(df2.columns)),'\n',
                               'in df2 but not df1',list(set(df2.columns) - set(df1.columns)))

    else:
        logger.error(" ======= Rows are likely in a different order ====== ")
        for i in range(len(df1)):
            if not (df1.iloc[i] == df2.iloc[i]).any():
                print ('Row',i,'is in a different location')
        raise Exception("differences detected")
Exemplo n.º 2
0
def _process_list_data(ctx):
    logger = Logger("_process_list_data")
    logger.info("ETL process has begun")

    interactive = ctx.obj['interactive']
    data = []
    clean = ctx.obj['clean']
    rules = ctx.obj['rules']
    bclink_helpers = ctx.obj['bclink_helpers']
    config_file = ctx.obj['conf']
    conf = _load_config(config_file)
    rules_file = conf['rules']
    rules_file_last_modified = os.path.getmtime(rules_file)

    bclink_helpers.print_summary()
    display_msg = True
    _clean = clean

    while True:

        re_execute = False
        try:
            conf = _load_config(config_file)
        except Exception as e:
            if not display_msg:
                logger.critical(e)
                logger.error(
                    f"You've misconfigured your file '{config_file}'!! Please fix!"
                )
            time.sleep(5)
            display_msg = True
            continue

        current_rules_file = conf['rules']
        new_rules_file = rules_file != current_rules_file
        if new_rules_file:
            #if there's a new rules file
            logger.info(
                f"Detected a new rules file.. old was '{rules_file}' and new is '{current_rules_file}'"
            )
            rules_file = current_rules_file
            rules = coconnect.tools.load_json_delta(rules_file, rules)
            rules_file_last_modified = os.path.getmtime(rules_file)
            re_execute = True
        else:
            #otherwise check for changes in the existing file
            new_rules_file_last_modified = os.path.getmtime(current_rules_file)
            change_in_rules = rules_file_last_modified != new_rules_file_last_modified
            if change_in_rules:
                logger.info(
                    f"Detected a change/update in the rules file '{rules_file}'"
                )
                rules = coconnect.tools.load_json_delta(
                    current_rules_file, rules)
                re_execute = True

        current_data = conf['data']
        if not data == current_data:
            logger.debug(f"old {data}")
            logger.debug(f"new {current_data}")
            new_data = [obj for obj in current_data if obj not in data]
            logger.info(f"New data found! {new_data}")
            re_execute = True
        else:
            new_data = data

        logger.debug(f"re-execute {re_execute}")
        if re_execute:
            current_data = copy.deepcopy(new_data)
            #loop over any new data
            for item in new_data:
                if isinstance(item['input'], list):
                    inputs = item['input']
                else:
                    input_folder = item['input']
                    if not os.path.isdir(input_folder):
                        raise Exception(
                            f"{input_folder} is not a directory containing files!"
                        )
                    inputs = coconnect.tools.get_files(input_folder,
                                                       type='csv')
                filtered_rules = coconnect.tools.remove_missing_sources_from_rules(
                    rules, inputs)

                _execute(ctx, data=item, rules=filtered_rules, clean=_clean)
                _clean = False

            data += [x for x in current_data if x not in data]
            display_msg = True

        if new_rules_file or change_in_rules:
            #if there's a new rules file or rules delta,
            #need to pick up the full rules for the next loop
            #incase we insert new data
            # --> we dont want to just apply the delta to the new data
            rules = coconnect.tools.load_json(current_rules_file)

        if ctx.obj['listen_for_changes'] == False:
            break

        if display_msg:
            logger.info(
                f"Finished!... Listening for changes to data in {config_file}")
            if display_msg:
                display_msg = False

        time.sleep(5)