示例#1
0
def make_filter_only_column(whitelist):
    log_info("only columns [{whitelist}]".format(whitelist=','.join(whitelist)))
    def filter_only_column(_, acc):
        if whitelist:
            acc['columns'] = whitelist
        return True, acc
    return filter_only_column
示例#2
0
    def sub_batch_run(self,
                      lines,
                      batch_number,
                      sub_batch_number,
                      total_line_nb,
                      check=False):
        success = False

        st = time()
        try:
            success = self._send_rpc(lines,
                                     batch_number,
                                     sub_batch_number,
                                     check=check)
        except Fault as e:
            log_error("Line %s %s failed" % (batch_number, sub_batch_number))
            log_error(e.faultString)
        except ValueError as e:
            log_error("Line %s %s failed value error" %
                      (batch_number, sub_batch_number))
        except Exception as e:
            log_info("Unknown Problem")
            exc_type, exc_value, _ = sys.exc_info()
            # traceback.print_tb(exc_traceback, file=sys.stdout)
            log_error(exc_type)
            log_error(exc_value)

        if not success:
            self.writer.writerows(lines)

        log_info("time for batch %s - %s of %s : %s" %
                 (batch_number, (sub_batch_number + 1) * self.batch_size,
                  total_line_nb, time() - st))
示例#3
0
def make_filter_ignore(blacklist):
    log_info("skip tables [{blacklist}]".format(blacklist=','.join(blacklist)))
    def filter_ignore(table, acc):
        if table.name not in blacklist:
            return True, acc
        else:
            return False, acc
    return filter_ignore
示例#4
0
def rs_make_filter_daily(time_columns):
    log_info("use time-columns [{time}]".format(time=','.join(time_columns)))
    def filter_daily(_, acc):
        if any(c in time_columns for c in acc['columns']):
            return True, acc
        else:
            return False, acc
    return filter_daily
示例#5
0
def make_filter_only(whitelist):
    log_info("only tables [{whitelist}]".format(whitelist=','.join(whitelist)))
    def filter_only(table, acc):
        if whitelist and table.name in whitelist:
            return True, acc
        elif whitelist:
            return False, acc
        else:
            return True, acc
    return filter_only
示例#6
0
 def run_stats(table_column):
     table, column, start_day, end_day = table_column
     log_info("read {name} for {table}.{column}".format(name=name,
                                                        table=table,
                                                        column=column))
     df = read_stats(table, column, start_day, end_day)
     df['table_name'] = table
     df['column_name'] = column
     df = process_result(df)
     return df
示例#7
0
def process(tables, func_list):
    result = []
    for table in tables:
        acc = {'table_name': [table.name]}
        for f in func_list:
            ok, acc = f(table, acc)
            if not ok:
                log_info("skip {table}".format(table=table.name))
                acc = {}
                break
        if acc:
            log_info("processed {table}".format(table=table.name))
            result.append(acc)
    return result
示例#8
0
 def launch_batch_fun(data_ids, batch_number, check=False):
     st = time()
     try:
         self.result[batch_number] = self.model.export_data(
             data_ids, self.header, context=self.context)['datas']
     except Fault as e:
         log_error("export %s failed" % batch_number)
         log_error(e.faultString)
     except Exception as e:
         log_info("Unknown Problem")
         exc_type, exc_value, _ = sys.exc_info()
         # traceback.print_tb(exc_traceback, file=sys.stdout)
         log_error(exc_type)
         log_error(exc_value)
     log_info("time for batch %s: %s" % (batch_number, time() - st))
示例#9
0
def export_list(table_results, csv_out, csv_columns):
    frames = []
    for result in table_results:
        log_info("exporting {table}".format(table=result['table_name']))
        frame_columns = []
        for key,value in result.items():
            if key in csv_columns:
                tmp_df = pd.DataFrame({key: value})
                tmp_df['_key'] = 1
                frame_columns.append(tmp_df)
        frames.append(reduce(lambda x,y: pd.merge(x,y,on='_key'), frame_columns))

    if frames:
        df = reduce(lambda x,y: pd.concat([x,y]), frames)
        df = df.sort_values(csv_columns)
        df.to_csv(csv_out, header=False, index=False,
                  columns=csv_columns)
示例#10
0
def export_data(config,
                model,
                domain,
                header,
                context=None,
                output=None,
                max_connection=1,
                batch_size=100,
                separator=';',
                encoding='utf-8-sig'):

    object_registry = get_server_connection(config).get_model(model)

    if output:
        file_result = open(output, "wb")
        writer = UnicodeWriter(file_result,
                               delimiter=separator,
                               encoding=encoding,
                               quoting=csv.QUOTE_ALL)
    else:
        writer = ListWriter()

    rpc_thread = RPCThreadExport(int(max_connection), object_registry, header,
                                 writer, batch_size, context)
    st = time()

    ids = object_registry.search(domain, context=context)
    i = 0
    for b in batch(ids, batch_size):
        batch_ids = [l for l in b]
        rpc_thread.launch_batch(batch_ids, i)
        i += 1

    rpc_thread.wait()
    log_info("%s %s exported, total time %s second(s)" % (len(ids), model,
                                                          (time() - st)))
    log_info("Writing file")
    rpc_thread.write_file(writer)
    if output:
        file_result.close()
        return False, False
    else:
        return writer.header, writer.data
示例#11
0
def import_data(config,
                model,
                header=None,
                data=None,
                fobj_read=None,
                context=None,
                fobj_fail=False,
                encoding='utf-8-sig',
                separator=";",
                ignore=False,
                split=False,
                check=True,
                max_connection=1,
                batch_size=10,
                skip=0):
    """
        header and data mandatory in fobj_read is not provided

    """
    ignore = ignore or []
    context = context or {}

    if fobj_read:
        header, data = read_file(fobj_read,
                                 delimiter=separator,
                                 encoding=encoding,
                                 skip=skip)
        fobj_fail = fobj_fail or open(fobj_read.name + ".fail", 'wb')

    if not header or data is None:
        raise ValueError(
            "Please provide either a data file or a header and data")

    object_registry = get_server_connection(config).get_model(model)

    if fobj_read:
        writer = UnicodeWriter(fobj_fail,
                               delimiter=separator,
                               encoding=encoding,
                               quoting=csv.QUOTE_ALL)
    else:
        writer = ListWriter()

    writer.writerow(filter_header_ignore(ignore, header))
    if fobj_read:
        fobj_fail.flush()
    rpc_thread = RPCThreadImport(int(max_connection), object_registry,
                                 filter_header_ignore(ignore, header), writer,
                                 batch_size, context)
    st = time()

    data, split_index = split_sort(split, header, data)

    i = 0
    previous_split_value = False
    while i < len(data):
        lines = []
        j = 0
        while i < len(data) and (j < batch_size or do_not_split(
                split, previous_split_value, split_index, data[i])):
            line = data[i][:len(header)]
            lines.append(filter_line_ignore(ignore, header, line))
            previous_split_value = line[split_index]
            j += 1
            i += 1
        batch_number = split and "[%s] - [%s]" % (rpc_thread.thread_number(
        ), previous_split_value) or "[%s]" % rpc_thread.thread_number()
        rpc_thread.launch_batch(lines, batch_number, check)

    rpc_thread.wait()
    if fobj_read:
        fobj_fail.close()

    log_info("%s %s imported, total time %s second(s)" % (len(data), model,
                                                          (time() - st)))
    if fobj_read:
        return False, False
    else:
        return writer.header, writer.data
示例#12
0
 def skip_line(reader):
     log_info("Skipping until line %s excluded" % skip)
     for _ in xrange(1, skip):
         reader.next()
示例#13
0
def make_filter_ignore_column(blacklist):
    log_info("ignore columns [{blacklist}]".format(blacklist=','.join(blacklist)))
    def filter_ignore_column(_, acc):
        acc['columns'] = [col for col in acc['columns'] if col not in blacklist]
        return True, acc
    return filter_ignore_column
 def count_daily(table_info):
     (table, start_day, end_day) = table_info
     log_info("read row count per day for {table}".format(table=table))
     result = count_rows_daily(table, start_day, end_day)
     for r in result:
         yield {'tablename': table, 'on_day': r[0], 'total_rows': r[1]}
def make_filter_tables(ignore):
    log_info("skip tables {ignore}".format(ignore=','.join(ignore)))
    filter_ignore = lambda tables: (table for table in tables if table not in ignore)
    return filter_ignore
 def load(dest_tables, source_tables):
     for dest, source in zip(dest_tables, source_tables):
         log_info("copy from {source} to {dest}".format(source=source.name,
                                                        dest=dest.name))
         insert_data = make_insert(source)
         insert_data(dest, source)
 def count_whole(table_info):
     table, column = table_info
     log_info("read row count for {table}".format(table=table))
     result = count_rows(table, column)
     for r in result:
         yield {'tablename': table, 'total_rows': r[0]}