def make_filter_only_column(whitelist): log_info("only columns [{whitelist}]".format(whitelist=','.join(whitelist))) def filter_only_column(_, acc): if whitelist: acc['columns'] = whitelist return True, acc return filter_only_column
def sub_batch_run(self, lines, batch_number, sub_batch_number, total_line_nb, check=False): success = False st = time() try: success = self._send_rpc(lines, batch_number, sub_batch_number, check=check) except Fault as e: log_error("Line %s %s failed" % (batch_number, sub_batch_number)) log_error(e.faultString) except ValueError as e: log_error("Line %s %s failed value error" % (batch_number, sub_batch_number)) except Exception as e: log_info("Unknown Problem") exc_type, exc_value, _ = sys.exc_info() # traceback.print_tb(exc_traceback, file=sys.stdout) log_error(exc_type) log_error(exc_value) if not success: self.writer.writerows(lines) log_info("time for batch %s - %s of %s : %s" % (batch_number, (sub_batch_number + 1) * self.batch_size, total_line_nb, time() - st))
def make_filter_ignore(blacklist): log_info("skip tables [{blacklist}]".format(blacklist=','.join(blacklist))) def filter_ignore(table, acc): if table.name not in blacklist: return True, acc else: return False, acc return filter_ignore
def rs_make_filter_daily(time_columns): log_info("use time-columns [{time}]".format(time=','.join(time_columns))) def filter_daily(_, acc): if any(c in time_columns for c in acc['columns']): return True, acc else: return False, acc return filter_daily
def make_filter_only(whitelist): log_info("only tables [{whitelist}]".format(whitelist=','.join(whitelist))) def filter_only(table, acc): if whitelist and table.name in whitelist: return True, acc elif whitelist: return False, acc else: return True, acc return filter_only
def run_stats(table_column): table, column, start_day, end_day = table_column log_info("read {name} for {table}.{column}".format(name=name, table=table, column=column)) df = read_stats(table, column, start_day, end_day) df['table_name'] = table df['column_name'] = column df = process_result(df) return df
def process(tables, func_list): result = [] for table in tables: acc = {'table_name': [table.name]} for f in func_list: ok, acc = f(table, acc) if not ok: log_info("skip {table}".format(table=table.name)) acc = {} break if acc: log_info("processed {table}".format(table=table.name)) result.append(acc) return result
def launch_batch_fun(data_ids, batch_number, check=False): st = time() try: self.result[batch_number] = self.model.export_data( data_ids, self.header, context=self.context)['datas'] except Fault as e: log_error("export %s failed" % batch_number) log_error(e.faultString) except Exception as e: log_info("Unknown Problem") exc_type, exc_value, _ = sys.exc_info() # traceback.print_tb(exc_traceback, file=sys.stdout) log_error(exc_type) log_error(exc_value) log_info("time for batch %s: %s" % (batch_number, time() - st))
def export_list(table_results, csv_out, csv_columns): frames = [] for result in table_results: log_info("exporting {table}".format(table=result['table_name'])) frame_columns = [] for key,value in result.items(): if key in csv_columns: tmp_df = pd.DataFrame({key: value}) tmp_df['_key'] = 1 frame_columns.append(tmp_df) frames.append(reduce(lambda x,y: pd.merge(x,y,on='_key'), frame_columns)) if frames: df = reduce(lambda x,y: pd.concat([x,y]), frames) df = df.sort_values(csv_columns) df.to_csv(csv_out, header=False, index=False, columns=csv_columns)
def export_data(config, model, domain, header, context=None, output=None, max_connection=1, batch_size=100, separator=';', encoding='utf-8-sig'): object_registry = get_server_connection(config).get_model(model) if output: file_result = open(output, "wb") writer = UnicodeWriter(file_result, delimiter=separator, encoding=encoding, quoting=csv.QUOTE_ALL) else: writer = ListWriter() rpc_thread = RPCThreadExport(int(max_connection), object_registry, header, writer, batch_size, context) st = time() ids = object_registry.search(domain, context=context) i = 0 for b in batch(ids, batch_size): batch_ids = [l for l in b] rpc_thread.launch_batch(batch_ids, i) i += 1 rpc_thread.wait() log_info("%s %s exported, total time %s second(s)" % (len(ids), model, (time() - st))) log_info("Writing file") rpc_thread.write_file(writer) if output: file_result.close() return False, False else: return writer.header, writer.data
def import_data(config, model, header=None, data=None, fobj_read=None, context=None, fobj_fail=False, encoding='utf-8-sig', separator=";", ignore=False, split=False, check=True, max_connection=1, batch_size=10, skip=0): """ header and data mandatory in fobj_read is not provided """ ignore = ignore or [] context = context or {} if fobj_read: header, data = read_file(fobj_read, delimiter=separator, encoding=encoding, skip=skip) fobj_fail = fobj_fail or open(fobj_read.name + ".fail", 'wb') if not header or data is None: raise ValueError( "Please provide either a data file or a header and data") object_registry = get_server_connection(config).get_model(model) if fobj_read: writer = UnicodeWriter(fobj_fail, delimiter=separator, encoding=encoding, quoting=csv.QUOTE_ALL) else: writer = ListWriter() writer.writerow(filter_header_ignore(ignore, header)) if fobj_read: fobj_fail.flush() rpc_thread = RPCThreadImport(int(max_connection), object_registry, filter_header_ignore(ignore, header), writer, batch_size, context) st = time() data, split_index = split_sort(split, header, data) i = 0 previous_split_value = False while i < len(data): lines = [] j = 0 while i < len(data) and (j < batch_size or do_not_split( split, previous_split_value, split_index, data[i])): line = data[i][:len(header)] lines.append(filter_line_ignore(ignore, header, line)) previous_split_value = line[split_index] j += 1 i += 1 batch_number = split and "[%s] - [%s]" % (rpc_thread.thread_number( ), previous_split_value) or "[%s]" % rpc_thread.thread_number() rpc_thread.launch_batch(lines, batch_number, check) rpc_thread.wait() if fobj_read: fobj_fail.close() log_info("%s %s imported, total time %s second(s)" % (len(data), model, (time() - st))) if fobj_read: return False, False else: return writer.header, writer.data
def skip_line(reader): log_info("Skipping until line %s excluded" % skip) for _ in xrange(1, skip): reader.next()
def make_filter_ignore_column(blacklist): log_info("ignore columns [{blacklist}]".format(blacklist=','.join(blacklist))) def filter_ignore_column(_, acc): acc['columns'] = [col for col in acc['columns'] if col not in blacklist] return True, acc return filter_ignore_column
def count_daily(table_info): (table, start_day, end_day) = table_info log_info("read row count per day for {table}".format(table=table)) result = count_rows_daily(table, start_day, end_day) for r in result: yield {'tablename': table, 'on_day': r[0], 'total_rows': r[1]}
def make_filter_tables(ignore): log_info("skip tables {ignore}".format(ignore=','.join(ignore))) filter_ignore = lambda tables: (table for table in tables if table not in ignore) return filter_ignore
def load(dest_tables, source_tables): for dest, source in zip(dest_tables, source_tables): log_info("copy from {source} to {dest}".format(source=source.name, dest=dest.name)) insert_data = make_insert(source) insert_data(dest, source)
def count_whole(table_info): table, column = table_info log_info("read row count for {table}".format(table=table)) result = count_rows(table, column) for r in result: yield {'tablename': table, 'total_rows': r[0]}