def like(df): """Creates and returns a DataFrame which has the same column structure and Column names as the specified DataFrame instance but is otherwise empty Args: df: The DataFrame from which to copy the Column structure Returns: A DataFrame with the same Column structure and names as the specified DataFrame, or None if the specified DataFrame is None """ if df is None: return None col = df.columns() if col == 0: return (dataframe.NullableDataFrame() if df.is_nullable() else dataframe.DefaultDataFrame()) cols = [None] * col for i in range(col): cols[i] = column.Column.of_type(df.get_column(i).type_code()) result = (dataframe.NullableDataFrame(cols) if df.is_nullable() else dataframe.DefaultDataFrame(cols)) if df.has_column_names(): result.set_column_names(df.get_column_names()) return result
def copy_of(df): """Creates and returns a copy of the given DataFrame Args: df: The DataFrame instance to copy Returns: A copy of the specified DataFrame or None if the argument is None """ if df is None: return None df.flush() columns = [col.clone() for col in df._internal_columns()] copy = None if df.is_nullable(): copy = dataframe.NullableDataFrame(columns) else: copy = dataframe.DefaultDataFrame(columns) return copy
def _deserialize_v2(buffer): """Deserialization from the binary-based version 2 format (v2). Args: buffer: The bytearray representing the DataFrame to deserialize Returns: A DataFrame from the given bytearray """ #HEADER ptr = 5 # index pointer dftype = buffer[ptr] if not dftype in (0x64, 0x6e): raise dataframe.DataFrameException( "Unsupported DataFrame implementation") # header format is {v:2;irrrrccccName1.Name2.ttllllbbb}0x... # code of the DataFrame implementation impl_default = (dftype == 0x64) ptr += 1 rows = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 4 cols = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 4 # column labels names = [] for i in range(cols): c0 = ptr # first char while buffer[ptr] != 0x00: ptr += 1 ptr += 1 names.append(buffer[c0:ptr - 1].decode("utf-8")) # column types types = [] for i in range(cols): types.append(buffer[ptr]) ptr += 1 df = None columns = [] if not impl_default: # NullableDataFrame # first read the entire lookup list into memory lookup_length = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 4 lookup_bits = BitVector(buffer[ptr:ptr + lookup_length]) # list index pointing to the next readable bit within the lookup list li = 0 ptr += lookup_length if buffer[ptr] != 0x7d: # header closing brace '}' missing raise dataframe.DataFrameException("Invalid format") #END HEADER #PAYLOAD for i in range(cols): val = np.empty(rows, dtype=np.object) if types[i] == bytecolumn.NullableByteColumn.TYPE_CODE: for j in range(rows): ptr += 1 b = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(bytecolumn.NullableByteColumn(names[i], val)) elif types[i] == shortcolumn.NullableShortColumn.TYPE_CODE: for j in range(rows): ptr += 2 b = int.from_bytes(buffer[ptr - 1:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(shortcolumn.NullableShortColumn(names[i], val)) elif types[i] == intcolumn.NullableIntColumn.TYPE_CODE: for j in range(rows): ptr += 4 b = int.from_bytes(buffer[ptr - 3:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(intcolumn.NullableIntColumn(names[i], val)) elif types[i] == longcolumn.NullableLongColumn.TYPE_CODE: for j in range(rows): ptr += 8 b = int.from_bytes(buffer[ptr - 7:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(longcolumn.NullableLongColumn(names[i], val)) elif types[i] == stringcolumn.NullableStringColumn.TYPE_CODE: for j in range(rows): ptr += 1 c0 = ptr # marks the first character of each string while buffer[ptr] != 0x00: ptr += 1 if (ptr - c0) == 0: if not lookup_bits.get(li): val[j] = "" li += 1 else: val[j] = buffer[c0:ptr].decode("utf-8") columns.append(stringcolumn.NullableStringColumn( names[i], val)) elif types[i] == floatcolumn.NullableFloatColumn.TYPE_CODE: for j in range(rows): ptr += 4 # since Python does not have float32, we need to do a conversion # over numpy and str to get the same precision as the original value f = float( str( np.float32( unpack(">f", buffer[ptr - 3:ptr + 1])[0]))) if f == 0.0: if not lookup_bits.get(li): val[j] = 0.0 li += 1 else: val[j] = f columns.append(floatcolumn.NullableFloatColumn(names[i], val)) elif types[i] == doublecolumn.NullableDoubleColumn.TYPE_CODE: for j in range(rows): ptr += 8 f = unpack(">d", buffer[ptr - 7:ptr + 1])[0] if f == 0.0: if not lookup_bits.get(li): val[j] = 0.0 li += 1 else: val[j] = f columns.append(doublecolumn.NullableDoubleColumn( names[i], val)) elif types[i] == charcolumn.NullableCharColumn.TYPE_CODE: for j in range(rows): ptr += 1 c = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=False) if c == 0: val[j] = None else: val[j] = chr(c) columns.append(charcolumn.NullableCharColumn(names[i], val)) elif types[i] == booleancolumn.NullableBooleanColumn.TYPE_CODE: length = int(rows / 8 if (rows % 8 == 0) else ((rows / 8) + 1)) ptr += 1 # focus on next readable position bits = BitVector(buffer[ptr:ptr + length]) for j in range(rows): if not bits.get(j): if not lookup_bits.get(li): val[j] = False li += 1 else: val[j] = True # let the base pointer jump forward to the last read byte ptr += (length - 1) columns.append( booleancolumn.NullableBooleanColumn(names[i], val)) elif types[i] == binarycolumn.NullableBinaryColumn.TYPE_CODE: for j in range(rows): ptr += 1 length = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 3 if length != 0: data = bytearray(length) for k in range(length): ptr += 1 data[k] = buffer[ptr] val[j] = data columns.append(binarycolumn.NullableBinaryColumn( names[i], val)) else: raise dataframe.DataFrameException( ("Unknown column with type code {}").format(types[i])) #END PAYLOAD if cols == 0: # uninitialized instance df = dataframe.NullableDataFrame() else: df = dataframe.NullableDataFrame(columns) else: # DefaultDataFrame if buffer[ptr] != 0x7d: # header closing brace '}' raise dataframe.DataFrameException("Invalid format") #END HEADER #PAYLOAD for i in range(cols): if types[i] == bytecolumn.ByteColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int8) for j in range(rows): ptr += 1 val[j] = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=True) columns.append(bytecolumn.ByteColumn(names[i], val)) elif types[i] == shortcolumn.ShortColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int16) for j in range(rows): ptr += 2 val[j] = int.from_bytes(buffer[ptr - 1:ptr + 1], byteorder="big", signed=True) columns.append(shortcolumn.ShortColumn(names[i], val)) elif types[i] == intcolumn.IntColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int32) for j in range(rows): ptr += 4 val[j] = int.from_bytes(buffer[ptr - 3:ptr + 1], byteorder="big", signed=True) columns.append(intcolumn.IntColumn(names[i], val)) elif types[i] == longcolumn.LongColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int64) for j in range(rows): ptr += 8 val[j] = int.from_bytes(buffer[ptr - 7:ptr + 1], byteorder="big", signed=True) columns.append(longcolumn.LongColumn(names[i], val)) elif types[i] == stringcolumn.StringColumn.TYPE_CODE: val = np.empty(rows, dtype=np.object) for j in range(rows): ptr += 1 c0 = ptr # marks the first character of each string while buffer[ptr] != 0x00: ptr += 1 if (ptr - c0) == 0: val[j] = stringcolumn.StringColumn.DEFAULT_VALUE else: val[j] = buffer[c0:ptr].decode("utf-8") columns.append(stringcolumn.StringColumn(names[i], val)) elif types[i] == floatcolumn.FloatColumn.TYPE_CODE: val = np.empty(rows, dtype=np.float32) for j in range(rows): ptr += 4 # since Python does not have float32, we need to do a conversion # over numpy and str to get the same precision as the original value val[j] = float( str( np.float32( unpack(">f", buffer[ptr - 3:ptr + 1])[0]))) columns.append(floatcolumn.FloatColumn(names[i], val)) elif types[i] == doublecolumn.DoubleColumn.TYPE_CODE: val = np.empty(rows, dtype=np.float64) for j in range(rows): ptr += 8 val[j] = unpack(">d", buffer[ptr - 7:ptr + 1])[0] columns.append(doublecolumn.DoubleColumn(names[i], val)) elif types[i] == charcolumn.CharColumn.TYPE_CODE: val = np.empty(rows, dtype=np.uint8) for j in range(rows): ptr += 1 c = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=False) val[j] = c columns.append(charcolumn.CharColumn(names[i], val)) elif types[i] == booleancolumn.BooleanColumn.TYPE_CODE: val = np.empty(rows, dtype=np.bool) length = int(rows / 8 if (rows % 8 == 0) else ((rows / 8) + 1)) ptr += 1 # focus on next readable position bits = BitVector(buffer[ptr:ptr + length]) for j in range(rows): val[j] = bits.get(j) ptr += (length - 1) columns.append(booleancolumn.BooleanColumn(names[i], val)) elif types[i] == binarycolumn.BinaryColumn.TYPE_CODE: val = np.empty(rows, dtype=np.object) for j in range(rows): ptr += 1 length = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 3 data = bytearray(length) for k in range(length): ptr += 1 data[k] = buffer[ptr] val[j] = data columns.append(binarycolumn.BinaryColumn(names[i], val)) else: raise dataframe.DataFrameException( ("Unknown column with type code {}").format(types[i])) #END PAYLOAD if cols == 0: # uninitialized instance df = dataframe.DefaultDataFrame() else: df = dataframe.DefaultDataFrame(columns) return df
def _df_from_csv_format(buffer, separator, header, types): """Internal function for deserializing the content of the specified string buffer to a DataFrame.""" # regex which splits a string by the provided # separator if it is not enclosed by double quotes pattern = regex_matcher.compile(separator + "(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") df = dataframe.DefaultDataFrame() line_index = 0 try: if types is not None: if isinstance(types, tuple): types = list(types) for i, t in enumerate(types): types[i] = t.lower() df.add_column(_column_from_type(types[i])) if header: h = pattern.split(buffer[line_index], 0) for i, elem in enumerate(h): h[i] = _normalize(elem) df.set_column_names(h) line_index += 1 for _ in range(line_index, len(buffer), 1): if not buffer[line_index]: # skip empty lines line_index += 1 continue blocks = pattern.split(_process(buffer[line_index], separator), 0) converted = [None] * len(blocks) for i, block in enumerate(blocks): try: converted[i] = _convert_type(types[i], _normalize(block)) except (ValueError, TypeError) as ex: raise IOError(("Improperly formatted CSV " "file at line: {}").format(line_index + 1)) from ex try: df.add_row(converted) except dataframe.DataFrameException: # null value in row df = dataframe.DataFrame.convert_to( df, "NullableDataFrame") df.add_row(converted) line_index += 1 else: if header: h = pattern.split(buffer[line_index], 0) for i, elem in enumerate(h): h[i] = _normalize(elem) df.add_column(stringcolumn.StringColumn()) df.set_column_names(h) else: first = pattern.split(buffer[line_index], 0) for i, elem in enumerate(first): first[i] = _normalize(elem) df.add_column(stringcolumn.StringColumn()) df.add_row(first) line_index += 1 for _ in range(line_index, len(buffer), 1): if not buffer[line_index]: # skip empty lines line_index += 1 continue blocks = pattern.split(_process(buffer[line_index], separator), 0) for i, block in enumerate(blocks): if blocks[i] == "null": blocks[i] = None else: blocks[i] = _normalize(block) try: df.add_row(blocks) except dataframe.DataFrameException: # null value in row df = dataframe.DataFrame.convert_to( df, "NullableDataFrame") df.add_row(blocks) line_index += 1 except (IndexError) as ex: raise IOError(("Improperly formatted CSV " "file at line: {}").format(line_index)) from ex return df
def setitem_impl(arg, position, value): """Implementation of the __setitem__() function. Args: arg: The DataFrame instance on which the function was called upon position: The position argument passed to the function value: The value argument passed to the function """ if isinstance(position, tuple): if len(position) > 2: raise dataframe.DataFrameException( ("Invalid position argument. Too many " "positions specified: {}").format(len(position))) cols = position[0] rows = position[1] if isinstance(cols, (int, str)): # check for negative column indices if isinstance(cols, int) and cols < 0: if abs(cols) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(cols)) cols = cols % arg.columns() if rows is None: # implements df[x, :] = Column # and df["x", :] = Column arg.set_column(cols, value) elif isinstance(rows, int): # implements df[x, y] = v # and df["x", y] = v if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() arg.get_column(cols).set_value(rows, value) elif isinstance(rows, str): # implements df[x, "y_regex"] = v | func | lamda # and df["x", "y_regex"] = v | func | lamda arg.replace(cols, rows, replacement=value) elif isinstance(rows, tuple): # implements df[x, (y0, y1, ..., yn)] = (v0, v1, ..., vn) # and df["x", (y0, y1, ..., yn)] = (v0, v1, ..., vn) col = arg.get_column(cols) if isinstance(value, (list, tuple)): if len(rows) != len(value): raise dataframe.DataFrameException(( "Invalid value argument. The specified " "list/tuple has a size of {} but the row position " "argument has a size of {}").format( len(value), len(rows))) for i, index in enumerate(rows): col.set_value(index, value[i]) else: # implements df[x, (y0, y1, ..., yn)] = v # and df["x", (y0, y1, ..., yn)] = v for index in rows: col.set_value(index, value) elif isinstance(rows, slice): rows = rows.indices(arg.rows()) start = rows[0] stop = rows[1] step = rows[2] col = arg.get_column(cols) if isinstance(value, (list, tuple)): # implements df[x, y0:y1:y2] = (v0, v1, ..., vn) # and df["x", y0:y1:y2] = (v0, v1, ..., vn) if ((stop - start) // step) != len(value): raise dataframe.DataFrameException(( "Invalid value argument. The specified " "list/tuple has a size of {} but the row position " "argument has a size of {}").format( len(value), (stop - start) // step)) i = 0 for index in range(start, stop, step): col.set_value(index, value[i]) i += 1 else: # implements df[x, y0:y1:y2] = v # and df["x", y0:y1:y2] = v for index in range(start, stop, step): col.set_value(index, value) else: # invalid type for row position arg raise dataframe.DataFrameException( ("Invalid row position type. " "Expected int or str but found {}").format(type(rows))) elif isinstance(cols, (tuple, slice)): # prefetch the selected columns as a DataFrame if isinstance(cols, tuple): cols_selected = arg.get_columns(cols=cols) else: # is slice cols_selected = (dataframe.NullableDataFrame( arg._internal_columns()[cols]) if arg.is_nullable() else dataframe.DefaultDataFrame( arg._internal_columns()[cols])) if isinstance(rows, int): if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() if isinstance(value, (tuple, list)): # implements df[(x0, x1, ..., xn), y] = [v0, v1, ..., vn] # and df[x0:x1:x2, y] = [v0, v1, ..., vn] cols_selected.set_row(rows, value) else: # implements df[(x0, x1, ..., xn), y] = v # and df[x0:x1:x2, y] = v cols_selected.set_row(rows, [value] * cols_selected.columns()) elif isinstance(rows, tuple): if isinstance(value, (list, tuple)): # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] = [[ ], [ ], ..., [ ]] # and df[x0:x1:x2, (y0, y1, ..., ym)] = [[ ], [ ], ..., [ ]] if len(value) == 0: raise dataframe.DataFrameException(( "Invalid value argument. The specified list/tuple " "of row values is empty")) if isinstance(value[0], (list, tuple)): if len(rows) != len(value): raise dataframe.DataFrameException(( "Invalid value argument. The specified list/tuple " "has a size of {} but the row position argument " "has a size of {}").format( len(value), len(rows))) for i, index in enumerate(rows): cols_selected.set_row(index, value[i]) else: for index in rows: cols_selected.set_row(index, value) elif isinstance(value, dataframe.DataFrame): # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] = vDataFrame # and df[x0:x1:x2, (y0, y1, ..., ym)] = vDataFrame if len(rows) != value.rows(): rmsg1 = "rows" if value.rows() != 1 else "row" rmsg2 = "rows" if len(rows) != 1 else "row" raise dataframe.DataFrameException( ("Invalid value argument. The specified " "DataFrame has {} {} but the row position " "argument specified {} {}").format( value.rows(), rmsg1, len(rows), rmsg2)) for i, index in enumerate(rows): cols_selected.set_row(index, value.get_row(i)) else: # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] = v # and df[x0:x1:x2, (y0, y1, ..., ym)] = v value = [value] * cols_selected.columns() for index in rows: cols_selected.set_row(index, value) elif isinstance(rows, slice): rows = rows.indices(cols_selected.rows()) start = rows[0] stop = rows[1] step = rows[2] if isinstance(value, (list, tuple)): # implements df[(x0, x1, ..., xn), y0:y1:y2] = [ .. ] # and df[x0:x1:x2, y0:y1:y2] = [ .. ] for index in range(start, stop, step): cols_selected.set_row(index, value) elif isinstance(value, dataframe.DataFrame): # implements df[(x0, x1, ..., xn), y0:y1:y2] = vDataFrame # and df[x0:x1:x2, y0:y1:y2] = vDataFrame i = 0 for index in range(start, stop, step): cols_selected.set_row(index, value.get_row(i)) i += 1 else: # implements df[(x0, x1, ..., xn), y0:y1:y2] = v # and df[x0:x1:x2, y0:y1:y2] = v value = [value] * cols_selected.columns() for index in range(start, stop, step): cols_selected.set_row(index, value) elif isinstance(rows, str): raise dataframe.DataFrameException( ("Invalid column position type. A replacement operation " "must only specify a single column " "but found {}").format(type(cols))) else: # invalid type for row position arg raise dataframe.DataFrameException( ("Invalid row position type. " "Expected int or str but found {}").format(type(rows))) else: # invalid type for column position arg raise dataframe.DataFrameException( ("Invalid column position type. " "Expected int or str but found {}").format(type(cols))) elif isinstance(position, int): # check for negative column indices if position < 0: if abs(position) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(position)) position = position % arg.columns() # implements df[x] = Column if position == arg.columns(): arg.add_column(value) else: arg.set_column(position, value) elif isinstance(position, str): # and df["x"] = Column arg.set_column(position, value) else: # invalid type for entire position arg raise dataframe.DataFrameException(("Invalid position type. " "Expected int or str but " "found {}").format(type(position)))
def getitem_impl(arg, position): """Implementation of the __getitem__() function Args: arg: The DataFrame instance on which the function was called upon position: The position argument passed to the function Returns: The value at the specified position """ if isinstance(position, tuple): if len(position) > 2: raise dataframe.DataFrameException( ("Invalid position argument. Too many " "positions specified: {}").format(len(position))) cols = position[0] rows = position[1] if isinstance(cols, (int, str)): # check for negative column indices if isinstance(cols, int) and cols < 0: if abs(cols) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(cols)) cols = cols % arg.columns() if rows is None: # implements df[x, :] and df["x", :] return arg.get_columns(cols=cols) elif isinstance(rows, int): # implements df[x, y] and df["x", y] if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() return arg.get_column(cols).get_value(rows) elif isinstance(rows, str): # implements df[x, "y_regex"] and df["x", "y_regex"] return arg.filter(cols, rows) elif isinstance(rows, tuple): # implements df[x, (y0, y1, ..., yn)] # and df["x", (y0, y1, ..., yn)] col_selected = arg.get_column(cols) col = column.Column.like(col_selected, length=len(rows)) df = (dataframe.NullableDataFrame(col) if arg.is_nullable() else dataframe.DefaultDataFrame(col)) for i, row_index in enumerate(rows): col[i] = col_selected[row_index] return df elif isinstance(rows, slice): # implements df[x, y0:y1:y2] # and df["x", y0:y1:y2] start = rows.start stop = rows.stop step = rows.step col_selected = arg.get_column(cols) # numpy returns an array view when slicing # so we have to copy the array explicitly # to get an independent instance col_values = col_selected._values[start:stop:step].copy() col = column.Column.like(col_selected, length=0) col._values = col_values return (dataframe.NullableDataFrame(col) if arg.is_nullable() else dataframe.DefaultDataFrame(col)) elif isinstance(cols, (tuple, slice)): # prefetch the selected columns as a DataFrame if isinstance(cols, tuple): cols_selected = arg.get_columns(cols=cols) else: # is slice cols_selected = arg._internal_columns()[cols] cols_selected = (dataframe.NullableDataFrame(cols_selected) if arg.is_nullable() else dataframe.DefaultDataFrame(cols_selected)) if rows is None: # implements df[(x0, x1, ..., xn), ] # and df[x0:x1:x2, ] return cols_selected elif isinstance(rows, int): # implements df[(x0, x1, ..., xn), y] # and df[x0:x1:x2, y] if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() return cols_selected.get_row(rows) elif isinstance(rows, tuple): # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] # and df[x0:x1:x2, (y0, y1, ..., ym)] cols = [ column.Column.like(col, length=len(rows)) for col in cols_selected._internal_columns() ] df = (dataframe.NullableDataFrame(cols) if arg.is_nullable() else dataframe.DefaultDataFrame(cols)) for i, row_index in enumerate(rows): df.set_row(i, cols_selected.get_row(rows[i])) return df elif isinstance(rows, slice): # implements df[(x0, x1, ..., xn), y0:y1:y2] # and df[x0:x1:x2, y0:y1:y2] start = rows.start stop = rows.stop step = rows.step cols = [None] * cols_selected.columns() for i, col in enumerate(cols_selected._internal_columns()): col_values = col._values[start:stop:step].copy() col_sliced = column.Column.like(col, length=col_values.shape[0]) col_sliced._values = col_values cols[i] = col_sliced return (dataframe.NullableDataFrame(cols) if arg.is_nullable() else dataframe.DefaultDataFrame(cols)) elif isinstance(rows, str): raise dataframe.DataFrameException( ("Invalid column position type. A filter operation " "must only specify a single column " "but found {}").format(type(cols))) else: # invalid type for column position arg raise dataframe.DataFrameException( ("Invalid column position type. " "Expected int or str but found {}").format(type(cols))) elif isinstance(position, int): # implements df[x] if position < 0: if abs(position) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(position)) position = position % arg.columns() return arg.get_column(position) elif isinstance(position, str): # implements df["x"] return arg.get_column(position) else: # invalid type for entire position arg raise dataframe.DataFrameException(("Invalid position type. " "Expected int or str but " "found {}").format(type(position))) # make pylint happy about missing return statement raise dataframe.DataFrameException("Implementation error")
def _group_operation(df, col, operation): """Performs a group_by operation for the specified DataFrame and Column. Operation codes: * 1 = Minimum * 2 = Maximum * 3 = Average * 4 = Sum Args: df: The DataFrame to use for the group operation col: The Column to use for the group operation operation: The operation code to use Returns: A DataFrame representing the result of the group operation """ if df._internal_next() == -1 or col < 0 or col >= df.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(col)) c = df.get_column(col) n_numeric = 0 for i in range(df.columns()): c_i = df.get_column(i) if not c_i._name: raise dataframe.DataFrameException( "All columns must be labeled for group operations") if c_i is not c and c_i.is_numeric(): n_numeric += 1 uniques = df.unique(col) n_uniques = len(uniques) contains_null = df.contains(col, "None") if df.is_nullable() else False col_length = n_uniques + 1 if contains_null else n_uniques cols = [None] * (n_numeric + 1) col_names = [None] * (n_numeric + 1) cols[0] = column.Column.of_type(c.type_code(), col_length) col_names[0] = c._name n_numeric = 1 for i in range(df.columns()): c_i = df.get_column(i) if c_i is not c and c_i.is_numeric(): if operation in (3, 4): # average or sum op cols[n_numeric] = (doublecolumn.NullableDoubleColumn( values=col_length) if df.is_nullable() else doublecolumn.DoubleColumn( values=col_length)) else: cols[n_numeric] = column.Column.of_type( c_i.type_code(), col_length) col_names[n_numeric] = c_i._name n_numeric += 1 result = (dataframe.NullableDataFrame(cols) if df.is_nullable() else dataframe.DefaultDataFrame(cols)) result.set_column_names(col_names) length = len(cols) index = 0 for elem in uniques: row = [None] * length row[0] = elem filtered = df.filter(c._name, str(elem)) for i in range(1, length, 1): value = 0.0 if operation == 1: value = filtered.minimum(col_names[i]) elif operation == 2: value = filtered.maximum(col_names[i]) elif operation == 3: value = filtered.average(col_names[i]) elif operation == 4: value = filtered.sum(col_names[i]) else: raise dataframe.DataFrameException( "Unknown group operation: {}".format(operation)) row[i] = _cast_to_numeric_type(cols[i], value) result.set_row(index, row) index += 1 if contains_null: row = [None] * length row[0] = None filtered = df.filter(c._name, "None") for i in range(1, length, 1): value = 0.0 if operation == 1: value = filtered.minimum(col_names[i]) elif operation == 2: value = filtered.maximum(col_names[i]) elif operation == 3: value = filtered.average(col_names[i]) elif operation == 4: value = filtered.sum(col_names[i]) else: raise dataframe.DataFrameException( "Unknown group operation: {}".format(operation)) row[i] = _cast_to_numeric_type(cols[i], value) result.set_row(index, row) index += 1 return result
def join(df1, col1, df2, col2): """Combines all rows from the specified DataFrames which have matching values in their columns with the corresponding specified name. Both DataFrames must have a column with the corresponding specified name and an identical element type. All columns in both DataFrame instances must be labeled by the time this method is called. The specified DataFrames may be of any types. All Columns in the second DataFrame argument that are also existent in the first DataFrame argument are excluded in the result DataFrame returned by this method. Therefore, in the case of duplicate Columns, the returned DataFrame only contains the corresponding Column from the first DataFrame. Args: df1: The first DataFrame to join. Must not be None col1: The name of the Column in the first DataFrame argument to match values for. Must be a str df2: The second DataFrame to join. Must not be None col2: The name of the Column in the second DataFrame argument to match values for. Must be a str Returns: A DataFrame with joined rows from both specified DataFrames that have matching values in the Columns with the specified names """ if df1 is None or df2 is None: raise dataframe.DataFrameException( "DataFrame argument must not be None") if df1 is df2: raise dataframe.DataFrameException( "Join operation is self-referential") if not col1: raise dataframe.DataFrameException( "First column name argument must not be None or empty") if not col2: raise dataframe.DataFrameException( "Second column name argument must not be None or empty") if not df1.has_column_names(): raise dataframe.DataFrameException("DataFrame must has column labels") if not df2.has_column_names(): raise dataframe.DataFrameException( "DataFrame argument must have column labels") if not df2.has_column(col2): raise dataframe.DataFrameException( "Invalid column name for DataFrame argument: '{}'".format(col2)) if df1.get_column(col1).type_name() != df2.get_column(col2).type_name(): raise dataframe.DataFrameException( ("Column '{}' in DataFrame argument has " "a different type. " "Expected {} but found {}").format( df2.get_column(col2).get_name(), df1.get_column(col1).type_name(), df2.get_column(col2).type_name())) # create a set holding the names of all columns from df2 # that should be bypassed in the result because they already exist in df1 duplicates = set() names = df2.get_column_names() for _, n in enumerate(names): if df1.has_column(n): duplicates.add(n) # add the specified column name to make sure # it is not included in the below computations duplicates.add(col2) df1.flush() df2.flush() # find the elements common to both DataFrames intersec = df1.get_columns(col1).intersection_rows(df2.get_columns(col2)) use_nullable = df1.is_nullable() or df2.is_nullable() result = (dataframe.NullableDataFrame() if use_nullable else dataframe.DefaultDataFrame()) # add all columns from df1 for i in range(df1.columns()): c = column.Column.of_type(df1.get_column(i).type_code()) result.add_column(col=c.as_nullable() if use_nullable else c, name=df1.get_column(i).get_name()) # add all columns from df2 as long as they are not already in df1 for i in range(df2.columns()): col = df2.get_column(i) # if the column is in the collection, then it # is either 'col2' or another duplicate, so it is skipped if not col.get_name() in duplicates: c = column.Column.of_type(col.type_code()) result.add_column(col=c.as_nullable() if use_nullable else c, name=col.get_name()) # iterate over all common elements and add all rows to # the result from both DataFrames that match the common # element in their respective key column for i in range(intersec.rows()): filter_key = str(intersec.get_column(0).get_value(i)) filter1 = df1.filter(col1, filter_key) filter2 = df2.filter(col2, filter_key) # remove 'col2' and any column already existent in df1 for name in duplicates: filter2.remove_column(name) length_col1 = df1.columns() length_col2 = df2.columns() - len(duplicates) # reuse the row list length_row = length_col1 + length_col2 row = [None] * length_row for j in range(filter1.rows()): for k in range(filter2.rows()): for l in range(length_col1): row[l] = filter1.get_column(l).get_value(j) for l in range(length_col2): row[length_col1 + l] = filter2.get_column(l).get_value(k) result.add_row(row) result.flush() return result
def convert(df, target_type): """Converts the given DataFrame from a DefaultDataFrame to a NullableDataFrame or vice versa. Converting a DefaultDataFrame to a NullableDataFrame will not change any internal values, except that now you can add/insert null values to it. Converting a NullableDataFrame to a DefaultDataFrame will convert all None occurrences to the primitive defaults according to the Column they are located. Args: df: The DataFrame instance to convert. Must not be None target_type: The type to convert the given DataFrame to. May be 'default' or 'nullable' Returns: A DataFrame converted from the type of the argument passed to this method to the type specified """ if df is None or target_type is None: raise ValueError("Arg must not be null") if not isinstance(target_type, str): raise ValueError("Target type argument must be specified as a string") target_type = target_type.lower() if target_type not in ("defaultdataframe", "default", "nullabledataframe", "nullable"): raise ValueError("Unable to convert to '" + str(target_type) + "'. Must be either 'default' or 'nullable'") if target_type == "defaultdataframe": target_type = "default" elif target_type == "nullabledataframe": target_type = "nullable" source_type = "nullable" if df.is_nullable() else "default" if target_type == source_type: return copy_of(df) rows = df.rows() converted = None # convert from Nullable to Default if target_type == "default": converted = dataframe.DefaultDataFrame() for col in df: tc = col.type_code() if tc == bytecolumn.NullableByteColumn.TYPE_CODE: vals = np.array([0] * rows, dtype=np.int8) for i in range(rows): val = col.get_value(i) vals[i] = 0 if val is None else val converted.add_column( bytecolumn.ByteColumn(col.get_name(), vals)) elif tc == shortcolumn.NullableShortColumn.TYPE_CODE: vals = np.array([0] * rows, dtype=np.int16) for i in range(rows): val = col.get_value(i) vals[i] = 0 if val is None else val converted.add_column( shortcolumn.ShortColumn(col.get_name(), vals)) elif tc == intcolumn.NullableIntColumn.TYPE_CODE: vals = np.array([0] * rows, dtype=np.int32) for i in range(rows): val = col.get_value(i) vals[i] = 0 if val is None else val converted.add_column(intcolumn.IntColumn(col.get_name(), vals)) elif tc == longcolumn.NullableLongColumn.TYPE_CODE: vals = np.array([0] * rows, dtype=np.int64) for i in range(rows): val = col.get_value(i) vals[i] = 0 if val is None else val converted.add_column( longcolumn.LongColumn(col.get_name(), vals)) elif tc == stringcolumn.NullableStringColumn.TYPE_CODE: vals = np.array([None] * rows, dtype=np.object) for i in range(rows): val = col.get_value(i) vals[i] = (stringcolumn.StringColumn.DEFAULT_VALUE if val is None or val == "" else val) converted.add_column( stringcolumn.StringColumn(col.get_name(), vals)) elif tc == floatcolumn.NullableFloatColumn.TYPE_CODE: vals = np.array([0.0] * rows, dtype=np.float32) for i in range(rows): val = col.get_value(i) vals[i] = 0.0 if val is None else val converted.add_column( floatcolumn.FloatColumn(col.get_name(), vals)) elif tc == doublecolumn.NullableDoubleColumn.TYPE_CODE: vals = np.array([0.0] * rows, dtype=np.float64) for i in range(rows): val = col.get_value(i) vals[i] = 0 if val is None else val converted.add_column( doublecolumn.DoubleColumn(col.get_name(), vals)) elif tc == charcolumn.NullableCharColumn.TYPE_CODE: vals = np.array([0] * rows, dtype=np.uint8) default_val = ord(charcolumn.CharColumn.DEFAULT_VALUE) for i in range(rows): val = col._values[i] vals[i] = default_val if val is None else val converted.add_column( charcolumn.CharColumn(col.get_name(), vals)) elif tc == booleancolumn.NullableBooleanColumn.TYPE_CODE: vals = np.array([False] * rows, dtype=np.bool) for i in range(rows): val = col.get_value(i) vals[i] = False if val is None else val converted.add_column( booleancolumn.BooleanColumn(col.get_name(), vals)) elif tc == binarycolumn.NullableBinaryColumn.TYPE_CODE: vals = np.array([None] * rows, dtype=np.object) for i in range(rows): val = col.get_value(i) vals[i] = bytearray.fromhex("00") if val is None else val converted.add_column( binarycolumn.BinaryColumn(col.get_name(), vals)) else: # undefined type raise dataframe.DataFrameException( ("Unable to convert dataframe. Unrecognized " "column type {}".format(type(col)))) else: # convert from Default to Nullable converted = dataframe.NullableDataFrame() for col in df: tc = col.type_code() vals = np.array([None] * rows, dtype=np.object) for i in range(rows): vals[i] = col.get_value(i) if tc == bytecolumn.ByteColumn.TYPE_CODE: converted.add_column( bytecolumn.NullableByteColumn(col.get_name(), vals)) elif tc == shortcolumn.ShortColumn.TYPE_CODE: converted.add_column( shortcolumn.NullableShortColumn(col.get_name(), vals)) elif tc == intcolumn.IntColumn.TYPE_CODE: converted.add_column( intcolumn.NullableIntColumn(col.get_name(), vals)) elif tc == longcolumn.LongColumn.TYPE_CODE: converted.add_column( longcolumn.NullableLongColumn(col.get_name(), vals)) elif tc == stringcolumn.StringColumn.TYPE_CODE: converted.add_column( stringcolumn.NullableStringColumn(col.get_name(), vals)) elif tc == floatcolumn.FloatColumn.TYPE_CODE: converted.add_column( floatcolumn.NullableFloatColumn(col.get_name(), vals)) elif tc == doublecolumn.DoubleColumn.TYPE_CODE: converted.add_column( doublecolumn.NullableDoubleColumn(col.get_name(), vals)) elif tc == charcolumn.CharColumn.TYPE_CODE: converted.add_column( charcolumn.NullableCharColumn(col.get_name(), vals)) elif tc == booleancolumn.BooleanColumn.TYPE_CODE: converted.add_column( booleancolumn.NullableBooleanColumn(col.get_name(), vals)) elif tc == binarycolumn.BinaryColumn.TYPE_CODE: converted.add_column( binarycolumn.NullableBinaryColumn(col.get_name(), vals)) else: # undefined type raise dataframe.DataFrameException( ("Unable to convert dataframe. Unrecognized " "column type {}".format(type(col)))) return converted
def merge(*dataframes): """Merges all given DataFrame instances into one DataFrame. All DataFames are merged by columns. All DataFrames must have an equal number of rows but may be of any type. All columns are added to the returned DataFrame in the order of the arguments passed to this method. Only passing one DataFrame to this method will simply return that instance. Columns with duplicate names are included in the returned DataFrame and a postfix is added to each duplicate column name. All columns of the returned DataFrame are backed by their origin, which means that changes to the original DataFrame are reflected in the merged DataFrame and vice versa. This does not apply, however, if columns need to be converted to a nullable type. For example, if one DataFrame argument is nullable, then all columns from non-nullable DataFrame arguments are converted to their corresponding nullable equivalent. If columns should be independent from their origin, then simply pass a clone (copy) of each DataFrame argument to this method. Example: merged = DataFrame.merge(DataFrame.copy(df1), DataFrame.copy(df2)) Args: dataframes: The DataFrames to be merged Returns: A DataFrame composed of all columns of the given DataFrames """ if dataframes is None or len(dataframes) == 0: raise dataframe.DataFrameException("Arg must not be None or empty") if len(dataframes) == 1: return dataframes[0] rows = dataframes[0].rows() cols = 0 has_nullable = False has_names = False for i, df in enumerate(dataframes): if df is None: raise dataframe.DataFrameException( "DataFrame argument must not be None") cols += df.columns() if df.rows() != rows: raise dataframe.DataFrameException( ("Size missmatch for DataFrame argument at index {}. " "Expected {} rows but found {}").format(i, rows, df.rows())) if df.is_nullable(): has_nullable = True if df.has_column_names(): has_names = True for _, df in enumerate(dataframes): df.flush() names = None if has_names: names = [None] * cols for i in range(cols): names[i] = str(i) k = 0 for i, df in enumerate(dataframes): for j in range(df.columns()): c = df.get_column(j) if c.get_name(): names[k] = c.get_name() k += 1 for i in range(cols): k = 0 already_set = False n = names[i] for j in range(cols): if i != j: if n == names[j]: if not already_set: names[i] = names[i] + "_" + str(k) k += 1 already_set = True names[j] = names[j] + "_" + str(k) k += 1 columns = [None] * cols k = 0 for i, df in enumerate(dataframes): for j in range(df.columns()): if has_nullable: columns[k] = df.get_column(j).as_nullable() k += 1 else: columns[k] = df.get_column(j) k += 1 merged = None if has_nullable: merged = dataframe.NullableDataFrame(columns) else: merged = dataframe.DefaultDataFrame(columns) if has_names: merged.set_column_names(names) return merged