def test_consistent_coerce_for_shapes(self): # we want column names to NOT be propagated # just because the shape matches the input shape df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) assert_series_equal(result, expected) result = df.apply(lambda x: [1, 2], axis=1) expected = Series([[1, 2] for t in df.itertuples()]) assert_series_equal(result, expected)
class Iteration(object): def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) def time_iteritems(self): # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass def time_iteritems_cached(self): for name, col in self.df.iteritems(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass def time_iterrows(self): for row in self.df.iterrows(): pass
def test_with_dictlike_columns(self): # GH 17602 df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) result = df.apply(lambda x: {'s': x['a'] + x['b']}, axis=1) expected = Series([{'s': 3} for t in df.itertuples()]) assert_series_equal(result, expected) df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), pd.Timestamp('2017-05-02 00:00:00')] result = df.apply(lambda x: {'s': x['a'] + x['b']}, axis=1) assert_series_equal(result, expected) # compose a series result = (df['a'] + df['b']).apply(lambda x: {'s': x}) expected = Series([{'s': 3}, {'s': 3}]) assert_series_equal(result, expected) # GH 18775 df = DataFrame() df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] df["date"] = pd.to_datetime(['17-10-2010 07:15:30', '13-05-2011 08:20:35', '15-01-2013 09:09:09']) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) assert_series_equal(result, expected)
def test_infer_output_shape_columns(self): # GH 18573 df = DataFrame({'number': [1., 2.], 'string': ['foo', 'bar'], 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), pd.Timestamp('2017-11-29 03:45:00')]}) result = df.apply(lambda row: (row.number, row.string), axis=1) expected = Series([(t.number, t.string) for t in df.itertuples()]) assert_series_equal(result, expected)
def test_itertuples(self): for i, tup in enumerate(self.frame.itertuples()): s = Series(tup[1:]) s.name = tup[0] expected = self.frame.iloc[i, :].reset_index(drop=True) assert_series_equal(s, expected) df = DataFrame({'floats': np.random.randn(5), 'ints': lrange(5)}, columns=['floats', 'ints']) for tup in df.itertuples(index=False): assert isinstance(tup[1], np.integer) df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[['a', 'a']] self.assertEqual(list(dfaa.itertuples()), [ (0, 1, 1), (1, 2, 2), (2, 3, 3)]) self.assertEqual(repr(list(df.itertuples(name=None))), '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') tup = next(df.itertuples(name='TestName')) # no support for field renaming in Python 2.6, regular tuples are # returned if sys.version >= LooseVersion('2.7'): self.assertEqual(tup._fields, ('Index', 'a', 'b')) self.assertEqual((tup.Index, tup.a, tup.b), tup) self.assertEqual(type(tup).__name__, 'TestName') df.columns = ['def', 'return'] tup2 = next(df.itertuples(name='TestName')) self.assertEqual(tup2, (0, 1, 4)) if sys.version >= LooseVersion('2.7'): self.assertEqual(tup2._fields, ('Index', '_1', '_2')) df3 = DataFrame(dict(('f' + str(i), [i]) for i in range(1024))) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) self.assertFalse(hasattr(tup3, '_fields')) assert isinstance(tup3, tuple)
def test_infer_output_shape_listlike_columns(self): # GH 16353 df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) assert_series_equal(result, expected) result = df.apply(lambda x: [1, 2], axis=1) expected = Series([[1, 2] for t in df.itertuples()]) assert_series_equal(result, expected) # GH 17970 df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) result = df.apply(lambda row: np.ones(1), axis=1) expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) result = df.apply(lambda row: np.ones(2), axis=1) expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) # GH 17892 df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), pd.Timestamp('2010-02-04'), pd.Timestamp('2010-02-05'), pd.Timestamp('2010-02-06')], 'b': [9, 5, 4, 3], 'c': [5, 3, 4, 2], 'd': [1, 2, 3, 4]}) def fun(x): return (1, 2) result = df.apply(fun, axis=1) expected = Series([(1, 2) for t in df.itertuples()]) assert_series_equal(result, expected)
def test_consistency_for_boxed(self, box): # passing an array or list should not affect the output shape df = DataFrame( np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) result = df.apply(lambda x: box([1, 2]), axis=1) expected = Series([box([1, 2]) for t in df.itertuples()]) assert_series_equal(result, expected) result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') expected = DataFrame( np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) assert_frame_equal(result, expected)
def job_status(self, df: pd.DataFrame, job_opts: JobOpts, progressbar=True): """Read the status and results of each submitted job. Notes: - Multithrading does not make it faster :(. """ # Refresh NFS: os.listdir(job_opts.working_dir.joinpath(job_opts.job_id)) # type: ignore results = [ self._read_results(row, job_opts) for row in tqdm(df.itertuples(), total=len(df), ncols=100, disable=not progressbar) ] if not results: return pd.DataFrame(columns=['status', 'Index']) else: return pd.DataFrame(results).set_index('Index')
def test_itertuples(self): for i, tup in enumerate(self.frame.itertuples()): s = self.klass._constructor_sliced(tup[1:]) s.name = tup[0] expected = self.frame.iloc[i, :].reset_index(drop=True) self._assert_series_equal(s, expected) df = self.klass({'floats': np.random.randn(5), 'ints': lrange(5)}, columns=['floats', 'ints']) for tup in df.itertuples(index=False): assert isinstance(tup[1], (int, long)) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[['a', 'a']] assert (list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) # repr with be int/long on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): assert (repr(list(df.itertuples(name=None))) == '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') tup = next(df.itertuples(name='TestName')) if sys.version >= LooseVersion('2.7'): assert tup._fields == ('Index', 'a', 'b') assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == 'TestName' df.columns = ['def', 'return'] tup2 = next(df.itertuples(name='TestName')) assert tup2 == (0, 1, 4) if sys.version >= LooseVersion('2.7'): assert tup2._fields == ('Index', '_1', '_2') df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert not hasattr(tup3, '_fields') assert isinstance(tup3, tuple)
def test_sequence_like_with_categorical(self): # GH 7839 # make sure can iterate df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing result = list(df.grade.values) expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result, expected) # iteration for t in df.itertuples(index=False): str(t) for row, s in df.iterrows(): str(s) for c, col in df.iteritems(): str(s)
def get_amazon_sample( df: pd.DataFrame, load_tiff: bool = False) -> Tuple[int, np.ndarray, np.ndarray]: """Generator that iterates through the labels and gets us the image (JPG or TIFF) and the label. Args: df (pd.DataFrame): Dataframe containing the image file names and their associated labels. load_tiff (bool, optional): Indicates whether to load the image in the TIFF (True) or the JPG (False) format. Defaults to False. Yields: Iterator[Tuple[np.ndarray, np.ndarray]]: Returns the current image data and the tags (i.e. the labels as in the original data). """ for row in df.itertuples(): if load_tiff: img_data = imread( f"{DATA_PATH}{PLANET_PATH}{TIFF_PATH}{row[1]}.tif") else: img_data = imread( f"{DATA_PATH}{PLANET_PATH}{IMG_PATH}{row[1]}.jpg") yield img_data, np.array(row[2:])
def FromDataFrame(gr_def_df: pd.DataFrame) -> object: """Initialize a GroupData from a DataFrame.""" list_of_groups = [] for row in gr_def_df.itertuples(index=True): logging.debug(f"Reading group definition for {row.NAME}") # Check that the smarts are good. if not Molecule.VerifySmarts(row.SMARTS): raise GroupsDataError("Cannot parse SMARTS expression: %s" % row.SMARTS) group = Group( row.NAME, hydrogens=row.PROTONS, charge=row.CHARGE, nMg=row.MAGNESIUMS, smarts=row.SMARTS, focal_set=FocalSet(row.FOCAL_ATOMS), ) list_of_groups.append(group) logging.debug("Done reading groups data.") return GroupsData(list_of_groups)
def test_sequence_like_with_categorical(self): # GH 7839 # make sure can iterate df = DataFrame({ "id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"] }) df["grade"] = Categorical(df["raw_grade"]) # basic sequencing testing result = list(df.grade.values) expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result, expected) # iteration for t in df.itertuples(index=False): str(t) for row, s in df.iterrows(): str(s) for c, col in df.items(): str(s)
def check_batch(batch: pd.DataFrame, target_pc: str) -> pd.DataFrame: """Check a batch of addresses online. Return a DataFrame with results. If target postcode is within the results - return as soon as target postcode is found. """ data = pd.DataFrame() for guid, address in batch.itertuples(index=False): print(f'--- {guid[:8]}… {address[-50:]:<51}: ', end='', flush=True) return_pc = get_postcode(address) print(return_pc) data_dict = { 'target': target_pc, 'guid': guid, 'address': address, 'pc': return_pc } data_row = pd.DataFrame(data_dict, index=[0]) data = data.append(data_row, ignore_index=True) if target_pc == return_pc: print('--- ✓ Match!') break return data
# In[6]: df.drop('index', axis=1, inplace=True) # In[7]: df['topic_id'] = 0 temp = df.urlkey.at[0] count = 0 index = 0 # In[8]: for line in df.itertuples(): curr_url = line.urlkey if curr_url == temp: df['topic_id'].at[line.Index] = count else: count += 1 df['topic_id'].at[line.Index] = count temp = curr_url #if count == 20: # break # In[9]: df.head(50)
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict]]: messages_out = [] users_out = [] for message in df.itertuples(): message_dict = {'message_id': message.id, 'date': message.date, 'from_user': None, 'forward_from_message_id': None, 'forward_from': None, 'forward_from_chat': None, 'caption': "", 'text': "", 'sticker_set_name': "", 'new_chat_title': "", 'reply_to_message': None, 'file_id': None, 'type': None, } user_event_dict = {} if message.type == 'message': if pd.notnull(message.from_id): message_dict['from_user'] = message.from_id if pd.notnull(message.forwarded_from): try: message_dict['forward_from'] = int(message.forwarded_from) except ValueError: pass if pd.notnull(message.reply_to_message_id): message_dict['reply_to_message'] = message.reply_to_message_id if pd.notnull(message.photo): message_dict['type'] = 'photo' if message.text != "": message_dict['caption'] = text_list_parser(message.text) elif pd.notnull(message.media_type): if message.text != "": message_dict['caption'] = text_list_parser(message.text) message_dict['type'] = media_dict[message.media_type] if message.media_type == 'sticker' and '.webp' not in message.file: message_dict['file_id'] = message.file elif message.text != "": message_dict['type'] = 'text' message_dict['text'] = text_list_parser(message.text) elif pd.notnull(message.poll): message_dict['type'] = 'poll' elif pd.notnull(message.location_information): message_dict['type'] = 'location' elif message.type == 'service': if pd.notnull(message.actor_id): message_dict['from_user'] = message.actor_id if message.action == 'edit_group_title': message_dict['type'] = 'new_chat_title' message_dict['new_chat_title'] = message.title elif message.action == 'pin_message': message_dict['type'] = 'pinned_message' elif message.action == 'edit_group_photo': message_dict['type'] = 'new_chat_photo' elif message.action == 'invite_members' or message.action == 'join_group_by_link': message_dict['type'] = 'new_chat_members' try: for i in message.members: users_out.append({'message_id': message.id, 'user_id': i, 'date': message.date, 'event': 'join'}) except TypeError: user_event_dict = {'message_id': message.id, 'user_id': message.actor_id, 'date': message.date, 'event': 'join'} elif message.action == 'remove_members': message_dict['type'] = 'left_chat_member' for i in message.members: users_out.append({'message_id': message.id, 'user_id': i, 'date': message.date, 'event': 'left'}) else: message_dict['type'] = message.action messages_out.append(message_dict) if user_event_dict != {}: users_out.append(user_event_dict) return messages_out, users_out
def test_itertuples(self, float_frame): for i, tup in enumerate(float_frame.itertuples()): s = DataFrame._constructor_sliced(tup[1:]) s.name = tup[0] expected = float_frame.iloc[i, :].reset_index(drop=True) tm.assert_series_equal(s, expected) df = DataFrame({ "floats": np.random.randn(5), "ints": range(5) }, columns=["floats", "ints"]) for tup in df.itertuples(index=False): assert isinstance(tup[1], int) df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[["a", "a"]] assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] # repr with int on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): assert (repr(list(df.itertuples( name=None))) == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]") tup = next(df.itertuples(name="TestName")) assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == "TestName" df.columns = ["def", "return"] tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) assert tup2._fields == ("Index", "_1", "_2") df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert isinstance(tup3, tuple) if PY37: assert hasattr(tup3, "_fields") else: assert not hasattr(tup3, "_fields") # GH 28282 df_254_columns = DataFrame( [{f"foo_{i}": f"bar_{i}" for i in range(254)}]) result_254_columns = next(df_254_columns.itertuples(index=False)) assert isinstance(result_254_columns, tuple) assert hasattr(result_254_columns, "_fields") df_255_columns = DataFrame( [{f"foo_{i}": f"bar_{i}" for i in range(255)}]) result_255_columns = next(df_255_columns.itertuples(index=False)) assert isinstance(result_255_columns, tuple) # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 if PY37: assert hasattr(result_255_columns, "_fields") else: assert not hasattr(result_255_columns, "_fields")
def remove_white_spaces(data_set: pd.DataFrame): blanks = [] for i, lb, rv in data_set.itertuples(): if rv.isspace(): blanks.append(i) data_set.drop(blanks, inplace=True)
def _get_ticker(self, processed: DataFrame) -> List: processed.drop(processed.head(1).index, inplace=True) return [x for x in processed.itertuples()]
def render_hist(df: pd.DataFrame, x: str, meta: ColumnMetadata, plot_width: int, plot_height: int) -> Figure: """ Render a histogram """ if is_categorical(meta["dtype"]): tooltips = [ (x, "@x"), ("Count", "@count"), ("Label", "@label"), ] else: df = df.copy() df["repr"] = [ f"[{row.lower_bound:.0f}~{row.upper_bound:.0f})" for row in df.itertuples() ] tooltips = [ (x, "@repr"), ("Frequency", "@count"), ("Label", "@label"), ] cmapper = CategoricalColorMapper(palette=Category10[3], factors=LABELS) if is_categorical(df["x"].dtype): radius = 0.99 x_range = FactorRange(*df["x"].unique()) else: radius = df["x"][1] - df["x"][0] x_range = Range1d(df["x"].min() - radius, df["x"].max() + radius) y_range = Range1d(0, df["count"].max() * 1.05) fig = tweak_figure( Figure( x_range=x_range, y_range=y_range, plot_width=plot_width, plot_height=plot_height, tools="hover", toolbar_location=None, tooltips=tooltips, )) fig.vbar( x="x", width=radius, top="count", source=df, fill_alpha=0.3, color={ "field": "label", "transform": cmapper }, legend_field="label", ) relocate_legend(fig, "right") return fig
def right_size_engine(context: SolidExecutionContext, cpu_utilization: DataFrame, mem_utilization: DataFrame, disk_utilization: DataFrame, compute_specs: AzureComputeSpecifications, resources: DataFrame) -> Dict[str, RightSizeAnalysis]: cpu_utilization = cpu_utilization.set_index('resource_id') mem_utilization = mem_utilization.set_index('resource_id') disk_utilization = disk_utilization.set_index('resource_id').sort_index() annual_sql_2core_cost = 10.0 annual_win_server = 10.0 location = "eastus2" prices = pandas.read_json('prices202006.eastus2.json') def find_vm_billables(vm_size: str): s = compute_specs.virtual_machine_by_name(vm_size) bill_sku = s.capabilities.parent_size if s.capabilities.parent_size else vm_size bill_sku = bill_sku.replace('s_', '_').replace('_DS', '_D') cores = s.capabilities.d_vcpus_available return (bill_sku, cores) def find_vm_record(vm_size: str, payg: bool): nonlocal location parts = prices[(prices.armSkuName == vm_size) & (prices.armRegionName == location) & (prices.type == 'Consumption') & (prices.serviceName == 'Virtual Machines') & ~pandas.isna(prices.partNumber) & ~prices.skuName.str.contains('Low Priority')] if parts.shape[0] != 2: print(f'failed to locate price for {vm_size}') return None if payg: record = parts[parts.productName.str.endswith('Windows')].iloc[0] else: record = parts[~parts.productName.str.endswith('Windows')].iloc[0] return record def price_sku(sku: VirtualMachineSku): billing_sku_name, billing_cores = find_vm_billables(sku.name) vm_record = find_vm_record(billing_sku_name, payg=False) if vm_record is None: return None if vm_record.unitOfMeasure != '1 Hour': raise 'Unhandled UOM' sql_cost = max(4, billing_cores) / 2 * annual_sql_2core_cost win_cost = (0.5 if billing_cores <= 8 else float( idivceil(billing_cores, 16))) * annual_win_server vm_cost = vm_record.unitPrice * 24 * 365 return sql_cost + vm_cost + win_cost skus = ((price_sku(s), s) for s in compute_specs.virtual_machine_skus if s.family.startswith('standardD') or s.family.startswith( 'standardES') or s.family.startswith('standardMS')) skus_hash = {s[1].name.lower(): s for s in skus if s[0] is not None} new_sku_families = { 'standardDSv2Family', 'standardDSv3Family', 'standardESv3Family', 'standardMSFamily' } new_skus = ( s for s in skus_hash.values() if s[1].family in new_sku_families and not ( s[1].capabilities.d_vcpus_available < 4 and s[1].capabilities.vcpus > s[1].capabilities.d_vcpus_available)) new_skus_list = sorted(new_skus, key=lambda x: x[0]) results: Dict[str, RightSizeAnalysis] = {} for resource in resources.itertuples(): resource_id = resource.resource_id is_database = resource.role_code == 'DBS' data = select_vm_data(resource_id, cpu_utilization, mem_utilization, disk_utilization) if data is None: continue vm_size = resource.vm_size.lower() sku_current_cost, sku_current = skus_hash[vm_size] for test_cost, test_sku in new_skus_list: if test_cost > sku_current_cost: break should_flex_mem_down = is_database and evaluate_low_cached_usage( data.disk, test_sku) fitness = evaluate_overall_fitness(test_sku, data, should_flex_mem_down) if fitness.cpu and fitness.memory and fitness.disk: break mem_equity = test_sku.capabilities.memory_gb == sku_current.capabilities.memory_gb if fitness.cpu and fitness.disk and mem_equity: break if resource.vm_size == test_sku.name: analysis = RightSizeAnalysis(test_sku.name, False, "Reduction not possible.") elif test_cost <= sku_current_cost: savings = sku_current_cost - test_cost analysis = RightSizeAnalysis(test_sku.name, True, None, savings) else: reason = f"{'CPU ' if not fitness.cpu else ''}{'Memory ' if not fitness.memory else ''}{'I/O ' if not fitness.disk else ''} suggests increase." analysis = RightSizeAnalysis(resource.vm_size, False, reason) results[resource_id] = analysis return results
def df_grans_to_score( df_grans: pd.DataFrame, parts: List[str], type_equality='default' ) -> music21.stream.Score: score = music21.stream.Score() for i_part, name_part in enumerate(parts): part = music21.stream.Part() part.id = name_part obj_first = df_grans.loc[df_grans.index[0]][0] offset_first = df_grans.index[0][0] counter = 0 obj_last = obj_first offset_last = offset_first for row in df_grans.itertuples(): counter = counter + 1 if counter == 1: continue index = row[0] index_beat = index[0] obj = row[1] if type_equality == 'absolute': if not utils.b_absolutely_equal(obj, obj_last): dur = music21.duration.Duration(index_beat - offset_last) offset = offset_last part.insert( offset, get_struct_score( obj_last, name_part, dur ) ) obj_last = obj offset_last = index_beat else: if obj != obj_last: dur = music21.duration.Duration(index_beat - offset_last) offset = offset_last part.insert( offset, get_struct_score( obj_last, name_part, dur ) ) obj_last = obj offset_last = index_beat # insert last part.insert( offset_last, get_struct_score( obj_last, name_part, music21.duration.Duration( list(df_grans.itertuples())[-1][0][0] - offset_last ) ) ) score.insert(i_part, part) return score
def generate_causal_graph(place_change_events: DataFrame, transition_events: DataFrame, time_per_step: float): g = nx.DiGraph( ) # Nodes are occasions and edges leading in their prehensions # Add the initial state for each node as an occasion with no past initial_occasions = place_change_events.query('tstep == 0') for occ in initial_occasions.itertuples(): g.add_node(Occasion(int(occ.num), occ.name, occ.time)) # unit, state, time # Visit each transition and identify i) its output node and its 2 input nodes for trans in transition_events.itertuples(): # row has: tstep, time, name, unit, neighbour & count # TODO: IS IT SAFE TO IGNORE THIS? # assert trans.count == 1 # Statistically likely to happen as simulations get more complex or are undersampled. Consider what to do if this occurs --Rob # Create new occasion in graph for this transition # output_state = trans.name[1] # ab -> b prefix, input_state, output_state = expand_transition_name( trans.name) # strings if math.isnan(trans.unit): print(f"*** {trans.unit} {output_state} {trans.time}") continue output_occasion = Occasion(int(trans.unit), output_state, trans.time) g.add_node(output_occasion) def choose_best_upstream_occasion(target_unit, target_state_name, source_time): query = f"num=={target_unit} & name=='{target_state_name}' & time<{source_time}" last_transition_time = place_change_events.query( query)['time'].max() if math.isnan(last_transition_time): # Try including the source time query = f"num=={target_unit} & name=='{target_state_name}' & time=={source_time}" last_transition_time = place_change_events.query( query)['time'].min() if math.isnan(last_transition_time): # Try including the step after query = f"num=={target_unit} & name=='{target_state_name}' & time<={source_time + time_per_step}" last_transition_time = place_change_events.query( query)['time'].min() return Occasion(target_unit, target_state_name, last_transition_time) # Determine local input node from same unit # state_name = trans.name[0] # ab -> a local_input_occasion = choose_best_upstream_occasion( trans.unit, input_state, trans.time) g.add_edge(local_input_occasion, output_occasion) # Determine input node from neighbour # state_name = trans.name[1] # ab -> b neighbour_input_occasion = choose_best_upstream_occasion( trans.neighbour, output_state, trans.time) g.add_edge(neighbour_input_occasion, output_occasion) # Determine input node from neighbour2 if set if not math.isnan(trans.neighbour2): # state_name = trans.name[1] # ab -> b # neighbour2 assumed pulling state forward (like neighbour) neighbour2_input_occasion = choose_best_upstream_occasion( trans.neighbour2, output_state, trans.time) g.add_edge(neighbour2_input_occasion, output_occasion) return g
def extractor(df: pd.DataFrame) -> dict: """ Extract date and email address from dataframe using regular expression Args: ``df``: dataframe dataframe obtained from image ocr Returns: { "date": { "text" : , "bbox" : [x0, y0, x2, y2] } , "email" : { "text" : "bbox": [x0, y0, x2, y2] } } """ data = [] empty_dummy = { "date": { "text": None, "bbox": None }, "email": { "text": None, "bbox": None }, } if df.empty: return data.append(empty_dummy) # possible date and emial patterns email_pattern = r"(^[a-zA-Z0-9_.+-]+[@.][a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)" date_pattern = [ r"([12]\d{3}[-/.](0[1-9]|1[0-2])[-/.](0[1-9]|[12]\d|3[01]))", r"(\d{2}[-/.]\d{2}[-/.]\d{4})", ] mobile_number_pattern = r'''(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\) [-\.\s]*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{3,4})''' for row in df.itertuples(): # check if matches with date pattern for dp in date_pattern: date_match = re.match(dp, row.Text) if date_match: logger.debug(f'Date match : {date_match}') d = { "date": { "text": date_match[0], "bbox": [row.x0, row.y0, row.x2, row.y2], } } data.append(d) # check if matches with email pattern email_match = re.match(email_pattern, row.Text) if email_match: d = { "email": { "text": email_match[0], "bbox": [row.x0, row.y0, row.x2, row.y2], } } data.append(d) # check if matches number pattern number_match = re.findall(re.compile(mobile_number_pattern), row.Text) if number_match: d = { "number": { "text": number_match[0], "bbox": [row.x0, row.y0, row.x2, row.y2], } } data.append(d) return data
def analyze_df(df: pd.DataFrame, strategy: dict): """Analyzes the dataframe and runs sort of a market simulation, entering and exiting positions Parameters ---------- df, dataframe from process_dataframe after the actions have been added strategy: dict, contains instructions on when to enter/exit trades Returns ------- df, returns a dataframe with the new rows processed """ in_trade = False last_base = float(strategy["base_balance"]) commission = float(strategy["commission"]) last_aux = 0.0 new_total_value = last_base aux_list = [] base_list = [] total_value_list = [] in_trade_list = [] fee_list = [] for row in df.itertuples(): close = row.close curr_action = row.action fee = 0 if curr_action == "e" and not in_trade: # this means we should enter the trade last_aux = convert_base_to_aux(last_base, close) fee = calculate_fee(last_aux, commission) last_aux = last_aux - fee new_total_value = convert_aux_to_base(last_aux, close) # should be extremely close to 0 last_base = round(last_base - new_total_value, 8) in_trade = True if curr_action == "x" and in_trade: last_base = convert_aux_to_base(last_aux, close) fee = calculate_fee(last_base, commission) last_base = last_base - fee last_aux = convert_base_to_aux(last_base, close) new_total_value = last_base in_trade = False aux_list.append(last_aux) base_list.append(last_base) total_value_list.append(new_total_value) in_trade_list.append(in_trade) fee_list.append(fee) if strategy.get("exit_on_end") and in_trade: last_base = convert_aux_to_base(last_aux, close) last_aux = convert_base_to_aux(last_base, close) new_date = df.index[-1] + timedelta(minutes=1) df = df.append(pd.DataFrame(index=[new_date])) aux_list.append(last_aux) base_list.append(last_base) total_value_list.append(new_total_value) in_trade_list.append(in_trade) fee_list.append(False) df["aux"] = aux_list df["base"] = base_list df["total_value"] = total_value_list df["in_trade"] = in_trade_list df["fee"] = fee_list return df
def df2doc_gen(self: Document, df: pd.DataFrame): for item in df.itertuples(): self.__init__(*item) yield self
def convert_df_to_conv_ai_dict(df: pd.DataFrame, personality: List[str], response_columns: List[str], tokenizer: Callable[[str], List[str]], max_tokens: Optional[int] = None, n_candidates: int = 6) -> Dict[str, List[Any]]: """ Each entry in personachat is a dict with two keys personality and utterances, the dataset is a list of entries. personality: list of strings containing the personality of the agent utterances: list of dictionaries, each of which has two keys which are lists of strings. candidates: [next_utterance_candidate_1, ..., next_utterance_candidate_19] The last candidate is the ground truth response observed in the conversational data history: [dialog_turn_0, ... dialog_turn N], where N is an odd number since the other user starts every conversation. Preprocessing: - Spaces before periods at end of sentences - everything lowercase Process each row of a DataFrame. For each row: 1. Grab the conversational input text 2. Grab A the responses 3. Create a unique data entry for each response to the question. 4. Sample random response sentences from the dataset. 5. Combine the random responses into a candidate list. Args: df: The counsel chat pandas dataframe personality: The personality we would like to use during training response_columns: Columns which contain valid responses to the question. For example, the answerText column is the complete response of the therapist tokenizer: The transformers library tokenizer associated with the model we will be training. It is used for setting the maximum sequence length max_tokens: The maximum number of tokens that any candidate, response, or question should be. n_candidates: The number of candidate phrases to include in the dataset for training. The last member of candidates is the ground truth response Returns: A dictionary with a train and validation key. """ # Add one because the index of the dataframe is the 0th position. tuple_map = { name: index + 1 for index, name in enumerate(df.columns.tolist()) } train = [] val = [] # Step through every row in the dictionary for row in df.itertuples(): # Get the question name and title # TODO:: MAKE THIS GENERAL YOU DUMB DUMB question_title = row[tuple_map["questionTitle"]] question_text = row[tuple_map["questionText"]] question_combined = question_title + " " + question_text # Step through every response column in the row for response_column in response_columns: # Get the true response true_response = row[tuple_map[response_column]] # We only want to add data if a good response exists if len(true_response) > 1: # Get candidate alternate sentances by sampling from all other questions candidates = sample_candidates(df, row[tuple_map["questionID"]], "questionID", "answerText", n_candidates) # Add the correct response to the end candidates.append(true_response) # We want to trim the size of the tokens if max_tokens is not None: # Use the provided tokenizer to tokenize the input and truncate at max_tokens question_combined = tokenizer.convert_tokens_to_string( tokenizer.tokenize(question_combined)[:max_tokens]) candidates = [ tokenizer.convert_tokens_to_string( tokenizer.tokenize(candidate)[:max_tokens]) for candidate in candidates ] if len(candidates) != n_candidates + 1: print(true_response) assert False # Define the personality and the history d = { "personality": personality, "utterances": [{ "history": [question_combined], "candidates": candidates }] } if getattr(row, "split") == "train": train.append(d) elif getattr(row, "split") == "val": val.append(d) data = {"train": train, "valid": val} return data
def load_years(years): for year in years: print 'loading %s' % year pop = DataFrame(index=['state', 'county']) column = 'popestimate%s' % year #create a DataFrame for each series of population estimates: #total, male, and female query = ("PopulationEst%sRaw.objects.values('state')" ".filter(gender='0',ethnic_origin='0')" ".annotate(population=Sum(column))" % args[0]) total_pop = eval(query) total_pop = DataFrame.from_records(total_pop, index=['state']) total_pop.columns = ['total'] if np.isnan(total_pop.sum()): #No data yet for the current year, which means no data yet #for future years in the decade, so stop right here print 'No data for year %s. Stopping load.' % year return 0 query = ("PopulationEst%sRaw.objects.values('state')" ".filter(gender='1',ethnic_origin='0')" ".annotate(population=Sum(column))" % args[0]) male_pop = eval(query) male_pop = DataFrame.from_records(male_pop, index=['state']) male_pop.columns = ['male'] query = ("PopulationEst%sRaw.objects.values('state')" ".filter(gender='2',ethnic_origin='0')" ".annotate(population=Sum(column))" % args[0]) female_pop = eval(query) female_pop = DataFrame.from_records(female_pop, index=['state']) female_pop.columns = ['female'] #merge the total, male, and female DataFrames into final, master df pop = pd.merge(pop, total_pop, how='right', left_index=True, right_index=True) pop = pd.merge(pop, male_pop, how='right', left_index=True, right_index=True) pop = pd.merge(pop, female_pop, how='right', left_index=True, right_index=True) #calculate male and female percentages and merge those in, too male_percent = DataFrame(pop.apply( lambda row: row['male'] * 1.0 / row['total'] * 100, axis=1), columns=['male_percent']) pop = pd.merge(pop, male_percent, left_index=True, right_index=True) female_percent = DataFrame(pop.apply( lambda row: row['female'] * 1.0 / row['total'] * 100, axis=1), columns=['female_percent']) pop = pd.merge(pop, female_percent, left_index=True, right_index=True) #add DataFrame contents to database #DataFrame is indexed by state code #i.e., p[0] = state code for p in pop.itertuples(): state_id = states['id'][p[0]] try: record = PopulationGenderState.objects.get( state=state_id, year=year) except: record = PopulationGenderState() record.state_id = state_id record.year = year record.total = p[1] record.male = p[2] record.female = p[3] record.male_percent = str(p[4]) record.female_percent = str(p[5]) record.save() db.reset_queries()
def plot_genes(self, ax, gr: GenomeRange, ov_genes: pd.DataFrame, dry_run = False, fig_width = None): properties = self.properties self.__set_plot_params(gr, ov_genes) assert (not dry_run) or (fig_width is not None) if dry_run: self.__get_length_w(fig_width, gr.start, gr.end) else: self.__get_length_w(ax.get_figure().get_figwidth(), gr.start, gr.end) num_rows = properties['num_rows'] max_num_row_local = 1 max_ypos = 0 # check for the number of other intervals that overlap # with the given interval # 1 2 # 012345678901234567890123456 # 1========= 4========= # 2========= # 3============ # # for 1 row_last_position = [9] # for 2 row_last_position = [9, 14] # for 3 row_last_position = [9, 14, 19] # for 4 row_last_position = [26, 14, 19] row_last_position = [] # each entry in this list contains the end position # of genomic interval. The list index is the row # in which the genomic interval was plotted. # Any new genomic interval that wants to be plotted, # knows the row to use by finding the list index that # is larger than its start # check for overlapping genes including # label size (if plotted) for bed in ov_genes.itertuples(): """ BED12 gene format with exon locations at the end chrX 20850 23076 CG17636-RA 0 - 20850 23017 0 3 946,765,64, 0,1031,2162, BED9 bed with rgb at end chr2L 0 70000 ID_5 0.26864549832 . 0 70000 51,160,44 BED6 bed without rgb chr2L 0 70000 ID_5 0.26864549832 . BED3 bed with only intervals chr2L 0 70000 """ self.counter += 1 if self.is_draw_labels: num_name_characters = len(bed.name) + 2 # +2 to account for an space before and after the name bed_extended_end = int(bed.end + (num_name_characters * self.len_w)) else: bed_extended_end = (bed.end + 2 * self.small_relative) # get smallest free row if not row_last_position: free_row = 0 row_last_position.append(bed_extended_end) else: # get list of rows that are less than bed.start, then take the min idx_list = [idx for idx, value in enumerate(row_last_position) if value < bed.start] if len(idx_list): free_row = min(idx_list) row_last_position[free_row] = bed_extended_end else: free_row = len(row_last_position) row_last_position.append(bed_extended_end) rgb, edgecolor = self.get_rgb_and_edge_color(bed) ypos = self.get_y_pos(free_row) # do not plot if the maximum interval rows to plot is reached if num_rows and free_row >= float(num_rows): continue if free_row > max_num_row_local: max_num_row_local = free_row if ypos > max_ypos: max_ypos = ypos if not dry_run: if properties['bed_type'] == 'bed12': if properties['gene_style'] == 'flybase': self.draw_gene_with_introns_flybase_style(ax, bed, ypos, rgb, edgecolor) else: self.draw_gene_with_introns(ax, bed, ypos, rgb, edgecolor) else: self.draw_gene_simple(ax, bed, ypos, rgb, edgecolor) if self.is_draw_labels and bed.start > gr.start and bed.end < gr.end: ax.text(bed.end + self.small_relative, ypos + (float(properties['interval_height']) / 2), bed.name, horizontalalignment='left', verticalalignment='center', fontproperties=self.fp) if self.counter == 0: log.debug(f"*Warning* No intervals were found for file {properties['file']} " f"in Track \'{properties['name']}\' for the interval plotted ({gr}).\n") ymax = 0 if num_rows: ymin = float(num_rows) * self.row_scale self.current_row_num = num_rows else: ymin = max_ypos + properties['interval_height'] self.current_row_num = len(row_last_position) log.debug("ylim {},{}".format(ymin, ymax)) # the axis is inverted (thus, ymax < ymin) if not dry_run: ax.set_ylim(ymin, ymax) if properties['display'] == 'collapsed': ax.set_ylim(-5, 105) ax.set_xlim(gr.start, gr.end)
def _build_graphviz_obj(self, show_ifnames: bool, df: pd.DataFrame): '''Return a graphviz object''' graph_attr = {'splines': 'polyline', 'layout': 'dot'} if show_ifnames: graph_attr.update({'nodesep': '1.0'}) g = graphviz.Digraph(graph_attr=graph_attr, name='Hover over arrow head for edge info') hostset = set() for hostgroup in df.groupby(by=['hopCount']) \ .hostname.unique().tolist(): with g.subgraph() as s: s.attr(rank='same') for hostname in hostgroup: if hostname in hostset: continue hostset.add(hostname) debugURL = '&'.join([ f'{get_base_url()}?page={quote("Path-Debug")}', 'lookupType=hop', f'namespace={quote(df.namespace[0])}', f'session={quote(get_session_id())}', f'hostname={quote(hostname)}', ]) tooltip, color = self._get_node_tooltip_color(hostname) s.node(hostname, tooltip=tooltip, color=color, URL=debugURL, target='_graphviz', shape='box') pathid = 0 prevrow = None connected_set = set() df['nextPathid'] = df.pathid.shift(-1).fillna('0').astype(int) for row in df.itertuples(): if row.pathid != pathid: prevrow = row pathid = row.pathid continue conn = (prevrow.hostname, row.hostname) if conn not in connected_set: if row.overlay: path_type = 'underlay' color = 'purple' elif prevrow.isL2: path_type = 'l2' color = 'blue' else: path_type = 'l3' color = 'black' if not row.mtuMatch: color = 'red' error = 'MTU mismatch' err_pfx = ', ' else: error = '' err_pfx = '' tdf = pd.DataFrame({ 'pathType': path_type, 'protocol': [prevrow.protocol], 'ipLookup': [prevrow.ipLookup], 'vtepLookup': [prevrow.vtepLookup], 'macLookup': [prevrow.macLookup], 'nexthopIp': [prevrow.nexthopIp], 'vrf': [prevrow.vrf], 'mtu': [f'{prevrow.outMtu} -> {row.inMtu}'], 'oif': [prevrow.oif], 'iif': [row.iif] }) rowerr = getattr(prevrow, 'error', '') if rowerr: error += f"{err_pfx}{rowerr}" if error: err_pfx = ', ' if row.nextPathid != row.pathid: # We need to capture any errors on the dest node as well destnode_error = getattr(row, 'error', '') if destnode_error: error += f'{err_pfx}{destnode_error}' if error: tdf['error'] = error color = 'red' tooltip = '\n'.join( tdf.T.to_string(justify='right').split('\n')[1:]) debugURL = '&'.join([ f'{get_base_url()}?page={quote("Path-Debug")}', 'lookupType=edge', f'namespace={quote(row.namespace)}', f'session={quote(get_session_id())}', f'hostname={quote(prevrow.hostname)}', f'vrf={quote(prevrow.vrf)}', f'vtepLookup-{prevrow.vtepLookup}', f'ifhost={quote(row.hostname)}', f'ipLookup={quote(prevrow.ipLookup)}', f'oif={quote(prevrow.oif)}', f'macaddr={quote(prevrow.macLookup or "")}', f'nhip={quote(prevrow.nexthopIp)}', ]) if show_ifnames: g.edge( prevrow.hostname, row.hostname, color=color, label=str(row.hopCount), URL=debugURL, edgetarget='_graphviz', tooltip=tooltip, taillabel=prevrow.oif, headlabel=row.iif, penwidth='2.0', ) else: g.edge(prevrow.hostname, row.hostname, color=color, label=str(row.hopCount), URL=debugURL, edgetarget='_graphviz', penwidth='2.0', tooltip=tooltip) connected_set.add(conn) prevrow = row df.drop(columns=['nextPathid'], inplace=True, errors='ignore') return g
def c_backtester( data: pd.DataFrame, sl_atr: float = 50, trailing_sl: bool = True, active_close: bool = False, block_stop: bool = True, take_profit: int = 0, ) -> pd.DataFrame: """ Consecutive (event driven) backtester. Given df with 'signal' (-1 short, 0 out, 1 long) return df with 'position', taking into account: - position can be taken on the next row after signal is generated - stop-losse (either relative to entry or high water mark) - filtered_signal if given allows for additional condition that must be met to initiate position. Positions are closed regardless of filter. Args: data: must have columns: 'price', 'close', 'signal', 'atr'; 'filtered_signal' is optional, if not given, filtred_signal = signal 'price' used for transactions 'close' to decide whether stop-loss has been triggered sl_atr: stop-loss distance in multiples of ATRs (if no stop loss required use very high number, default 50) trailing_sl: if True, stop-loss calculated off high watermark, if False, entry price active_close: if True close signal is the signal opposite to the direction of the position, if False close signal is lack of signal in the direction of the position block_stop: if True, after stop loss no position will be entered in the same same direction as stoped out position until opposite signal is generated take_profit: take profit distance expressed as multiple of stop-loss distance, 0 means no take profit Returns: DataFrame with column 'position' to be processed by another function. """ for c in ['price', 'close', 'signal', 'atr']: assert c in data.columns, f"'{c}' is a required column" data = data.copy() # while in position maintain open price and transaction direction data['position'] = 0 # flag to execute transaction at next data point data['mark'] = False # note the reason for transaction at next data point data['reason'] = '' # record transaction price data['t_price'] = 0 # entry price for stop loss calculation data['entry'] = 0 # for stop-loss calculation data['high_water'] = 0 # whether stop loss is trailing or fixed trailing_sl = trailing_sl # restrict re-entering positions after stop loss # (1=long positions blocked, -1=short positions blocked) block = 0 if trailing_sl: sl_field = 'high_water' else: sl_field = 'entry' if 'date' not in data.columns: data.reset_index(inplace=True) if 'filtered_signal' not in data.columns: data['filtered_signal'] = data['signal'] for item in data.itertuples(): # first row doesn't have to check for positions or execute transactions if not item.Index == 0: # starting position is the same as previous day's position data.loc[item.Index, 'position'] = data.loc[(item.Index - 1), 'position'] data.loc[item.Index, 'entry'] = data.loc[(item.Index - 1), 'entry'] # execute transactions if data.loc[(item.Index - 1), 'mark']: # close position if data.loc[item.Index, 'position']: data.loc[item.Index, 'position'] = 0 data.loc[item.Index, 'entry'] = 0 # record transaction price data.loc[item.Index, 't_price'] = item.price * \ np.sign(data.loc[(item.Index - 1), 'entry']) * -1 # open position else: data.loc[item.Index, 'position'] = data.loc[(item.Index - 1), 'signal'] data.loc[item.Index, 'entry'] = item.price * \ data.loc[(item.Index - 1), 'signal'] # record transaction price and high water mark data.loc[item.Index, 't_price'] = item.price * \ data.loc[(item.Index - 1), 'signal'] data.loc[item.Index, 'high_water'] = data.loc[item.Index, 't_price'] # update high water mark if not item.Index == 0: # skip first row if data.loc[item.Index - 1, 'position'] != 0: data.loc[item.Index, 'high_water'] = max( data.loc[item.Index - 1, 'high_water'], item.close * data.loc[item.Index, 'position']) # check for close signal if active_close: if data.loc[item.Index, 'position'] != 0 and np.sign( item.signal) != 0: if np.sign(data.loc[item.Index, 'position']) != np.sign( item.signal): data.loc[item.Index, 'mark'] = True data.loc[item.Index, 'reason'] = 'close' # check for stop-loss signal # long positions if data.loc[item.Index, 'position'] > 0: if item.close <= (data.loc[item.Index, sl_field] - (item.atr * sl_atr)): data.loc[item.Index, 'mark'] = True data.loc[item.Index, 'reason'] = 'stop-out' if block_stop: block = 1 # short positions if data.loc[item.Index, 'position'] < 0: if item.close >= abs( (data.loc[item.Index, sl_field] - (item.atr * sl_atr))): data.loc[item.Index, 'mark'] = True data.loc[item.Index, 'reason'] = 'stop-out' if block_stop: block = -1 # check for take profit if take_profit: # long positions if data.loc[item.Index, 'position'] > 0: if item.close >= (data.loc[item.Index, 'entry'] + (item.atr * sl_atr * take_profit)): data.loc[item.Index, 'mark'] = True data.loc[item.Index, 'reason'] = 'take-profit' block = 1 # short positions if data.loc[item.Index, 'position'] < 0: if item.close <= abs((data.loc[item.Index, 'entry'] + (item.atr * sl_atr * take_profit))): data.loc[item.Index, 'mark'] = True data.loc[item.Index, 'reason'] = 'take-profit' block = -1 # check for entry signal if data.loc[item.Index, 'position'] == 0: if item.filtered_signal != 0 and item.filtered_signal != block: data.loc[item.Index, 'mark'] = True data.loc[item.Index, 'reason'] = 'entry' block = 0 data.set_index('date', inplace=True, drop=True) return data
def load_years(years): for year in years: print 'loading %s' % year pop = DataFrame(index=['state','county']) column = 'popestimate%s' % year #create a DataFrame for each series of population estimates: #total, male, and female query = ("PopulationEst%sRaw.objects.values('state')" ".filter(gender='0',ethnic_origin='0')" ".annotate(population=Sum(column))" % args[0]) total_pop = eval(query) total_pop = DataFrame.from_records( total_pop, index=['state']) total_pop.columns = ['total'] if np.isnan(total_pop.sum()): #No data yet for the current year, which means no data yet #for future years in the decade, so stop right here print 'No data for year %s. Stopping load.' % year return 0 query = ("PopulationEst%sRaw.objects.values('state')" ".filter(gender='1',ethnic_origin='0')" ".annotate(population=Sum(column))" % args[0]) male_pop = eval(query) male_pop = DataFrame.from_records( male_pop, index=['state']) male_pop.columns = ['male'] query = ("PopulationEst%sRaw.objects.values('state')" ".filter(gender='2',ethnic_origin='0')" ".annotate(population=Sum(column))" % args[0]) female_pop = eval(query) female_pop = DataFrame.from_records( female_pop, index=['state']) female_pop.columns = ['female'] #merge the total, male, and female DataFrames into final, master df pop = pd.merge(pop, total_pop, how='right', left_index=True, right_index=True) pop = pd.merge(pop, male_pop, how='right', left_index=True, right_index=True) pop = pd.merge(pop, female_pop, how='right', left_index=True, right_index=True) #calculate male and female percentages and merge those in, too male_percent = DataFrame(pop.apply( lambda row: row['male']*1.0/row['total']*100,axis=1), columns=['male_percent']) pop = pd.merge(pop, male_percent, left_index=True, right_index=True) female_percent = DataFrame(pop.apply( lambda row: row['female']*1.0/row['total']*100,axis=1), columns=['female_percent']) pop = pd.merge(pop, female_percent, left_index=True, right_index=True) #add DataFrame contents to database #DataFrame is indexed by state code #i.e., p[0] = state code for p in pop.itertuples(): state_id = states['id'][p[0]] try: record = PopulationGenderState.objects.get( state = state_id, year = year) except: record = PopulationGenderState() record.state_id = state_id record.year = year record.total = p[1] record.male = p[2] record.female = p[3] record.male_percent = str(p[4]) record.female_percent = str(p[5]) record.save() db.reset_queries()
def read_from_db(): #pan = c.execute('select artist,avg(score) from reviews group by artist order by avg(score) desc') #pan1 = c.execute('select reviewid, genre from genres group by genre ') #pan2 = c.execute('select artist, avg(score) from reviews group by artist having count(reviewid) > 4 order by avg(score) desc') #print(pan1.fetchall()) #pan = c.execute('select * from reviews') #print(type(pan)) #df = DataFrame(pan.fetchall()) #df.columns = ['reviewid','title','artist','url','score','best_new_music','author','author_type','pub_date','pub_weekday','pub_day','pub_month','pub_year'] #print(df.dtypes) '''df1 = df.groupby('artist').agg({'score':np.mean}).sort_values(by='score', ascending=False) df2 = df1.head(10) df3 = df1.tail(10) df4 = pd.concat([df2,df3])''' #data = c.fetchall() #print(df4) #plt.plot(arti) #print(df.head(10)) #print(type(df)) #print(data[0][1]) #print("reviewid") #for row in c.fetchall(): #print(row) #if(row[4]>9): #print(row[4])''' #for row1 in pan2.fetchall(): # print(row1) #g = plt.bar(pan2.tail(10)['artist'], pan2.tail(10)['avg(score)']) #h = DataFrame(pan2.fetchall()) #g = h.head(10) #plt.bar(g.artist, g. avg(score)) #plt.xlabel('artist', fontsize=5) #plt.ylabel('avg(score)', fontsize=5) #plt.xticks(index, label, fontsize=5, rotation=30) #plt.show() q1 = c.execute('select r.pub_year,g.genre,avg(r.score) from genres g, reviews r where g.reviewid = r.reviewid group by g.genre,r.pub_year order by r.pub_year,avg(r.score) desc') #print(q1.fetchall()) df1 = DataFrame(q1.fetchall()) #print(df1) q2 = c.execute('select g.genre,avg(r.score) from genres g, reviews r where g.reviewid = r.reviewid group by g.genre order by avg(r.score) desc') df2 = DataFrame(q2.fetchall()) df2 = df2.drop(index=4) l = df2[0].tolist() y_pos = [i for i, _ in enumerate(l)] m = df2[1].tolist() g = sns.barplot(m, l, data=df2) #plt.xlim(6.5, 7.5) r = np.linspace(0, 9, 10) j = 0 for row in df2.itertuples(): g.text(x=row[2]+0.2, y=j, s='{:4.2f}'.format(row[2]), color='black', ha='center') j += 1 #plt.bar(m,y_pos) #plt.yticks(y_pos, l) #plt.legend() #plt.xlabel('bar number') #plt.ylabel('bar height') #plt.title('Epic Graph\nAnother Line! Whoa') plt.tight_layout() plt.show()
def backtest(df: pd.DataFrame, settings: dict, price_precisions: dict = {}): start_quot = 1.0 ppctminus = 1 - settings['profit_pct'] ppctplus = 1 + settings['profit_pct'] symbols = [c.replace('_low', '') for c in df.columns if 'low' in c] if not price_precisions: price_precisions = {s: 8 for s in symbols} lows = {s: f'{s}_low' for s in symbols} highs = {s: f'{s}_high' for s in symbols} means = {s: f'{s}_mean' for s in symbols} min_emas = {s: f'{s}_mean_min_ema' for s in symbols} max_emas = {s: f'{s}_mean_max_ema' for s in symbols} min_delay_millis = settings['min_seconds_between_same_side_entries'] * 1000 rolling_millis = settings['max_memory_span_days'] * 24 * 60 * 60 * 1000 s2c = {s: s.split('_')[0] for s in symbols} quot = symbols[0].split('_')[1] balance = {s2c[s]: 0.0 for s in s2c} balance[quot] = 1.0 acc_equity_quot = 1.0 acc_debt_quot = 0.0 long_entries = {s: [] for s in symbols} shrt_entries = {s: [] for s in symbols} long_exits = {s: [] for s in symbols} shrt_exits = {s: [] for s in symbols} long_exit_price_list = {s: [] for s in symbols} shrt_exit_price_list = {s: [] for s in symbols} past_rolling_long_entries = {s: [] for s in symbols} past_rolling_shrt_entries = {s: [] for s in symbols} entry_bid = {s: round(df.iloc[0][means[s]], 8) for s in symbols} entry_ask = {s: round(df.iloc[0][means[s]], 8) for s in symbols} exit_bid = {s: entry_bid[s] for s in symbols} exit_ask = {s: entry_ask[s] for s in symbols} long_cost = {s: 0.0 for s in symbols} long_amount = {s: 0.0 for s in symbols} shrt_cost = {s: 0.0 for s in symbols} shrt_amount = {s: 0.0 for s in symbols} fee = 1 - 0.000675 # vip 1 margin_level = 3 - 1 balance_list = [] do_shrt = {s for s in symbols if s2c[s] in settings['coins_shrt']} do_long = {s for s in symbols if s2c[s] in settings['coins_long']} exponent = settings['entry_vol_modifier_exponent'] start_ts, end_ts = df.index[0], df.index[-1] ts_range = end_ts - start_ts for row in df.itertuples(): cost = acc_equity_quot * settings['account_equity_pct_per_trade'] min_exit_cost = cost * settings['min_big_trade_cost_multiplier'] credit_avbl_quot = max(0.0, acc_equity_quot * margin_level - acc_debt_quot) age_limit = row.Index - rolling_millis for s in symbols: # rolling longs long_i = get_cutoff_index(past_rolling_long_entries[s], age_limit) if long_i > 0: slc = past_rolling_long_entries[s][:long_i] past_rolling_long_entries[s] = past_rolling_long_entries[s][long_i:] long_amount[s] -= sum([e['amount'] for e in slc]) long_cost[s] -= sum([e['amount'] * e['price'] for e in slc]) if long_cost[s] <= 0.0 or long_amount[s] <= 0.0: long_cost[s] = 0.0 long_amount[s] = 0.0 past_rolling_long_entries[s] = [] exit_ask[s] = getattr(row, means[s]) else: exit_ask[s] = (long_cost[s] / long_amount[s]) * ppctplus # rolling shrts shrt_i = get_cutoff_index(past_rolling_shrt_entries[s], age_limit) if shrt_i > 0: slc = past_rolling_shrt_entries[s][:shrt_i] past_rolling_shrt_entries[s] = past_rolling_shrt_entries[s][shrt_i:] shrt_cost[s] -= sum([e['amount'] * e['price'] for e in slc]) shrt_amount[s] -= sum([e['amount'] for e in slc]) if shrt_cost[s] <= 0.0 or shrt_amount[s] <= 0.0: shrt_cost[s] = 0.0 shrt_amount[s] = 0.0 past_rolling_shrt_entries[s] = [] exit_bid[s] = getattr(row, means[s]) else: exit_bid[s] = (shrt_cost[s] / shrt_amount[s]) * ppctminus if s in do_long and getattr(row, lows[s]) < entry_bid[s] and \ (not long_entries[s] or (row.Index - long_entries[s][-1]['timestamp'] >= min_delay_millis)): # long buy long_modifier = max( 1.0, min(settings['min_big_trade_cost_multiplier'] - 1, (exit_ask[s] / getattr(row, means[s]))**exponent)) buy_cost = cost * long_modifier if balance[quot] >= buy_cost: # long buy normal buy_amount = (buy_cost / entry_bid[s]) balance[quot] -= buy_cost balance[s2c[s]] += buy_amount * fee long_entries[s].append({'price': entry_bid[s], 'amount': buy_amount, 'timestamp': row.Index}) past_rolling_long_entries[s].append(long_entries[s][-1]) long_amount[s] += buy_amount long_cost[s] += buy_cost exit_ask[s] = (long_cost[s] / long_amount[s]) * ppctplus elif credit_avbl_quot > 0.0: # long buy with credit quot_avbl = max(0.0, balance[quot]) to_borrow = min(credit_avbl_quot, buy_cost - quot_avbl) credit_avbl_quot -= to_borrow partial_buy_cost = quot_avbl + to_borrow buy_amount = (partial_buy_cost / entry_bid[s]) balance[quot] -= partial_buy_cost balance[s2c[s]] += buy_amount * fee long_entries[s].append({'price': entry_bid[s], 'amount': buy_amount, 'timestamp': row.Index}) past_rolling_long_entries[s].append(long_entries[s][-1]) long_amount[s] += buy_amount long_cost[s] += partial_buy_cost exit_ask[s] = (long_cost[s] / long_amount[s]) * ppctplus if s in do_shrt and getattr(row, highs[s]) > entry_ask[s] and \ (not shrt_entries[s] or (row.Index - shrt_entries[s][-1]['timestamp'] >= min_delay_millis)): # shrt sel shrt_modifier = max( 1.0, min(settings['min_big_trade_cost_multiplier'] - 1, (getattr(row, means[s]) / exit_bid[s])**exponent)) sel_cost = cost * shrt_modifier sel_amount = sel_cost / entry_ask[s] if balance[s2c[s]] >= sel_amount: # shrt sel normal balance[s2c[s]] -= sel_amount balance[quot] += sel_cost * fee shrt_entries[s].append({'price': entry_ask[s], 'amount': sel_amount, 'timestamp': row.Index}) past_rolling_shrt_entries[s].append(shrt_entries[s][-1]) shrt_amount[s] += sel_amount shrt_cost[s] += sel_cost exit_bid[s] = (shrt_cost[s] / shrt_amount[s]) * ppctminus elif credit_avbl_quot > 0.0: # shrt sel with credit coin_avbl = max(0.0, balance[s2c[s]]) to_borrow = min(credit_avbl_quot / entry_ask[s], sel_amount - coin_avbl) credit_avbl_quot -= (to_borrow * entry_ask[s]) partial_sel_amount = coin_avbl + to_borrow balance[s2c[s]] -= partial_sel_amount partial_sel_cost = partial_sel_amount * entry_ask[s] balance[quot] += partial_sel_cost * fee shrt_entries[s].append({'price': entry_ask[s], 'amount': partial_sel_amount, 'timestamp': row.Index}) past_rolling_shrt_entries[s].append(shrt_entries[s][-1]) shrt_amount[s] += partial_sel_amount shrt_cost[s] += partial_sel_cost exit_bid[s] = (shrt_cost[s] / shrt_amount[s]) * ppctminus exit_ask[s] = round_up(exit_ask[s], price_precisions[s]) exit_bid[s] = round_dn(exit_bid[s], price_precisions[s]) if long_cost[s] > min_exit_cost: # long sel long_exit_price_list[s].append({'price': exit_ask[s], 'timestamp': row.Index}) if getattr(row, highs[s]) > exit_ask[s]: if balance[s2c[s]] >= long_amount[s]: # long sel normal long_sel_amount = max(balance[s2c[s]], long_amount[s]) long_exits[s].append({'price': exit_ask[s], 'amount': long_sel_amount, 'timestamp': row.Index}) quot_acquired = long_sel_amount * exit_ask[s] balance[s2c[s]] -= long_sel_amount balance[quot] += quot_acquired * fee long_amount[s] = 0.0 long_cost[s] = 0.0 else: # partial long sel coin_avbl = max(0.0, balance[s2c[s]]) to_borrow = min(credit_avbl_quot / exit_ask[s], long_amount[s] - coin_avbl) partial_sel_amount = coin_avbl + to_borrow if partial_sel_amount > 0.0: credit_avbl_quot -= (to_borrow * exit_ask[s]) balance[s2c[s]] -= partial_sel_amount partial_sel_cost = partial_sel_amount * exit_ask[s] balance[quot] += partial_sel_cost * fee long_exits[s].append({'price': exit_ask[s], 'amount': partial_sel_amount, 'timestamp': row.Index}) long_amount[s] -= partial_sel_amount long_cost[s] -= partial_sel_cost if long_amount[s] <= 0.0 or long_cost[s] <= 0.0: long_amount[s] = 0.0 long_cost[s] = 0.0 past_rolling_long_entries[s] = [] if shrt_cost[s] > min_exit_cost: shrt_exit_price_list[s].append({'price': exit_bid[s], 'timestamp': row.Index}) if getattr(row, lows[s]) < exit_bid[s]: # shrt buy shrt_buy_cost = shrt_amount[s] * exit_bid[s] if balance[quot] >= shrt_buy_cost: # shrt buy normal shrt_buy_cost = max(shrt_buy_cost, min(balance[quot], -balance[s2c[s]] * exit_bid[s])) shrt_buy_amount = shrt_buy_cost / exit_bid[s] shrt_exits[s].append({'price': exit_bid[s], 'amount': shrt_buy_amount, 'timestamp': row.Index}) balance[quot] -= shrt_buy_cost balance[s2c[s]] += shrt_buy_amount * fee shrt_amount[s] = 0.0 shrt_cost[s] = 0.0 else: # partial shrt buy quot_avbl = max(0.0, balance[quot]) to_borrow = min(credit_avbl_quot, shrt_buy_cost - quot_avbl) partial_sel_cost = quot_avbl + to_borrow if partial_sel_cost > 0.0: coin_acquired = partial_sel_cost / exit_bid[s] shrt_exits[s].append({'price': exit_bid[s], 'amount': coin_acquired, 'timestamp': row.Index}) credit_avbl_quot -= to_borrow balance[quot] -= partial_sel_cost balance[s2c[s]] += coin_acquired * fee shrt_amount[s] -= coin_acquired shrt_cost[s] -= partial_sel_cost if shrt_amount[s] <= 0.0 or shrt_cost[s] <= 0.0: shrt_amount[s] = 0.0 shrt_cost[s] = 0.0 past_rolling_shrt_entries[s] = [] entry_bid[s] = round_dn( min(getattr(row, means[s]), getattr(row, min_emas[s])), price_precisions[s]) entry_ask[s] = round_up( max(getattr(row, means[s]), getattr(row, max_emas[s])), price_precisions[s]) acc_equity_quot = \ balance[quot] + sum([balance[s2c[s]] * getattr(row, means[s]) for s in symbols]) balance_list.append({**{s2c[s]: balance[s2c[s]] * getattr(row, means[s]) for s in symbols}, **{'acc_equity_quot': acc_equity_quot, 'timestamp': row.Index, quot: balance[quot]}}) acc_debt_quot = -sum([balance_list[-1][c] for c in balance if balance_list[-1][c] < 0.0]) balance_list[-1]['acc_debt_quot'] = acc_debt_quot if row.Index % 86400000 == 0 or row.Index >= end_ts: n_millis = row.Index - start_ts line = f'\r{(n_millis / ts_range) * 100:.2f}% ' line += f'acc equity quot: {acc_equity_quot:.6f} ' n_days = n_millis / 1000 / 60 / 60 / 24 line += f'avg daily gain: {acc_equity_quot**(1/n_days):6f} ' line += f'cost {cost:.8f} ' sys.stdout.write(line) sys.stdout.flush() return balance_list, long_entries, shrt_entries, long_exits, shrt_exits, \ long_exit_price_list, shrt_exit_price_list
def get_matches(item, config_sheet: pd.DataFrame): return (Match(rule, rule.input_re.match(item)) for rule in config_sheet.itertuples())
def df_rich_text(df: pd.DataFrame) -> str: return rich_text_table( df.itertuples(index=False), column_headers=df.columns, row_headers=df.index )
#pmml = minidom.parse('single_audit_logreg.pmml') pmml = minidom.parse('lr.pmml') root = pmml.documentElement model = root.getElementsByTagName('GeneralRegressionModel')[0] nameNodeList = model.getElementsByTagName("RegressionTable") np = nameNodeList[0].getElementsByTagName("NumericPredictor") for l in np: p = l.parentNode p.removeChild(l) for row in df.itertuples(): #print(type(row.Index)) #print(row._1) if row.Index == 'intercept': nameNodeList[0].setAttribute('intercept', str(row._1)) break for index, row in df.iterrows(): if index == 'intercept': continue newEle = pmml.createElement("NumericPredictor") newEle.setAttribute("name", index) newEle.setAttribute("exponent", "1") newEle.setAttribute("coefficient", str(row[0])) nameNodeList[0].appendChild(newEle)
class Iteration: # mem_itertuples_* benchmarks are slow timeout = 120 def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_iteritems(self): # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass def time_iteritems_cached(self): for name, col in self.df.iteritems(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples_start(self): self.df4.itertuples() def time_itertuples_read_first(self): next(self.df4.itertuples()) def time_itertuples(self): for row in self.df4.itertuples(): pass def time_itertuples_to_list(self): list(self.df4.itertuples()) def mem_itertuples_start(self): return self.df4.itertuples() def peakmem_itertuples_start(self): self.df4.itertuples() def mem_itertuples_read_first(self): return next(self.df4.itertuples()) def peakmem_itertuples(self): for row in self.df4.itertuples(): pass def mem_itertuples_to_list(self): return list(self.df4.itertuples()) def peakmem_itertuples_to_list(self): list(self.df4.itertuples()) def time_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def time_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def time_itertuples_raw_tuples(self): for row in self.df4.itertuples(index=False, name=None): pass def time_itertuples_raw_tuples_to_list(self): list(self.df4.itertuples(index=False, name=None)) def mem_itertuples_raw_start(self): return self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw(self): for row in self.df4.itertuples(index=False, name=None): pass def mem_itertuples_raw_to_list(self): return list(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw_to_list(self): list(self.df4.itertuples(index=False, name=None)) def time_iterrows(self): for row in self.df.iterrows(): pass
def plot_genes(self, ax, gr: GenomeRange, ov_genes: pd.DataFrame): properties = self.properties # bed_type self.properties['bed_type'] = properties[ 'bed_type'] or self.infer_bed_type(ov_genes) # as min_score and max_score change every plot, we compute them for every plot min_score, max_score = properties['min_score'], properties['max_score'] has_score_col = properties['bed_type'] in ('bed6', 'bed9', 'bed12') if has_score_col and len(ov_genes): min_score = (min_score != 'inf') or ov_genes['score'].min() max_score = (max_score != '-inf') or ov_genes['score'].max() min_score, max_score = float(min_score), float(max_score) # set colormap if self.colormap is not None: norm = matplotlib.colors.Normalize(vmin=min_score, vmax=max_score) cmap = matplotlib.cm.get_cmap(properties['color']) self.colormap = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap) if properties['color'] == 'bed_rgb' and properties['bed_type'] not in [ 'bed12', 'bed9' ]: log.warning( "*WARNING* Color set to 'bed_rgb', but bed file does not have the rgb field. The color has " "been set to {}".format(self.COLOR)) self.properties['color'] = self.COLOR self.colormap = None self.counter = 0 self.small_relative = 0.004 * (gr.end - gr.start) self.get_length_w(ax.get_figure().get_figwidth(), gr.start, gr.end) # turn labels off when too many intervals are visible. if properties['labels'] == 'on' and len(ov_genes) > 60: self.is_draw_labels = False num_rows = properties['num_rows'] max_num_row_local = 1 max_ypos = 0 # check for the number of other intervals that overlap # with the given interval # 1 2 # 012345678901234567890123456 # 1========= 4========= # 2========= # 3============ # # for 1 row_last_position = [9] # for 2 row_last_position = [9, 14] # for 3 row_last_position = [9, 14, 19] # for 4 row_last_position = [26, 14, 19] row_last_position = [ ] # each entry in this list contains the end position # of genomic interval. The list index is the row # in which the genomic interval was plotted. # Any new genomic interval that wants to be plotted, # knows the row to use by finding the list index that # is larger than its start # check for overlapping genes including # label size (if plotted) for bed in ov_genes.itertuples(): """ BED12 gene format with exon locations at the end chrX 20850 23076 CG17636-RA 0 - 20850 23017 0 3 946,765,64, 0,1031,2162, BED9 bed with rgb at end chr2L 0 70000 ID_5 0.26864549832 . 0 70000 51,160,44 BED6 bed without rgb chr2L 0 70000 ID_5 0.26864549832 . BED3 bed with only intervals chr2L 0 70000 """ self.counter += 1 if self.is_draw_labels: num_name_characters = len( bed.name ) + 2 # +2 to account for an space before and after the name bed_extended_end = int(bed.end + (num_name_characters * self.len_w)) else: bed_extended_end = (bed.end + 2 * self.small_relative) # get smallest free row if not row_last_position: free_row = 0 row_last_position.append(bed_extended_end) else: # get list of rows that are less than bed.start, then take the min idx_list = [ idx for idx, value in enumerate(row_last_position) if value < bed.start ] if len(idx_list): free_row = min(idx_list) row_last_position[free_row] = bed_extended_end else: free_row = len(row_last_position) row_last_position.append(bed_extended_end) rgb, edgecolor = self.get_rgb_and_edge_color(bed) ypos = self.get_y_pos(free_row) # do not plot if the maximum interval rows to plot is reached if num_rows and free_row >= float(num_rows): continue if free_row > max_num_row_local: max_num_row_local = free_row if ypos > max_ypos: max_ypos = ypos if properties['bed_type'] == 'bed12': if properties['gene_style'] == 'flybase': self.draw_gene_with_introns_flybase_style( ax, bed, ypos, rgb, edgecolor) else: self.draw_gene_with_introns(ax, bed, ypos, rgb, edgecolor) else: self.draw_gene_simple(ax, bed, ypos, rgb, edgecolor) if self.is_draw_labels and bed.start > gr.start and bed.end < gr.end: ax.text(bed.end + self.small_relative, ypos + (float(properties['interval_height']) / 2), bed.name, horizontalalignment='left', verticalalignment='center', fontproperties=self.fp) if self.counter == 0: log.warning( f"*Warning* No intervals were found for file {properties['file']} " f"in Track \'{properties['name']}\' for the interval plotted ({gr}).\n" ) ymax = 0 if num_rows: ymin = float(num_rows) * self.row_scale else: ymin = max_ypos + properties['interval_height'] log.debug("ylim {},{}".format(ymin, ymax)) # the axis is inverted (thus, ymax < ymin) ax.set_ylim(ymin, ymax) if properties['display'] == 'domain': ax.set_ylim(-5, 205) elif properties['display'] == 'collapsed': ax.set_ylim(-5, 105) ax.set_xlim(gr.start, gr.end)
def pandas_to_ped( ped_pd: pd.DataFrame ): """ Creates a Hail Pedigree object from trios stored as rows in a DataFrame. Input columns should contain 'fam_id', 's', 'is_female', 'pat_id', 'mat_id' :param DataFrame ped_pd: Input DataFrame :return: Pedigree :rtype: Pedigree """ return hl.Pedigree([hl.Trio(s=row.s, is_female=row.is_female, pat_id=row.pat_id, mat_id=row.mat_id, fam_id=str(row.fam_id)) for row in ped_pd.itertuples()])
class Iteration: # mem_itertuples_* benchmarks are slow timeout = 120 def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)]) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_items(self): # (monitor no-copying behaviour) if hasattr(self.df, "_item_cache"): self.df._item_cache.clear() for name, col in self.df.items(): pass def time_items_cached(self): for name, col in self.df.items(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples_start(self): self.df4.itertuples() def time_itertuples_read_first(self): next(self.df4.itertuples()) def time_itertuples(self): for row in self.df4.itertuples(): pass def time_itertuples_to_list(self): list(self.df4.itertuples()) def mem_itertuples_start(self): return self.df4.itertuples() def peakmem_itertuples_start(self): self.df4.itertuples() def mem_itertuples_read_first(self): return next(self.df4.itertuples()) def peakmem_itertuples(self): for row in self.df4.itertuples(): pass def mem_itertuples_to_list(self): return list(self.df4.itertuples()) def peakmem_itertuples_to_list(self): list(self.df4.itertuples()) def time_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def time_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def time_itertuples_raw_tuples(self): for row in self.df4.itertuples(index=False, name=None): pass def time_itertuples_raw_tuples_to_list(self): list(self.df4.itertuples(index=False, name=None)) def mem_itertuples_raw_start(self): return self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw(self): for row in self.df4.itertuples(index=False, name=None): pass def mem_itertuples_raw_to_list(self): return list(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw_to_list(self): list(self.df4.itertuples(index=False, name=None)) def time_iterrows(self): for row in self.df.iterrows(): pass
def _write_dataframe_kafka( self, feature_group: FeatureGroup, dataframe: pd.DataFrame, offline_write_options: dict, ): # setup kafka producer producer = Producer(self._get_kafka_config(offline_write_options)) # setup complex feature writers feature_writers = { feature: self._get_encoder_func( feature_group._get_feature_avro_schema(feature) ) for feature in feature_group.get_complex_features() } # setup row writer function writer = self._get_encoder_func(feature_group._get_encoded_avro_schema()) def acked(err, msg): if err is not None: print("Failed to deliver message: %s: %s" % (str(msg), str(err))) # loop over rows for r in dataframe.itertuples(index=False): # itertuples returns Python NamedTyple, to be able to serialize it using # avro, create copy of row only by converting to dict, which preserves datatypes row = r._asdict() # transform special data types # here we might need to handle also timestamps and other complex types # possible optimizaiton: make it based on type so we don't need to loop over # all keys in the row for k in row.keys(): # for avro to be able to serialize them, they need to be python data types if isinstance(row[k], np.ndarray): row[k] = row[k].tolist() if isinstance(row[k], pd.Timestamp): row[k] = row[k].to_pydatetime() # encode complex features row = self._encode_complex_features(feature_writers, row) # encode feature row with BytesIO() as outf: writer(row, outf) encoded_row = outf.getvalue() # assemble key key = "".join([str(row[pk]) for pk in sorted(feature_group.primary_key)]) while True: # if BufferError is thrown, we can be sure, message hasn't been send so we retry try: # produce producer.produce( topic=feature_group._online_topic_name, key=key, value=encoded_row, callback=acked if offline_write_options.get("debug_kafka", False) else None, ) # Trigger internal callbacks to empty op queue producer.poll(0) break except BufferError as e: if offline_write_options.get("debug_kafka", False): print("Caught: {}".format(e)) # backoff for 1 second producer.poll(1) # make sure producer blocks and everything is delivered producer.flush() # start backfilling job job_name = "{fg_name}_{version}_offline_fg_backfill".format( fg_name=feature_group.name, version=feature_group.version ) job = self._job_api.get(job_name) if offline_write_options is not None and offline_write_options.get( "start_offline_backfill", True ): print("Launching offline feature group backfill job...") self._job_api.launch(job_name) print( "Backfill Job started successfully, you can follow the progress at \n{}".format( self._get_job_url(job.href) ) ) self._wait_for_job(job, offline_write_options) return job
def load_guess_score_map(guess_df: pd.DataFrame) -> defaultdict: guess_score_map = defaultdict(dict) for row in guess_df.itertuples(): guess_score_map[row.guesser][(row.qnum, row.sentence, row.token, row.guess)] = row.score return guess_score_map