Пример #1
0
    def test_consistent_coerce_for_shapes(self):
        # we want column names to NOT be propagated
        # just because the shape matches the input shape
        df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)
Пример #2
0
class Iteration(object):

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Пример #3
0
    def test_with_dictlike_columns(self):
        # GH 17602
        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        expected = Series([{'s': 3} for t in df.itertuples()])
        assert_series_equal(result, expected)

        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
                    pd.Timestamp('2017-05-02 00:00:00')]
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        assert_series_equal(result, expected)

        # compose a series
        result = (df['a'] + df['b']).apply(lambda x: {'s': x})
        expected = Series([{'s': 3}, {'s': 3}])
        assert_series_equal(result, expected)

        # GH 18775
        df = DataFrame()
        df["author"] = ["X", "Y", "Z"]
        df["publisher"] = ["BBC", "NBC", "N24"]
        df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
                                     '13-05-2011 08:20:35',
                                     '15-01-2013 09:09:09'])
        result = df.apply(lambda x: {}, axis=1)
        expected = Series([{}, {}, {}])
        assert_series_equal(result, expected)
Пример #4
0
    def test_infer_output_shape_columns(self):
        # GH 18573

        df = DataFrame({'number': [1., 2.],
                        'string': ['foo', 'bar'],
                        'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
                                     pd.Timestamp('2017-11-29 03:45:00')]})
        result = df.apply(lambda row: (row.number, row.string), axis=1)
        expected = Series([(t.number, t.string) for t in df.itertuples()])
        assert_series_equal(result, expected)
Пример #5
0
    def test_itertuples(self):
        for i, tup in enumerate(self.frame.itertuples()):
            s = Series(tup[1:])
            s.name = tup[0]
            expected = self.frame.iloc[i, :].reset_index(drop=True)
            assert_series_equal(s, expected)

        df = DataFrame({'floats': np.random.randn(5),
                        'ints': lrange(5)}, columns=['floats', 'ints'])

        for tup in df.itertuples(index=False):
            assert isinstance(tup[1], np.integer)

        df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
        dfaa = df[['a', 'a']]
        self.assertEqual(list(dfaa.itertuples()), [
                         (0, 1, 1), (1, 2, 2), (2, 3, 3)])

        self.assertEqual(repr(list(df.itertuples(name=None))),
                         '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')

        tup = next(df.itertuples(name='TestName'))

        # no support for field renaming in Python 2.6, regular tuples are
        # returned
        if sys.version >= LooseVersion('2.7'):
            self.assertEqual(tup._fields, ('Index', 'a', 'b'))
            self.assertEqual((tup.Index, tup.a, tup.b), tup)
            self.assertEqual(type(tup).__name__, 'TestName')

        df.columns = ['def', 'return']
        tup2 = next(df.itertuples(name='TestName'))
        self.assertEqual(tup2, (0, 1, 4))

        if sys.version >= LooseVersion('2.7'):
            self.assertEqual(tup2._fields, ('Index', '_1', '_2'))

        df3 = DataFrame(dict(('f' + str(i), [i]) for i in range(1024)))
        # will raise SyntaxError if trying to create namedtuple
        tup3 = next(df3.itertuples())
        self.assertFalse(hasattr(tup3, '_fields'))
        assert isinstance(tup3, tuple)
Пример #6
0
    def test_infer_output_shape_listlike_columns(self):
        # GH 16353

        df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)

        # GH 17970
        df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))

        result = df.apply(lambda row: np.ones(1), axis=1)
        expected = Series([np.ones(1) for t in df.itertuples()],
                          index=df.index)
        assert_series_equal(result, expected)

        result = df.apply(lambda row: np.ones(2), axis=1)
        expected = Series([np.ones(2) for t in df.itertuples()],
                          index=df.index)
        assert_series_equal(result, expected)

        # GH 17892
        df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
                                 pd.Timestamp('2010-02-04'),
                                 pd.Timestamp('2010-02-05'),
                                 pd.Timestamp('2010-02-06')],
                           'b': [9, 5, 4, 3],
                           'c': [5, 3, 4, 2],
                           'd': [1, 2, 3, 4]})

        def fun(x):
            return (1, 2)

        result = df.apply(fun, axis=1)
        expected = Series([(1, 2) for t in df.itertuples()])
        assert_series_equal(result, expected)
Пример #7
0
    def test_consistency_for_boxed(self, box):
        # passing an array or list should not affect the output shape
        df = DataFrame(
            np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
            columns=['A', 'B', 'C'])

        result = df.apply(lambda x: box([1, 2]), axis=1)
        expected = Series([box([1, 2]) for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand')
        expected = DataFrame(
            np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1)
        assert_frame_equal(result, expected)
Пример #8
0
    def job_status(self, df: pd.DataFrame, job_opts: JobOpts, progressbar=True):
        """Read the status and results of each submitted job.

        Notes:
            - Multithrading does not make it faster :(.
        """
        # Refresh NFS:
        os.listdir(job_opts.working_dir.joinpath(job_opts.job_id))  # type: ignore
        results = [
            self._read_results(row, job_opts)
            for row in tqdm(df.itertuples(), total=len(df), ncols=100, disable=not progressbar)
        ]
        if not results:
            return pd.DataFrame(columns=['status', 'Index'])
        else:
            return pd.DataFrame(results).set_index('Index')
Пример #9
0
    def test_itertuples(self):
        for i, tup in enumerate(self.frame.itertuples()):
            s = self.klass._constructor_sliced(tup[1:])
            s.name = tup[0]
            expected = self.frame.iloc[i, :].reset_index(drop=True)
            self._assert_series_equal(s, expected)

        df = self.klass({'floats': np.random.randn(5),
                         'ints': lrange(5)}, columns=['floats', 'ints'])

        for tup in df.itertuples(index=False):
            assert isinstance(tup[1], (int, long))

        df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
        dfaa = df[['a', 'a']]

        assert (list(dfaa.itertuples()) ==
                [(0, 1, 1), (1, 2, 2), (2, 3, 3)])

        # repr with be int/long on 32-bit/windows
        if not (compat.is_platform_windows() or compat.is_platform_32bit()):
            assert (repr(list(df.itertuples(name=None))) ==
                    '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')

        tup = next(df.itertuples(name='TestName'))

        if sys.version >= LooseVersion('2.7'):
            assert tup._fields == ('Index', 'a', 'b')
            assert (tup.Index, tup.a, tup.b) == tup
            assert type(tup).__name__ == 'TestName'

        df.columns = ['def', 'return']
        tup2 = next(df.itertuples(name='TestName'))
        assert tup2 == (0, 1, 4)

        if sys.version >= LooseVersion('2.7'):
            assert tup2._fields == ('Index', '_1', '_2')

        df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
        # will raise SyntaxError if trying to create namedtuple
        tup3 = next(df3.itertuples())
        assert not hasattr(tup3, '_fields')
        assert isinstance(tup3, tuple)
Пример #10
0
    def test_sequence_like_with_categorical(self):

        # GH 7839
        # make sure can iterate
        df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
        df['grade'] = Categorical(df['raw_grade'])

        # basic sequencing testing
        result = list(df.grade.values)
        expected = np.array(df.grade.values).tolist()
        tm.assert_almost_equal(result, expected)

        # iteration
        for t in df.itertuples(index=False):
            str(t)

        for row, s in df.iterrows():
            str(s)

        for c, col in df.iteritems():
            str(s)
def get_amazon_sample(
        df: pd.DataFrame,
        load_tiff: bool = False) -> Tuple[int, np.ndarray, np.ndarray]:
    """Generator that iterates through the labels and gets us the image (JPG or TIFF)
    and the label.

    Args:
        df (pd.DataFrame): Dataframe containing the image file names and their
        associated labels.
        load_tiff (bool, optional): Indicates whether to load the image in the TIFF (True)
        or the JPG (False) format. Defaults to False.

    Yields:
        Iterator[Tuple[np.ndarray, np.ndarray]]: Returns the current image data and the
        tags (i.e. the labels as in the original data).
    """
    for row in df.itertuples():
        if load_tiff:
            img_data = imread(
                f"{DATA_PATH}{PLANET_PATH}{TIFF_PATH}{row[1]}.tif")
        else:
            img_data = imread(
                f"{DATA_PATH}{PLANET_PATH}{IMG_PATH}{row[1]}.jpg")
        yield img_data, np.array(row[2:])
Пример #12
0
    def FromDataFrame(gr_def_df: pd.DataFrame) -> object:
        """Initialize a GroupData from a DataFrame."""
        list_of_groups = []
        for row in gr_def_df.itertuples(index=True):
            logging.debug(f"Reading group definition for {row.NAME}")

            # Check that the smarts are good.
            if not Molecule.VerifySmarts(row.SMARTS):
                raise GroupsDataError("Cannot parse SMARTS expression: %s" %
                                      row.SMARTS)

            group = Group(
                row.NAME,
                hydrogens=row.PROTONS,
                charge=row.CHARGE,
                nMg=row.MAGNESIUMS,
                smarts=row.SMARTS,
                focal_set=FocalSet(row.FOCAL_ATOMS),
            )
            list_of_groups.append(group)

        logging.debug("Done reading groups data.")

        return GroupsData(list_of_groups)
Пример #13
0
    def test_sequence_like_with_categorical(self):

        # GH 7839
        # make sure can iterate
        df = DataFrame({
            "id": [1, 2, 3, 4, 5, 6],
            "raw_grade": ["a", "b", "b", "a", "a", "e"]
        })
        df["grade"] = Categorical(df["raw_grade"])

        # basic sequencing testing
        result = list(df.grade.values)
        expected = np.array(df.grade.values).tolist()
        tm.assert_almost_equal(result, expected)

        # iteration
        for t in df.itertuples(index=False):
            str(t)

        for row, s in df.iterrows():
            str(s)

        for c, col in df.items():
            str(s)
Пример #14
0
def check_batch(batch: pd.DataFrame, target_pc: str) -> pd.DataFrame:
    """Check a batch of addresses online.

    Return a DataFrame with results.
    If target postcode is within the results - return as soon as target
    postcode is found.
    """
    data = pd.DataFrame()
    for guid, address in batch.itertuples(index=False):
        print(f'--- {guid[:8]}… {address[-50:]:<51}: ', end='', flush=True)
        return_pc = get_postcode(address)
        print(return_pc)
        data_dict = {
            'target': target_pc,
            'guid': guid,
            'address': address,
            'pc': return_pc
        }
        data_row = pd.DataFrame(data_dict, index=[0])
        data = data.append(data_row, ignore_index=True)
        if target_pc == return_pc:
            print('--- ✓ Match!')
            break
    return data
Пример #15
0
# In[6]:

df.drop('index', axis=1, inplace=True)

# In[7]:

df['topic_id'] = 0

temp = df.urlkey.at[0]
count = 0
index = 0

# In[8]:

for line in df.itertuples():

    curr_url = line.urlkey
    if curr_url == temp:
        df['topic_id'].at[line.Index] = count

    else:
        count += 1
        df['topic_id'].at[line.Index] = count
        temp = curr_url
    #if count == 20:
    #    break

# In[9]:

df.head(50)
Пример #16
0
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict]]:
    messages_out = []
    users_out = []
    for message in df.itertuples():
        message_dict = {'message_id': message.id,
                        'date': message.date,
                        'from_user': None,
                        'forward_from_message_id': None,
                        'forward_from': None,
                        'forward_from_chat': None,
                        'caption': "",
                        'text': "",
                        'sticker_set_name': "",
                        'new_chat_title': "",
                        'reply_to_message': None,
                        'file_id': None,
                        'type': None,
                        }
        user_event_dict = {}
        if message.type == 'message':
            if pd.notnull(message.from_id):
                message_dict['from_user'] = message.from_id

            if pd.notnull(message.forwarded_from):
                try:
                    message_dict['forward_from'] = int(message.forwarded_from)
                except ValueError:
                    pass

            if pd.notnull(message.reply_to_message_id):
                message_dict['reply_to_message'] = message.reply_to_message_id

            if pd.notnull(message.photo):
                message_dict['type'] = 'photo'
                if message.text != "":
                    message_dict['caption'] = text_list_parser(message.text)
            elif pd.notnull(message.media_type):
                if message.text != "":
                    message_dict['caption'] = text_list_parser(message.text)
                message_dict['type'] = media_dict[message.media_type]
                if message.media_type == 'sticker' and '.webp' not in message.file:
                    message_dict['file_id'] = message.file
            elif message.text != "":
                message_dict['type'] = 'text'
                message_dict['text'] = text_list_parser(message.text)
            elif pd.notnull(message.poll):
                message_dict['type'] = 'poll'
            elif pd.notnull(message.location_information):
                message_dict['type'] = 'location'

        elif message.type == 'service':
            if pd.notnull(message.actor_id):
                message_dict['from_user'] = message.actor_id

            if message.action == 'edit_group_title':
                message_dict['type'] = 'new_chat_title'
                message_dict['new_chat_title'] = message.title
            elif message.action == 'pin_message':
                message_dict['type'] = 'pinned_message'
            elif message.action == 'edit_group_photo':
                message_dict['type'] = 'new_chat_photo'
            elif message.action == 'invite_members' or message.action == 'join_group_by_link':
                message_dict['type'] = 'new_chat_members'
                try:
                    for i in message.members:
                        users_out.append({'message_id': message.id,
                                          'user_id': i,
                                          'date': message.date,
                                          'event': 'join'})
                except TypeError:
                    user_event_dict = {'message_id': message.id,
                                       'user_id': message.actor_id,
                                       'date': message.date,
                                       'event': 'join'}
            elif message.action == 'remove_members':
                message_dict['type'] = 'left_chat_member'
                for i in message.members:
                    users_out.append({'message_id': message.id,
                                      'user_id': i,
                                      'date': message.date,
                                      'event': 'left'})
            else:
                message_dict['type'] = message.action
        messages_out.append(message_dict)
        if user_event_dict != {}:
            users_out.append(user_event_dict)
    return messages_out, users_out
Пример #17
0
    def test_itertuples(self, float_frame):
        for i, tup in enumerate(float_frame.itertuples()):
            s = DataFrame._constructor_sliced(tup[1:])
            s.name = tup[0]
            expected = float_frame.iloc[i, :].reset_index(drop=True)
            tm.assert_series_equal(s, expected)

        df = DataFrame({
            "floats": np.random.randn(5),
            "ints": range(5)
        },
                       columns=["floats", "ints"])

        for tup in df.itertuples(index=False):
            assert isinstance(tup[1], int)

        df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
        dfaa = df[["a", "a"]]

        assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]

        # repr with int on 32-bit/windows
        if not (compat.is_platform_windows() or compat.is_platform_32bit()):
            assert (repr(list(df.itertuples(
                name=None))) == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]")

        tup = next(df.itertuples(name="TestName"))
        assert tup._fields == ("Index", "a", "b")
        assert (tup.Index, tup.a, tup.b) == tup
        assert type(tup).__name__ == "TestName"

        df.columns = ["def", "return"]
        tup2 = next(df.itertuples(name="TestName"))
        assert tup2 == (0, 1, 4)
        assert tup2._fields == ("Index", "_1", "_2")

        df3 = DataFrame({"f" + str(i): [i] for i in range(1024)})
        # will raise SyntaxError if trying to create namedtuple
        tup3 = next(df3.itertuples())
        assert isinstance(tup3, tuple)
        if PY37:
            assert hasattr(tup3, "_fields")
        else:
            assert not hasattr(tup3, "_fields")

        # GH 28282
        df_254_columns = DataFrame(
            [{f"foo_{i}": f"bar_{i}"
              for i in range(254)}])
        result_254_columns = next(df_254_columns.itertuples(index=False))
        assert isinstance(result_254_columns, tuple)
        assert hasattr(result_254_columns, "_fields")

        df_255_columns = DataFrame(
            [{f"foo_{i}": f"bar_{i}"
              for i in range(255)}])
        result_255_columns = next(df_255_columns.itertuples(index=False))
        assert isinstance(result_255_columns, tuple)

        # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7
        if PY37:
            assert hasattr(result_255_columns, "_fields")
        else:
            assert not hasattr(result_255_columns, "_fields")
def remove_white_spaces(data_set: pd.DataFrame):
    blanks = []
    for i, lb, rv in data_set.itertuples():
        if rv.isspace():
            blanks.append(i)
    data_set.drop(blanks, inplace=True)
Пример #19
0
    def _get_ticker(self, processed: DataFrame) -> List:
        processed.drop(processed.head(1).index, inplace=True)

        return [x for x in processed.itertuples()]
Пример #20
0
def render_hist(df: pd.DataFrame, x: str, meta: ColumnMetadata,
                plot_width: int, plot_height: int) -> Figure:
    """
    Render a histogram
    """
    if is_categorical(meta["dtype"]):
        tooltips = [
            (x, "@x"),
            ("Count", "@count"),
            ("Label", "@label"),
        ]
    else:
        df = df.copy()
        df["repr"] = [
            f"[{row.lower_bound:.0f}~{row.upper_bound:.0f})"
            for row in df.itertuples()
        ]

        tooltips = [
            (x, "@repr"),
            ("Frequency", "@count"),
            ("Label", "@label"),
        ]

    cmapper = CategoricalColorMapper(palette=Category10[3], factors=LABELS)

    if is_categorical(df["x"].dtype):
        radius = 0.99
        x_range = FactorRange(*df["x"].unique())
    else:
        radius = df["x"][1] - df["x"][0]
        x_range = Range1d(df["x"].min() - radius, df["x"].max() + radius)

    y_range = Range1d(0, df["count"].max() * 1.05)

    fig = tweak_figure(
        Figure(
            x_range=x_range,
            y_range=y_range,
            plot_width=plot_width,
            plot_height=plot_height,
            tools="hover",
            toolbar_location=None,
            tooltips=tooltips,
        ))

    fig.vbar(
        x="x",
        width=radius,
        top="count",
        source=df,
        fill_alpha=0.3,
        color={
            "field": "label",
            "transform": cmapper
        },
        legend_field="label",
    )

    relocate_legend(fig, "right")

    return fig
Пример #21
0
def right_size_engine(context: SolidExecutionContext,
                      cpu_utilization: DataFrame, mem_utilization: DataFrame,
                      disk_utilization: DataFrame,
                      compute_specs: AzureComputeSpecifications,
                      resources: DataFrame) -> Dict[str, RightSizeAnalysis]:
    cpu_utilization = cpu_utilization.set_index('resource_id')
    mem_utilization = mem_utilization.set_index('resource_id')
    disk_utilization = disk_utilization.set_index('resource_id').sort_index()

    annual_sql_2core_cost = 10.0
    annual_win_server = 10.0
    location = "eastus2"
    prices = pandas.read_json('prices202006.eastus2.json')

    def find_vm_billables(vm_size: str):
        s = compute_specs.virtual_machine_by_name(vm_size)
        bill_sku = s.capabilities.parent_size if s.capabilities.parent_size else vm_size
        bill_sku = bill_sku.replace('s_', '_').replace('_DS', '_D')
        cores = s.capabilities.d_vcpus_available
        return (bill_sku, cores)

    def find_vm_record(vm_size: str, payg: bool):
        nonlocal location
        parts = prices[(prices.armSkuName == vm_size)
                       & (prices.armRegionName == location) &
                       (prices.type == 'Consumption') &
                       (prices.serviceName == 'Virtual Machines')
                       & ~pandas.isna(prices.partNumber)
                       & ~prices.skuName.str.contains('Low Priority')]
        if parts.shape[0] != 2:
            print(f'failed to locate price for {vm_size}')
            return None
        if payg:
            record = parts[parts.productName.str.endswith('Windows')].iloc[0]
        else:
            record = parts[~parts.productName.str.endswith('Windows')].iloc[0]
        return record

    def price_sku(sku: VirtualMachineSku):
        billing_sku_name, billing_cores = find_vm_billables(sku.name)
        vm_record = find_vm_record(billing_sku_name, payg=False)
        if vm_record is None:
            return None
        if vm_record.unitOfMeasure != '1 Hour':
            raise 'Unhandled UOM'
        sql_cost = max(4, billing_cores) / 2 * annual_sql_2core_cost
        win_cost = (0.5 if billing_cores <= 8 else float(
            idivceil(billing_cores, 16))) * annual_win_server
        vm_cost = vm_record.unitPrice * 24 * 365
        return sql_cost + vm_cost + win_cost

    skus = ((price_sku(s), s) for s in compute_specs.virtual_machine_skus
            if s.family.startswith('standardD') or s.family.startswith(
                'standardES') or s.family.startswith('standardMS'))
    skus_hash = {s[1].name.lower(): s for s in skus if s[0] is not None}

    new_sku_families = {
        'standardDSv2Family', 'standardDSv3Family', 'standardESv3Family',
        'standardMSFamily'
    }
    new_skus = (
        s for s in skus_hash.values()
        if s[1].family in new_sku_families and not (
            s[1].capabilities.d_vcpus_available < 4
            and s[1].capabilities.vcpus > s[1].capabilities.d_vcpus_available))
    new_skus_list = sorted(new_skus, key=lambda x: x[0])

    results: Dict[str, RightSizeAnalysis] = {}
    for resource in resources.itertuples():
        resource_id = resource.resource_id
        is_database = resource.role_code == 'DBS'
        data = select_vm_data(resource_id, cpu_utilization, mem_utilization,
                              disk_utilization)
        if data is None:
            continue

        vm_size = resource.vm_size.lower()
        sku_current_cost, sku_current = skus_hash[vm_size]

        for test_cost, test_sku in new_skus_list:
            if test_cost > sku_current_cost:
                break

            should_flex_mem_down = is_database and evaluate_low_cached_usage(
                data.disk, test_sku)
            fitness = evaluate_overall_fitness(test_sku, data,
                                               should_flex_mem_down)
            if fitness.cpu and fitness.memory and fitness.disk:
                break

            mem_equity = test_sku.capabilities.memory_gb == sku_current.capabilities.memory_gb
            if fitness.cpu and fitness.disk and mem_equity:
                break

        if resource.vm_size == test_sku.name:
            analysis = RightSizeAnalysis(test_sku.name, False,
                                         "Reduction not possible.")
        elif test_cost <= sku_current_cost:
            savings = sku_current_cost - test_cost
            analysis = RightSizeAnalysis(test_sku.name, True, None, savings)
        else:
            reason = f"{'CPU ' if not fitness.cpu else ''}{'Memory ' if not fitness.memory else ''}{'I/O ' if not fitness.disk else ''} suggests increase."
            analysis = RightSizeAnalysis(resource.vm_size, False, reason)
        results[resource_id] = analysis

    return results
Пример #22
0
def df_grans_to_score(
        df_grans: pd.DataFrame,
        parts: List[str],
        type_equality='default'
) -> music21.stream.Score:

    score = music21.stream.Score()

    for i_part, name_part in enumerate(parts):

        part = music21.stream.Part()

        part.id = name_part

        obj_first = df_grans.loc[df_grans.index[0]][0]

        offset_first = df_grans.index[0][0]

        counter = 0

        obj_last = obj_first

        offset_last = offset_first

        for row in df_grans.itertuples():

            counter = counter + 1

            if counter == 1:
                continue

            index = row[0]
            index_beat = index[0]
            obj = row[1]

            if type_equality == 'absolute':

                if not utils.b_absolutely_equal(obj, obj_last):

                    dur = music21.duration.Duration(index_beat - offset_last)

                    offset = offset_last

                    part.insert(
                        offset,
                        get_struct_score(
                            obj_last,
                            name_part,
                            dur
                        )
                    )

                    obj_last = obj

                    offset_last = index_beat
            else:

                if obj != obj_last:
                    dur = music21.duration.Duration(index_beat - offset_last)

                    offset = offset_last

                    part.insert(
                        offset,
                        get_struct_score(
                            obj_last,
                            name_part,
                            dur
                        )
                    )

                    obj_last = obj

                    offset_last = index_beat

        # insert last

        part.insert(
            offset_last,
            get_struct_score(
                obj_last,
                name_part,
                music21.duration.Duration(
                    list(df_grans.itertuples())[-1][0][0] - offset_last
                )
            )
        )

        score.insert(i_part, part)

    return score
Пример #23
0
def generate_causal_graph(place_change_events: DataFrame,
                          transition_events: DataFrame, time_per_step: float):
    g = nx.DiGraph(
    )  # Nodes are occasions and edges leading in their prehensions

    # Add the initial state for each node as an occasion with no past
    initial_occasions = place_change_events.query('tstep == 0')
    for occ in initial_occasions.itertuples():
        g.add_node(Occasion(int(occ.num), occ.name,
                            occ.time))  # unit, state, time

    # Visit each transition and identify i) its output node and its 2 input nodes
    for trans in transition_events.itertuples():
        # row has: tstep, time, name, unit, neighbour & count

        # TODO: IS IT SAFE TO IGNORE THIS?
        # assert trans.count == 1  # Statistically likely to happen as simulations get more complex or are undersampled. Consider what to do if this occurs --Rob

        # Create new occasion in graph for this transition
        # output_state = trans.name[1]  # ab -> b
        prefix, input_state, output_state = expand_transition_name(
            trans.name)  # strings
        if math.isnan(trans.unit):
            print(f"*** {trans.unit} {output_state} {trans.time}")
            continue
        output_occasion = Occasion(int(trans.unit), output_state, trans.time)
        g.add_node(output_occasion)

        def choose_best_upstream_occasion(target_unit, target_state_name,
                                          source_time):
            query = f"num=={target_unit} & name=='{target_state_name}' & time<{source_time}"
            last_transition_time = place_change_events.query(
                query)['time'].max()
            if math.isnan(last_transition_time):
                #  Try including the source time
                query = f"num=={target_unit} & name=='{target_state_name}' & time=={source_time}"
                last_transition_time = place_change_events.query(
                    query)['time'].min()
                if math.isnan(last_transition_time):
                    #  Try including the step after
                    query = f"num=={target_unit} & name=='{target_state_name}' & time<={source_time + time_per_step}"
                    last_transition_time = place_change_events.query(
                        query)['time'].min()
            return Occasion(target_unit, target_state_name,
                            last_transition_time)

        # Determine local input node from same unit
        # state_name = trans.name[0]  # ab -> a
        local_input_occasion = choose_best_upstream_occasion(
            trans.unit, input_state, trans.time)
        g.add_edge(local_input_occasion, output_occasion)

        # Determine input node from neighbour
        # state_name = trans.name[1]  # ab -> b
        neighbour_input_occasion = choose_best_upstream_occasion(
            trans.neighbour, output_state, trans.time)
        g.add_edge(neighbour_input_occasion, output_occasion)

        # Determine input node from neighbour2 if set
        if not math.isnan(trans.neighbour2):
            # state_name = trans.name[1]  # ab -> b  # neighbour2 assumed pulling state forward (like neighbour)
            neighbour2_input_occasion = choose_best_upstream_occasion(
                trans.neighbour2, output_state, trans.time)
            g.add_edge(neighbour2_input_occasion, output_occasion)

    return g
Пример #24
0
def extractor(df: pd.DataFrame) -> dict:
    """
    Extract date and email address from dataframe using regular expression

    Args:
        ``df``: dataframe
            dataframe obtained from image ocr
    Returns:
            {
                "date":
                        {
                            "text" : ,
                            "bbox" : [x0, y0, x2, y2]
                        } ,

                "email" :
                    {
                        "text" :
                        "bbox": [x0, y0, x2, y2]
                    }
            }
    """
    data = []
    empty_dummy = {
        "date": {
            "text": None,
            "bbox": None
        },
        "email": {
            "text": None,
            "bbox": None
        },
    }

    if df.empty:
        return data.append(empty_dummy)

    # possible date and emial patterns
    email_pattern = r"(^[a-zA-Z0-9_.+-]+[@.][a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
    date_pattern = [
        r"([12]\d{3}[-/.](0[1-9]|1[0-2])[-/.](0[1-9]|[12]\d|3[01]))",
        r"(\d{2}[-/.]\d{2}[-/.]\d{4})",
    ]
    mobile_number_pattern = r'''(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)
                    [-\.\s]*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{3,4})'''

    for row in df.itertuples():
        # check if matches with date pattern
        for dp in date_pattern:
            date_match = re.match(dp, row.Text)
            if date_match:
                logger.debug(f'Date match : {date_match}')
                d = {
                    "date": {
                        "text": date_match[0],
                        "bbox": [row.x0, row.y0, row.x2, row.y2],
                    }
                }
                data.append(d)
        # check if matches with email pattern
        email_match = re.match(email_pattern, row.Text)
        if email_match:
            d = {
                "email": {
                    "text": email_match[0],
                    "bbox": [row.x0, row.y0, row.x2, row.y2],
                }
            }
            data.append(d)

        # check if matches number pattern
        number_match = re.findall(re.compile(mobile_number_pattern), row.Text)
        if number_match:
            d = {
                "number": {
                    "text": number_match[0],
                    "bbox": [row.x0, row.y0, row.x2, row.y2],
                }
            }
            data.append(d)

    return data
Пример #25
0
def analyze_df(df: pd.DataFrame, strategy: dict):
    """Analyzes the dataframe and runs sort of a market simulation, entering and exiting positions

    Parameters
    ----------
        df, dataframe from process_dataframe after the actions have been added
        strategy: dict, contains instructions on when to enter/exit trades

    Returns
    -------
        df, returns a dataframe with the new rows processed
    """
    in_trade = False
    last_base = float(strategy["base_balance"])
    commission = float(strategy["commission"])
    last_aux = 0.0
    new_total_value = last_base

    aux_list = []
    base_list = []
    total_value_list = []
    in_trade_list = []
    fee_list = []

    for row in df.itertuples():
        close = row.close
        curr_action = row.action
        fee = 0

        if curr_action == "e" and not in_trade:
            # this means we should enter the trade
            last_aux = convert_base_to_aux(last_base, close)
            fee = calculate_fee(last_aux, commission)

            last_aux = last_aux - fee
            new_total_value = convert_aux_to_base(last_aux, close)

            # should be extremely close to 0
            last_base = round(last_base - new_total_value, 8)
            in_trade = True

        if curr_action == "x" and in_trade:
            last_base = convert_aux_to_base(last_aux, close)
            fee = calculate_fee(last_base, commission)
            last_base = last_base - fee
            last_aux = convert_base_to_aux(last_base, close)
            new_total_value = last_base

            in_trade = False

        aux_list.append(last_aux)
        base_list.append(last_base)
        total_value_list.append(new_total_value)
        in_trade_list.append(in_trade)
        fee_list.append(fee)

    if strategy.get("exit_on_end") and in_trade:
        last_base = convert_aux_to_base(last_aux, close)
        last_aux = convert_base_to_aux(last_base, close)
        new_date = df.index[-1] + timedelta(minutes=1)

        df = df.append(pd.DataFrame(index=[new_date]))

        aux_list.append(last_aux)
        base_list.append(last_base)
        total_value_list.append(new_total_value)
        in_trade_list.append(in_trade)
        fee_list.append(False)

    df["aux"] = aux_list
    df["base"] = base_list
    df["total_value"] = total_value_list
    df["in_trade"] = in_trade_list
    df["fee"] = fee_list

    return df
Пример #26
0
 def df2doc_gen(self: Document, df: pd.DataFrame):
     for item in df.itertuples():
         self.__init__(*item)
         yield self
Пример #27
0
def convert_df_to_conv_ai_dict(df: pd.DataFrame,
                               personality: List[str],
                               response_columns: List[str],
                               tokenizer: Callable[[str], List[str]],
                               max_tokens: Optional[int] = None,
                               n_candidates: int = 6) -> Dict[str, List[Any]]:
    """
    Each entry in personachat is a dict with two keys personality and utterances, the dataset is a list of entries.
    personality:  list of strings containing the personality of the agent
    utterances: list of dictionaries, each of which has two keys which are lists of strings.
        candidates: [next_utterance_candidate_1, ..., next_utterance_candidate_19]
            The last candidate is the ground truth response observed in the conversational data
        history: [dialog_turn_0, ... dialog_turn N], where N is an odd number since the other user starts every conversation.
    Preprocessing:
        - Spaces before periods at end of sentences
        - everything lowercase

    Process each row of a DataFrame.  For each row:
    1. Grab the conversational input text
    2. Grab A the responses
    3. Create a unique data entry for each response to the question.
    4. Sample random response sentences from the dataset.
    5. Combine the random responses into a candidate list.

    Args:
        df: The counsel chat pandas dataframe
        personality: The personality we would like to use during training
        response_columns: Columns which contain valid responses to the question.  For example,
            the answerText column is the complete response of the therapist
        tokenizer: The transformers library tokenizer associated with the model we will be
            training.  It is used for setting the maximum sequence length
        max_tokens: The maximum number of tokens that any candidate, response, or question should be.
        n_candidates: The number of candidate phrases to include in the dataset for training.
            The last member of candidates is the ground truth response

    Returns:
        A dictionary with a train and validation key.
    """
    # Add one because the index of the dataframe is the 0th position.
    tuple_map = {
        name: index + 1
        for index, name in enumerate(df.columns.tolist())
    }

    train = []
    val = []
    # Step through every row in the dictionary
    for row in df.itertuples():

        # Get the question name and title
        # TODO:: MAKE THIS GENERAL YOU DUMB DUMB
        question_title = row[tuple_map["questionTitle"]]
        question_text = row[tuple_map["questionText"]]
        question_combined = question_title + " " + question_text

        # Step through every response column in the row
        for response_column in response_columns:

            # Get the true response
            true_response = row[tuple_map[response_column]]

            # We only want to add data if a good response exists
            if len(true_response) > 1:
                # Get candidate alternate sentances by sampling from all other questions
                candidates = sample_candidates(df,
                                               row[tuple_map["questionID"]],
                                               "questionID", "answerText",
                                               n_candidates)

                # Add the correct response to the end
                candidates.append(true_response)

                # We want to trim the size of the tokens
                if max_tokens is not None:
                    # Use the provided tokenizer to tokenize the input and truncate at max_tokens
                    question_combined = tokenizer.convert_tokens_to_string(
                        tokenizer.tokenize(question_combined)[:max_tokens])
                    candidates = [
                        tokenizer.convert_tokens_to_string(
                            tokenizer.tokenize(candidate)[:max_tokens])
                        for candidate in candidates
                    ]

                if len(candidates) != n_candidates + 1:
                    print(true_response)
                    assert False

                # Define the personality and the history
                d = {
                    "personality":
                    personality,
                    "utterances": [{
                        "history": [question_combined],
                        "candidates": candidates
                    }]
                }
                if getattr(row, "split") == "train":
                    train.append(d)
                elif getattr(row, "split") == "val":
                    val.append(d)

    data = {"train": train, "valid": val}

    return data
Пример #28
0
        def load_years(years):
            for year in years:
                print 'loading %s' % year
                pop = DataFrame(index=['state', 'county'])
                column = 'popestimate%s' % year

                #create a DataFrame for each series of population estimates:
                #total, male, and female
                query = ("PopulationEst%sRaw.objects.values('state')"
                         ".filter(gender='0',ethnic_origin='0')"
                         ".annotate(population=Sum(column))" % args[0])
                total_pop = eval(query)
                total_pop = DataFrame.from_records(total_pop, index=['state'])
                total_pop.columns = ['total']
                if np.isnan(total_pop.sum()):
                    #No data yet for the current year, which means no data yet
                    #for future years in the decade, so stop right here
                    print 'No data for year %s. Stopping load.' % year
                    return 0
                query = ("PopulationEst%sRaw.objects.values('state')"
                         ".filter(gender='1',ethnic_origin='0')"
                         ".annotate(population=Sum(column))" % args[0])
                male_pop = eval(query)
                male_pop = DataFrame.from_records(male_pop, index=['state'])
                male_pop.columns = ['male']
                query = ("PopulationEst%sRaw.objects.values('state')"
                         ".filter(gender='2',ethnic_origin='0')"
                         ".annotate(population=Sum(column))" % args[0])
                female_pop = eval(query)
                female_pop = DataFrame.from_records(female_pop,
                                                    index=['state'])
                female_pop.columns = ['female']

                #merge the total, male, and female DataFrames into final, master df
                pop = pd.merge(pop,
                               total_pop,
                               how='right',
                               left_index=True,
                               right_index=True)
                pop = pd.merge(pop,
                               male_pop,
                               how='right',
                               left_index=True,
                               right_index=True)
                pop = pd.merge(pop,
                               female_pop,
                               how='right',
                               left_index=True,
                               right_index=True)

                #calculate male and female percentages and merge those in, too
                male_percent = DataFrame(pop.apply(
                    lambda row: row['male'] * 1.0 / row['total'] * 100,
                    axis=1),
                                         columns=['male_percent'])
                pop = pd.merge(pop,
                               male_percent,
                               left_index=True,
                               right_index=True)
                female_percent = DataFrame(pop.apply(
                    lambda row: row['female'] * 1.0 / row['total'] * 100,
                    axis=1),
                                           columns=['female_percent'])
                pop = pd.merge(pop,
                               female_percent,
                               left_index=True,
                               right_index=True)

                #add DataFrame contents to database
                #DataFrame is indexed by state code
                #i.e., p[0] = state code
                for p in pop.itertuples():
                    state_id = states['id'][p[0]]
                    try:
                        record = PopulationGenderState.objects.get(
                            state=state_id, year=year)
                    except:
                        record = PopulationGenderState()
                        record.state_id = state_id
                        record.year = year
                    record.total = p[1]
                    record.male = p[2]
                    record.female = p[3]
                    record.male_percent = str(p[4])
                    record.female_percent = str(p[5])
                    record.save()
                    db.reset_queries()
Пример #29
0
    def plot_genes(self, ax, gr: GenomeRange, ov_genes: pd.DataFrame, dry_run = False, fig_width = None):
        properties = self.properties
        self.__set_plot_params(gr, ov_genes)

        assert (not dry_run) or (fig_width is not None)
        if dry_run:
            self.__get_length_w(fig_width, gr.start, gr.end)
        else:
            self.__get_length_w(ax.get_figure().get_figwidth(), gr.start, gr.end)

        num_rows = properties['num_rows']
        max_num_row_local = 1
        max_ypos = 0
        # check for the number of other intervals that overlap
        #    with the given interval
        #            1         2
        #  012345678901234567890123456
        #  1=========       4=========
        #       2=========
        #         3============
        #
        # for 1 row_last_position = [9]
        # for 2 row_last_position = [9, 14]
        # for 3 row_last_position = [9, 14, 19]
        # for 4 row_last_position = [26, 14, 19]

        row_last_position = []  # each entry in this list contains the end position
        # of genomic interval. The list index is the row
        # in which the genomic interval was plotted.
        # Any new genomic interval that wants to be plotted,
        # knows the row to use by finding the list index that
        # is larger than its start

        # check for overlapping genes including
        # label size (if plotted)

        for bed in ov_genes.itertuples():
            """
            BED12 gene format with exon locations at the end
            chrX    20850   23076   CG17636-RA      0       -       20850   23017   0       3       946,765,64,     0,1031,2162,

            BED9
            bed with rgb at end
            chr2L   0       70000   ID_5    0.26864549832   .       0       70000   51,160,44

            BED6
            bed without rgb
            chr2L   0       70000   ID_5    0.26864549832   .

            BED3
            bed with only intervals
            chr2L  0        70000
            """
            self.counter += 1

            if self.is_draw_labels:
                num_name_characters = len(bed.name) + 2  # +2 to account for an space before and after the name
                bed_extended_end = int(bed.end + (num_name_characters * self.len_w))
            else:
                bed_extended_end = (bed.end + 2 * self.small_relative)

            # get smallest free row
            if not row_last_position:
                free_row = 0
                row_last_position.append(bed_extended_end)
            else:
                # get list of rows that are less than bed.start, then take the min
                idx_list = [idx for idx, value in enumerate(row_last_position) if value < bed.start]
                if len(idx_list):
                    free_row = min(idx_list)
                    row_last_position[free_row] = bed_extended_end
                else:
                    free_row = len(row_last_position)
                    row_last_position.append(bed_extended_end)

            rgb, edgecolor = self.get_rgb_and_edge_color(bed)

            ypos = self.get_y_pos(free_row)

            # do not plot if the maximum interval rows to plot is reached
            if num_rows and free_row >= float(num_rows):
                continue

            if free_row > max_num_row_local:
                max_num_row_local = free_row
            if ypos > max_ypos:
                max_ypos = ypos
            
            if not dry_run:
                if properties['bed_type'] == 'bed12':
                    if properties['gene_style'] == 'flybase':
                        self.draw_gene_with_introns_flybase_style(ax, bed, ypos, rgb, edgecolor)
                    else:
                        self.draw_gene_with_introns(ax, bed, ypos, rgb, edgecolor)
                else:
                    self.draw_gene_simple(ax, bed, ypos, rgb, edgecolor)

                if self.is_draw_labels and bed.start > gr.start and bed.end < gr.end:
                    ax.text(bed.end + self.small_relative,
                            ypos + (float(properties['interval_height']) / 2),
                            bed.name,
                            horizontalalignment='left',
                            verticalalignment='center',
                            fontproperties=self.fp)

        if self.counter == 0:
            log.debug(f"*Warning* No intervals were found for file {properties['file']} "
                        f"in Track \'{properties['name']}\' for the interval plotted ({gr}).\n")

        ymax = 0
        if num_rows:
            ymin = float(num_rows) * self.row_scale
            self.current_row_num = num_rows
        else:
            ymin = max_ypos + properties['interval_height']
            self.current_row_num = len(row_last_position)

        log.debug("ylim {},{}".format(ymin, ymax))
        # the axis is inverted (thus, ymax < ymin)
        if not dry_run:
            ax.set_ylim(ymin, ymax)

            if properties['display'] == 'collapsed':
                ax.set_ylim(-5, 105)

            ax.set_xlim(gr.start, gr.end)
Пример #30
0
    def _build_graphviz_obj(self, show_ifnames: bool, df: pd.DataFrame):
        '''Return a graphviz object'''

        graph_attr = {'splines': 'polyline', 'layout': 'dot'}
        if show_ifnames:
            graph_attr.update({'nodesep': '1.0'})

        g = graphviz.Digraph(graph_attr=graph_attr,
                             name='Hover over arrow head for edge info')

        hostset = set()
        for hostgroup in df.groupby(by=['hopCount']) \
                           .hostname.unique().tolist():
            with g.subgraph() as s:
                s.attr(rank='same')
                for hostname in hostgroup:
                    if hostname in hostset:
                        continue
                    hostset.add(hostname)
                    debugURL = '&amp;'.join([
                        f'{get_base_url()}?page={quote("Path-Debug")}',
                        'lookupType=hop',
                        f'namespace={quote(df.namespace[0])}',
                        f'session={quote(get_session_id())}',
                        f'hostname={quote(hostname)}',
                    ])
                    tooltip, color = self._get_node_tooltip_color(hostname)
                    s.node(hostname,
                           tooltip=tooltip,
                           color=color,
                           URL=debugURL,
                           target='_graphviz',
                           shape='box')

        pathid = 0
        prevrow = None
        connected_set = set()

        df['nextPathid'] = df.pathid.shift(-1).fillna('0').astype(int)
        for row in df.itertuples():
            if row.pathid != pathid:
                prevrow = row
                pathid = row.pathid
                continue
            conn = (prevrow.hostname, row.hostname)
            if conn not in connected_set:
                if row.overlay:
                    path_type = 'underlay'
                    color = 'purple'
                elif prevrow.isL2:
                    path_type = 'l2'
                    color = 'blue'
                else:
                    path_type = 'l3'
                    color = 'black'

                if not row.mtuMatch:
                    color = 'red'
                    error = 'MTU mismatch'
                    err_pfx = ', '
                else:
                    error = ''
                    err_pfx = ''

                tdf = pd.DataFrame({
                    'pathType': path_type,
                    'protocol': [prevrow.protocol],
                    'ipLookup': [prevrow.ipLookup],
                    'vtepLookup': [prevrow.vtepLookup],
                    'macLookup': [prevrow.macLookup],
                    'nexthopIp': [prevrow.nexthopIp],
                    'vrf': [prevrow.vrf],
                    'mtu': [f'{prevrow.outMtu} -> {row.inMtu}'],
                    'oif': [prevrow.oif],
                    'iif': [row.iif]
                })
                rowerr = getattr(prevrow, 'error', '')
                if rowerr:
                    error += f"{err_pfx}{rowerr}"
                if error:
                    err_pfx = ', '
                if row.nextPathid != row.pathid:
                    # We need to capture any errors on the dest node as well
                    destnode_error = getattr(row, 'error', '')
                    if destnode_error:
                        error += f'{err_pfx}{destnode_error}'

                if error:
                    tdf['error'] = error
                    color = 'red'
                tooltip = '\n'.join(
                    tdf.T.to_string(justify='right').split('\n')[1:])
                debugURL = '&amp;'.join([
                    f'{get_base_url()}?page={quote("Path-Debug")}',
                    'lookupType=edge',
                    f'namespace={quote(row.namespace)}',
                    f'session={quote(get_session_id())}',
                    f'hostname={quote(prevrow.hostname)}',
                    f'vrf={quote(prevrow.vrf)}',
                    f'vtepLookup-{prevrow.vtepLookup}',
                    f'ifhost={quote(row.hostname)}',
                    f'ipLookup={quote(prevrow.ipLookup)}',
                    f'oif={quote(prevrow.oif)}',
                    f'macaddr={quote(prevrow.macLookup or "")}',
                    f'nhip={quote(prevrow.nexthopIp)}',
                ])
                if show_ifnames:
                    g.edge(
                        prevrow.hostname,
                        row.hostname,
                        color=color,
                        label=str(row.hopCount),
                        URL=debugURL,
                        edgetarget='_graphviz',
                        tooltip=tooltip,
                        taillabel=prevrow.oif,
                        headlabel=row.iif,
                        penwidth='2.0',
                    )
                else:
                    g.edge(prevrow.hostname,
                           row.hostname,
                           color=color,
                           label=str(row.hopCount),
                           URL=debugURL,
                           edgetarget='_graphviz',
                           penwidth='2.0',
                           tooltip=tooltip)

                connected_set.add(conn)
            prevrow = row
        df.drop(columns=['nextPathid'], inplace=True, errors='ignore')
        return g
Пример #31
0
def c_backtester(
    data: pd.DataFrame,
    sl_atr: float = 50,
    trailing_sl: bool = True,
    active_close: bool = False,
    block_stop: bool = True,
    take_profit: int = 0,
) -> pd.DataFrame:
    """
    Consecutive (event driven) backtester.

    Given df with 'signal' (-1 short, 0 out, 1 long) return df with 'position',
    taking into account:
       - position can be taken on the next row after signal is generated
       - stop-losse (either relative to entry or high water mark)
       - filtered_signal if given allows for additional condition that must
         be met to initiate position. Positions are closed regardless of
         filter.

    Args:
        data:        must have columns: 'price', 'close', 'signal', 'atr';
                     'filtered_signal' is optional, if not given,
                     filtred_signal = signal
                     'price' used for transactions
                     'close' to decide whether stop-loss has been triggered
        sl_atr:      stop-loss distance in multiples of ATRs
                     (if no stop loss required use very high number,
                      default 50)
        trailing_sl: if True, stop-loss calculated off high watermark,
                     if False, entry price
        active_close: if True close signal is the signal opposite to the
                      direction of the position, if False close signal is
                      lack of signal in
                      the direction of the position
        block_stop:  if True, after stop loss no position will be entered in
                     the same same direction as stoped out position until
                     opposite signal is generated
        take_profit: take profit distance expressed as multiple of stop-loss
                     distance, 0 means no take profit

    Returns:
        DataFrame with column 'position' to be processed by another function.
    """

    for c in ['price', 'close', 'signal', 'atr']:
        assert c in data.columns, f"'{c}' is a required column"

    data = data.copy()

    # while in position maintain open price and transaction direction
    data['position'] = 0
    # flag to execute transaction at next data point
    data['mark'] = False
    # note the reason for transaction at next data point
    data['reason'] = ''
    # record transaction price
    data['t_price'] = 0
    # entry price for stop loss calculation
    data['entry'] = 0
    # for stop-loss calculation
    data['high_water'] = 0
    # whether stop loss is trailing or fixed
    trailing_sl = trailing_sl
    # restrict re-entering positions after stop loss
    # (1=long positions blocked, -1=short positions blocked)
    block = 0

    if trailing_sl:
        sl_field = 'high_water'
    else:
        sl_field = 'entry'

    if 'date' not in data.columns:
        data.reset_index(inplace=True)

    if 'filtered_signal' not in data.columns:
        data['filtered_signal'] = data['signal']

    for item in data.itertuples():
        # first row doesn't have to check for positions or execute transactions
        if not item.Index == 0:
            # starting position is the same as previous day's position
            data.loc[item.Index, 'position'] = data.loc[(item.Index - 1),
                                                        'position']
            data.loc[item.Index, 'entry'] = data.loc[(item.Index - 1), 'entry']
            # execute transactions
            if data.loc[(item.Index - 1), 'mark']:
                # close position
                if data.loc[item.Index, 'position']:
                    data.loc[item.Index, 'position'] = 0
                    data.loc[item.Index, 'entry'] = 0
                    # record transaction price
                    data.loc[item.Index, 't_price'] = item.price * \
                        np.sign(data.loc[(item.Index - 1), 'entry']) * -1
                # open position
                else:
                    data.loc[item.Index,
                             'position'] = data.loc[(item.Index - 1), 'signal']
                    data.loc[item.Index, 'entry'] = item.price * \
                        data.loc[(item.Index - 1), 'signal']
                    # record transaction price and high water mark
                    data.loc[item.Index, 't_price'] = item.price * \
                        data.loc[(item.Index - 1), 'signal']
                    data.loc[item.Index, 'high_water'] = data.loc[item.Index,
                                                                  't_price']

        # update high water mark
        if not item.Index == 0:  # skip first row
            if data.loc[item.Index - 1, 'position'] != 0:
                data.loc[item.Index, 'high_water'] = max(
                    data.loc[item.Index - 1, 'high_water'],
                    item.close * data.loc[item.Index, 'position'])

        # check for close signal
        if active_close:
            if data.loc[item.Index, 'position'] != 0 and np.sign(
                    item.signal) != 0:
                if np.sign(data.loc[item.Index, 'position']) != np.sign(
                        item.signal):
                    data.loc[item.Index, 'mark'] = True
                    data.loc[item.Index, 'reason'] = 'close'

        # check for stop-loss signal
        # long positions
        if data.loc[item.Index, 'position'] > 0:
            if item.close <= (data.loc[item.Index, sl_field] -
                              (item.atr * sl_atr)):
                data.loc[item.Index, 'mark'] = True
                data.loc[item.Index, 'reason'] = 'stop-out'
                if block_stop:
                    block = 1
        # short positions
        if data.loc[item.Index, 'position'] < 0:
            if item.close >= abs(
                (data.loc[item.Index, sl_field] - (item.atr * sl_atr))):
                data.loc[item.Index, 'mark'] = True
                data.loc[item.Index, 'reason'] = 'stop-out'
                if block_stop:
                    block = -1

        # check for take profit
        if take_profit:
            # long positions
            if data.loc[item.Index, 'position'] > 0:
                if item.close >= (data.loc[item.Index, 'entry'] +
                                  (item.atr * sl_atr * take_profit)):
                    data.loc[item.Index, 'mark'] = True
                    data.loc[item.Index, 'reason'] = 'take-profit'
                    block = 1
            # short positions
            if data.loc[item.Index, 'position'] < 0:
                if item.close <= abs((data.loc[item.Index, 'entry'] +
                                      (item.atr * sl_atr * take_profit))):
                    data.loc[item.Index, 'mark'] = True
                    data.loc[item.Index, 'reason'] = 'take-profit'
                    block = -1

        # check for entry signal
        if data.loc[item.Index, 'position'] == 0:
            if item.filtered_signal != 0 and item.filtered_signal != block:
                data.loc[item.Index, 'mark'] = True
                data.loc[item.Index, 'reason'] = 'entry'
                block = 0

    data.set_index('date', inplace=True, drop=True)
    return data
Пример #32
0
 def load_years(years):
     for year in years:
         print 'loading %s' %  year
         pop = DataFrame(index=['state','county'])
         column = 'popestimate%s' % year
         
         #create a DataFrame for each series of population estimates:
         #total, male, and female
         query = ("PopulationEst%sRaw.objects.values('state')" 
             ".filter(gender='0',ethnic_origin='0')"  
             ".annotate(population=Sum(column))" % args[0])
         total_pop = eval(query) 
         total_pop = DataFrame.from_records(
             total_pop,
             index=['state'])
         total_pop.columns = ['total']
         if np.isnan(total_pop.sum()):
             #No data yet for the current year, which means no data yet
             #for future years in the decade, so stop right here
             print 'No data for year %s. Stopping load.' % year
             return 0
         query = ("PopulationEst%sRaw.objects.values('state')"
             ".filter(gender='1',ethnic_origin='0')"
             ".annotate(population=Sum(column))" % args[0])
         male_pop = eval(query)
         male_pop = DataFrame.from_records(
             male_pop,
             index=['state'])
         male_pop.columns = ['male']
         query = ("PopulationEst%sRaw.objects.values('state')"
             ".filter(gender='2',ethnic_origin='0')"
             ".annotate(population=Sum(column))" % args[0])
         female_pop = eval(query)
         female_pop = DataFrame.from_records(
             female_pop,
             index=['state'])
         female_pop.columns = ['female']
         
         #merge the total, male, and female DataFrames into final, master df
         pop = pd.merge(pop,
             total_pop,
             how='right',
             left_index=True,
             right_index=True)
         pop = pd.merge(pop,
             male_pop,
             how='right',
             left_index=True,
             right_index=True)
         pop = pd.merge(pop,
             female_pop,
             how='right',
             left_index=True,
             right_index=True)
         
         #calculate male and female percentages and merge those in, too
         male_percent = DataFrame(pop.apply(
             lambda row: row['male']*1.0/row['total']*100,axis=1),
             columns=['male_percent'])
         pop = pd.merge(pop,
             male_percent,
             left_index=True,
             right_index=True)
         female_percent = DataFrame(pop.apply(
             lambda row: row['female']*1.0/row['total']*100,axis=1),
             columns=['female_percent'])
         pop = pd.merge(pop,
             female_percent,
             left_index=True,
             right_index=True)
         
         #add DataFrame contents to database
         #DataFrame is indexed by state code
         #i.e., p[0] = state code
         for p in pop.itertuples():
             state_id = states['id'][p[0]]
             try:
                 record = PopulationGenderState.objects.get(
                     state = state_id,
                     year = year)
             except:
                 record = PopulationGenderState()
                 record.state_id = state_id
                 record.year = year
             record.total = p[1]
             record.male = p[2]
             record.female = p[3]
             record.male_percent = str(p[4])
             record.female_percent = str(p[5])
             record.save()
             db.reset_queries()
Пример #33
0
def read_from_db():
    #pan = c.execute('select artist,avg(score) from reviews group by artist order by avg(score) desc')
    #pan1 = c.execute('select reviewid, genre from genres group by genre ')
    #pan2 = c.execute('select artist, avg(score) from reviews group by artist having count(reviewid) > 4 order by avg(score) desc')
    #print(pan1.fetchall())
    #pan = c.execute('select * from reviews')
    #print(type(pan))
    #df = DataFrame(pan.fetchall())
    #df.columns = ['reviewid','title','artist','url','score','best_new_music','author','author_type','pub_date','pub_weekday','pub_day','pub_month','pub_year']
    #print(df.dtypes)
    '''df1 = df.groupby('artist').agg({'score':np.mean}).sort_values(by='score', ascending=False)
    df2 = df1.head(10)
    df3 = df1.tail(10)
    df4 = pd.concat([df2,df3])'''
    #data = c.fetchall()
    #print(df4)
    #plt.plot(arti)
    #print(df.head(10))
    #print(type(df))
    #print(data[0][1])

    #print("reviewid")
    #for row in c.fetchall():
        #print(row)
        #if(row[4]>9):
            #print(row[4])'''
    #for row1 in pan2.fetchall():
     #   print(row1)
    #g = plt.bar(pan2.tail(10)['artist'], pan2.tail(10)['avg(score)'])
    #h = DataFrame(pan2.fetchall())
    #g = h.head(10)
    #plt.bar(g.artist, g. avg(score))
    #plt.xlabel('artist', fontsize=5)
    #plt.ylabel('avg(score)', fontsize=5)
    #plt.xticks(index, label, fontsize=5, rotation=30)
    #plt.show()

    q1 = c.execute('select r.pub_year,g.genre,avg(r.score) from genres g, reviews r where g.reviewid = r.reviewid group by g.genre,r.pub_year order by r.pub_year,avg(r.score) desc')
    #print(q1.fetchall())
    df1 = DataFrame(q1.fetchall())
    #print(df1)
    q2 = c.execute('select g.genre,avg(r.score) from genres g, reviews r where g.reviewid = r.reviewid group by g.genre order by avg(r.score) desc')
    df2 = DataFrame(q2.fetchall())
    df2 = df2.drop(index=4)
    l = df2[0].tolist()
    y_pos = [i for i, _ in enumerate(l)]
    m = df2[1].tolist()

    g = sns.barplot(m, l, data=df2)
    #plt.xlim(6.5, 7.5)
    r = np.linspace(0, 9, 10)
    j = 0

    for row in df2.itertuples():
        g.text(x=row[2]+0.2, y=j, s='{:4.2f}'.format(row[2]), color='black', ha='center')
        j += 1


    #plt.bar(m,y_pos)
    #plt.yticks(y_pos, l)
    #plt.legend()
    #plt.xlabel('bar number')
    #plt.ylabel('bar height')

    #plt.title('Epic Graph\nAnother Line! Whoa')
    plt.tight_layout()
    plt.show()
Пример #34
0
def backtest(df: pd.DataFrame, settings: dict, price_precisions: dict = {}):
    start_quot = 1.0
    ppctminus = 1 - settings['profit_pct']
    ppctplus = 1 + settings['profit_pct']
    symbols = [c.replace('_low', '') for c in df.columns if 'low' in c]
    if not price_precisions:
        price_precisions = {s: 8 for s in symbols}
    lows = {s: f'{s}_low' for s in symbols}
    highs = {s: f'{s}_high' for s in symbols}
    means = {s: f'{s}_mean' for s in symbols}

    min_emas = {s: f'{s}_mean_min_ema' for s in symbols}
    max_emas = {s: f'{s}_mean_max_ema' for s in symbols}

    min_delay_millis = settings['min_seconds_between_same_side_entries'] * 1000

    rolling_millis = settings['max_memory_span_days'] * 24 * 60 * 60 * 1000

    s2c = {s: s.split('_')[0] for s in symbols}
    quot = symbols[0].split('_')[1]
    balance = {s2c[s]: 0.0 for s in s2c}
    balance[quot] = 1.0
    acc_equity_quot = 1.0
    acc_debt_quot = 0.0
    long_entries = {s: [] for s in symbols}
    shrt_entries = {s: [] for s in symbols}
    long_exits = {s: [] for s in symbols}
    shrt_exits = {s: [] for s in symbols}
    long_exit_price_list = {s: [] for s in symbols}
    shrt_exit_price_list = {s: [] for s in symbols}

    past_rolling_long_entries = {s: [] for s in symbols}
    past_rolling_shrt_entries = {s: [] for s in symbols}

    entry_bid = {s: round(df.iloc[0][means[s]], 8) for s in symbols}
    entry_ask = {s: round(df.iloc[0][means[s]], 8) for s in symbols}

    exit_bid = {s: entry_bid[s] for s in symbols}
    exit_ask = {s: entry_ask[s] for s in symbols}

    long_cost = {s: 0.0 for s in symbols}
    long_amount = {s: 0.0 for s in symbols}
    shrt_cost = {s: 0.0 for s in symbols}
    shrt_amount = {s: 0.0 for s in symbols}

    fee = 1 - 0.000675 # vip 1

    margin_level = 3 - 1

    balance_list = []

    do_shrt = {s for s in symbols if s2c[s] in settings['coins_shrt']}
    do_long = {s for s in symbols if s2c[s] in settings['coins_long']}

    exponent = settings['entry_vol_modifier_exponent']

    start_ts, end_ts = df.index[0], df.index[-1]
    ts_range = end_ts - start_ts

    for row in df.itertuples():
        cost = acc_equity_quot * settings['account_equity_pct_per_trade']
        min_exit_cost = cost * settings['min_big_trade_cost_multiplier']
        credit_avbl_quot = max(0.0, acc_equity_quot * margin_level - acc_debt_quot)
        age_limit = row.Index - rolling_millis
        for s in symbols:
            # rolling longs
            long_i = get_cutoff_index(past_rolling_long_entries[s], age_limit)
            if long_i > 0:
                slc = past_rolling_long_entries[s][:long_i]
                past_rolling_long_entries[s] = past_rolling_long_entries[s][long_i:]
                long_amount[s] -= sum([e['amount'] for e in slc])
                long_cost[s] -= sum([e['amount'] * e['price'] for e in slc])
                if long_cost[s] <= 0.0 or long_amount[s] <= 0.0:
                    long_cost[s] = 0.0
                    long_amount[s] = 0.0
                    past_rolling_long_entries[s] = []
                    exit_ask[s] = getattr(row, means[s])
                else:
                    exit_ask[s] = (long_cost[s] / long_amount[s]) * ppctplus

            # rolling shrts
            shrt_i = get_cutoff_index(past_rolling_shrt_entries[s], age_limit)
            if shrt_i > 0:
                slc = past_rolling_shrt_entries[s][:shrt_i]
                past_rolling_shrt_entries[s] = past_rolling_shrt_entries[s][shrt_i:]
                shrt_cost[s] -= sum([e['amount'] * e['price'] for e in slc])
                shrt_amount[s] -= sum([e['amount'] for e in slc])
                if shrt_cost[s] <= 0.0 or shrt_amount[s] <= 0.0:
                    shrt_cost[s] = 0.0
                    shrt_amount[s] = 0.0
                    past_rolling_shrt_entries[s] = []
                    exit_bid[s] = getattr(row, means[s])
                else:
                    exit_bid[s] = (shrt_cost[s] / shrt_amount[s]) * ppctminus

            if s in do_long and getattr(row, lows[s]) < entry_bid[s] and \
                    (not long_entries[s] or
                     (row.Index - long_entries[s][-1]['timestamp'] >= min_delay_millis)):
                # long buy
                long_modifier = max(
                    1.0, min(settings['min_big_trade_cost_multiplier'] - 1,
                             (exit_ask[s] / getattr(row, means[s]))**exponent))
                buy_cost = cost * long_modifier
                if balance[quot] >= buy_cost:
                    # long buy normal
                    buy_amount = (buy_cost / entry_bid[s])
                    balance[quot] -= buy_cost
                    balance[s2c[s]] += buy_amount * fee
                    long_entries[s].append({'price': entry_bid[s], 'amount': buy_amount,
                                            'timestamp': row.Index})
                    past_rolling_long_entries[s].append(long_entries[s][-1])
                    long_amount[s] += buy_amount
                    long_cost[s] += buy_cost
                    exit_ask[s] = (long_cost[s] / long_amount[s]) * ppctplus
                elif credit_avbl_quot > 0.0:
                    # long buy with credit
                    quot_avbl = max(0.0, balance[quot])
                    to_borrow = min(credit_avbl_quot, buy_cost - quot_avbl)
                    credit_avbl_quot -= to_borrow
                    partial_buy_cost = quot_avbl + to_borrow
                    buy_amount = (partial_buy_cost / entry_bid[s])
                    balance[quot] -= partial_buy_cost
                    balance[s2c[s]] += buy_amount * fee
                    long_entries[s].append({'price': entry_bid[s], 'amount': buy_amount,
                        'timestamp': row.Index})
                    past_rolling_long_entries[s].append(long_entries[s][-1])
                    long_amount[s] += buy_amount
                    long_cost[s] += partial_buy_cost
                    exit_ask[s] = (long_cost[s] / long_amount[s]) * ppctplus
            if s in do_shrt and getattr(row, highs[s]) > entry_ask[s] and \
                    (not shrt_entries[s] or
                     (row.Index - shrt_entries[s][-1]['timestamp'] >= min_delay_millis)):
                # shrt sel
                shrt_modifier = max(
                    1.0, min(settings['min_big_trade_cost_multiplier'] - 1,
                             (getattr(row, means[s]) / exit_bid[s])**exponent))
                sel_cost = cost * shrt_modifier
                sel_amount = sel_cost / entry_ask[s]
                if balance[s2c[s]] >= sel_amount:
                    # shrt sel normal
                    balance[s2c[s]] -= sel_amount
                    balance[quot] += sel_cost * fee
                    shrt_entries[s].append({'price': entry_ask[s], 'amount': sel_amount,
                                            'timestamp': row.Index})
                    past_rolling_shrt_entries[s].append(shrt_entries[s][-1])
                    shrt_amount[s] += sel_amount
                    shrt_cost[s] += sel_cost
                    exit_bid[s] = (shrt_cost[s] / shrt_amount[s]) * ppctminus
                elif credit_avbl_quot > 0.0:
                    # shrt sel with credit
                    coin_avbl = max(0.0, balance[s2c[s]])
                    to_borrow = min(credit_avbl_quot / entry_ask[s], sel_amount - coin_avbl)
                    credit_avbl_quot -= (to_borrow * entry_ask[s])
                    partial_sel_amount = coin_avbl + to_borrow
                    balance[s2c[s]] -= partial_sel_amount
                    partial_sel_cost = partial_sel_amount * entry_ask[s]
                    balance[quot] += partial_sel_cost * fee
                    shrt_entries[s].append({'price': entry_ask[s], 'amount': partial_sel_amount,
                                            'timestamp': row.Index})
                    past_rolling_shrt_entries[s].append(shrt_entries[s][-1])
                    shrt_amount[s] += partial_sel_amount
                    shrt_cost[s] += partial_sel_cost
                    exit_bid[s] = (shrt_cost[s] / shrt_amount[s]) * ppctminus

            exit_ask[s] = round_up(exit_ask[s], price_precisions[s])
            exit_bid[s] = round_dn(exit_bid[s], price_precisions[s])

            if long_cost[s] > min_exit_cost:
                # long sel
                long_exit_price_list[s].append({'price': exit_ask[s], 'timestamp': row.Index})
                if getattr(row, highs[s]) > exit_ask[s]:
                    if balance[s2c[s]] >= long_amount[s]:
                        # long sel normal
                        long_sel_amount = max(balance[s2c[s]], long_amount[s])
                        long_exits[s].append({'price': exit_ask[s], 'amount': long_sel_amount,
                                              'timestamp': row.Index})
                        quot_acquired = long_sel_amount * exit_ask[s]
                        balance[s2c[s]] -= long_sel_amount
                        balance[quot] += quot_acquired * fee
                        long_amount[s] = 0.0
                        long_cost[s] = 0.0
                    else:
                        # partial long sel
                        coin_avbl = max(0.0, balance[s2c[s]])
                        to_borrow = min(credit_avbl_quot / exit_ask[s], long_amount[s] - coin_avbl)
                        partial_sel_amount = coin_avbl + to_borrow
                        if partial_sel_amount > 0.0:
                            credit_avbl_quot -= (to_borrow * exit_ask[s])
                            balance[s2c[s]] -= partial_sel_amount
                            partial_sel_cost = partial_sel_amount * exit_ask[s]
                            balance[quot] += partial_sel_cost * fee
                            long_exits[s].append({'price': exit_ask[s],
                                                  'amount': partial_sel_amount,
                                                  'timestamp': row.Index})
                            long_amount[s] -= partial_sel_amount
                            long_cost[s] -= partial_sel_cost
                    if long_amount[s] <= 0.0 or long_cost[s] <= 0.0:
                        long_amount[s] = 0.0
                        long_cost[s] = 0.0
                        past_rolling_long_entries[s] = []
            if shrt_cost[s] > min_exit_cost:
                shrt_exit_price_list[s].append({'price': exit_bid[s], 'timestamp': row.Index})
                if getattr(row, lows[s]) < exit_bid[s]:
                    # shrt buy
                    shrt_buy_cost = shrt_amount[s] * exit_bid[s]
                    if balance[quot] >= shrt_buy_cost:
                        # shrt buy normal
                        shrt_buy_cost = max(shrt_buy_cost,
                                            min(balance[quot], -balance[s2c[s]] * exit_bid[s]))
                        shrt_buy_amount = shrt_buy_cost / exit_bid[s]
                        shrt_exits[s].append({'price': exit_bid[s], 'amount': shrt_buy_amount,
                                              'timestamp': row.Index})
                        balance[quot] -= shrt_buy_cost
                        balance[s2c[s]] += shrt_buy_amount * fee
                        shrt_amount[s] = 0.0
                        shrt_cost[s] = 0.0
                    else:
                        # partial shrt buy
                        quot_avbl = max(0.0, balance[quot])
                        to_borrow = min(credit_avbl_quot, shrt_buy_cost - quot_avbl)
                        partial_sel_cost = quot_avbl + to_borrow
                        if partial_sel_cost > 0.0:
                            coin_acquired = partial_sel_cost / exit_bid[s]
                            shrt_exits[s].append({'price': exit_bid[s], 'amount': coin_acquired,
                                                  'timestamp': row.Index})
                            credit_avbl_quot -= to_borrow
                            balance[quot] -= partial_sel_cost
                            balance[s2c[s]] += coin_acquired * fee
                            shrt_amount[s] -= coin_acquired
                            shrt_cost[s] -= partial_sel_cost
                    if shrt_amount[s] <= 0.0 or shrt_cost[s] <= 0.0:
                        shrt_amount[s] = 0.0
                        shrt_cost[s] = 0.0
                        past_rolling_shrt_entries[s] = []

            entry_bid[s] = round_dn(
                min(getattr(row, means[s]), getattr(row, min_emas[s])), price_precisions[s])
            entry_ask[s] = round_up(
                max(getattr(row, means[s]), getattr(row, max_emas[s])), price_precisions[s])

        acc_equity_quot = \
            balance[quot] + sum([balance[s2c[s]] * getattr(row, means[s]) for s in symbols])
        balance_list.append({**{s2c[s]: balance[s2c[s]] * getattr(row, means[s]) for s in symbols},
                             **{'acc_equity_quot': acc_equity_quot, 'timestamp': row.Index,
                                quot: balance[quot]}})
        acc_debt_quot = -sum([balance_list[-1][c] for c in balance if balance_list[-1][c] < 0.0])
        balance_list[-1]['acc_debt_quot'] = acc_debt_quot
        if row.Index % 86400000 == 0 or row.Index >= end_ts:
            n_millis = row.Index - start_ts
            line = f'\r{(n_millis / ts_range) * 100:.2f}% '
            line += f'acc equity quot: {acc_equity_quot:.6f}  '
            n_days = n_millis / 1000 / 60 / 60 / 24
            line += f'avg daily gain: {acc_equity_quot**(1/n_days):6f} '
            line += f'cost {cost:.8f} '
            sys.stdout.write(line)
            sys.stdout.flush()
    return balance_list, long_entries, shrt_entries, long_exits, shrt_exits, \
        long_exit_price_list, shrt_exit_price_list
Пример #35
0
def get_matches(item, config_sheet: pd.DataFrame):
    return (Match(rule, rule.input_re.match(item))
            for rule in config_sheet.itertuples())
Пример #36
0
def df_rich_text(df: pd.DataFrame) -> str:
    return rich_text_table(
        df.itertuples(index=False),
        column_headers=df.columns,
        row_headers=df.index
    )
Пример #37
0
#pmml = minidom.parse('single_audit_logreg.pmml')
pmml = minidom.parse('lr.pmml')

root = pmml.documentElement

model = root.getElementsByTagName('GeneralRegressionModel')[0]
nameNodeList = model.getElementsByTagName("RegressionTable")

np = nameNodeList[0].getElementsByTagName("NumericPredictor")

for l in np:
    p = l.parentNode
    p.removeChild(l)

for row in df.itertuples():
    #print(type(row.Index))
    #print(row._1)
    if row.Index == 'intercept':
        nameNodeList[0].setAttribute('intercept', str(row._1))
        break

for index, row in df.iterrows():
    if index == 'intercept':
        continue
    newEle = pmml.createElement("NumericPredictor")
    newEle.setAttribute("name", index)
    newEle.setAttribute("exponent", "1")
    newEle.setAttribute("coefficient", str(row[0]))
    nameNodeList[0].appendChild(newEle)
Пример #38
0
class Iteration:
    # mem_itertuples_* benchmarks are slow
    timeout = 120

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])
        self.df4 = DataFrame(np.random.randn(N * 1000, 10))

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples_start(self):
        self.df4.itertuples()

    def time_itertuples_read_first(self):
        next(self.df4.itertuples())

    def time_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def time_itertuples_to_list(self):
        list(self.df4.itertuples())

    def mem_itertuples_start(self):
        return self.df4.itertuples()

    def peakmem_itertuples_start(self):
        self.df4.itertuples()

    def mem_itertuples_read_first(self):
        return next(self.df4.itertuples())

    def peakmem_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def mem_itertuples_to_list(self):
        return list(self.df4.itertuples())

    def peakmem_itertuples_to_list(self):
        list(self.df4.itertuples())

    def time_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def time_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def time_itertuples_raw_tuples(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def time_itertuples_raw_tuples_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def mem_itertuples_raw_start(self):
        return self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def mem_itertuples_raw_to_list(self):
        return list(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Пример #39
0
    def plot_genes(self, ax, gr: GenomeRange, ov_genes: pd.DataFrame):
        properties = self.properties
        # bed_type
        self.properties['bed_type'] = properties[
            'bed_type'] or self.infer_bed_type(ov_genes)
        # as min_score and max_score change every plot, we compute them for every plot
        min_score, max_score = properties['min_score'], properties['max_score']
        has_score_col = properties['bed_type'] in ('bed6', 'bed9', 'bed12')
        if has_score_col and len(ov_genes):
            min_score = (min_score != 'inf') or ov_genes['score'].min()
            max_score = (max_score != '-inf') or ov_genes['score'].max()
        min_score, max_score = float(min_score), float(max_score)

        # set colormap
        if self.colormap is not None:
            norm = matplotlib.colors.Normalize(vmin=min_score, vmax=max_score)
            cmap = matplotlib.cm.get_cmap(properties['color'])
            self.colormap = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap)
        if properties['color'] == 'bed_rgb' and properties['bed_type'] not in [
                'bed12', 'bed9'
        ]:
            log.warning(
                "*WARNING* Color set to 'bed_rgb', but bed file does not have the rgb field. The color has "
                "been set to {}".format(self.COLOR))
            self.properties['color'] = self.COLOR
            self.colormap = None

        self.counter = 0
        self.small_relative = 0.004 * (gr.end - gr.start)
        self.get_length_w(ax.get_figure().get_figwidth(), gr.start, gr.end)
        # turn labels off when too many intervals are visible.
        if properties['labels'] == 'on' and len(ov_genes) > 60:
            self.is_draw_labels = False

        num_rows = properties['num_rows']
        max_num_row_local = 1
        max_ypos = 0
        # check for the number of other intervals that overlap
        #    with the given interval
        #            1         2
        #  012345678901234567890123456
        #  1=========       4=========
        #       2=========
        #         3============
        #
        # for 1 row_last_position = [9]
        # for 2 row_last_position = [9, 14]
        # for 3 row_last_position = [9, 14, 19]
        # for 4 row_last_position = [26, 14, 19]

        row_last_position = [
        ]  # each entry in this list contains the end position
        # of genomic interval. The list index is the row
        # in which the genomic interval was plotted.
        # Any new genomic interval that wants to be plotted,
        # knows the row to use by finding the list index that
        # is larger than its start

        # check for overlapping genes including
        # label size (if plotted)

        for bed in ov_genes.itertuples():
            """
            BED12 gene format with exon locations at the end
            chrX    20850   23076   CG17636-RA      0       -       20850   23017   0       3       946,765,64,     0,1031,2162,

            BED9
            bed with rgb at end
            chr2L   0       70000   ID_5    0.26864549832   .       0       70000   51,160,44

            BED6
            bed without rgb
            chr2L   0       70000   ID_5    0.26864549832   .

            BED3
            bed with only intervals
            chr2L  0        70000
            """
            self.counter += 1

            if self.is_draw_labels:
                num_name_characters = len(
                    bed.name
                ) + 2  # +2 to account for an space before and after the name
                bed_extended_end = int(bed.end +
                                       (num_name_characters * self.len_w))
            else:
                bed_extended_end = (bed.end + 2 * self.small_relative)

            # get smallest free row
            if not row_last_position:
                free_row = 0
                row_last_position.append(bed_extended_end)
            else:
                # get list of rows that are less than bed.start, then take the min
                idx_list = [
                    idx for idx, value in enumerate(row_last_position)
                    if value < bed.start
                ]
                if len(idx_list):
                    free_row = min(idx_list)
                    row_last_position[free_row] = bed_extended_end
                else:
                    free_row = len(row_last_position)
                    row_last_position.append(bed_extended_end)

            rgb, edgecolor = self.get_rgb_and_edge_color(bed)

            ypos = self.get_y_pos(free_row)

            # do not plot if the maximum interval rows to plot is reached
            if num_rows and free_row >= float(num_rows):
                continue

            if free_row > max_num_row_local:
                max_num_row_local = free_row
            if ypos > max_ypos:
                max_ypos = ypos

            if properties['bed_type'] == 'bed12':
                if properties['gene_style'] == 'flybase':
                    self.draw_gene_with_introns_flybase_style(
                        ax, bed, ypos, rgb, edgecolor)
                else:
                    self.draw_gene_with_introns(ax, bed, ypos, rgb, edgecolor)
            else:
                self.draw_gene_simple(ax, bed, ypos, rgb, edgecolor)

            if self.is_draw_labels and bed.start > gr.start and bed.end < gr.end:
                ax.text(bed.end + self.small_relative,
                        ypos + (float(properties['interval_height']) / 2),
                        bed.name,
                        horizontalalignment='left',
                        verticalalignment='center',
                        fontproperties=self.fp)

        if self.counter == 0:
            log.warning(
                f"*Warning* No intervals were found for file {properties['file']} "
                f"in Track \'{properties['name']}\' for the interval plotted ({gr}).\n"
            )

        ymax = 0
        if num_rows:
            ymin = float(num_rows) * self.row_scale
        else:
            ymin = max_ypos + properties['interval_height']

        log.debug("ylim {},{}".format(ymin, ymax))
        # the axis is inverted (thus, ymax < ymin)
        ax.set_ylim(ymin, ymax)

        if properties['display'] == 'domain':
            ax.set_ylim(-5, 205)
        elif properties['display'] == 'collapsed':
            ax.set_ylim(-5, 105)

        ax.set_xlim(gr.start, gr.end)
Пример #40
0
def pandas_to_ped(
        ped_pd: pd.DataFrame
):
    """
    Creates a Hail Pedigree object from trios stored as rows in a DataFrame.
    Input columns should contain 'fam_id', 's', 'is_female', 'pat_id', 'mat_id'

    :param DataFrame ped_pd: Input DataFrame
    :return: Pedigree
    :rtype: Pedigree
    """
    return hl.Pedigree([hl.Trio(s=row.s, is_female=row.is_female, pat_id=row.pat_id, mat_id=row.mat_id, fam_id=str(row.fam_id)) for row in ped_pd.itertuples()])
Пример #41
0
class Iteration:
    # mem_itertuples_* benchmarks are slow
    timeout = 120

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=["C" + str(c) for c in range(N * 5)])
        self.df4 = DataFrame(np.random.randn(N * 1000, 10))

    def time_items(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, "_item_cache"):
            self.df._item_cache.clear()
        for name, col in self.df.items():
            pass

    def time_items_cached(self):
        for name, col in self.df.items():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples_start(self):
        self.df4.itertuples()

    def time_itertuples_read_first(self):
        next(self.df4.itertuples())

    def time_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def time_itertuples_to_list(self):
        list(self.df4.itertuples())

    def mem_itertuples_start(self):
        return self.df4.itertuples()

    def peakmem_itertuples_start(self):
        self.df4.itertuples()

    def mem_itertuples_read_first(self):
        return next(self.df4.itertuples())

    def peakmem_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def mem_itertuples_to_list(self):
        return list(self.df4.itertuples())

    def peakmem_itertuples_to_list(self):
        list(self.df4.itertuples())

    def time_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def time_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def time_itertuples_raw_tuples(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def time_itertuples_raw_tuples_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def mem_itertuples_raw_start(self):
        return self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def mem_itertuples_raw_to_list(self):
        return list(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Пример #42
0
    def _write_dataframe_kafka(
        self,
        feature_group: FeatureGroup,
        dataframe: pd.DataFrame,
        offline_write_options: dict,
    ):
        # setup kafka producer
        producer = Producer(self._get_kafka_config(offline_write_options))

        # setup complex feature writers
        feature_writers = {
            feature: self._get_encoder_func(
                feature_group._get_feature_avro_schema(feature)
            )
            for feature in feature_group.get_complex_features()
        }

        # setup row writer function
        writer = self._get_encoder_func(feature_group._get_encoded_avro_schema())

        def acked(err, msg):
            if err is not None:
                print("Failed to deliver message: %s: %s" % (str(msg), str(err)))

        # loop over rows
        for r in dataframe.itertuples(index=False):
            # itertuples returns Python NamedTyple, to be able to serialize it using
            # avro, create copy of row only by converting to dict, which preserves datatypes
            row = r._asdict()

            # transform special data types
            # here we might need to handle also timestamps and other complex types
            # possible optimizaiton: make it based on type so we don't need to loop over
            # all keys in the row
            for k in row.keys():
                # for avro to be able to serialize them, they need to be python data types
                if isinstance(row[k], np.ndarray):
                    row[k] = row[k].tolist()
                if isinstance(row[k], pd.Timestamp):
                    row[k] = row[k].to_pydatetime()

            # encode complex features
            row = self._encode_complex_features(feature_writers, row)

            # encode feature row
            with BytesIO() as outf:
                writer(row, outf)
                encoded_row = outf.getvalue()

            # assemble key
            key = "".join([str(row[pk]) for pk in sorted(feature_group.primary_key)])

            while True:
                # if BufferError is thrown, we can be sure, message hasn't been send so we retry
                try:
                    # produce
                    producer.produce(
                        topic=feature_group._online_topic_name,
                        key=key,
                        value=encoded_row,
                        callback=acked
                        if offline_write_options.get("debug_kafka", False)
                        else None,
                    )

                    # Trigger internal callbacks to empty op queue
                    producer.poll(0)
                    break
                except BufferError as e:
                    if offline_write_options.get("debug_kafka", False):
                        print("Caught: {}".format(e))
                    # backoff for 1 second
                    producer.poll(1)

        # make sure producer blocks and everything is delivered
        producer.flush()

        # start backfilling job
        job_name = "{fg_name}_{version}_offline_fg_backfill".format(
            fg_name=feature_group.name, version=feature_group.version
        )
        job = self._job_api.get(job_name)

        if offline_write_options is not None and offline_write_options.get(
            "start_offline_backfill", True
        ):
            print("Launching offline feature group backfill job...")
            self._job_api.launch(job_name)
            print(
                "Backfill Job started successfully, you can follow the progress at \n{}".format(
                    self._get_job_url(job.href)
                )
            )
            self._wait_for_job(job, offline_write_options)

        return job
Пример #43
0
    def load_guess_score_map(guess_df: pd.DataFrame) -> defaultdict:
        guess_score_map = defaultdict(dict)
        for row in guess_df.itertuples():
            guess_score_map[row.guesser][(row.qnum, row.sentence, row.token, row.guess)] = row.score

        return guess_score_map