Python isna 예제들, pandas.isna Python 예제들

예제 #1

0

파일 보기

파일: test_stat_reductions.py 프로젝트: DusanMilunovic/pandas

    def test_var_std(self):
        string_series = tm.makeStringSeries().rename('series')
        datetime_series = tm.makeTimeSeries().rename('ts')

        alt = lambda x: np.std(x, ddof=1)
        self._check_stat_op('std', alt, string_series)

        alt = lambda x: np.var(x, ddof=1)
        self._check_stat_op('var', alt, string_series)

        result = datetime_series.std(ddof=4)
        expected = np.std(datetime_series.values, ddof=4)
        tm.assert_almost_equal(result, expected)

        result = datetime_series.var(ddof=4)
        expected = np.var(datetime_series.values, ddof=4)
        tm.assert_almost_equal(result, expected)

        # 1 - element series with ddof=1
        s = datetime_series.iloc[[0]]
        result = s.var(ddof=1)
        assert pd.isna(result)

        result = s.std(ddof=1)
        assert pd.isna(result)

예제 #2

0

파일 보기

파일: validation.py 프로젝트: JLL-Benson/CHN_DQ

def enrich_company(company_dedup_list, company_scrapy_result, company_colnames):
    company_scrapy_verify = pd.DataFrame(columns=company_colnames)
    for index, company in company_dedup_list.iterrows():

        if company['db_New'] == False:
            continue
        sourceid = company['Source_ID']
        scrapy_list = company_scrapy_result[company_scrapy_result['Source_ID'] == sourceid]
        scrapy_best = scrapy_list[scrapy_list['Confidence'] == 0]

        # If multiple best match, get first one with address
        if len(scrapy_best) > 1:
            if len(scrapy_best[scrapy_best['地址'].notnull()]) > 1:
                scrapy_best = scrapy_best[scrapy_best['地址'].notnull()].iloc[0].to_frame().transpose()
            else:
                scrapy_best = scrapy_best.iloc[0].to_frame().transpose()
            company = enrich_scrapy(company, scrapy_best)
        # If no best match, return companies without address
        elif len(scrapy_best) < 1:
            if pd.isna(company['Billing_Address']) and pd.isna(company['Billing_Address_CN']):
                company_scrapy_verify = company_scrapy_verify.append(company.to_frame().transpose())
        else:
            company = enrich_scrapy(company, scrapy_best)
        company_dedup_list[company_dedup_list['Source_ID'] == company['Source_ID']] = company.to_frame().transpose()
    company_dedup_list = validate_company(company_dedup_list)
    company_scrapy_verify = validate_company(company_scrapy_verify)
    return company_dedup_list, company_scrapy_verify

예제 #3

0

파일 보기

파일: test_numeric.py 프로젝트: DusanMilunovic/pandas

    def test_constructor(self):

        # explicit construction
        index = Float64Index([1, 2, 3, 4, 5])
        assert isinstance(index, Float64Index)
        expected = np.array([1, 2, 3, 4, 5], dtype='float64')
        tm.assert_numpy_array_equal(index.values, expected)
        index = Float64Index(np.array([1, 2, 3, 4, 5]))
        assert isinstance(index, Float64Index)
        index = Float64Index([1., 2, 3, 4, 5])
        assert isinstance(index, Float64Index)
        index = Float64Index(np.array([1., 2, 3, 4, 5]))
        assert isinstance(index, Float64Index)
        assert index.dtype == float

        index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32)
        assert isinstance(index, Float64Index)
        assert index.dtype == np.float64

        index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32)
        assert isinstance(index, Float64Index)
        assert index.dtype == np.float64

        # nan handling
        result = Float64Index([np.nan, np.nan])
        assert pd.isna(result.values).all()
        result = Float64Index(np.array([np.nan]))
        assert pd.isna(result.values).all()
        result = Index(np.array([np.nan]))
        assert pd.isna(result.values).all()

예제 #4

0

파일 보기

파일: test_integrity.py 프로젝트: bwignall/pandas

def test_isna_behavior(idx):
    # should not segfault GH5123
    # NOTE: if MI representation changes, may make sense to allow
    # isna(MI)
    msg = "isna is not defined for MultiIndex"
    with pytest.raises(NotImplementedError, match=msg):
        pd.isna(idx)

예제 #5

0

파일 보기

파일: test_array.py 프로젝트: TomAugspurger/pandas

    def test_constructor_inferred_fill_value(self, data, fill_value):
        result = SparseArray(data).fill_value

        if pd.isna(fill_value):
            assert pd.isna(result)
        else:
            assert result == fill_value

예제 #6

0

파일 보기

파일: test_reductions.py 프로젝트: bashtage/pandas

    def test_idxmin(self):
        # test idxmin
        # _check_stat_op approach can not be used here because of isna check.
        string_series = tm.makeStringSeries().rename('series')

        # add some NaNs
        string_series[5:15] = np.NaN

        # skipna or no
        assert string_series[string_series.idxmin()] == string_series.min()
        assert pd.isna(string_series.idxmin(skipna=False))

        # no NaNs
        nona = string_series.dropna()
        assert nona[nona.idxmin()] == nona.min()
        assert (nona.index.values.tolist().index(nona.idxmin()) ==
                nona.values.argmin())

        # all NaNs
        allna = string_series * np.nan
        assert pd.isna(allna.idxmin())

        # datetime64[ns]
        s = Series(pd.date_range('20130102', periods=6))
        result = s.idxmin()
        assert result == 0

        s[0] = np.nan
        result = s.idxmin()
        assert result == 1

예제 #7

0

파일 보기

파일: test_ops.py 프로젝트: Xbar/pandas

    def test_minmax(self):
        for tz in self.tz:
            # monotonic
            idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
                                     '2011-01-03'], tz=tz)
            assert idx1.is_monotonic

            # non-monotonic
            idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03',
                                     '2011-01-02', pd.NaT], tz=tz)
            assert not idx2.is_monotonic

            for idx in [idx1, idx2]:
                assert idx.min() == Timestamp('2011-01-01', tz=tz)
                assert idx.max() == Timestamp('2011-01-03', tz=tz)
                assert idx.argmin() == 0
                assert idx.argmax() == 2

        for op in ['min', 'max']:
            # Return NaT
            obj = DatetimeIndex([])
            assert pd.isna(getattr(obj, op)())

            obj = DatetimeIndex([pd.NaT])
            assert pd.isna(getattr(obj, op)())

            obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT])
            assert pd.isna(getattr(obj, op)())

예제 #8

0

파일 보기

파일: test_dtype.py 프로젝트: bkandel/pandas

def test_inferred_dtype(dtype, fill_value):
    sparse_dtype = SparseDtype(dtype)
    result = sparse_dtype.fill_value
    if pd.isna(fill_value):
        assert pd.isna(result) and type(result) == type(fill_value)
    else:
        assert result == fill_value

예제 #9

0

파일 보기

파일: test_operators.py 프로젝트: christlc/pandas

        def _check_fill(meth, op, a, b, fill_value=0):
            exp_index = a.index.union(b.index)
            a = a.reindex(exp_index)
            b = b.reindex(exp_index)

            amask = isna(a)
            bmask = isna(b)

            exp_values = []
            for i in range(len(exp_index)):
                with np.errstate(all='ignore'):
                    if amask[i]:
                        if bmask[i]:
                            exp_values.append(np.nan)
                            continue
                        exp_values.append(op(fill_value, b[i]))
                    elif bmask[i]:
                        if amask[i]:
                            exp_values.append(np.nan)
                            continue
                        exp_values.append(op(a[i], fill_value))
                    else:
                        exp_values.append(op(a[i], b[i]))

            result = meth(a, b, fill_value=fill_value)
            expected = Series(exp_values, exp_index)
            assert_series_equal(result, expected)

예제 #10

0

파일 보기

파일: test_ops.py 프로젝트: bkandel/pandas

    def test_minmax(self):

        # monotonic
        idx1 = TimedeltaIndex(['1 days', '2 days', '3 days'])
        assert idx1.is_monotonic

        # non-monotonic
        idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT'])
        assert not idx2.is_monotonic

        for idx in [idx1, idx2]:
            assert idx.min() == Timedelta('1 days')
            assert idx.max() == Timedelta('3 days')
            assert idx.argmin() == 0
            assert idx.argmax() == 2

        for op in ['min', 'max']:
            # Return NaT
            obj = TimedeltaIndex([])
            assert pd.isna(getattr(obj, op)())

            obj = TimedeltaIndex([pd.NaT])
            assert pd.isna(getattr(obj, op)())

            obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT])
            assert pd.isna(getattr(obj, op)())

예제 #11

0

파일 보기

파일: test_replace.py 프로젝트: pt247/pandas

    def test_replace2(self):
        N = 100
        ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N),
                        dtype=object)
        ser[:5] = np.nan
        ser[6:10] = 'foo'
        ser[20:30] = 'bar'

        # replace list with a single value
        rs = ser.replace([np.nan, 'foo', 'bar'], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True)
        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

예제 #12

0

파일 보기

파일: test.py 프로젝트: linkcheng/python_demo

def diff_small():
    df_ret = pd.read_excel('ret.xlsx')
    df_src = pd.read_excel('src.xlsx')

    df_ret.sort_values(by='id_card_number')
    df_ret_cols = list(df_ret.columns.values)
    id_nums = df_ret['id_card_number']

    df_src_valid = df_src[df_src['id_card_number'].isin(id_nums)]
    df_src_valid.sort_values(by='id_card_number')

    exp = 0.00001
    discard_cols = [
        'id_card_number', 'score', 'gender', 'id_number_province',
    ]
    float_cols = [
        'pdl_credit_24'
    ]

    cols_dict = {
        'tongdun': ['tongdun_status', 'tongdun_25', 'tongdun_4', 'tongdun_14', 'tongdun_41',
                    'tongdun_259', 'tongdun_120', 'tongdun_152', 'tongdun_87', 'tongdun_136'],
        'call_record': ['call_record_600', 'call_record_441'],
        'hj': ['hj_3y_xfnl_5', 'hj_3y_xfnl_score'],
        'contact': ['contact_10', 'contact_11'],
        'ei': ['education'],
        'pdl': ['pdl_credit_24'],
    }
    col_data_dict = {col: [] for cols in cols_dict.values() for col in cols}

    for col in df_ret_cols:
        if col in discard_cols:
            continue

        ret_vals = list(df_ret[col])
        src_vals = list(df_src_valid[col])

        for i, id_num in enumerate(id_nums):
            is_ret_vals_na = isna(ret_vals[i])
            is_src_vals_na = isna(src_vals[i])
            if is_ret_vals_na and is_src_vals_na:
                continue

            if is_ret_vals_na or is_src_vals_na:
                col_data_dict.get(col).append(id_num)
            else:
                if col in float_cols:
                    if abs(ret_vals[i] - src_vals[i]) > exp:
                        col_data_dict.get(col).append(id_num)
                else:
                    if ret_vals[i] != src_vals[i]:
                        col_data_dict.get(col).append(id_num)

    for key, cols in cols_dict.items():
        diff_set = set()
        for col in cols:
            diff_set |= set(col_data_dict[col])
        print('ids of %s is %s' % (key, sorted(list(diff_set))))
        print('count of %s is %s' % (key, len(diff_set)))

예제 #13

0

파일 보기

파일: test_period.py 프로젝트: Itay4/pandas

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, pd._libs.iNaT], dtype='period[D]')

        val = series[3]
        assert pd.isna(val)

        series[2] = val
        assert pd.isna(series[2])

예제 #14

0

파일 보기

파일: test_constructors.py 프로젝트: BobMcFry/pandas

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

예제 #15

0

파일 보기

파일: test_boolean.py 프로젝트: giang12/pandas

def test_where_unsafe():
    # see gh-9731
    s = Series(np.arange(10), dtype="int64")
    values = [2.5, 3.5, 4.5, 5.5]

    mask = s > 5
    expected = Series(lrange(6) + values, dtype="float64")

    s[mask] = values
    assert_series_equal(s, expected)

    # see gh-3235
    s = Series(np.arange(10), dtype='int64')
    mask = s < 5
    s[mask] = lrange(2, 7)
    expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64')
    assert_series_equal(s, expected)
    assert s.dtype == expected.dtype

    s = Series(np.arange(10), dtype='int64')
    mask = s > 5
    s[mask] = [0] * 4
    expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64')
    assert_series_equal(s, expected)

    s = Series(np.arange(10))
    mask = s > 5

    def f():
        s[mask] = [5, 4, 3, 2, 1]

    pytest.raises(ValueError, f)

    def f():
        s[mask] = [0] * 5

    pytest.raises(ValueError, f)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.where(s > 2, np.nan)
    expected = Series([np.nan, np.nan, 3, 4])
    assert_series_equal(result, expected)

    # GH 4667
    # setting with None changes dtype
    s = Series(range(10)).astype(float)
    s[8] = None
    result = s[8]
    assert isna(result)

    s = Series(range(10)).astype(float)
    s[s > 8] = None
    result = s[isna(s)]
    expected = Series(np.nan, index=[9])
    assert_series_equal(result, expected)

예제 #16

0

파일 보기

파일: test_ops.py 프로젝트: BobMcFry/pandas

    def test_minmax_nat(self, op):
        # Return NaT
        obj = DatetimeIndex([])
        assert pd.isna(getattr(obj, op)())

        obj = DatetimeIndex([pd.NaT])
        assert pd.isna(getattr(obj, op)())

        obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT])
        assert pd.isna(getattr(obj, op)())

예제 #17

0

파일 보기

        def scalar_add(a, b):

            # TODO; should really be a type specific NA
            if pd.isna(a) or pd.isna(b):
                return np.nan
            if is_integer(a):
                a = int(a)
            elif is_integer(b):
                b = int(b)
            return a + b

예제 #18

0

파일 보기

파일: test_operators.py 프로젝트: christlc/pandas

    def test_operators_na_handling(self):
        from decimal import Decimal
        from datetime import date
        s = Series([Decimal('1.3'), Decimal('2.3')],
                   index=[date(2012, 1, 1), date(2012, 1, 2)])

        result = s + s.shift(1)
        result2 = s.shift(1) + s
        assert isna(result[0])
        assert isna(result2[0])

예제 #19

0

파일 보기

파일: MultiClassification.py 프로젝트: rafg92/ALW

def NanCleanerApply(x):
    #@param x is a column of the dataset
    maskNan = pd.isna(x)
    maskNotNan = pd.notna(x)
    notNan = x[maskNotNan]
    nan = x[maskNan]
    avg = int(np.average(notNan))
    for i in range (0, len(x)):
        if(pd.isna(x[i])):
            x[i] = avg
    return x

예제 #20

0

파일 보기

파일: test_apply.py 프로젝트: rajasankar-ideas2it/pandas

    def test_map_int(self):
        left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4})
        right = Series({1: 11, 2: 22, 3: 33})

        assert left.dtype == np.float_
        assert issubclass(right.dtype.type, np.integer)

        merged = left.map(right)
        assert merged.dtype == np.float_
        assert isna(merged['d'])
        assert not isna(merged['c'])

예제 #21

0

파일 보기

파일: test_axis_select_reindex.py 프로젝트: clham/pandas

    def test_reindex_boolean(self):
        frame = DataFrame(np.ones((10, 2), dtype=bool),
                          index=np.arange(0, 20, 2),
                          columns=[0, 2])

        reindexed = frame.reindex(np.arange(10))
        assert reindexed.values.dtype == np.object_
        assert isna(reindexed[0][1])

        reindexed = frame.reindex(columns=lrange(3))
        assert reindexed.values.dtype == np.object_
        assert isna(reindexed[1]).all()

예제 #22

0

파일 보기

파일: test_setitem.py 프로젝트: bwignall/pandas

    def test_series_setitem(
            self, multiindex_year_month_day_dataframe_random_data):
        ymd = multiindex_year_month_day_dataframe_random_data
        s = ymd['A']

        s[2000, 3] = np.nan
        assert isna(s.values[42:65]).all()
        assert notna(s.values[:42]).all()
        assert notna(s.values[65:]).all()

        s[2000, 3, 10] = np.nan
        assert isna(s[49])

예제 #23

0

파일 보기

파일: confidence.py 프로젝트: JLL-Benson/CHN_DQ

def getConfidence(company_scrapy):
    company_search_key = company_scrapy['搜索词']
    company_response_name = company_scrapy['公司名称']
    if pd.isna(company_search_key) or pd.isna(company_response_name):
        return None
    elif hasCHN(company_search_key) and hasCHN(company_response_name):
        company_search_key = vd.extract_keyword(company_search_key)
        company_response_name = vd.extract_keyword(company_response_name)
        return lv.distance(company_search_key,company_response_name)
    elif not hasCHN(company_search_key) and not hasCHN(company_response_name):
        return lvEN(company_search_key, company_response_name)
    else:
        return None

예제 #24

0

파일 보기

파일: test_range.py 프로젝트: changhiskhan/pandas

    def test_max_min(self, start, stop, step):
        # GH17607
        idx = RangeIndex(start, stop, step)
        expected = idx._int64index.max()
        result = idx.max()
        assert result == expected

        expected = idx._int64index.min()
        result = idx.min()
        assert result == expected

        # empty
        idx = RangeIndex(start, stop, -step)
        assert isna(idx.max())
        assert isna(idx.min())

예제 #25

0

파일 보기

파일: test_tile.py 프로젝트: MasonGallo/pandas

    def test_na_handling(self):
        arr = np.arange(0, 0.75, 0.01)
        arr[::3] = np.nan

        result = cut(arr, 4)

        result_arr = np.asarray(result)

        ex_arr = np.where(isna(arr), np.nan, result_arr)

        tm.assert_almost_equal(result_arr, ex_arr)

        result = cut(arr, 4, labels=False)
        ex_result = np.where(isna(arr), np.nan, result)
        tm.assert_almost_equal(result, ex_result)

예제 #26

0

파일 보기

파일: test_sparse.py 프로젝트: sinhrks/pandas

    def test_isna(self, data_missing):
        expected_dtype = SparseDtype(bool,
                                     pd.isna(data_missing.dtype.fill_value))
        expected = SparseArray([True, False], dtype=expected_dtype)

        result = pd.isna(data_missing)
        self.assert_equal(result, expected)

        result = pd.Series(data_missing).isna()
        expected = pd.Series(expected)
        self.assert_series_equal(result, expected)

        # GH 21189
        result = pd.Series(data_missing).drop([0, 1]).isna()
        expected = pd.Series([], dtype=expected_dtype)
        self.assert_series_equal(result, expected)

예제 #27

0

파일 보기

파일: array.py 프로젝트: rajasankar-ideas2it/pandas

    def value_counts(self, dropna=True):
        """
        Returns a Series containing counts of unique values.

        Parameters
        ----------
        dropna : boolean, default True
            Don't include counts of NaN, even if NaN is in sp_values.

        Returns
        -------
        counts : Series
        """
        keys, counts = algos._value_counts_arraylike(self.sp_values,
                                                     dropna=dropna)
        fcounts = self.sp_index.ngaps
        if fcounts > 0:
            if self._null_fill_value and dropna:
                pass
            else:
                if self._null_fill_value:
                    mask = pd.isna(keys)
                else:
                    mask = keys == self.fill_value

                if mask.any():
                    counts[mask] += fcounts
                else:
                    keys = np.insert(keys, 0, self.fill_value)
                    counts = np.insert(counts, 0, fcounts)

        if not isinstance(keys, pd.Index):
            keys = pd.Index(keys)
        result = pd.Series(counts, index=keys)
        return result

예제 #28

0

파일 보기

파일: json.py 프로젝트: TomAugspurger/pandas

    def _try_convert_to_date(self, data):
        """ try to parse a ndarray like into a date column
            try to coerce object in epoch/iso formats and
            integer/float in epcoh formats, return a boolean if parsing
            was successful """

        # no conversion on empty
        if not len(data):
            return data, False

        new_data = data
        if new_data.dtype == 'object':
            try:
                new_data = data.astype('int64')
            except (TypeError, ValueError, OverflowError):
                pass

        # ignore numbers that are out of range
        if issubclass(new_data.dtype.type, np.number):
            in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
                        (new_data.values == iNaT))
            if not in_range.all():
                return data, False

        date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
        for date_unit in date_units:
            try:
                new_data = to_datetime(new_data, errors='raise',
                                       unit=date_unit)
            except ValueError:
                continue
            except Exception:
                break
            return new_data, True
        return data, False

예제 #29

0

파일 보기

파일: test_cut.py 프로젝트: scari/pandas

def test_cut_out_of_bounds():
    arr = np.random.randn(100)
    result = cut(arr, [-1, 0, 1])

    mask = isna(result)
    ex_mask = (arr < -1) | (arr > 1)
    tm.assert_numpy_array_equal(mask, ex_mask)

예제 #30

0

파일 보기

파일: test_nat.py 프로젝트: TomAugspurger/pandas

def test_identity(klass):
    assert klass(None) is NaT

    result = klass(np.nan)
    assert result is NaT

    result = klass(None)
    assert result is NaT

    result = klass(iNaT)
    assert result is NaT

    result = klass(np.nan)
    assert result is NaT

    result = klass(float('nan'))
    assert result is NaT

    result = klass(NaT)
    assert result is NaT

    result = klass('NaT')
    assert result is NaT

    assert isna(klass('nat'))

예제 #31

0

파일 보기

"""

import pandas as pd
import os
import matplotlib.pyplot as plt
import scipy.stats as spst

results = pd.read_csv(os.path.join('datasets', 'bitzounis_results.csv'),
                      decimal='.')

stages = ['circuit', 'carmona', 'sprint', 'laguna']

for s in stages:
    # max
    fast_max = results[s + '_' + 'fast_max']
    fast_max = fast_max[~pd.isna(fast_max)]

    slow_max = results[s + '_' + 'slow_max']
    slow_max = slow_max[~pd.isna(slow_max)]

    w_max = spst.wilcoxon(fast_max, slow_max)[1]

    plt.clf()
    plt.boxplot([slow_max, fast_max])
    plt.title(str(w_max))
    plt.savefig(os.path.join('figs_bitz', s + '_' + 'max.png'), dpi=300)

    # avg
    fast_avg = results[s + '_' + 'fast_avg']
    fast_avg = fast_avg[~pd.isna(fast_avg)]

예제 #32

0

파일 보기

from SVM import dataLoading, data_preprocessing, missing_data_processing

from math import exp
import math
import tensorflow as tf
import pandas as pd
import numpy as np
import os

# 파일 읽어오기
from sklearn.model_selection import train_test_split

train, test = dataLoading()
x = data_preprocessing(train, test)

print(test[pd.isna(test["Fare"])])

# test[]

feature_names = [
    "Pclass", "Sex", "Fare", "Embarked_C", "Embarked_Q", "Embarked_S"
]

X_train = train[feature_names]
Y_train = train["Survived"]

X_test = test[feature_names]

# ########################################################################################################################
# # 러닝 모델 생성
# # DT = 기본 예문

예제 #33

0

파일 보기

    def test_value_counts_datetime64(self):
        klasses = [Index, Series]
        for klass in klasses:
            # GH 3002, datetime64[ns]
            # don't test names though
            txt = "\n".join([
                'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
            ])
            f = StringIO(txt)
            df = pd.read_fwf(f,
                             widths=[6, 8, 3],
                             names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())
            s.name = None

            idx = pd.to_datetime([
                '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z',
                '2009-01-01 00:00:00X'
            ])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np_array_datetime64_compat([
                '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z',
                '2008-09-09 00:00:00Z'
            ],
                                                  dtype='datetime64[ns]')
            if isinstance(s, Index):
                tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
            else:
                tm.assert_numpy_array_equal(s.unique(), expected)

            assert s.nunique() == 3

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            assert result.index.dtype == 'datetime64[ns]'
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            assert unique.dtype == 'datetime64[ns]'

            # numpy_array_equal cannot compare pd.NaT
            if isinstance(s, Index):
                exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT])
                tm.assert_index_equal(unique, exp_idx)
            else:
                tm.assert_numpy_array_equal(unique[:3], expected)
                assert pd.isna(unique[3])

            assert s.nunique() == 3
            assert s.nunique(dropna=False) == 4

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td, name='dt')

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta('1day')], name='dt')
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(['1 days'], name='dt')
            if isinstance(td, Index):
                tm.assert_index_equal(td.unique(), expected)
            else:
                tm.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2, name='dt')
            result2 = td2.value_counts()
            tm.assert_series_equal(result2, expected_s)

예제 #34

0

파일 보기

파일: Trip_advisor.py 프로젝트: Basill-ini/SkillFactory

    for row in dt[col]:
        vocab.append(str(row))
    # Создаем список токенов:
    token = sorted(set(vocab))
    # Задаем словарь с числовым значением для каждого токена:
    dictionary = {elem: ind for ind, elem in enumerate(token)}
    # Применяем словарь к исходной колонке:
    return dictionary


# ### Обработка NAN
# ![](http://)Создадим столбцы с информацией о том, где были пропуски.

# In[579]:

dt['cuis_nan'] = pd.isna(dt['cuis_style']).astype('uint8')
dt['price_nan'] = pd.isna(dt['price']).astype('uint8')

# ### Price:

# Определим все возможные значения в столбце и присвоим им числовые определители.

# In[580]:

dt['price'].value_counts()

# In[581]:

# Создадим числовые признаки  для колонки Price:
token = tokenizer('price')
# Применим полученный словарь к столбцу

예제 #35

0

파일 보기

파일: test_timezones.py 프로젝트: WangyiG/pandas-1

    def test_dti_tz_nat(self, tzstr):
        idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT])

        assert isna(idx[1])
        assert idx[0].tzinfo is not None

예제 #36

0

파일 보기

파일: style.py 프로젝트: DailyActie/PY_DATA-pandas

 def _highlight_null(v, null_color):
     return ('background-color: {color}'.format(
         color=null_color) if pd.isna(v) else '')

예제 #37

0

파일 보기

파일: script.py 프로젝트: WayneLee1023/DXY-2019-nCoV-Data

    def dumper(self, collection):
        if collection == 'DXYArea':
            structured_results = list()
            results = self.db.dump(collection=collection)
            for province_dict in results:
                if province_dict.get('cities', None):
                    for city_counter in range(len(province_dict['cities'])):
                        city_dict = province_dict['cities'][city_counter]
                        result = dict()
                        result['provinceName'] = province_dict['provinceName']
                        result['cityName'] = city_dict['cityName']

                        result['province_confirmedCount'] = province_dict['confirmedCount']
                        result['province_suspectedCount'] = province_dict['suspectedCount']
                        result['province_curedCount'] = province_dict['curedCount']
                        result['province_deadCount'] = province_dict['deadCount']

                        result['city_confirmedCount'] = city_dict['confirmedCount']
                        result['city_suspectedCount'] = city_dict['suspectedCount']
                        result['city_curedCount'] = city_dict['curedCount']
                        result['city_deadCount'] = city_dict['deadCount']

                        result['updateTime'] = datetime.datetime.fromtimestamp(province_dict['updateTime']/1000)

                        structured_results.append(result)
            df = pd.DataFrame(structured_results)
            df.to_csv(
                path_or_buf=os.path.join(
                    os.path.split(os.path.realpath(__file__))[0], collection + '.csv'),
                index=False, encoding='utf_8_sig'
            )
        else:
            df = pd.DataFrame(data=self.db.dump(collection=collection))
            for time_type in time_types:
                if time_type in df.columns:
                    df[time_type] = df[time_type].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000) if not pd.isna(x) else '')
            df.to_csv(
                path_or_buf=os.path.join(
                    os.path.split(os.path.realpath(__file__))[0], collection + '.csv'),
                index=False, encoding='utf_8_sig'
            )

예제 #38

0

파일 보기

파일: test_bin_groupby.py 프로젝트: wleepang/pandas

 def _ohlc(group):
     if isna(group).all():
         return np.repeat(nan, 4)
     return [group[0], group.max(), group.min(), group[-1]]

예제 #39

0

파일 보기

    tqdm.write("Loading Data...")
    df_tt_raw = pd.read_csv(os.path.join(in_root, "train_test.csv"))
    df_tt_raw.sort_values(by=['id', 'date'], inplace=True)
    df_tt_raw.set_index(['id'], inplace=True)
    df_pred = pd.read_csv(os.path.join(in_root, "pred.csv"))
    df_pred.sort_values(by=['id', 'date'], inplace=True)
    df_pred.set_index(['id'], inplace=True)

    # parse command line arguments, loop over targets
    for target_name in argv[2:]:

        # display target variables
        tqdm.write("Target: {}...".format(target_name))

        # remove rows missing target
        df_tt = df_tt_raw[~pd.isna(df_tt_raw["target_" + target_name])]

        # xgboost hyperparams
        with open(
                os.path.join(out_root, "params/",
                             "params_" + target_name + ".txt"), 'r') as f:
            params = literal_eval(f.read())

        # collect unique station_ids
        station_ids = df_tt.index.unique()

        # only process a sub sample
        station_ids = pd.Series(station_ids).sample(frac=float(argv[1]),
                                                    replace=False).tolist()

        # initialize k-fold

예제 #40

0

파일 보기

 def prepare_file_form_scraped_data(self):
     all_files = []
     path = r'{0}{1}_{2}*{3}'.format(self.scrape_path, self.scrape_file,
                                     self.version, self.csv_ext)
     all_files += glob.glob(path, recursive=True)
     li = []
     for filename in all_files:
         df = pd.read_csv(filename,
                          delimiter=self.csv_delimiter,
                          encoding=self.csv_encoding,
                          dtype={'ID': object})
         li.append(df)
     try:
         frame = pd.concat(li, axis=0, ignore_index=True)
     except Exception as e:
         frame = pd.DataFrame()
         print('No sach files to concatenate {0}'.format(path))
         print(str(e))
     else:
         self.save_results(frame, '_final')
         scrape = frame
         self.df_scrape = scrape
         try:
             df = pd.read_csv(r'{0}{1}'.format(self.start_path,
                                               self.obec_words_file),
                              delimiter=self.csv_delimiter,
                              encoding=self.csv_encoding)
         except Exception as e:
             df = pd.DataFrame()
             print(
             'Something went wrong on reading ' \
  '{0}{1}'.format(self.start_path,
                    self.obec_words_file)
             )
             print(str(e))
         else:
             self.df_words = df
             try:
                 dfc = pd.read_csv(r'{0}{1}'.format(
                     self.start_path, self.obec_known_characteristics),
                                   delimiter=self.csv_delimiter,
                                   encoding=self.csv_encoding,
                                   dtype={'ID': object})
             except Exception as e:
                 dfc = pd.DataFrame()
                 print(
                 'Something went wrong on reading ' \
      '{0}{1}'.format(self.start_path,
                        self.obec_known_characteristics)
                 )
                 print(str(e))
             else:
                 self.df_obec = dfc
                 if not 'Link position' in frame.columns:
                     frame = frame[['ID', 'Suggested URL', 'URL to scrape']
                                   + df['Word'].unique().tolist()]
                     frame = frame.groupby([
                         'ID', 'URL'
                     ])[df['Word'].unique().tolist()].sum().reset_index()
                 else:
                     frame = frame[[
                         'ID', 'Name', 'Suggested URL', 'URL to scrape',
                         'Link position', 'Has equal domain',
                         'Has Simple Suggested URL'
                     ] + df['Word'].unique().tolist()]
                     frame = frame.groupby([
                         'ID',
                         'Name',
                         'Suggested URL',
                         'Link position'
                     ])[df['Word'].unique().tolist() \
                         + [
                             'Has equal domain',
                             'Has Simple Suggested URL'
                           ]].sum().reset_index()
                     frame[[
                         'Has equal domain', 'Has Simple Suggested URL'
                     ]] = frame[[
                         'Has equal domain', 'Has Simple Suggested URL'
                     ]].where(~(frame[[
                         'Has equal domain', 'Has Simple Suggested URL'
                     ]] > 0),
                              other=1)  # count to 1
                 frame[df['Word'].unique().tolist()] = frame[
                     df['Word'].unique().tolist()].where(
                         ~(frame[df['Word'].unique().tolist()] > 0),
                         other=1)  # word count to 1
                 try:
                     frame = frame.join(dfc.set_index('ID'), on='ID')
                 except Exception as e:
                     frame = pd.DataFrame()
                     print(
                     'Something went wrong on joining ' \
                         '{0}{1}_{2}{3} and {4}{5}'.format(
                     self.machine_learning_path,
                             self.scrape_file,
                     self.version,
                             self.csv_ext,
                     self.start_path,
                     self.obec_known_characteristics)
                     )
                     print(str(e))
                 else:
                     frame['Known OBEC'] = frame['OBEC'].apply(\
       lambda x: 0 if pd.isna(x) else 1)
                     if not 'Link position' in frame.columns:
                         frame['Link position'] = frame.apply(
                             lambda row: 1 if '.'.join(
                                 urlparse(row['URL'].lower()).netloc.split(
                                     '.')[-2:]) == '.'.join(
                                         urlparse(str(row['OBEC']).lower()).
                                         netloc.split('.')[-2:]) else 2,
                             axis=1)
                     frame['sum'] = 0
                     for x in df['Word'].unique().tolist():
                         frame['sum'] = frame['sum'] + frame[x]
                     if 'Has equal domain' in frame.columns:
                         frame['sum'] = frame['sum'] \
                                        + frame['Has equal domain']
                     if 'Has Simple Suggested URL' in frame.columns:
                         frame['sum'] = frame['sum'] \
                                        + frame['Has Simple Suggested URL']
                     frame['Score'] = frame['sum'] \
                                      - frame['sum'] \
                                        * frame['Link position'] / 100
                     #                        frame['Score'] = frame['sum'] - frame['sum']/100
                     self.df_ml = frame
     self.save_results(frame, '_ml_ready')
     return [frame, df, dfc, scrape]

예제 #41

0

파일 보기

def test_reindex_bool_pad(datetime_series):
    # fail
    ts = datetime_series[5:]
    bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index)
    filled_bool = bool_ts.reindex(datetime_series.index, method="pad")
    assert isna(filled_bool[:5]).all()

예제 #42

0

파일 보기

파일: config.py 프로젝트: iakremnev/azure-notifications

import os

import pandas as pd

SERVER_PORT = int(os.getenv("PORT") or 3000)
SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN")
SLACK_SIGNING_SECRET = os.getenv("SLACK_SIGNING_SECRET")
BOT_SERVICE_CHANNEL = os.getenv("BOT_SERVICE_CHANNEL")

slack_users = pd.read_csv(os.getenv("SLACK_USER_IDS"), index_col="tfs_name")
slack_users = {
    name: uid
    for name, uid in slack_users["id"].to_dict().items() if not pd.isna(name)
}
slack_ims = {}

예제 #43

0

파일 보기

파일: bool.py 프로젝트: LioraR/Data-Processing

 def isna(self):
     nas = pd.isna(self._data.to_pandas())
     return type(self).from_scalars(nas)

예제 #44

0

파일 보기

def get_unique_values(series: pd.Series) -> List:
    """Returns a list of unique values in a series, including NaNs."""
    vals = list(sorted(series.dropna().unique()))
    if any(pd.isna(series)):
        vals.append(pd.NA)
    return vals

예제 #45

0

파일 보기

파일: _pyreadr_writer.py 프로젝트: ty1539/pyreadr

def get_pyreadr_column_types(df):
    """
    From a pandas data frame, get an OrderedDict with column name as key
    and pyreadr column type as value, and also a list with boolean 
    values indicating if the column has missing values (np.nan).
    The pyreadr column types are needed for downstream processing.
    """

    types = df.dtypes.values.tolist()
    columns = df.columns.values.tolist()

    result = OrderedDict()
    has_missing_values = [False] * len(columns)
    for indx, (col_name, col_type) in enumerate(zip(columns, types)):
        
        # recover original type for categories
        if type(col_type) is pd.core.dtypes.dtypes.CategoricalDtype:
            col_type = np.asarray(df[col_name]).dtype
        
        if col_type in int_types:
            result[col_name] = "INTEGER"
        elif col_type in float_types:
            result[col_name] = "NUMERIC"
        elif col_type == np.bool:
            result[col_name] = "LOGICAL"
        # np.datetime64[ns]
        elif col_type == np.dtype('<M8[ns]') or col_type == np.datetime64:
                result[col_name] = "DATETIME"
                missing = pd.isna(df[col_name])
                if np.any(missing):
                    has_missing_values[indx] = True
        elif col_type == np.object or col_type in int_mixed_types:
            missing = pd.isna(df[col_name])
            if np.any(missing):
                has_missing_values[indx] = True
                if col_type in int_mixed_types:
                    result[col_name] = "INTEGER"
                    continue
                col = df[col_name].dropna()
                if len(col):
                    curtype = type(col[0])
                    equal = col.apply(lambda x: type(x) == curtype)
                    if not np.all(equal):
                        result[col_name] = "OBJECT"
                        continue
                else:
                    result[col_name] = "LOGICAL"
                    continue
            else:
                if col_type in int_mixed_types:
                    result[col_name] = "INTEGER"
                    continue
                curtype = type(df[col_name][0])
                equal = df[col_name].apply(lambda x: type(x) == curtype)
                if not np.all(equal):
                    result[col_name] = "OBJECT"
                    continue
            
            if curtype in int_types:
                result[col_name] = "INTEGER"
            elif curtype in float_types:
                result[col_name] = "NUMERIC"
            elif curtype == np.bool:
                result[col_name] = "LOGICAL"
            elif curtype == str:
                result[col_name] = "CHARACTER"
            elif curtype == datetime.date:
                result[col_name] = "DATE"
            elif curtype == datetime.datetime:
                result[col_name] = "DATETIME"
            else:
                result[col_name] = "OBJECT"
            
        else:
            # generic object
            result[col_name] = "OBJECT"
    return result, has_missing_values

예제 #46

0

파일 보기

파일: _læs_observationer.py 프로젝트: majbrittws/FIRE

def opbyg_punktoversigt(
    navn: str,
    nyetablerede: pd.DataFrame,
    alle_punkter: Tuple[str, ...],
) -> pd.DataFrame:
    punktoversigt = pd.DataFrame(columns=list(ARKDEF_PUNKTOVERSIGT))
    fire.cli.print("Opbygger punktoversigt")

    # Forlæng punktoversigt, så der er plads til alle punkter
    punktoversigt = punktoversigt.reindex(range(len(alle_punkter)))
    punktoversigt["Punkt"] = alle_punkter
    # Geninstaller 'punkt'-søjlen som indexsøjle
    punktoversigt = punktoversigt.set_index("Punkt")

    nye_punkter = tuple(sorted(set(nyetablerede.index)))

    try:
        DVR90 = fire.cli.firedb.hent_srid("EPSG:5799")
    except KeyError:
        fire.cli.print("DVR90 (EPSG:5799) ikke fundet i srid-tabel",
                       bg="red",
                       fg="white",
                       err=True)
        sys.exit(1)

    for punkt in alle_punkter:
        if not pd.isna(punktoversigt.at[punkt, "Kote"]):
            continue
        if punkt in nye_punkter:
            continue

        fire.cli.print(f"Finder kote for {punkt}", fg="green")
        pkt = fire.cli.firedb.hent_punkt(punkt)

        # Grav aktuel kote frem
        kote = None
        for koord in pkt.koordinater:
            if koord.srid != DVR90:
                continue
            if koord.registreringtil is None:
                kote = koord
                break

        punktoversigt.at[punkt, "Fasthold"] = ""
        punktoversigt.at[punkt, "System"] = "DVR90"
        punktoversigt.at[punkt, "uuid"] = ""
        punktoversigt.at[punkt, "Udelad publikation"] = ""

        if kote is None:
            fire.cli.print(
                f"Ingen aktuel DVR90-kote fundet for {punkt}",
                bg="red",
                fg="white",
                err=True,
            )
            punktoversigt.at[punkt, "Kote"] = None
            punktoversigt.at[punkt, "σ"] = None
            punktoversigt.at[punkt, "Hvornår"] = None

        else:
            punktoversigt.at[punkt, "Kote"] = kote.z
            punktoversigt.at[punkt, "σ"] = kote.sz
            punktoversigt.at[punkt, "Hvornår"] = kote.t

        if pd.isna(punktoversigt.at[punkt, "Nord"]):
            punktoversigt.at[punkt, "Nord"] = pkt.geometri.koordinater[1]
            punktoversigt.at[punkt, "Øst"] = pkt.geometri.koordinater[0]

    # Nyetablerede punkter er ikke i databasen, så hent eventuelle manglende
    # koter og lokationskoordinater i fanebladet 'Nyetablerede punkter'
    for punkt in nye_punkter:
        if pd.isna(punktoversigt.at[punkt, "Kote"]):
            punktoversigt.at[punkt, "Kote"] = None
        if pd.isna(punktoversigt.at[punkt, "Nord"]):
            punktoversigt.at[punkt, "Nord"] = nyetablerede.at[punkt, "Nord"]
        if pd.isna(punktoversigt.at[punkt, "Øst"]):
            punktoversigt.at[punkt, "Øst"] = nyetablerede.at[punkt, "Øst"]

    # Check op på lokationskoordinaterne
    for punkt in alle_punkter:
        λ, φ = normaliser_lokationskoordinat(punktoversigt.at[punkt, "Øst"],
                                             punktoversigt.at[punkt, "Nord"])
        punktoversigt.at[punkt, "Nord"] = φ
        punktoversigt.at[punkt, "Øst"] = λ

    # Reformater datarammen så den egner sig til output
    return punktoversigt.reset_index()

예제 #47

0

파일 보기

파일: Ex04_nan.py 프로젝트: qwe321as/Python

               columns=['kim','park','jung'])

print('df:\n',df)
print()

filename = 'mynan.csv'
table = pd.read_csv(filename,encoding='euc-kr',
                    index_col=0) 
print('table:\n',table)
print(type(table))
print(table.size)

print(table.isna())
print()

print(pd.isna(table))
print()

print(table.notnull())
print()

table2 = table.dropna()
print('table2:\n', table2)
print()

table2 = table.dropna(how='any')
print('table2(any):\n', table2)
print()

table2 = table.dropna(how='all')
print('table2(all):\n', table2)

예제 #48

0

파일 보기

But : Monter une entreprise de livraison en utilisant des robots qui suivront les itinéraires trouvés par notre
algorithmes, il faut donc optimiser les trajets

Fichiers : Fourmiam.py
"""

import pandas
import networkx as nx
import matplotlib.pyplot as plt

map = pandas.read_csv('VOIES_NM.csv', nrows=5300, sep=",", encoding='latin-1')

for i in range(0, len(map)):
    # Check NaN
    if pandas.isna(map['TENANT'][i]) is True and pandas.isna(
            map['ABOUTISSANT'][i]) is False:
        map.loc[map.index[i], 'TENANT'] = i

    if pandas.isna(map['TENANT'][i]) is False and pandas.isna(
            map['ABOUTISSANT'][i]) is True:
        map.loc[map.index[i], 'ABOUTISSANT'] = i

    if pandas.isna(map['BI_MIN'][i]) is False:
        bi_min = map['BI_MIN'][i]
    else:
        bi_min = 1

    if pandas.isna(map['BP_MIN'][i]) is False:
        bp_min = map['BP_MIN'][i]
    else:

예제 #49

0

파일 보기

def preprocess(train):
    ## GameClock
    train['GameClock_sec'] = train['GameClock'].apply(strtoseconds)
    train["GameClock_minute"] = train["GameClock"].apply(
        lambda x: x.split(":")[0]).astype("object")

    ## Height
    train['PlayerHeight_dense'] = train['PlayerHeight'].apply(
        lambda x: 12 * int(x.split('-')[0]) + int(x.split('-')[1]))

    ## Time
    train['TimeHandoff'] = train['TimeHandoff'].apply(
        lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    train['TimeSnap'] = train['TimeSnap'].apply(
        lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    train['TimeDelta'] = train.apply(
        lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(),
        axis=1)
    train['PlayerBirthDate'] = train['PlayerBirthDate'].apply(
        lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

    ## Age
    seconds_in_year = 60 * 60 * 24 * 365.25
    train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff'] - row[
        'PlayerBirthDate']).total_seconds() / seconds_in_year,
                                     axis=1)
    train["PlayerAge_ob"] = train['PlayerAge'].astype(np.int).astype("object")

    ## WindSpeed
    # train['WindSpeed_ob'] = train['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    # train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    # train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    # train['WindSpeed_dense'] = train['WindSpeed_ob'].apply(strtofloat)

    ## Weather
    train['GameWeather_process'] = train['GameWeather'].str.lower()
    train['GameWeather_process'] = train['GameWeather_process'].apply(
        lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(
        lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').
        replace('party', 'partly') if not pd.isna(x) else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(
        lambda x: x.replace('clear and sunny', 'sunny and clear')
        if not pd.isna(x) else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(
        lambda x: x.replace('skies', '').replace("mostly", "").strip()
        if not pd.isna(x) else x)
    train['GameWeather_dense'] = train['GameWeather_process'].apply(
        map_weather)

    ## Rusher
    train['IsRusher'] = (train['NflId'] == train['NflIdRusher'])
    train['IsRusher_ob'] = (
        train['NflId'] == train['NflIdRusher']).astype("object")
    temp = train[train["IsRusher"]][["Team", "PlayId"
                                     ]].rename(columns={"Team": "RusherTeam"})
    train = train.merge(temp, on="PlayId")
    train["IsRusherTeam"] = train["Team"] == train["RusherTeam"]

    ## dense -> categorical
    train["Quarter_ob"] = train["Quarter"].astype("object")
    train["Down_ob"] = train["Down"].astype("object")
    train["JerseyNumber_ob"] = train["JerseyNumber"].astype("object")
    train["YardLine_ob"] = train["YardLine"].astype("object")
    # train["DefendersInTheBox_ob"] = train["DefendersInTheBox"].astype("object")
    # train["Week_ob"] = train["Week"].astype("object")
    # train["TimeDelta_ob"] = train["TimeDelta"].astype("object")

    ## Orientation and Dir
    train["Orientation_ob"] = train["Orientation"].apply(
        lambda x: orientation_to_cat(x)).astype("object")
    train["Dir_ob"] = train["Dir"].apply(
        lambda x: orientation_to_cat(x)).astype("object")

    train["Orientation_sin"] = train["Orientation"].apply(
        lambda x: np.sin(x / 360 * 2 * np.pi))
    train["Orientation_cos"] = train["Orientation"].apply(
        lambda x: np.cos(x / 360 * 2 * np.pi))
    train["Dir_sin"] = train["Dir"].apply(
        lambda x: np.sin(x / 360 * 2 * np.pi))
    train["Dir_cos"] = train["Dir"].apply(
        lambda x: np.cos(x / 360 * 2 * np.pi))

    ## diff Score
    train["diffScoreBeforePlay"] = train["HomeScoreBeforePlay"] - train[
        "VisitorScoreBeforePlay"]
    train["diffScoreBeforePlay_binary_ob"] = (
        train["HomeScoreBeforePlay"] >
        train["VisitorScoreBeforePlay"]).astype("object")

    ## Turf
    Turf = {
        'Field Turf': 'Artificial',
        'A-Turf Titan': 'Artificial',
        'Grass': 'Natural',
        'UBU Sports Speed S5-M': 'Artificial',
        'Artificial': 'Artificial',
        'DD GrassMaster': 'Artificial',
        'Natural Grass': 'Natural',
        'UBU Speed Series-S5-M': 'Artificial',
        'FieldTurf': 'Artificial',
        'FieldTurf 360': 'Artificial',
        'Natural grass': 'Natural',
        'grass': 'Natural',
        'Natural': 'Natural',
        'Artifical': 'Artificial',
        'FieldTurf360': 'Artificial',
        'Naturall Grass': 'Natural',
        'Field turf': 'Artificial',
        'SISGrass': 'Artificial',
        'Twenty-Four/Seven Turf': 'Artificial',
        'natural grass': 'Natural'
    }
    train['Turf'] = train['Turf'].map(Turf)

    ## OffensePersonnel
    temp = train["OffensePersonnel"].iloc[np.arange(
        0, len(train),
        22)].apply(lambda x: pd.Series(OffensePersonnelSplit(x)))
    temp.columns = ["Offense" + c for c in temp.columns]
    temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)]
    train = train.merge(temp, on="PlayId")

    ## DefensePersonnel
    temp = train["DefensePersonnel"].iloc[np.arange(
        0, len(train),
        22)].apply(lambda x: pd.Series(DefensePersonnelSplit(x)))
    temp.columns = ["Defense" + c for c in temp.columns]
    temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)]
    train = train.merge(temp, on="PlayId")

    ## sort
    #     train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index(drop = True)
    train = train.sort_values(by=['X']).sort_values(by=['Dis']).sort_values(
        by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop=True)
    return train

예제 #50

0

파일 보기

파일: underscore.py 프로젝트: CermakM/lab

 def stringify(s):
     return str(s) if not pd.isna(s) else None

예제 #51

0

파일 보기

파일: F2.[Map Climatic and Economic Impacts].py 프로젝트: YixuanZheng/Aerosol_Inequality_2019

if_gdp = _env.odir_root + '/summary_' + ds + '/country_specific_statistics_GDP_' + ds + '_' + p_scen + '_Burke.xls'

if_ctrylist = _env.idir_root + '/regioncode/Country_List.xls'
if_ctryshp = (_env.idir_root + '/shape/country/country1.shp')

itbl_gdp = pd.read_excel(if_gdp, 'country-lag0')
itbl_gdp.set_index('iso', inplace=True)
ishp_ctry = gp.read_file(if_ctryshp)

#correct country code
ishp_ctry.loc[ishp_ctry['GMI_CNTRY'] == 'ROM', 'GMI_CNTRY'] = 'ROU'
ishp_ctry.loc[ishp_ctry['GMI_CNTRY'] == 'ZAR', 'GMI_CNTRY'] = 'COD'
ishp_ctry.set_index('GMI_CNTRY', inplace=True)

ishp_ctry['GDP_median'] = itbl_gdp['GDP_median_benefit_ratio']
ishp_ctry.loc[pd.isna(ishp_ctry['GDP_median']), 'GDP_median'] = -999

_env.mkdirs(_env.odir_root + 'gdp_map_' + ds)
ishp_ctry.to_file(_env.odir_root + 'gdp_map_' + ds + '/gdp_country_' + p_scen +
                  '.shp')
ishp_ctry.drop('geometry',
               axis=1).to_csv(_env.odir_root + 'gdp_map_' + ds +
                              '/country_gdp_ratio_median_' + ds + '.csv')

ax = fig.add_subplot(414)
m = Basemap(ellps='WGS84',
            llcrnrlon=-180,
            llcrnrlat=-90,
            urcrnrlon=177.5,
            urcrnrlat=90.,
            suppress_ticks=False)

예제 #52

0

파일 보기

파일: test_nanops.py 프로젝트: srhee1/pandas

    def check_fun_data(self,
                       testfunc,
                       targfunc,
                       testarval,
                       targarval,
                       targarnanval,
                       check_dtype=True,
                       empty_targfunc=None,
                       **kwargs):
        for axis in list(range(targarval.ndim)) + [None]:
            for skipna in [False, True]:
                targartempval = targarval if skipna else targarnanval
                if skipna and empty_targfunc and isna(targartempval).all():
                    targ = empty_targfunc(targartempval, axis=axis, **kwargs)
                else:
                    targ = targfunc(targartempval, axis=axis, **kwargs)

                try:
                    res = testfunc(testarval,
                                   axis=axis,
                                   skipna=skipna,
                                   **kwargs)
                    self.check_results(targ,
                                       res,
                                       axis,
                                       check_dtype=check_dtype)
                    if skipna:
                        res = testfunc(testarval, axis=axis, **kwargs)
                        self.check_results(targ,
                                           res,
                                           axis,
                                           check_dtype=check_dtype)
                    if axis is None:
                        res = testfunc(testarval, skipna=skipna, **kwargs)
                        self.check_results(targ,
                                           res,
                                           axis,
                                           check_dtype=check_dtype)
                    if skipna and axis is None:
                        res = testfunc(testarval, **kwargs)
                        self.check_results(targ,
                                           res,
                                           axis,
                                           check_dtype=check_dtype)
                except BaseException as exc:
                    exc.args += ('axis: %s of %s' % (axis, testarval.ndim - 1),
                                 'skipna: %s' % skipna, 'kwargs: %s' % kwargs)
                    raise

        if testarval.ndim <= 1:
            return

        try:
            testarval2 = np.take(testarval, 0, axis=-1)
            targarval2 = np.take(targarval, 0, axis=-1)
            targarnanval2 = np.take(targarnanval, 0, axis=-1)
        except ValueError:
            return
        self.check_fun_data(testfunc,
                            targfunc,
                            testarval2,
                            targarval2,
                            targarnanval2,
                            check_dtype=check_dtype,
                            empty_targfunc=empty_targfunc,
                            **kwargs)

예제 #53

0

파일 보기

파일: shuffle.py 프로젝트: m-rossi/dask

def _calculate_divisions(
    df: DataFrame,
    partition_col: Series,
    repartition: bool,
    npartitions: int,
    upsample: float = 1.0,
    partition_size: float = 128e6,
) -> Tuple[List, List, List]:
    """
    Utility function to calculate divisions for calls to `map_partitions`
    """
    sizes = df.map_partitions(sizeof) if repartition else []
    divisions = partition_col._repartition_quantiles(npartitions,
                                                     upsample=upsample)
    mins = partition_col.map_partitions(M.min)
    maxes = partition_col.map_partitions(M.max)

    try:
        divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes)
    except TypeError as e:
        # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of
        # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float.
        # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA.
        # If this happens, we hint the user about eliminating nulls beforehand.
        if not is_numeric_dtype(partition_col.dtype):
            obj, suggested_method = (
                ("column",
                 f"`.dropna(subset=['{partition_col.name}'])`") if any(
                     partition_col._name == df[c]._name for c in df) else
                ("series", "`.loc[series[~series.isna()]]`"))
            raise NotImplementedError(
                f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n"
                f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n"
                f"We suggest you try with {suggested_method}.") from e
        # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError
        else:
            raise e

    divisions = methods.tolist(divisions)
    if type(sizes) is not list:
        sizes = methods.tolist(sizes)
    mins = methods.tolist(mins)
    maxes = methods.tolist(maxes)

    empty_dataframe_detected = pd.isna(divisions).all()
    if repartition or empty_dataframe_detected:
        total = sum(sizes)
        npartitions = max(math.ceil(total / partition_size), 1)
        npartitions = min(npartitions, df.npartitions)
        n = len(divisions)
        try:
            divisions = np.interp(
                x=np.linspace(0, n - 1, npartitions + 1),
                xp=np.linspace(0, n - 1, n),
                fp=divisions,
            ).tolist()
        except (TypeError, ValueError):  # str type
            indexes = np.linspace(0, n - 1, npartitions + 1).astype(int)
            divisions = [divisions[i] for i in indexes]
    else:
        # Drop duplicate divisions returned by partition quantiles
        divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]]

    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    if pd.api.types.is_categorical_dtype(partition_col.dtype):
        dtype = partition_col.dtype
        mins = pd.Categorical(mins, dtype=dtype).codes.tolist()
        maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist()

    return divisions, mins, maxes

예제 #54

0

파일 보기

    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._ndarray_values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                assert values.dtype == o.dtype

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                assert o.dtype == orig.dtype
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    tm.assert_numpy_array_equal(pd.isna(o), nanloc)
                else:
                    exp = Series(nanloc, o.index, name='a')
                    tm.assert_series_equal(pd.isna(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64',
                                       name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64',
                                    name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                assert result_s_na.index.name is None
                assert result_s_na.name == 'a'
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                assert result_s.index.name is None
                assert result_s.name == 'a'

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result, Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    vals = values[2:].astype(object).values
                    tm.assert_numpy_array_equal(result[1:], vals)
                    assert result[0] is pd.NaT
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    assert pd.isna(result[0])
                    assert result.dtype == orig.dtype

                assert o.nunique() == 8
                assert o.nunique(dropna=False) == 9

예제 #55

0

파일 보기

파일: jhu-county-to-esri.py 프로젝트: prestinomills/aqueduct

 def integrify(x):
     return int(float(x)) if not pd.isna(x) else None

예제 #56

0

파일 보기

파일: shuffle.py 프로젝트: m-rossi/dask

def set_partition(
    df: DataFrame,
    index: Union[str, Series],
    divisions: Sequence,
    max_branch: int = 32,
    drop: bool = True,
    shuffle: Optional[str] = None,
    compute: Optional[bool] = None,
) -> DataFrame:
    """Group DataFrame by index

    Sets a new index and partitions data along that index according to
    divisions.  Divisions are often found by computing approximate quantiles.
    The function ``set_index`` will do both of these steps.

    Parameters
    ----------
    df: DataFrame/Series
        Data that we want to re-partition
    index: string or Series
        Column to become the new index
    divisions: list
        Values to form new divisions between partitions
    drop: bool, default True
        Whether to delete columns to be used as the new index
    shuffle: str (optional)
        Either 'disk' for an on-disk shuffle or 'tasks' to use the task
        scheduling framework.  Use 'disk' if you are on a single machine
        and 'tasks' if you are on a distributed cluster.
    max_branch: int (optional)
        If using the task-based shuffle, the amount of splitting each
        partition undergoes.  Increase this for fewer copies but more
        scheduler overhead.

    See Also
    --------
    set_index
    shuffle
    partd
    """
    meta = df._meta._constructor_sliced([0])
    if isinstance(divisions, tuple):
        # pd.isna considers tuples to be scalars. Convert to a list.
        divisions = list(divisions)

    if not isinstance(index, Series):
        dtype = df[index].dtype
    else:
        dtype = index.dtype

    if pd.isna(divisions).any() and pd.api.types.is_integer_dtype(dtype):
        # Can't construct a Series[int64] when any / all of the divisions are NaN.
        divisions = df._meta._constructor_sliced(divisions)
    elif (pd.api.types.is_categorical_dtype(dtype)
          and UNKNOWN_CATEGORIES in dtype.categories):
        # If categories are unknown, leave as a string dtype instead.
        divisions = df._meta._constructor_sliced(divisions)
    else:
        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)

    if not isinstance(index, Series):
        partitions = df[index].map_partitions(set_partitions_pre,
                                              divisions=divisions,
                                              meta=meta)
        df2 = df.assign(_partitions=partitions)
    else:
        partitions = index.map_partitions(set_partitions_pre,
                                          divisions=divisions,
                                          meta=meta)
        df2 = df.assign(_partitions=partitions, _index=index)

    df3 = rearrange_by_column(
        df2,
        "_partitions",
        max_branch=max_branch,
        npartitions=len(divisions) - 1,
        shuffle=shuffle,
        compute=compute,
        ignore_index=True,
    )

    if not isinstance(index, Series):
        df4 = df3.map_partitions(
            set_index_post_scalar,
            index_name=index,
            drop=drop,
            column_dtype=df.columns.dtype,
        )
    else:
        df4 = df3.map_partitions(
            set_index_post_series,
            index_name=index.name,
            drop=drop,
            column_dtype=df.columns.dtype,
        )

    df4.divisions = tuple(methods.tolist(divisions))

    return df4.map_partitions(M.sort_index)

예제 #57

0

파일 보기

파일: shuffle.py 프로젝트: m-rossi/dask

def sort_values(
    df: DataFrame,
    by: Union[str, List[str]],
    npartitions: Optional[Union[int, Literal["auto"]]] = None,
    ascending: Union[bool, List[bool]] = True,
    na_position: Union[Literal["first"], Literal["last"]] = "last",
    upsample: float = 1.0,
    partition_size: float = 128e6,
    sort_function: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
    sort_function_kwargs: Optional[Mapping[str, Any]] = None,
    **kwargs,
) -> DataFrame:
    """See DataFrame.sort_values for docstring"""
    if na_position not in ("first", "last"):
        raise ValueError("na_position must be either 'first' or 'last'")
    if not isinstance(by, list):
        by = [by]
    if len(by) > 1 and df.npartitions > 1 or any(not isinstance(b, str)
                                                 for b in by):
        raise NotImplementedError(
            "Dataframes only support sorting by named columns which must be passed as a "
            "string or a list of strings; multi-partition dataframes only support sorting "
            "by a single column.\n"
            "You passed %s" % str(by))

    sort_kwargs = {
        "by": by,
        "ascending": ascending,
        "na_position": na_position,
    }
    if sort_function is None:
        sort_function = M.sort_values
    if sort_function_kwargs is not None:
        sort_kwargs.update(sort_function_kwargs)

    if df.npartitions == 1:
        return df.map_partitions(sort_function, **sort_kwargs)

    if npartitions == "auto":
        repartition = True
        npartitions = max(100, df.npartitions)
    else:
        if npartitions is None:
            npartitions = df.npartitions
        repartition = False

    sort_by_col = df[by[0]]

    divisions, mins, maxes = _calculate_divisions(df, sort_by_col, repartition,
                                                  npartitions, upsample,
                                                  partition_size)

    if len(divisions) == 2:
        return df.repartition(npartitions=1).map_partitions(
            sort_function, **sort_kwargs)

    if not isinstance(ascending, bool):
        # support [True] as input
        if (isinstance(ascending, list) and len(ascending) == 1
                and isinstance(ascending[0], bool)):
            ascending = ascending[0]
        else:
            raise NotImplementedError(
                f"Dask currently only supports a single boolean for ascending. You passed {str(ascending)}"
            )

    if (all(not pd.isna(x)
            for x in divisions) and mins == sorted(mins, reverse=not ascending)
            and maxes == sorted(maxes, reverse=not ascending)
            and all(mx < mn for mx, mn in zip(
                maxes[:-1] if ascending else maxes[1:],
                mins[1:] if ascending else mins[:-1],
            )) and npartitions == df.npartitions):
        # divisions are in the right place
        return df.map_partitions(sort_function, **sort_kwargs)

    df = rearrange_by_divisions(
        df,
        by,
        divisions,
        ascending=ascending,
        na_position=na_position,
        duplicates=False,
    )
    df = df.map_partitions(sort_function, **sort_kwargs)
    return df

예제 #58

0

파일 보기

    def control_loading():
        d_control_data = {}
        df = Facture.load_db()

        # Load table manager
        table_man = TableLoader(Facture.l_index, Facture.l_fields())

        # App 1 table of bill waiting for visa
        ref_date = pd.Timestamp('1970-01-01')
        df['date_visa'] = df.date_visa.apply(lambda x: pd.Timestamp(x))
        df['date_payed'] = df.date_payed.apply(lambda x: pd.Timestamp(x))

        df_, d_footer, kwargs = table_man.load_full_table(
            df.loc[df.date_visa.apply(
                lambda x: pd.isna(x) or x == '' or x == ref_date)])

        d_control_data['tablenovisa'] = {
            'table': {
                'df': df_.copy(),
                'd_footer': d_footer,
                'kwargs': kwargs,
                'key': 'nothing'
            },
            'rows': [('title', [{
                'content': 'title',
                'value': u'Factures en attente de visa',
                'cls': 'text-center'
            }]), ('Table', [{
                'content': 'table'
            }])],
            'rank':
            0
        }
        # App 2 table of bill waiting for payment
        df_, d_footer, kwargs = table_man.load_full_table(
            df.loc[~df.date_visa.apply(
                lambda x: pd.isna(x) or x == '' or x == ref_date)
                   & df.date_payed.apply(
                       lambda x: pd.isna(x) or x == '' or x == ref_date)])
        d_control_data['tablenopayement'] = {
            'table': {
                'df': df_.copy(),
                'd_footer': d_footer,
                'kwargs': kwargs,
                'key': 'visa'
            },
            'rows': [('title', [{
                'content': 'title',
                'value': u'Factures en attente de paiement',
                'cls': 'text-center'
            }]), ('Table', [{
                'content': 'table'
            }])],
            'rank':
            1
        }

        # App 3 table of bill payed
        df_, d_footer, kwargs = table_man.load_full_table(
            df.loc[df.date_payed > ref_date])
        d_control_data['tablepayment'] = {
            'table': {
                'df': df_,
                'd_footer': d_footer,
                'kwargs': kwargs,
                'key': 'payement'
            },
            'rows': [('title', [{
                'content': 'title',
                'value': u'Factures encaissées',
                'cls': 'text-center'
            }]), ('Table', [{
                'content': 'table'
            }])],
            'rank':
            2
        }

        return d_control_data

예제 #59

0

파일 보기

파일: data_fill.py 프로젝트: MauroVA98/DAPOS

 def __row_isna_check(self, row):
     return pd.isna(row[1]['Solar Index'])

예제 #60

0

파일 보기

파일: confusion_matrix.py 프로젝트: Aswinprabhakaran/product_detection_aswin_prabhakaran

def compute_confison_matrix(predicted_df, ground_truth_df, iou_intersection_th = 0.4 ,debug = False,
                            calculate_distance = False, minimum_distance = None):
    
    """
    
    This piece of code essentially computes the confusion matrix between Ground Truth and predicted Video
    
    :param predicted_df -- prediction dataframe
    :param ground_truth_df -- groundtruth dataframe
    :param iou_intersection_th -- IOU iou_intersection_th threshold
    :param calculate_distance -- If True , compute the centroid distance between true and predicted boxes
    :param minimum_distance -- Distance threshold between two pred and true boxes
    
    """    
    
    # Defining the Output variables which has to be returned
    result = {}
    
    # Defining the variables which wil be used in the computation
    tp = 0
    tn = 0
    unique_all_gt_box = []
    unique_all_pred_box= []
    
    
    df_pred = predicted_df.copy()
    
#     df_pred = alter_predicted_csv(predicted_df = df_pred)

    GT = ground_truth_df.copy()
    
    # Setting up an dataframe to store the details of matched predicted box with GroundTruth Box 
    count_correctness = pd.DataFrame(columns=['NAME','IOU_THRESHOLD', 'GT_BOX', 
                                           'MATCHED_PRED_BOX', 
                                           'IOU_BTWN_GT_AND_PRED_BOX',
                                           'COUNT_OF_OTHER_BOXES_MATCHED_WITH_GT_FOR_GIVEN_IOU'])


    # Processing the frames in the csv to compute its confusion matrix
    for frame_name in GT['name'].unique().tolist():
    
        # Values changing w.r.t. every frame in the given video file
        bbox_num_mapping = {}
        bbox_with_iou = {}
        matched_pred_box = {}
        
        # Getting all the GroundTruth Frames for the given FRAME NAME or FRAME NO from GroundTruth dataframe
        cur_frame_gt_boxes = GT[GT['name'] == frame_name].values.tolist()
        
        for box in cur_frame_gt_boxes:
        
            bbox_num_mapping[box[1]] = box[2:10]
        
            bbox_with_iou [box[1]] = 0.0
        
            unique_all_gt_box.append(box[2:10])
         
        
        # Getting all the prediction for the current frame from the predictions csv
        cur_frame_predictions =  df_pred[df_pred['name'] == frame_name].values.tolist()
        
        if cur_frame_predictions:
        
            cur_frame_pred_boxes = np.array(cur_frame_predictions[0][1:]).reshape(-1,4).tolist()

            cur_frame_pred_boxes = [box for box in cur_frame_pred_boxes if not pd.isna(box[0])]
        
            for pred_box in cur_frame_pred_boxes :
            
                unique_all_pred_box.append(pred_box)
            
                pred_box_poly = make_polygon_from_4_coordinates(box = pred_box)
            
                per_pred_box_iou = {}
            
                for box_no , gt_box in bbox_num_mapping.items():
                    
                    gt_box_poly = make_polygon_from_8_coordinates(box = gt_box)
                
                    iou = gt_box_poly.intersection(pred_box_poly).area / gt_box_poly.union(pred_box_poly).area
                
                    if calculate_distance:
                    
                        x_centroid, y_centroid = get_centroid(box = gt_box)
                        x_centroid_p, y_centroid_p = get_centroid_from_4_coordinates(box = pred_box)
                    
                        centroid_dis = centroid_distance(x_centroid, y_centroid, x_centroid_p, y_centroid_p)
                    
                        if (iou >= iou_intersection_th and centroid_dis < minimum_distance):
                        
                            per_pred_box_iou[box_no] = iou
                
                    else:
                    
                        if (iou >= iou_intersection_th):
                        
                            per_pred_box_iou[box_no] = iou
                
            
                if per_pred_box_iou:
                
                    matched_gt_box_no = max(per_pred_box_iou, key = per_pred_box_iou.get)
            
                    if per_pred_box_iou[matched_gt_box_no] > bbox_with_iou[matched_gt_box_no]:
                
                        bbox_with_iou[matched_gt_box_no] = per_pred_box_iou[matched_gt_box_no]
                
                        matched_pred_box[matched_gt_box_no] = pred_box
                
        
        tp += len(matched_pred_box)
        
        for gt_box_no , mat_box in matched_pred_box.items():
        
            count_correctness = count_correctness.append({'NAME' : frame_name,
                                                          'IOU_THRESHOLD' : iou_intersection_th,
                                                          'GT_BOX' : bbox_num_mapping[gt_box_no],
                                                          'MATCHED_PRED_BOX' : mat_box,
                                                          'IOU_BTWN_GT_AND_PRED_BOX' : bbox_with_iou[gt_box_no]}, ignore_index = True)
        
    
    # False Positives - unique all predicted boxes - true predicted boxes ( TRUE POSITIVES)
    fp = len(unique_all_pred_box) - tp
    
    assert len(GT) == len(unique_all_gt_box)
    
    # False Negatives - unique all GroundTruth boxes - true predicted boxes ( TRUE POSITIVES)
    fn = len(unique_all_gt_box) - tp
    
    
    # Computation of Confusion Matrix
    if tp == 0:
        acc = 0
        recall = 0
        precision = 0
        f_measure = 0
    else:
        acc = (tp+tn)/(tp+tn+fp+fn) 
        recall = tp/(tp+fn)
        precision = tp/(tp+fp)
        f_measure = (2*recall*precision) / (recall+precision)
    
    # Assigning the results to the 
    result['tp'] = tp
    result['fp'] = fp
    result['fn'] = fn
    result['tn'] = tn
    result['acc'] = acc * 100
    result['precision'] = precision
    result['recall'] = recall
    result['f_measure'] = f_measure
    
    if debug:
        
        result['count_correctness'] = count_correctness
        
    return result