示例#1
0
 def test_string_no_dates(self):
     s1 = Series(['a', 'A longer string'])
     s2 = Series([1.0, 2.0], dtype=np.float64)
     original = DataFrame({'s1': s1, 's2': s2})
     original.index.name = 'index'
     with tm.ensure_clean() as path:
         original.to_stata(path)
         written_and_read_again = self.read_dta(path)
         tm.assert_frame_equal(written_and_read_again.set_index('index'),
                               original)
示例#2
0
 def test_nan_to_missing_value(self):
     s1 = Series(np.arange(4.0), dtype=np.float32)
     s2 = Series(np.arange(4.0), dtype=np.float64)
     s1[::2] = np.nan
     s2[1::2] = np.nan
     original = DataFrame({'s1': s1, 's2': s2})
     original.index.name = 'index'
     with tm.ensure_clean() as path:
         original.to_stata(path)
         written_and_read_again = self.read_dta(path)
         written_and_read_again = written_and_read_again.set_index('index')
         tm.assert_frame_equal(written_and_read_again, original)
示例#3
0
    def test_read_write_dta13(self):
        s1 = Series(2**9, dtype=np.int16)
        s2 = Series(2**17, dtype=np.int32)
        s3 = Series(2**33, dtype=np.int64)
        original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3})
        original.index.name = 'index'

        formatted = original
        formatted['int64'] = formatted['int64'].astype(np.float64)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  formatted)
示例#4
0
    def readFromFileMl1m(behavioursFile: str):

        behavioursDF: DataFrame = pd.read_csv(behavioursFile,
                                              sep='\t',
                                              header=0,
                                              encoding="ISO-8859-1")
        behavioursDF.columns = [
            Behaviours.COL_USERID, Behaviours.COL_MOVIEID,
            Behaviours.COL_REPETITION, Behaviours.COL_BEHAVIOUR
        ]

        return behavioursDF

        behaviour: List[float] = []
        for indexI, rowI in behavioursDF.iterrows():

            behaviourI: List[bool] = Behaviours.__convertToListOfBoolean(
                str(rowI[Behaviours.COL_BEHAVIOUR]))
            behaviour.append(behaviourI)

        behavioursConvertedDF: DataFrame = pd.concat(
            [
                behavioursDF[Behaviours.COL_USERID],
                behavioursDF[Behaviours.COL_MOVIEID],
                behavioursDF[Behaviours.COL_REPETITION],
                Series(behaviour)
            ],
            axis=1,
            keys=[
                Behaviours.COL_USERID, Behaviours.COL_MOVIEID,
                Behaviours.COL_REPETITION, Behaviours.COL_BEHAVIOUR
            ])

        return behavioursConvertedDF
示例#5
0
    def test_read_dta4(self):
        parsed_113 = self.read_dta(self.dta4_113)
        parsed_114 = self.read_dta(self.dta4_114)
        parsed_115 = self.read_dta(self.dta4_115)
        parsed_117 = self.read_dta(self.dta4_117)

        expected = DataFrame.from_records(
            [["one", "ten", "one", "one", "one"],
             ["two", "nine", "two", "two", "two"],
             ["three", "eight", "three", "three", "three"],
             ["four", "seven", 4, "four", "four"],
             ["five", "six", 5, np.nan, "five"],
             ["six", "five", 6, np.nan, "six"],
             ["seven", "four", 7, np.nan, "seven"],
             ["eight", "three", 8, np.nan, "eight"],
             ["nine", "two", 9, np.nan, "nine"],
             ["ten", "one", "ten", np.nan, "ten"]],
            columns=[
                'fully_labeled', 'fully_labeled2', 'incompletely_labeled',
                'labeled_with_missings', 'float_labelled'
            ])

        # these are all categoricals
        expected = pd.concat([
            Series(pd.Categorical(value))
            for col, value in compat.iteritems(expected)
        ],
                             axis=1)

        tm.assert_frame_equal(parsed_113, expected)
        tm.assert_frame_equal(parsed_114, expected)
        tm.assert_frame_equal(parsed_115, expected)
        tm.assert_frame_equal(parsed_117, expected)
示例#6
0
    def test_bool_uint(self):
        s0 = Series([0, 1, True], dtype=np.bool)
        s1 = Series([0, 1, 100], dtype=np.uint8)
        s2 = Series([0, 1, 255], dtype=np.uint8)
        s3 = Series([0, 1, 2**15 - 100], dtype=np.uint16)
        s4 = Series([0, 1, 2**16 - 1], dtype=np.uint16)
        s5 = Series([0, 1, 2**31 - 100], dtype=np.uint32)
        s6 = Series([0, 1, 2**32 - 1], dtype=np.uint32)

        original = DataFrame({
            's0': s0,
            's1': s1,
            's2': s2,
            's3': s3,
            's4': s4,
            's5': s5,
            's6': s6
        })
        original.index.name = 'index'
        expected = original.copy()
        expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32,
                          np.int32, np.float64)
        for c, t in zip(expected.columns, expected_types):
            expected[c] = expected[c].astype(t)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            written_and_read_again = self.read_dta(path)
            written_and_read_again = written_and_read_again.set_index('index')
            tm.assert_frame_equal(written_and_read_again, expected)
示例#7
0
 def test_excessively_long_string(self):
     str_lens = (1, 244, 500)
     s = {}
     for str_len in str_lens:
         s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
     original = DataFrame(s)
     with tm.assertRaises(ValueError):
         with tm.ensure_clean() as path:
             original.to_stata(path)
示例#8
0
  def readFromFileMl1m():
    behavioursFile: str = ".." + os.sep + "datasets" + os.sep + "ml-1m" + os.sep + "behaviours.dat"

    behavioursDF:DataFrame = pd.read_csv(behavioursFile, sep='\t', header=0, encoding="ISO-8859-1")
    behavioursDF.columns = [Behaviours.COL_USERID, Behaviours.COL_MOVIEID, Behaviours.COL_REPETITION,
                            Behaviours.COL_STATIC08, Behaviours.COL_STATIC06, Behaviours.COL_STATIC04,
                            Behaviours.COL_STATIC02, Behaviours.COL_LINEAR0109]


    behaviourStatic08:List[float] = []
    behaviourStatic06:List[float] = []
    behaviourStatic04:List[float] = []
    behaviourStatic02:List[float] = []
    behaviourLinear0109:List[float] = []
    for indexI, rowI in behavioursDF.iterrows():

       behavStatic08I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC08]))
       behaviourStatic08.append(behavStatic08I)
       
       behavStatic06I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC06]))
       behaviourStatic06.append(behavStatic06I)

       behavStatic04I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC04]))
       behaviourStatic04.append(behavStatic04I)

       behavStatic02I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC02]))
       behaviourStatic02.append(behavStatic02I)
       
       behavLinear0109I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_LINEAR0109]))
       behaviourLinear0109.append(behavLinear0109I)

    behavioursConvertedDF:DataFrame = pd.concat([behavioursDF[Behaviours.COL_USERID], behavioursDF[Behaviours.COL_MOVIEID],
                                                 behavioursDF[Behaviours.COL_REPETITION],
                                                 Series(behaviourStatic08), Series(behaviourStatic06),
                                                 Series(behaviourStatic04), Series(behaviourStatic02),
                                                 Series(behaviourLinear0109)],
                                                 axis=1, keys=[Behaviours.COL_USERID, Behaviours.COL_MOVIEID,
                                                 Behaviours.COL_REPETITION,
                                                 Behaviours.COL_STATIC08, Behaviours.COL_STATIC06, Behaviours.COL_STATIC04,
                                                 Behaviours.COL_STATIC02, Behaviours.COL_LINEAR0109])

    return behavioursConvertedDF
示例#9
0
 def test_minimal_size_col(self):
     str_lens = (1, 100, 244)
     s = {}
     for str_len in str_lens:
         s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
     original = DataFrame(s)
     with tm.ensure_clean() as path:
         original.to_stata(path, write_index=False)
         sr = StataReader(path)
         variables = sr.varlist
         formats = sr.fmtlist
         for variable, fmt in zip(variables, formats):
             self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
示例#10
0
    def test_read_dta10(self):
        original = DataFrame(
            data=[["string", "object", 1, 1.1,
                   np.datetime64('2003-12-25')]],
            columns=['string', 'object', 'integer', 'float', 'datetime'])
        original["object"] = Series(original["object"], dtype=object)
        original.index.name = 'index'

        with ensure_clean(self.dta10) as path:
            original.to_stata(path, {'datetime': 'tc'}, False)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  original)
示例#11
0
    def test_read_write_dta10(self):
        original = DataFrame(data=[["string", "object", 1, 1.1,
                                    np.datetime64('2003-12-25')]],
                             columns=['string', 'object', 'integer',
                                      'floating', 'datetime'])
        original["object"] = Series(original["object"], dtype=object)
        original.index.name = 'index'
        original.index = original.index.astype(np.int32)
        original['integer'] = original['integer'].astype(np.int32)

        with tm.ensure_clean() as path:
            original.to_stata(path, {'datetime': 'tc'})
            written_and_read_again = self.read_dta(path)
            # original.index is np.int32, readed index is np.int64
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  original, check_index_type=False)
示例#12
0
    def test_read_write_dta10(self):
        if not is_little_endian():
            raise nose.SkipTest("known failure of test_write_dta10 on "
                                "non-little endian")

        original = DataFrame(data=[["string", "object", 1, 1.1,
                                    np.datetime64('2003-12-25')]],
                             columns=['string', 'object', 'integer', 'float',
                                      'datetime'])
        original["object"] = Series(original["object"], dtype=object)
        original.index.name = 'index'

        with tm.ensure_clean() as path:
            original.to_stata(path, {'datetime': 'tc'}, False)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  original)
示例#13
0
def countAggrBanditsResponsibility(methodsResult: List[tuple],
                                   modelDF: DataFrame):

    #print(methodsResult)

    result: List[tuple] = []
    for itemIdI, methodIdI in methodsResult:
        wIJ: float = modelDF.loc[methodIdI, 'r'] / modelDF.loc[methodIdI, 'n']
        result.append((itemIdI, wIJ))

    itemsIDs: List[int] = [x[0] for x in result]
    scores: List[float] = [x[1] for x in result]
    resultSer: Series = Series(scores, index=itemsIDs)

    finalScores = normalize(np.expand_dims(resultSer.values, axis=0))[0, :]
    resultNorm: List[tuple] = zip(resultSer.index, finalScores.tolist())

    return resultNorm
示例#14
0
    def test_large_value_conversion(self):
        s0 = Series([1, 99], dtype=np.int8)
        s1 = Series([1, 127], dtype=np.int8)
        s2 = Series([1, 2**15 - 1], dtype=np.int16)
        s3 = Series([1, 2**63 - 1], dtype=np.int64)
        original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
        original.index.name = 'index'
        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(PossiblePrecisionLoss):
                original.to_stata(path)

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified['s1'] = Series(modified['s1'], dtype=np.int16)
            modified['s2'] = Series(modified['s2'], dtype=np.int32)
            modified['s3'] = Series(modified['s3'], dtype=np.float64)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  modified)
示例#15
0
#!/usr/bin/env python
"""

@author: jstrick
Created on Sat May 18 10:46:25 2013

"""
import numpy as np
from pandas.core.frame import Series

NUM_VALUES = 10
index = [chr(i) for i in range(97,97 + NUM_VALUES)]
print(index)

s1 = Series(np.linspace(1,5,NUM_VALUES), index=index)
s2 = Series(np.linspace(1,5,NUM_VALUES))

print(s1, "\n")
print(s2, "\n")

print(s1[['h','b']], "\n")

print(s1[['a','b','c']], "\n")

print(s1.sum(), s1.mean(), s1.min(), s1.max(), "\n")
print(s1.cumsum(), s1.cumprod(), s1.std(), "\n")
print('a' in s1)
print('m' in s1)

s3 = s1 * 10
print(s3, "\n")
示例#16
0
#!/usr/bin/env python
"""

@author: jstrick
Created on Sat May 18 16:20:49 2013

"""
from pandas.core.frame import Series

# create from list
s1 = Series([5, 10, 15])
print(s1, "\n")
print("s1[0]:", s1[0], "\n")
print('-' * 60)

# create from list with index
s2 = Series([5, 10, 15], ['a', 'b', 'c'])
print(s2, "\n")
print("s2['a']:", s2['a'])
print('-' * 60)

# create from dictionary (keys are indices)
s3 = Series({'b': 10, 'a': 5, 'c': 15})
print(s3, "\n")
print("s3.sum(), s3.mean():", s3.sum(), s3.mean())
print('-' * 60)
print(index2 & index3)
print(index2.intersection(index3))
print()

print_header("index2 | index3", 70)
# these are the same
print(index2 | index3)
print(index2.union(index3))
print()

print_header("index1.difference(index3)", 70)
print(index1.difference(index3))
print()

print_header("Series([10,20,30], index=index1)", 70)
series1 = Series([10, 20, 30], index=index1)
print(series1)
print()

print_header(
    "DataFrame([(1,2,3),(4,5,6),(7,8,9)], index=index1, columns=index4)", 70)
dataframe1 = DataFrame([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                       index=index1,
                       columns=index4)
print(dataframe1)
print()

print_header(
    "DataFrame([(1,2,3),(4,5,6),(7,8,9)], index=index4, columns=index1)", 70)
dataframe2 = DataFrame([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                       index=index4,
示例#18
0
    def generateGraphData(self, stats=None):
        safePrint('Generating data files to %s' % self.pathGraphs)

        wiki = self.getWiki()

        if stats is None:
            allData = read_table(self.combinedFile, sep='\t')
        else:
            allData = DataFrame(stats, columns=columnHdrResult)

        # filter type==DATA and site==wikipedia
        allData = allData[(allData['type'] == 'DATA') & (allData['site'] == 'wikipedia')]
        # filter out last date
        lastDate = allData.date.max()
        df = allData[allData.date < lastDate]

        allEnabled = df[(df.ison == 'on') & (df.iszero == 'yes')]
        s = StringIO.StringIO()
        pivot_table(allEnabled, 'count', ['date', 'xcs'], aggfunc=np.sum).to_csv(s, header=True)
        result = s.getvalue()

        wiki(
            'edit',
            title='RawData:AllEnabled',
            summary='refreshing data',
            text=result,
            token=wiki.token()
        )

        for xcs in list(df.xcs.unique()):

            xcsDf = df[df.xcs == xcs]

            # create an artificial yes/opera value
            opera = xcsDf[(xcsDf.via == 'OPERA') & (xcsDf.iszero == 'yes')]
            opera['str'] = 'zero-opera'

            yes = xcsDf[xcsDf.iszero == 'yes']
            yes['str'] = 'zero-all'

            no = xcsDf[xcsDf.iszero == 'no']
            no['str'] = 'non-zero'

            combined = opera.append(yes).append(no)

            s = StringIO.StringIO()
            pivot_table(combined, 'count', ['date', 'str'], aggfunc=np.sum).to_csv(s, header=False)
            result = 'date,iszero,count\n' + s.getvalue()

            wiki(
                'edit',
                title='RawData:' + xcs,
                summary='refreshing data',
                text=result,
                token=wiki.token()
            )


            byLang = pivot_table(xcsDf, 'count', ['lang'], aggfunc=np.sum).order('count', ascending=False)
            top = byLang.head(5)
            other = byLang.sum() - top.sum()
            s = StringIO.StringIO()
            Series.to_csv(top, s)
            result = 'lang,count\n' + s.getvalue() + ('other,%d\n' % other)

            wiki(
                'edit',
                title='RawData:' + xcs + '-langTotal',
                summary='refreshing data',
                text=result,
                token=wiki.token()
            )
示例#19
0
    names = "names"
    indices = "indices"
    range = "range"


# Indices=NewType("indices",List[int])
# Names=NewType("names",List[str])
# Start=NewType("start",PositiveInt)
# End=NewType("end",PositiveInt)
# Range=NewType("range",Tuple[Start,End])


class SelectRow(BasicOperator):
    data: DFPort
    bool_array: series_port(optional=False)
    mode: ModeEnum


if __name__ == "__main__":
    data = DataFrame(np.arange(15).reshape(3, 5),
                     index=['one', 'two', 'three'],
                     columns=['a', 'b', 'c', 'd', 'e'])
    bool_array = Series([True, False, True, False])
    try:
        SelectRow(data=data,
                  bool_array=bool_array,
                  mode="indices",
                  config=(9, 9))
    except ValidationError as e:
        print(str(e))