Пример #1
0
    def test_csv_roaring_import(self):
        client = self.get_client()
        text = u"""
            10, 7
            10, 5
            2, 3
            7, 1
        """
        reader = csv_column_reader(StringIO(text))
        field = self.index.field("importfield-fast")
        client.ensure_field(field)
        client.import_field(field, reader, fast_import=True)
        bq = self.index.batch_query(
            field.row(2),
            field.row(7),
            field.row(10),
        )
        response = client.query(bq)
        target = [3, 1, 5]
        self.assertEqual(3, len(response.results))
        self.assertEqual(
            target, [result.row.columns[0] for result in response.results])

        # test clear import
        reader = csv_column_reader(StringIO(text))
        client.import_field(field, reader, fast_import=True, clear=True)
        bq = self.index.batch_query(
            field.row(2),
            field.row(7),
            field.row(10),
        )
        response = client.query(bq)
        self.assertEqual(3, len(response.results))
        for result in response.results:
            self.assertEqual([], result.row.columns)
Пример #2
0
    def test_csvbititerator_customtimefunc(self):
        class UtcTzinfo(datetime.tzinfo):
            ZERO = datetime.timedelta(0)

            def utcoffset(self, dt):
                return UtcTzinfo.ZERO

            def dst(self, dt):
                return UtcTzinfo.ZERO

            def tzname(self, dt):
                return "UTC"

        def timefunc_utcstr(timeval):
            dt = datetime.datetime.strptime(timeval, '%Y-%m-%dT%H:%M:%S')
            dt = dt.replace(tzinfo=UtcTzinfo())
            return calendar.timegm(dt.timetuple())

        reader = csv_column_reader(StringIO(u"""
            1,10,1991-09-02T06:33:20
            5,20,1991-09-02T06:35:00
            3,41,1991-09-02T06:36:25
            10,10485760,1991-09-02T06:36:25
        """),
                                   timefunc=timefunc_utcstr)

        rows = list(reader)
        self.assertEqual(len(rows), 4)
        self.assertEqual(rows[0], (1, 10, 683793200))
        self.assertEqual(rows[1], (5, 20, 683793300))
        self.assertEqual(rows[2], (3, 41, 683793385))
        self.assertEqual(rows[3], (10, 10485760, 683793385))
Пример #3
0
    def test_csv_roaring_import_time_field(self):
        client = self.get_client()
        text = u"""
            10, 7, 1542199376
            10, 5, 1483273800
            2, 3, 1520268300
            7, 1, 1330965900
        """
        reader = csv_column_reader(StringIO(text))
        field = self.index.field("importfield-fast-time",
                                 time_quantum=TimeQuantum.YEAR_MONTH_DAY_HOUR)
        client.ensure_field(field)
        client.import_field(field, reader, fast_import=True)
        bq = self.index.batch_query(
            field.row(2),
            field.row(7),
            field.row(10),
        )
        response = client.query(bq)
        target = [3, 1, 5]
        self.assertEqual(3, len(response.results))
        self.assertEqual(
            target, [result.row.columns[0] for result in response.results])

        target = [5, 7]
        start = datetime(2016, 1, 1, 0, 0)
        end = datetime(2019, 1, 1, 0, 0, 0)
        response = client.query(field.range(10, start, end))
        self.assertEqual(target, response.result.row.columns)

        # test clear import
        reader = csv_column_reader(StringIO(text))
        client.import_field(field, reader, fast_import=True, clear=True)
        bq = self.index.batch_query(
            field.row(2),
            field.row(7),
            field.row(10),
        )
        response = client.query(bq)
        self.assertEqual(3, len(response.results))
        for result in response.results:
            self.assertEqual([], result.row.columns)
Пример #4
0
 def test_csv_import2(self):
     # Checks against encoding errors on Python 2.x
     text = u"""
         1,10,683793200
         5,20,683793300
         3,41,683793385        
         10,10485760,683793385        
     """
     reader = csv_column_reader(StringIO(text))
     client = self.get_client()
     schema = client.schema()
     field = schema.index(self.index.name).field("importfield", time_quantum=TimeQuantum.YEAR_MONTH_DAY_HOUR)
     client.sync_schema(schema)
     client.import_field(field, reader)
Пример #5
0
    def test_invalid_input(self):
        invalid_inputs = [
            # less than 2 columns
            u"155",
            # invalid row ID
            u"a5,155",
            # invalid column ID
            u"155,a5",
            # invalid timestamp
            u"155,255,a5",
        ]

        for text in invalid_inputs:
            reader = csv_column_reader(StringIO(text))
            self.assertRaises(PilosaError, list, reader)
Пример #6
0
    def test_csv_column_reader_row_key_column_id(self):
        reader = csv_column_reader(StringIO(u"""
            one,10,683793200
            five,20,683793300
            three,41,683793385
            ten,10485760,683793385
        """),
                                   formatfunc=csv_row_key_column_id)

        ls = list(reader)
        target = [
            Column(row_key="one", column_id=10, timestamp=683793200),
            Column(row_key="five", column_id=20, timestamp=683793300),
            Column(row_key="three", column_id=41, timestamp=683793385),
            Column(row_key="ten", column_id=10485760, timestamp=683793385)
        ]
        self.assertEqual(target, ls)
Пример #7
0
    def test_csv_column_reader_row_id_column_key(self):
        reader = csv_column_reader(StringIO(u"""
            1,ten,683793200
            5,twenty,683793300
            3,forty-one,683793385
            10,a-big-number,683793385
        """),
                                   formatfunc=csv_row_id_column_key)

        ls = list(reader)
        target = [
            Column(row_id=1, column_key="ten", timestamp=683793200),
            Column(row_id=5, column_key="twenty", timestamp=683793300),
            Column(row_id=3, column_key="forty-one", timestamp=683793385),
            Column(row_id=10, column_key="a-big-number", timestamp=683793385)
        ]
        self.assertEqual(target, ls)
Пример #8
0
    def test_csv_field_value_column_key(self):
        reader = csv_column_reader(StringIO(u"""
            ten,1
            twenty,5
            forty-one,3
            a-big-number,10
        """),
                                   formatfunc=csv_column_key_value)

        ls = list(reader)
        target = [
            FieldValue(column_key="ten", value=1),
            FieldValue(column_key="twenty", value=5),
            FieldValue(column_key="forty-one", value=3),
            FieldValue(column_key="a-big-number", value=10)
        ]
        self.assertEqual(target, ls)
Пример #9
0
 def test_csv_import_time_field(self):
     text = u"""
         1,10,683793200
         5,20,683793300
         3,41,683793385
         10,10485760,683793385
     """
     reader = csv_column_reader(StringIO(text))
     client = self.get_client()
     schema = client.schema()
     field = schema.index(self.index.name).field(
         "importfield", time_quantum=TimeQuantum.YEAR_MONTH_DAY_HOUR)
     client.sync_schema(schema)
     client.import_field(field, reader)
     bq = self.index.batch_query(field.row(1), field.row(5), field.row(3),
                                 field.row(10))
     response = client.query(bq)
     target = [10, 20, 41, 10485760]
     self.assertEqual(
         target, [result.row.columns[0] for result in response.results])
Пример #10
0
    def test_csv_column_reader_row_key_column_key(self):
        reader = csv_column_reader(StringIO(u"""
            one,ten,683793200
            five,twenty,683793300
            three,forty-one,683793385
            ten,a-big-number,683793385
        """),
                                   formatfunc=csv_row_key_column_key)

        ls = list(reader)
        target = [
            Column(row_key="one", column_key="ten", timestamp=683793200),
            Column(row_key="five", column_key="twenty", timestamp=683793300),
            Column(row_key="three",
                   column_key="forty-one",
                   timestamp=683793385),
            Column(row_key="ten",
                   column_key="a-big-number",
                   timestamp=683793385)
        ]
        self.assertEqual(target, ls)
Пример #11
0
 def test_csv_import_row_keys_manual_address(self):
     client = self.get_client_manual_address()
     text = u"""
         ten, 7
         ten, 5
         two, 3
         seven, 1
     """
     reader = csv_column_reader(StringIO(text), formatfunc=csv_row_key_column_id)
     field = self.index.field("importfield-keys", keys=True)
     client.ensure_field(field)
     client.import_field(field, reader)
     bq = self.index.batch_query(
         field.row("two"),
         field.row("seven"),
         field.row("ten"),
     )
     response = client.query(bq)
     target = [3, 1, 5]
     self.assertEqual(3, len(response.results))
     self.assertEqual(target, [result.row.columns[0] for result in response.results])
Пример #12
0
    def test_csvbititerator(self):
        reader = csv_column_reader(
            StringIO(u"""
            1,10,683793200
            5,20,683793300
            3,41,683793385        
            10,10485760,683793385        
        """))
        shard_bit_groups = list(batch_columns(reader, 2))
        self.assertEqual(3, len(shard_bit_groups))

        shard1, batch1 = shard_bit_groups[0]
        self.assertEqual(shard1, 0)
        self.assertEqual(2, len(list(batch1)))

        shard2, batch2 = shard_bit_groups[1]
        self.assertEqual(shard2, 0)
        self.assertEqual(1, len(list(batch2)))

        shard3, batch3 = shard_bit_groups[2]
        self.assertEqual(shard3, 10)
        self.assertEqual(1, len(list(batch3)))
Пример #13
0
    def test_csv_column_reader_row_id_column_id(self):
        reader = csv_column_reader(
            StringIO(u"""
            1,10,683793200
            5,20,683793300
            3,41,683793385        
            10,10485760,683793385        
        """))
        from pilosa.client import DEFAULT_SHARD_WIDTH
        shard_bit_groups = list(batch_columns(reader, 2, DEFAULT_SHARD_WIDTH))
        self.assertEqual(3, len(shard_bit_groups))

        shard1, batch1 = shard_bit_groups[0]
        self.assertEqual(shard1, 0)
        self.assertEqual(2, len(list(batch1)))

        shard2, batch2 = shard_bit_groups[1]
        self.assertEqual(shard2, 0)
        self.assertEqual(1, len(list(batch2)))

        shard3, batch3 = shard_bit_groups[2]
        self.assertEqual(shard3, 10)
        self.assertEqual(1, len(list(batch3)))
Пример #14
0
# Creating the Schema
client = pilosa.Client()
schema = client.schema()
# This is where the index will go later
# This is where the fields will go later
repository = schema.index("repository")
stargazer = repository.field("stargazer",
                             time_quantum=pilosa.TimeQuantum.YEAR_MONTH_DAY)
language = repository.field("language")
client.sync_schema(schema)

# Now we are loading our data into the stargazer field
time_func = lambda s: int(time.mktime(time.strptime(s, "%Y-%m-%dT%H:%M")))
with open("stargazer.csv") as f:
    stargazer_reader = csv_column_reader(f, timefunc=time_func)
    client.import_field(stargazer, stargazer_reader)

# Now we are loading our data into the langauge field
with open("language.csv") as f:
    language_reader = csv_column_reader(f, csv_row_id_column_id)
    client.import_field(language, language_reader)

# Now lets make some queries on csv files to measure the peformance of pilosa bitmapping technique :-

# Query 1: Let's find out which repositories did user 14 starred.
response = client.query(stargazer.row(14))
print("User 14 starredd: ", response.result.row.columns)


# Query2 : What are the top 5 programming language in sample data.