def test_twitter(self): a = Babe().pull_twitter() a = a.filterColumns(keep_fields=[ "author_name", "author_id", "author_screen_name", "created_at", "hashtags", "text", "in_reply_to_status_id_str" ]) a = a.typedetect() a.to_string()
def test_buzzdata(self): a = Babe().pull( protocol='buzzdata', dataroom='best-city-contest-worldwide-cost-of-living-index', uuid='aINAPyLGur4y37yAyCM7w3', username='******', format='xls') a = a.head(2) a.to_string()
def test_buzzdata(self): a = Babe().pull( protocol="buzzdata", dataroom="best-city-contest-worldwide-cost-of-living-index", uuid="aINAPyLGur4y37yAyCM7w3", username="******", format="xls", ) a = a.head(2) a.to_string()
def test_twitter(self): a = Babe().pull_twitter() a = a.filterColumns(keep_fields=["author_name", "author_id", "author_screen_name", "created_at", "hashtags", "text", "in_reply_to_status_id_str"]) a = a.typedetect() a.to_string()
def test_user_agent(self): a = Babe().pull(string=self.s, format="csv") a = a.user_agent(field="useragent", output_os="os", output_browser="browser", output_browser_version="browser_version") self.assertEquals(a.to_string(), self.s2)
def test_bulk(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.typedetect() a = a.bulkMapTo(lambda list: [[sum([r.a for r in list])]] * len(list), bulk_size=2, insert_fields=["b"]) self.assertEquals(a.to_string(), self.s2)
def test_html(self): a = Babe().pull(string=self.s, format="csv") self.assertEqual(a.to_string(format="html"), """<h2></h2><table> <tr><th>a</th><th>b</th></tr> <tr><td>1</td><td>2</td></tr> </table> """)
def test_join_none(self): a = Babe().pull(string=self.s1, format='csv') a = a.join(join_stream=Babe().pull(string=self.s2_bis, format='csv'), key='country', join_key='country_code', on_error=Babe.ON_ERROR_NONE) self.assertEquals(a.to_string(), self.sjoined_bis)
def test_bulk(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.typedetect() a = a.bulkMapTo(lambda list: [[sum([r.a for r in list])]] * len(list), bulk_size=2, insert_fields=["b"]) self.assertEquals(a.to_string(), self.s2)
def test_user_agent(self): a = Babe().pull(string=self.s, format="csv") a = a.user_agent(field="useragent", output_os="os", output_browser="browser", output_browser_version="browser_version") self.assertEquals(a.to_string(), self.s2)
def test_join_none(self): a = Babe().pull(string=self.s1, format='csv') a = a.join(join_stream=Babe().pull(string=self.s2_bis, format='csv'), key='country', join_key='country_code', on_error=Babe.ON_ERROR_NONE) self.assertEquals(a.to_string(), self.sjoined_bis)
def test_http(self): a = Babe().pull(protocol='http', host='localhost', name='Test', filename='remote/files/test.csv', port=self.port) self.assertEquals(a.to_string(), 'foo,bar,f,d\n1,2,3.2,2010/10/02\n3,4,1.2,2011/02/02\n')
def test_groupby(self): a = Babe().pull(string='a,b\n1,2\n3,4\n1,4\n', format="csv").typedetect() a = a.group(key="a", reducer=lambda key, rows: (key, sum([row.b for row in rows]))) self.assertEquals(a.to_string(), "a,b\n1,6\n3,4\n")
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj: obj._replace(foo=obj.foo + 1)) s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(a.to_string(), s)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row: row.foo + 1, insert_fields=['fooplus']) s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(a.to_string(), s)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : row.foo+1, insert_fields=['fooplus']) s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(a.to_string(), s)
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj: obj._replace(foo=obj.foo + 1)) s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(a.to_string(), s)
def test_parse(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.parse_time(field="time", output_time="time", output_date="date", output_hour="hour", input_timezone="CET", output_timezone="GMT") self.assertEquals(a.to_string(), self.s2)
def test_s3(self): s = "a,b\n1,2\n3,4\n" a = Babe().pull(string=s, format='csv', name='Test') a.push(filename='test3.csv', bucket='florian-test', protocol="s3") b = Babe().pull(filename='test3.csv', name='Test', bucket='florian-test', protocol="s3") self.assertEquals(b.to_string(), s)
def test_parse(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.parse_time(field="time", output_time="time", output_date="date", output_hour="hour", input_timezone="CET", output_timezone="GMT") self.assertEquals(a.to_string(), self.s2)
def test_pushsqlite(self): a = Babe().pull(string=self.s, format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='sqlite', database='tests/files/test.sqlite', drop_table=True, create_table=True) b = Babe().pull_sql(database_kind='sqlite', database='tests/files/test.sqlite', table='test_table') self.assertEquals(b.to_string(), self.s)
def test_pull_bigquery(self): dataset_id = 'ladata' day = '20151010' table_name = 'crazy_{}'.format(day) query = """ SELECT uid, count(1) FROM [{}.{}] WHERE name='pgr' GROUP BY 1 ORDER BY 2 DESC;""".format(dataset_id, table_name) a = Babe().pull_bigquery(project_id='bigquery-testing-1098', query=query, timeout=1000, num_retries=2) print a.to_string()
def test_vectorwise(self): a = Babe().pull(string=self.s, format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='vectorwise', database='pybabe_test', drop_table=True, create_table=True) b = Babe().pull_sql(database_kind='vectorwise', database='pybabe_test', table='test_table') self.assertEquals(b.to_string(), self.s)
def test_s3(self): s = "a,b\n1,2\n3,4\n" a = Babe().pull(string=s, format='csv', name='Test') a.push(filename='test3.csv', bucket='florian-test', protocol="s3") b = Babe().pull(filename='test3.csv', name='Test', bucket='florian-test', protocol="s3") self.assertEquals(b.to_string(), s)
def test_pushsqlite_partition(self): a = Babe().pull(string=self.s, format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='sqlite', database='test.sqlite', drop_table = True, create_table=True) a = Babe().pull(string=self.s2, format='csv') a = a.typedetect() a = a.partition(field='id') a.push_sql(table='test_table', database_kind='sqlite', database='test.sqlite', delete_partition=True) b = Babe().pull_sql(database_kind='sqlite', database='test.sqlite', table='test_table') b = b.sort(field="id") self.assertEquals(b.to_string(), self.sr)
def test_vectorwise(self): a = Babe().pull(string=self.s, format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='vectorwise', database='pybabe_test', drop_table=True, create_table=True) b = Babe().pull_sql(database_kind='vectorwise', database='pybabe_test', table='test_table') self.assertEquals(b.to_string(), self.s)
def test_pushsqlite_partition(self): a = Babe().pull(string=self.s, format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='sqlite', database='test.sqlite', drop_table=True, create_table=True) a = Babe().pull(string=self.s2, format='csv') a = a.typedetect() a = a.partition(field='id') a.push_sql(table='test_table', database_kind='sqlite', database='test.sqlite', delete_partition=True) b = Babe().pull_sql(database_kind='sqlite', database='test.sqlite', table='test_table') b = b.sort(field="id") self.assertEquals(b.to_string(), self.sr)
def test_groupby(self): a = Babe().pull(string='a,b\n1,2\n3,4\n1,4\n', format="csv").typedetect() a = a.group(key="a", reducer=lambda key, rows: (key, sum([row.b for row in rows]))) self.assertEquals(a.to_string(), "a,b\n1,6\n3,4\n")
def test_sqldump(self): a = Babe().pull(string=self.s, format='sql', table='foobar', fields=['id', 'number', 'title', 'datetime']) self.assertEquals(a.to_string(), self.s2)
def test_html(self): a = Babe().pull(string=self.s, format="csv") print a.to_string(format="html")
def test_replace(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.replace_in_string("cou", "bar", field="b") self.assertEquals(a.to_string(), self.sr)
def test_http(self): a = Babe().pull(protocol="http", host="localhost", name="Test", filename="remote/test.csv", port=self.port) self.assertEquals(a.to_string(), "foo,bar,f,d\n1,2,3.2,2010/10/02\n3,4,1.2,2011/02/02\n")
def test_filter2(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filterColumns(remove_fields=['a']) self.assertEquals(a.to_string(), "b\n2\n4\n4\n")
def test_replace(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row: [row.foo + 1, row.bar * 2], fields=['a', 'b']) s = """a,b\n2,4\n4,8\n""" self.assertEquals(a.to_string(), s)
def test_dedup2(self): a = Babe().pull(stream=StringIO(self.s2), format="csv") a = a.dedup() self.assertEquals(a.to_string(), self.s3)
def test_windowMap(self): a = Babe().pull(stream=StringIO('a\n1\n2\n3\n4\n5\n6\n7\n'), format="csv").typedetect() a = a.windowMap( 3, lambda rows: rows[-1]._make([sum([row.a for row in rows])])) self.assertEquals(a.to_string(), 'a\n1\n3\n6\n9\n12\n15\n18\n')
def test_min(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.minN(column='a', n=2) self.assertEquals(a.to_string(), 'a,b\n1,2\n1,4\n')
def test_filter_values(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filter_values(a=3, b=4) self.assertEquals(a.to_string(), "a,b\n3,4\n")
def test_http(self): a = Babe().pull(protocol='http', host='localhost', name='Test', filename='remote/test.csv', port=self.port) self.assertEquals(a.to_string(), 'foo,bar,f,d\n1,2,3.2,2010/10/02\n3,4,1.2,2011/02/02\n')
def test_dedup4(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.dedup(fields=['value']) self.assertEquals(a.to_string(), self.s4)
def test_groupAll(self): a = Babe().pull(string='a,b\n1,2\n3,4\n1,4\n', format="csv").typedetect() a = a.group_all(reducer=lambda rows: (max([row.b for row in rows]), ), fields=['max']) self.assertEquals(a.to_string(), "max\n4\n")
def test_pushpull(self): a = Babe().pull(string=self.s2, format="csv", primary_key="rown") a = a.typedetect() a.push_mongo(db="pybabe_test", collection="test_pushpull", drop_collection=True) b = Babe().pull_mongo(db="pybabe_test", fields=["rown", "f", "s"], collection="test_pushpull") self.assertEquals(b.to_string(), self.s2)
def test_tuple(self): a = Babe().pull(stream=StringIO("a,b\n1,2:3\n4,5:6\n"), format="csv") a = a.flatMap( lambda row: [row._replace(b=i) for i in row.b.split(':')]) self.assertEquals(a.to_string(), "a,b\n1,2\n1,3\n4,5\n4,6\n")
def test_filter(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filter(function=lambda x : x.a == 3) self.assertEquals(a.to_string(), 'a,b\n3,4\n')
def test_filter(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filter(function=lambda x: x.a == 3) self.assertEquals(a.to_string(), 'a,b\n3,4\n')
def test_load(self): start_time = "2012-04-23 11:00" end_time = "2012-04-23 12:00" a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) a = a.head(n=10) print a.to_string()
def test_zip(self): a = Babe().pull(string=self.s, format="csv") a.push(filename='tests/files/test.zip') b = Babe().pull(filename='tests/files/test.zip') self.assertEquals(b.to_string(), self.s)
def test_primarykey3(self): a = Babe().pull(stream=StringIO(self.s3), format='csv') a = a.primary_key_detect() self.assertEquals(a.to_string(), self.s3)
def test_country_code(self): a = Babe().pull(string=self.s, format='csv') a = a.geoip_country_code() self.assertEquals(a.to_string(), self.s2)
def test_transpose(self): a = Babe().pull(string=self.s, format='csv', primary_key='city').transpose() self.assertEquals(a.to_string(), self.s2)
def test_transpose(self): a = Babe().pull(string=self.s, format='csv', primary_key='city').transpose() self.assertEquals(a.to_string(), self.s2)
def test_airport(self): a = Babe().pull(filename='data/airports.csv') a = a.primary_key_detect() a = a.head(n=10) a.to_string()
def test_replace(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.replace_in_string("cou", "bar", field="b") self.assertEquals(a.to_string(), self.sr)
def test_filter2(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filterColumns(remove_fields=['a']) self.assertEquals(a.to_string(), "b\n2\n4\n4\n")
def test_groupAll(self): a = Babe().pull(string='a,b\n1,2\n3,4\n1,4\n', format="csv").typedetect() a = a.group_all(reducer=lambda rows: (max([row.b for row in rows]),), fields=['max']) self.assertEquals(a.to_string(), "max\n4\n")
def test_tuple(self): a = Babe().pull(stream=StringIO("a,b\n1,2:3\n4,5:6\n"), format="csv") a = a.flatMap(lambda row: [row._replace(b=i) for i in row.b.split(':')]) self.assertEquals(a.to_string(), "a,b\n1,2\n1,3\n4,5\n4,6\n")
def test_join(self): a = Babe().pull(string=self.s1, format='csv') a = a.join(join_stream=Babe().pull(string=self.s2, format='csv'), key='country', join_key='country_code') self.assertEquals(a.to_string(), self.sjoined)
def test_replace(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : [row.foo+1, row.bar*2], fields=['a','b']) s = """a,b\n2,4\n4,8\n""" self.assertEquals(a.to_string(), s)
def test_rename(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.rename(a="c") self.assertEquals(a.to_string(), 'c,b\n1,2\n3,4\n1,4\n')
def test_filter_values(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filter_values(a=3,b=4) self.assertEquals(a.to_string(), "a,b\n3,4\n")