def test_RowSetTable_len(): schema = Schema(parentId="syn123", id='syn456', columns=[Column(name='column_name', id='123')]) rowset = RowSet(schema=schema, rows=[Row(['first row']), Row(['second row'])]) row_set_table = RowSetTable(schema, rowset) assert_equals(2, len(row_set_table))
def dontruntest_big_tables(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) table1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) rows_per_append = 10 for i in range(1000): rows = [] for j in range(rows_per_append): foo = cols[1].enumValues[random.randint(0, 2)] rows.append( Row(('Robot ' + str(i * rows_per_append + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5))) rowset1 = syn.store(RowSet(columns=cols, schema=table1, rows=rows)) results = syn.tableQuery("select * from %s" % table1.id) results = syn.tableQuery( "select n, COUNT(n), MIN(x), AVG(x), MAX(x), SUM(x) from %s group by n" % table1.id) df = results.asDataFrame()
def test_Row_forward_compatibility(): row = Row("2, 3, 4", rowId=1, versionNumber=1, etag=None, new_field="new") assert_equals("2, 3, 4", row.get("values")) assert_equals(1, row.get("rowId")) assert_equals(1, row.get("versionNumber")) assert_is_none(row.get("etag")) assert_equals("new", row.get("new_field"))
def test_Row_forward_compatibility(): row = Row("2, 3, 4", rowId=1, versionNumber=1, etag=None, new_field="new") assert "2, 3, 4" == row.get("values") assert 1 == row.get("rowId") assert 1 == row.get("versionNumber") assert row.get("etag") is None assert "new" == row.get("new_field")
def test_store_table_datetime(): current_datetime = datetime.fromtimestamp(round(time.time(), 3)) schema = syn.store( Schema("testTable", [Column(name="testerino", columnType='DATE')], project)) rowset = RowSet(rows=[Row([current_datetime])], schema=schema) rowset_table = syn.store(Table(schema, rowset)) query_result = syn.tableQuery("select * from %s" % id_of(schema), resultsAs="rowset") assert_equals(current_datetime, query_result.rowset['rows'][0]['values'][0])
def _table_setup(cls): # set up a table cols = [ Column(name='foo', columnType='STRING', maximumSize=1000), Column(name='bar', columnType='STRING') ] schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [['foo1', None], [None, 'bar2']] syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) return schema
def _table_setup(cls): # set up a table cols = [ Column(name='foo', columnType='INTEGER'), Column(name='bar', columnType='INTEGER') ] schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [[1, None], [None, 2]] syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) return schema
def test_download_table_files(): cols = [ Column(name='artist', columnType='STRING', maximumSize=50), Column(name='album', columnType='STRING', maximumSize=50), Column(name='year', columnType='INTEGER'), Column(name='catalog', columnType='STRING', maximumSize=50), Column(name='cover', columnType='FILEHANDLEID') ] schema = syn.store(Schema(name='Jazz Albums', columns=cols, parent=project)) schedule_for_cleanup(schema) data = [[ "John Coltrane", "Blue Train", 1957, "BLP 1577", "coltraneBlueTrain.jpg" ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"], [ "Sonny Rollins", "Newk's Time", 1958, "BLP 4001", "rollinsBN4001.jpg" ], [ "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543", "burrellWarholBN1543.jpg" ]] ## upload files and store file handle ids original_files = [] for row in data: path = utils.make_bogus_data_file() original_files.append(path) schedule_for_cleanup(path) file_handle = syn._chunkedUploadFile(path) row[4] = file_handle['id'] row_reference_set = syn.store( RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data])) ## retrieve the files for each row and verify that they are identical to the originals results = syn.tableQuery( 'select artist, album, year, catalog, cover from %s' % schema.id, resultsAs="rowset") for i, row in enumerate(results): print "%s_%s" % (row.rowId, row.versionNumber), row.values file_info = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover', downloadLocation='.') assert filecmp.cmp(original_files[i], file_info['path']) schedule_for_cleanup(file_info['path'])
def test_rowset_tables(): cols = [ Column(name='name', columnType='STRING', maximumSize=1000), Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat']), Column(name='x', columnType='DOUBLE'), Column(name='age', columnType='INTEGER'), Column(name='cartoon', columnType='BOOLEAN'), Column(name='description', columnType='LARGETEXT') ] schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project)) data1 = [['Chris', 'bar', 11.23, 45, False, 'a'], ['Jen', 'bat', 14.56, 40, False, 'b'], ['Jane', 'bat', 17.89, 6, False, 'c' * 1002], ['Henry', 'bar', 10.12, 1, False, 'd']] row_reference_set1 = syn.store( RowSet(schema=schema1, rows=[Row(r) for r in data1])) assert_equals(len(row_reference_set1['rows']), 4)
def test_download_table_files(): cols = [ Column(name='artist', columnType='STRING', maximumSize=50), Column(name='album', columnType='STRING', maximumSize=50), Column(name='year', columnType='INTEGER'), Column(name='catalog', columnType='STRING', maximumSize=50), Column(name='cover', columnType='FILEHANDLEID') ] schema = syn.store(Schema(name='Jazz Albums', columns=cols, parent=project)) schedule_for_cleanup(schema) data = [[ "John Coltrane", "Blue Train", 1957, "BLP 1577", "coltraneBlueTrain.jpg" ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"], [ "Sonny Rollins", "Newk's Time", 1958, "BLP 4001", "rollinsBN4001.jpg" ], [ "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543", "burrellWarholBN1543.jpg" ]] ## upload files and store file handle ids original_files = [] for row in data: path = utils.make_bogus_data_file() original_files.append(path) schedule_for_cleanup(path) file_handle = syn.uploadFileHandle(path, project) row[4] = file_handle['id'] row_reference_set = syn.store( RowSet(schema=schema, rows=[Row(r) for r in data])) ## retrieve the files for each row and verify that they are identical to the originals results = syn.tableQuery( "select artist, album, 'year', 'catalog', cover from %s" % schema.id, resultsAs="rowset") for i, row in enumerate(results): path = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover') assert filecmp.cmp(original_files[i], path) schedule_for_cleanup(path) ## test that cached copies are returned for already downloaded files original_downloadFile_method = syn._downloadFileHandle with patch( "synapseclient.Synapse._downloadFileHandle") as _downloadFile_mock: _downloadFile_mock.side_effect = original_downloadFile_method results = syn.tableQuery( "select artist, album, 'year', 'catalog', cover from %s where artist = 'John Coltrane'" % schema.id, resultsAs="rowset") for i, row in enumerate(results): file_path = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover') assert filecmp.cmp(original_files[i], file_path) assert not _downloadFile_mock.called, "Should have used cached copy of file and not called _downloadFile" ## test download table column results = syn.tableQuery('select * from %s' % schema.id) ## uncache 2 out of 4 files for i, row in enumerate(results): if i % 2 == 0: syn.cache.remove(row[6]) file_map = syn.downloadTableColumns(results, ['cover']) assert len(file_map) == 4 for row in results: filecmp.cmp(original_files[i], file_map[row[6]])
def test_rowset_tables(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='age', columnType='INTEGER')) cols.append(Column(name='cartoon', columnType='BOOLEAN')) cols.append(Column(name='description', columnType='LARGETEXT')) schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project)) ## Get columns associated with the given table retrieved_cols = list(syn.getTableColumns(schema1)) ## Test that the columns we get are the same as the ones we stored assert len(retrieved_cols) == len(cols) for retrieved_col, col in zip(retrieved_cols, cols): assert retrieved_col.name == col.name assert retrieved_col.columnType == col.columnType data1 = [['Chris', 'bar', 11.23, 45, False, 'a'], ['Jen', 'bat', 14.56, 40, False, 'b'], ['Jane', 'bat', 17.89, 6, False, 'c' * 1002], ['Henry', 'bar', 10.12, 1, False, 'd']] row_reference_set1 = syn.store( RowSet(schema=schema1, rows=[Row(r) for r in data1])) assert len(row_reference_set1['rows']) == 4 ## add more new rows data2 = [['Fred', 'bat', 21.45, 20, True, 'e'], ['Daphne', 'foo', 27.89, 20, True, 'f'], ['Shaggy', 'foo', 23.45, 20, True, 'g'], ['Velma', 'bar', 25.67, 20, True, 'h']] syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data2])) results = syn.tableQuery("select * from %s order by name" % schema1.id, resultsAs="rowset") assert results.count == 8 assert results.tableId == schema1.id ## test that the values made the round trip expected = sorted(data1 + data2) for expected_values, row in zip(expected, results): assert expected_values == row['values'], 'got %s but expected %s' % ( row['values'], expected_values) ## To modify rows, we have to select then first. result2 = syn.tableQuery('select * from %s where age>18 and age<30' % schema1.id, resultsAs="rowset") ## make a change rs = result2.asRowSet() for row in rs['rows']: row['values'][2] = 88.888 ## store it row_reference_set = syn.store(rs) ## check if the change sticks result3 = syn.tableQuery('select name, x, age from %s' % schema1.id, resultsAs="rowset") for row in result3: if int(row['values'][2]) == 20: assert row['values'][1] == 88.888 ## Add a column bday_column = syn.store(Column(name='birthday', columnType='DATE')) column = syn.getColumn(bday_column.id) assert column.name == "birthday" assert column.columnType == "DATE" schema1.addColumn(bday_column) schema1 = syn.store(schema1) results = syn.tableQuery( 'select * from %s where cartoon=false order by age' % schema1.id, resultsAs="rowset") rs = results.asRowSet() ## put data in new column bdays = ('2013-3-15', '2008-1-3', '1973-12-8', '1969-4-28') for bday, row in zip(bdays, rs.rows): row['values'][6] = bday row_reference_set = syn.store(rs) ## query by date and check that we get back two kids date_2008_jan_1 = utils.to_unix_epoch_time(datetime(2008, 1, 1)) results = syn.tableQuery( 'select name from %s where birthday > %d order by birthday' % (schema1.id, date_2008_jan_1), resultsAs="rowset") assert ["Jane", "Henry"] == [row['values'][0] for row in results] try: import pandas as pd df = results.asDataFrame() assert all(df.ix[:, "name"] == ["Jane", "Henry"]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' ) results = syn.tableQuery( 'select birthday from %s where cartoon=false order by age' % schema1.id, resultsAs="rowset") for bday, row in zip(bdays, results): assert row['values'][0] == datetime.strptime( bday, "%Y-%m-%d"), "got %s but expected %s" % (row['values'][0], bday) try: import pandas as pd results = syn.tableQuery( "select foo, MAX(x), COUNT(foo), MIN(age) from %s group by foo order by foo" % schema1.id, resultsAs="rowset") df = results.asDataFrame() assert df.shape == (3, 4) assert all(df.iloc[:, 0] == ["bar", "bat", "foo"]) assert all(df.iloc[:, 1] == [88.888, 88.888, 88.888]) assert all(df.iloc[:, 2] == [3, 3, 2]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' ) ## test delete rows by deleting cartoon characters syn.delete( syn.tableQuery('select name from %s where cartoon = true' % schema1.id, resultsAs="rowset")) results = syn.tableQuery('select name from %s order by birthday' % schema1.id, resultsAs="rowset") assert ["Chris", "Jen", "Jane", "Henry"] == [row['values'][0] for row in results] ## check what happens when query result is empty results = syn.tableQuery('select * from %s where age > 1000' % schema1.id, resultsAs="rowset") assert len(list(results)) == 0 try: import pandas as pd results = syn.tableQuery('select * from %s where age > 1000' % schema1.id, resultsAs="rowset") df = results.asDataFrame() assert df.shape[0] == 0 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' )
def files_to_synapse_table(in_files, synapse_project_id, table_name, column_name='fileID', username='', password=''): """ Upload files and file handle IDs to Synapse. Parameters ---------- in_files : list of strings paths to files to upload to Synapse synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table column_name : string header for column of fileIDs username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- synapse_project_id : string Synapse ID for project Examples -------- >>> from mhealthx.io_data import files_to_synapse_table >>> in_files = ['/Users/arno/Local/wav/test1.wav'] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Test to store files and file handle IDs' >>> column_name = 'fileID1' >>> username = '' >>> password = '' >>> table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password) >>> #column_name = 'fileID2' >>> #in_files = ['/Users/arno/Local/wav/test2.wav'] >>> #table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password) """ import synapseclient from synapseclient import Schema from synapseclient.table import Column, RowSet, Row syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store file handle IDs: files_handles = [] for in_file in in_files: file_handle = syn._chunkedUploadFile(in_file) files_handles.append([file_handle['id']]) # New column headers: new_column_header = Column(name=column_name, columnType='FILEHANDLEID') # See if Synapse table exists: # tex = list(syn.chunkedQuery("select id from Table where parentId=='{0}'" # " and name=='{1}'".format(synapse_project_id, # table_name))) # If Synapse table does not exist, create table schema: # if not tex: # Create table schema: schema = syn.store( Schema(name=table_name, columns=[new_column_header], parent=synapse_project_id)) # Upload files and file handle IDs with new schema: syn.store( RowSet(columns=[new_column_header], schema=schema, rows=[Row(r) for r in files_handles]))