def _copyTable(syn, entity, destinationId, setAnnotations=False): """ Copies synapse Tables :param entity: A synapse ID of Table Schema :param destinationId: Synapse ID of a project that the Table wants to be copied to :param setAnnotations: Set the annotations of the copied table to be the annotations of the entity Defaults to False """ print("Getting table %s" % entity) myTableSchema = syn.get(entity) #CHECK: If Table name already exists, raise value error search = syn.query('select name from table where projectId == "%s"' % destinationId) for i in search['results']: if i['table.name'] == myTableSchema.name: raise ValueError('A table named "%s" already exists in this location. Table could not be copied'%myTableSchema.name) d = syn.tableQuery('select * from %s' % myTableSchema.id) d = d.asDataFrame() d = d.reset_index() del d['index'] colIds = myTableSchema.columnIds newTableSchema = Schema(name = myTableSchema.name, parent = destinationId, columns=colIds) if setAnnotations: newTableSchema.annotations = myTableSchema.annotations if len(d) > 0: print("Created new table using schema %s" % newTableSchema.name) newTable = Table(schema=newTableSchema,values=d) newTable = syn.store(newTable) return(newTable.schema.id) else: print("No data, so storing schema %s" % newTableSchema.name) newTableSchema = syn.store(newTableSchema) return(newTableSchema.id)
def test_store_table_datetime(): current_datetime = datetime.fromtimestamp(round(time.time(), 3)) schema = syn.store( Schema("testTable", [Column(name="testerino", columnType='DATE')], project)) rowset = RowSet(rows=[Row([current_datetime])], schema=schema) syn.store(Table(schema, rowset)) query_result = syn.tableQuery("select * from %s" % id_of(schema), resultsAs="rowset") assert_equals(current_datetime, query_result.rowset['rows'][0]['values'][0])
def challenge_demo(number_of_submissions=NUM_OF_SUBMISSIONS_TO_CREATE, cleanup=True): try: # create a Challenge project, evaluation queue, etc. objects = set_up() evaluation = objects['evaluation'] ## import challenge *after* we write the config file ## 'cause challenge.py imports the config file import challenge ## a dirty hack to share the same synapse connection object challenge.syn = syn # create leaderboard wiki page leaderboard_columns = challenge.conf.leaderboard_columns[evaluation.id] create_wiki(evaluation, objects['challenge_project'], objects['participants_team'], leaderboard_columns) # create leaderboard table schema = syn.store( Schema(name=evaluation.name, columns=challenge.to_column_objects(leaderboard_columns), parent=objects['challenge_project'])) # stash a reference to the table in the challenge config challenge.conf.leaderboard_tables[evaluation.id] = schema.id # create submissions on behalf of a team submit_to_challenge(evaluation, objects['participant_file'], team=objects['my_team'], n=number_of_submissions) # validate correctness # (this can be done at the same time as scoring, below, but we # demonstrate doing the two tasks separately) challenge.validate(evaluation) # score the validated submissions challenge.score(evaluation) # query the results (this is the action used by dynamic leader boards # viewable in challenge web pages). The process of indexing submission # annotations for query is asynchronous. Wait a second to give it a # fighting chance of finishing. time.sleep(1) challenge.query(evaluation, columns=leaderboard_columns) finally: if cleanup and "objects" in locals() and objects: tear_down(objects)
def _table_setup(cls): # set up a table cols = [ Column(name='foo', columnType='INTEGER'), Column(name='bar', columnType='INTEGER') ] schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [[1, None], [None, 2]] syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) return schema
def _table_setup(cls): # set up a table cols = [ Column(name='foo', columnType='STRING', maximumSize=1000), Column(name='bar', columnType='STRING') ] schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [['foo1', None], [None, 'bar2']] syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) return schema
def test_download_table_files(): cols = [ Column(name='artist', columnType='STRING', maximumSize=50), Column(name='album', columnType='STRING', maximumSize=50), Column(name='year', columnType='INTEGER'), Column(name='catalog', columnType='STRING', maximumSize=50), Column(name='cover', columnType='FILEHANDLEID') ] schema = syn.store(Schema(name='Jazz Albums', columns=cols, parent=project)) schedule_for_cleanup(schema) data = [[ "John Coltrane", "Blue Train", 1957, "BLP 1577", "coltraneBlueTrain.jpg" ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"], [ "Sonny Rollins", "Newk's Time", 1958, "BLP 4001", "rollinsBN4001.jpg" ], [ "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543", "burrellWarholBN1543.jpg" ]] ## upload files and store file handle ids original_files = [] for row in data: path = utils.make_bogus_data_file() original_files.append(path) schedule_for_cleanup(path) file_handle = syn._chunkedUploadFile(path) row[4] = file_handle['id'] row_reference_set = syn.store( RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data])) ## retrieve the files for each row and verify that they are identical to the originals results = syn.tableQuery( 'select artist, album, year, catalog, cover from %s' % schema.id, resultsAs="rowset") for i, row in enumerate(results): print "%s_%s" % (row.rowId, row.versionNumber), row.values file_info = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover', downloadLocation='.') assert filecmp.cmp(original_files[i], file_info['path']) schedule_for_cleanup(file_info['path'])
def test_tables_pandas(): # create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) # store in Synapse table = syn.store(Table(schema, df)) # retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) # simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] # for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails if six.PY3: df['string_'] = df['string_'].transform(str) # SYNPY-717 df['datetime64'] = df['datetime64'].apply( lambda x: pd.Timestamp(x).tz_localize('UTC')) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; # second .all() gives a bool that is ANDed value of that Series assert_frame_equal(df2, df)
def test_syncFromSynapse__children_contain_non_file(): proj = syn.store(Project(name="test_syncFromSynapse_children_non_file" + str(uuid.uuid4()))) schedule_for_cleanup(proj) temp_file = utils.make_bogus_data_file() schedule_for_cleanup(temp_file) file_entity = syn.store(File(temp_file, name="temp_file_test_syncFromSynapse_children_non_file" + str(uuid.uuid4()), parent=proj)) table_schema = syn.store(Schema(name="table_test_syncFromSynapse", parent=proj)) temp_folder = tempfile.mkdtemp() schedule_for_cleanup(temp_folder) files_list = synapseutils.syncFromSynapse(syn, proj, temp_folder) assert_equals(1, len(files_list)) assert_equals(file_entity, files_list[0])
def get_or_create_schema(self, **kwargs) -> Schema: """Gets an existing table schema by name and parent or creates a new one. Args: Same arguments as synapseclient.Schema Returns: A synapseclient.Schema. """ schema = Schema(**kwargs) schema = self._find_by_obj_or_create(schema) self.logger.info('{} Schema {} ({})'.format(self._update_str, schema.name, schema.id)) return schema
def dontruntest_big_csvs(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) print("Created table:", schema1.id) print("with columns:", schema1.columnIds) ## write rows to CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: schedule_for_cleanup(temp.name) filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) writer.writerow([col.name for col in cols]) for i in range(10): for j in range(100): foo = cols[1].enumValues[random.randint(0, 2)] writer.writerow( ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5)) print("wrote 100 rows to disk") ## upload CSV UploadToTableResult = syn._uploadCsv(filepath=temp.name, schema=schema1) from synapseclient.table import CsvFileTable results = CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id) print("etag:", results.etag) print("tableId:", results.tableId) for row in results: print(row)
def dontruntest_big_tables(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) table1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) print "Created table:", table1.id print "with columns:", table1.columnIds rows_per_append = 10 for i in range(1000): rows = [] for j in range(rows_per_append): foo = cols[1].enumValues[random.randint(0, 2)] rows.append( Row(('Robot ' + str(i * rows_per_append + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5))) print "added %d rows" % rows_per_append rowset1 = syn.store(RowSet(columns=cols, schema=table1, rows=rows)) results = syn.tableQuery("select * from %s" % table1.id) print "etag:", results.etag print "tableId:", results.tableId for row in results: print row results = syn.tableQuery( "select n, COUNT(n), MIN(x), AVG(x), MAX(x), SUM(x) from %s group by n" % table1.id) df = results.asDataFrame() print df.shape print df
def test_table_query(test_state): """Test command line ability to do table query.""" cols = [ Column(name='name', columnType='STRING', maximumSize=1000), Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat']), Column(name='x', columnType='DOUBLE'), Column(name='age', columnType='INTEGER'), Column(name='cartoon', columnType='BOOLEAN') ] project_entity = test_state.project schema1 = test_state.syn.store( Schema(name=str(uuid.uuid4()), columns=cols, parent=project_entity)) test_state.schedule_for_cleanup(schema1.id) data1 = [['Chris', 'bar', 11.23, 45, False], ['Jen', 'bat', 14.56, 40, False], ['Jane', 'bat', 17.89, 6, False], ['Henry', 'bar', 10.12, 1, False]] test_state.syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data1])) # Test query output = run(test_state, 'synapse' '--skip-checks', 'query', 'select * from %s' % schema1.id) output_rows = output.rstrip("\n").split("\n") # Check the length of the output assert len(output_rows) == 5, "got %s rows" % (len(output_rows), ) # Check that headers are correct. # Should be column names in schema plus the ROW_ID and ROW_VERSION my_headers_set = output_rows[0].split("\t") expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list( map(lambda x: x.name, cols)) assert my_headers_set == expected_headers_set, "%r != %r" % ( my_headers_set, expected_headers_set)
def test_rowset_tables(syn, project): cols = [ Column(name='name', columnType='STRING', maximumSize=1000), Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat']), Column(name='x', columnType='DOUBLE'), Column(name='age', columnType='INTEGER'), Column(name='cartoon', columnType='BOOLEAN'), Column(name='description', columnType='LARGETEXT') ] schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project)) data1 = [['Chris', 'bar', 11.23, 45, False, 'a'], ['Jen', 'bat', 14.56, 40, False, 'b'], ['Jane', 'bat', 17.89, 6, False, 'c' * 1002], ['Henry', 'bar', 10.12, 1, False, 'd']] row_reference_set1 = syn.store( RowSet(schema=schema1, rows=[Row(r) for r in data1])) assert len(row_reference_set1['rows']) == 4
def update_global_scores_table(global_data): import challenge_config as config from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns # 'principalId', 'name', 'score_lb', 'score_mean', 'score_ub', 'rank' cols = [ Column(name='UserID', columnType='STRING', maximumSize=100), Column(name='Name', columnType='STRING', maximumSize=100), Column(name='score_lb', columnType='DOUBLE'), Column(name='score_mean', columnType='DOUBLE'), Column(name='score_ub', columnType='DOUBLE'), Column(name='rank', columnType='DOUBLE'), ] schema = Schema(name='Global Scores', columns=cols, parent=config.CHALLENGE_SYN_ID) results = syn.tableQuery("select * from {}".format('syn7237020')) if len(results) > 0: a = syn.delete(results.asRowSet()) table = syn.store(Table(schema, global_data)) results = syn.tableQuery("select * from {}".format(table.tableId)) for row in results: print row return
def test_download_table_files(): cols = [ Column(name='artist', columnType='STRING', maximumSize=50), Column(name='album', columnType='STRING', maximumSize=50), Column(name='year', columnType='INTEGER'), Column(name='catalog', columnType='STRING', maximumSize=50), Column(name='cover', columnType='FILEHANDLEID') ] schema = syn.store(Schema(name='Jazz Albums', columns=cols, parent=project)) schedule_for_cleanup(schema) data = [[ "John Coltrane", "Blue Train", 1957, "BLP 1577", "coltraneBlueTrain.jpg" ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"], [ "Sonny Rollins", "Newk's Time", 1958, "BLP 4001", "rollinsBN4001.jpg" ], [ "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543", "burrellWarholBN1543.jpg" ]] ## upload files and store file handle ids original_files = [] for row in data: path = utils.make_bogus_data_file() original_files.append(path) schedule_for_cleanup(path) file_handle = syn.uploadFileHandle(path, project) row[4] = file_handle['id'] row_reference_set = syn.store( RowSet(schema=schema, rows=[Row(r) for r in data])) ## retrieve the files for each row and verify that they are identical to the originals results = syn.tableQuery( "select artist, album, 'year', 'catalog', cover from %s" % schema.id, resultsAs="rowset") for i, row in enumerate(results): path = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover') assert filecmp.cmp(original_files[i], path) schedule_for_cleanup(path) ## test that cached copies are returned for already downloaded files original_downloadFile_method = syn._downloadFileHandle with patch( "synapseclient.Synapse._downloadFileHandle") as _downloadFile_mock: _downloadFile_mock.side_effect = original_downloadFile_method results = syn.tableQuery( "select artist, album, 'year', 'catalog', cover from %s where artist = 'John Coltrane'" % schema.id, resultsAs="rowset") for i, row in enumerate(results): file_path = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover') assert filecmp.cmp(original_files[i], file_path) assert not _downloadFile_mock.called, "Should have used cached copy of file and not called _downloadFile" ## test download table column results = syn.tableQuery('select * from %s' % schema.id) ## uncache 2 out of 4 files for i, row in enumerate(results): if i % 2 == 0: syn.cache.remove(row[6]) file_map = syn.downloadTableColumns(results, ['cover']) assert len(file_map) == 4 for row in results: filecmp.cmp(original_files[i], file_map[row[6]])
def test_tables_csv(): ## Define schema cols = [] cols.append(Column(name='Name', columnType='STRING')) cols.append(Column(name='Born', columnType='INTEGER')) cols.append(Column(name='Hipness', columnType='DOUBLE')) cols.append(Column(name='Living', columnType='BOOLEAN')) schema = Schema(name='Jazz Guys', columns=cols, parent=project) data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] ## the following creates a CSV file and uploads it to create a new table table = syn.store(Table(schema, data)) ## Query and download an identical CSV results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) ## Test that CSV file came back as expected for expected_row, row in zip(data, results): assert expected_row == row, "expected %s but got %s" % (expected_row, row) try: ## check if we have pandas import pandas as pd df = results.asDataFrame() assert all(df.columns.values == ['Name', 'Born', 'Hipness', 'Living']) assert list(df.iloc[1, [0, 1, 3]]) == ['Miles Davis', 1926, False] assert df.iloc[1, 2] - 9.87 < 0.0001 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test of .asDataFrame for CSV tables.\n\n' ) ## Aggregate query expected = {True: [True, 1929, 3, 6.38], False: [False, 1926, 5, 7.104]} results = syn.tableQuery( 'select Living, min(Born), count(Living), avg(Hipness) from %s group by Living' % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) for row in results: living = row[0] assert expected[living][1] == row[1] assert expected[living][2] == row[2] assert abs(expected[living][3] - row[3]) < 0.0001 ## Aggregate query results to DataFrame try: ## check if we have pandas import pandas as pd df = results.asDataFrame() assert all(expected[df.iloc[0, 0]][0:3] == df.iloc[0, 0:3]) assert abs(expected[df.iloc[1, 0]][3] - df.iloc[1, 3]) < 0.0001 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test of .asDataFrame for aggregate queries as CSV tables.\n\n' ) ## Append rows more_jazz_guys = [["Sonny Clark", 1931, 8.43, False], ["Hank Mobley", 1930, 5.67, False], ["Freddie Hubbard", 1938, float('nan'), False], ["Thelonious Monk", 1917, float('inf'), False]] table = syn.store(Table(table.schema, more_jazz_guys)) ## test that CSV file now has more jazz guys results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv") for expected_row, row in zip(data + more_jazz_guys, results): for field, expected_field in zip(row[2:], expected_row): if type(field) is float and math.isnan(field): assert type(expected_field) is float and math.isnan( expected_field) elif type(expected_field) is float and math.isnan(expected_field): assert type(field) is float and math.isnan(field) else: assert expected_field == field ## Update as a RowSet rowset = results.asRowSet() for row in rowset['rows']: if row['values'][1] == 1930: row['values'][2] = 8.5 row_reference_set = syn.store(rowset) ## aggregate queries won't return row id and version, so we need to ## handle this correctly results = syn.tableQuery( 'select Born, COUNT(*) from %s group by Born order by Born' % table.schema.id, resultsAs="csv") assert results.includeRowIdAndRowVersion == False for i, row in enumerate(results): assert row[0] == [1917, 1926, 1929, 1930, 1931, 1935, 1936, 1938][i] assert row[1] == [1, 2, 2, 2, 2, 1, 1, 1][i] try: import pandas as pd results = syn.tableQuery("select * from %s where Born=1930" % table.schema.id, resultsAs="csv") df = results.asDataFrame() all(df['Born'].values == 1930) all(df['Hipness'].values == 8.5) ## Update via a Data Frame df['Hipness'] = 9.75 table = syn.store(Table(table.tableId, df, etag=results.etag)) results = syn.tableQuery("select * from %s where Born=1930" % table.tableId, resultsAs="csv") for row in results: assert row[4] == 9.75 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n' ) ## check what happens when query result is empty results = syn.tableQuery('select * from %s where Born=2013' % table.tableId, resultsAs="csv") assert len(list(results)) == 0 try: import pandas as pd results = syn.tableQuery('select * from %s where Born=2013' % table.tableId, resultsAs="csv") df = results.asDataFrame() assert df.shape[0] == 0 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n' ) ## delete some rows results = syn.tableQuery('select * from %s where Hipness < 7' % table.tableId, resultsAs="csv") syn.delete(results)
def test_rowset_tables(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='age', columnType='INTEGER')) cols.append(Column(name='cartoon', columnType='BOOLEAN')) cols.append(Column(name='description', columnType='LARGETEXT')) schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project)) ## Get columns associated with the given table retrieved_cols = list(syn.getTableColumns(schema1)) ## Test that the columns we get are the same as the ones we stored assert len(retrieved_cols) == len(cols) for retrieved_col, col in zip(retrieved_cols, cols): assert retrieved_col.name == col.name assert retrieved_col.columnType == col.columnType data1 = [['Chris', 'bar', 11.23, 45, False, 'a'], ['Jen', 'bat', 14.56, 40, False, 'b'], ['Jane', 'bat', 17.89, 6, False, 'c' * 1002], ['Henry', 'bar', 10.12, 1, False, 'd']] row_reference_set1 = syn.store( RowSet(schema=schema1, rows=[Row(r) for r in data1])) assert len(row_reference_set1['rows']) == 4 ## add more new rows data2 = [['Fred', 'bat', 21.45, 20, True, 'e'], ['Daphne', 'foo', 27.89, 20, True, 'f'], ['Shaggy', 'foo', 23.45, 20, True, 'g'], ['Velma', 'bar', 25.67, 20, True, 'h']] syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data2])) results = syn.tableQuery("select * from %s order by name" % schema1.id, resultsAs="rowset") assert results.count == 8 assert results.tableId == schema1.id ## test that the values made the round trip expected = sorted(data1 + data2) for expected_values, row in zip(expected, results): assert expected_values == row['values'], 'got %s but expected %s' % ( row['values'], expected_values) ## To modify rows, we have to select then first. result2 = syn.tableQuery('select * from %s where age>18 and age<30' % schema1.id, resultsAs="rowset") ## make a change rs = result2.asRowSet() for row in rs['rows']: row['values'][2] = 88.888 ## store it row_reference_set = syn.store(rs) ## check if the change sticks result3 = syn.tableQuery('select name, x, age from %s' % schema1.id, resultsAs="rowset") for row in result3: if int(row['values'][2]) == 20: assert row['values'][1] == 88.888 ## Add a column bday_column = syn.store(Column(name='birthday', columnType='DATE')) column = syn.getColumn(bday_column.id) assert column.name == "birthday" assert column.columnType == "DATE" schema1.addColumn(bday_column) schema1 = syn.store(schema1) results = syn.tableQuery( 'select * from %s where cartoon=false order by age' % schema1.id, resultsAs="rowset") rs = results.asRowSet() ## put data in new column bdays = ('2013-3-15', '2008-1-3', '1973-12-8', '1969-4-28') for bday, row in zip(bdays, rs.rows): row['values'][6] = bday row_reference_set = syn.store(rs) ## query by date and check that we get back two kids date_2008_jan_1 = utils.to_unix_epoch_time(datetime(2008, 1, 1)) results = syn.tableQuery( 'select name from %s where birthday > %d order by birthday' % (schema1.id, date_2008_jan_1), resultsAs="rowset") assert ["Jane", "Henry"] == [row['values'][0] for row in results] try: import pandas as pd df = results.asDataFrame() assert all(df.ix[:, "name"] == ["Jane", "Henry"]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' ) results = syn.tableQuery( 'select birthday from %s where cartoon=false order by age' % schema1.id, resultsAs="rowset") for bday, row in zip(bdays, results): assert row['values'][0] == datetime.strptime( bday, "%Y-%m-%d"), "got %s but expected %s" % (row['values'][0], bday) try: import pandas as pd results = syn.tableQuery( "select foo, MAX(x), COUNT(foo), MIN(age) from %s group by foo order by foo" % schema1.id, resultsAs="rowset") df = results.asDataFrame() assert df.shape == (3, 4) assert all(df.iloc[:, 0] == ["bar", "bat", "foo"]) assert all(df.iloc[:, 1] == [88.888, 88.888, 88.888]) assert all(df.iloc[:, 2] == [3, 3, 2]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' ) ## test delete rows by deleting cartoon characters syn.delete( syn.tableQuery('select name from %s where cartoon = true' % schema1.id, resultsAs="rowset")) results = syn.tableQuery('select name from %s order by birthday' % schema1.id, resultsAs="rowset") assert ["Chris", "Jen", "Jane", "Henry"] == [row['values'][0] for row in results] ## check what happens when query result is empty results = syn.tableQuery('select * from %s where age > 1000' % schema1.id, resultsAs="rowset") assert len(list(results)) == 0 try: import pandas as pd results = syn.tableQuery('select * from %s where age > 1000' % schema1.id, resultsAs="rowset") df = results.asDataFrame() assert df.shape[0] == 0 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' )
def test_copy(): """Tests the copy function""" # Create a Project project_entity = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(project_entity.id) # Create two Folders in Project folder_entity = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) second_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) third_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(folder_entity.id) schedule_for_cleanup(second_folder.id) schedule_for_cleanup(third_folder.id) # Annotations and provenance repo_url = 'https://github.com/Sage-Bionetworks/synapsePythonClient' annos = {'test': ['hello_world']} prov = Activity(name="test", used=repo_url) # Create, upload, and set annotations/provenance on a file in Folder filename = utils.make_bogus_data_file() schedule_for_cleanup(filename) file_entity = syn.store(File(filename, parent=folder_entity)) externalURL_entity = syn.store( File(repo_url, name='rand', parent=folder_entity, synapseStore=False)) syn.setAnnotations(file_entity, annos) syn.setAnnotations(externalURL_entity, annos) syn.setProvenance(externalURL_entity.id, prov) schedule_for_cleanup(file_entity.id) schedule_for_cleanup(externalURL_entity.id) # ------------------------------------ # TEST COPY FILE # ------------------------------------ output = synapseutils.copy(syn, file_entity.id, destinationId=project_entity.id) output_URL = synapseutils.copy(syn, externalURL_entity.id, destinationId=project_entity.id, skipCopyAnnotations=True) # Verify that our copied files are identical copied_ent = syn.get(output[file_entity.id]) copied_URL_ent = syn.get(output_URL[externalURL_entity.id], downloadFile=False) copied_ent_annot = syn.getAnnotations(copied_ent) copied_url_annot = syn.getAnnotations(copied_URL_ent) copied_prov = syn.getProvenance(copied_ent) copied_url_prov = syn.getProvenance(copied_URL_ent) schedule_for_cleanup(copied_ent.id) schedule_for_cleanup(copied_URL_ent.id) # TEST: set_Provenance = Traceback assert_equals(copied_prov['used'][0]['reference']['targetId'], file_entity.id) assert_equals(copied_url_prov['used'][0]['reference']['targetId'], externalURL_entity.id) # TEST: Make sure copied files are the same assert_equals(copied_ent_annot, annos) assert_equals(copied_ent.dataFileHandleId, file_entity.dataFileHandleId) # TEST: Make sure copied URLs are the same assert_equals(copied_url_annot, {}) assert_equals(copied_URL_ent.externalURL, repo_url) assert_equals(copied_URL_ent.name, 'rand') assert_equals(copied_URL_ent.dataFileHandleId, externalURL_entity.dataFileHandleId) # TEST: Throw error if file is copied to a folder/project that has a file with the same filename assert_raises(ValueError, synapseutils.copy, syn, project_entity.id, destinationId=project_entity.id) assert_raises(ValueError, synapseutils.copy, syn, file_entity.id, destinationId=project_entity.id) assert_raises(ValueError, synapseutils.copy, syn, file_entity.id, destinationId=third_folder.id, setProvenance="gib") assert_raises(ValueError, synapseutils.copy, syn, file_entity.id, destinationId=file_entity.id) # Test: setProvenance = None output = synapseutils.copy(syn, file_entity.id, destinationId=second_folder.id, setProvenance=None) assert_raises(SynapseHTTPError, syn.getProvenance, output[file_entity.id]) schedule_for_cleanup(output[file_entity.id]) # Test: setProvenance = Existing output_URL = synapseutils.copy(syn, externalURL_entity.id, destinationId=second_folder.id, setProvenance="existing") output_prov = syn.getProvenance(output_URL[externalURL_entity.id]) schedule_for_cleanup(output_URL[externalURL_entity.id]) assert_equals(output_prov['name'], prov['name']) assert_equals(output_prov['used'], prov['used']) # ------------------------------------ # TEST COPY LINKS # ------------------------------------ second_file = utils.make_bogus_data_file() # schedule_for_cleanup(filename) second_file_entity = syn.store(File(second_file, parent=project_entity)) link_entity = Link(second_file_entity.id, parent=folder_entity.id) link_entity = syn.store(link_entity) copied_link = synapseutils.copy(syn, link_entity.id, destinationId=second_folder.id) old = syn.get(link_entity.id, followLink=False) new = syn.get(copied_link[link_entity.id], followLink=False) assert_equals(old.linksTo['targetId'], new.linksTo['targetId']) schedule_for_cleanup(second_file_entity.id) schedule_for_cleanup(link_entity.id) schedule_for_cleanup(copied_link[link_entity.id]) time.sleep(3) assert_raises(ValueError, synapseutils.copy, syn, link_entity.id, destinationId=second_folder.id) # ------------------------------------ # TEST COPY TABLE # ------------------------------------ second_project = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(second_project.id) cols = [ Column(name='n', columnType='DOUBLE', maximumSize=50), Column(name='c', columnType='STRING', maximumSize=50), Column(name='i', columnType='INTEGER') ] data = [[2.1, 'foo', 10], [2.2, 'bar', 20], [2.3, 'baz', 30]] schema = syn.store( Schema(name='Testing', columns=cols, parent=project_entity.id)) syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) table_map = synapseutils.copy(syn, schema.id, destinationId=second_project.id) copied_table = syn.tableQuery('select * from %s' % table_map[schema.id]) rows = copied_table.asRowSet()['rows'] # TEST: Check if all values are the same for i, row in enumerate(rows): assert_equals(row['values'], data[i]) assert_raises(ValueError, synapseutils.copy, syn, schema.id, destinationId=second_project.id) schedule_for_cleanup(schema.id) schedule_for_cleanup(table_map[schema.id]) # ------------------------------------ # TEST COPY FOLDER # ------------------------------------ mapping = synapseutils.copy(syn, folder_entity.id, destinationId=second_project.id) for i in mapping: old = syn.get(i, downloadFile=False) new = syn.get(mapping[i], downloadFile=False) assert_equals(old.name, new.name) assert_equals(old.annotations, new.annotations) assert_equals(old.concreteType, new.concreteType) assert_raises(ValueError, synapseutils.copy, syn, folder_entity.id, destinationId=second_project.id) # TEST: Throw error if excludeTypes isn't in file, link and table or isn't a list assert_raises(ValueError, synapseutils.copy, syn, second_folder.id, destinationId=second_project.id, excludeTypes=["foo"]) assert_raises(ValueError, synapseutils.copy, syn, second_folder.id, destinationId=second_project.id, excludeTypes="file") # TEST: excludeType = ["file"], only the folder is created second = synapseutils.copy(syn, second_folder.id, destinationId=second_project.id, excludeTypes=["file", "table", "link"]) copied_folder = syn.get(second[second_folder.id]) assert_equals(copied_folder.name, second_folder.name) assert_equals(len(second), 1) # TEST: Make sure error is thrown if foldername already exists assert_raises(ValueError, synapseutils.copy, syn, second_folder.id, destinationId=second_project.id) # ------------------------------------ # TEST COPY PROJECT # ------------------------------------ third_project = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(third_project.id) mapping = synapseutils.copy(syn, project_entity.id, destinationId=third_project.id) for i in mapping: old = syn.get(i, downloadFile=False) new = syn.get(mapping[i], downloadFile=False) if not isinstance(old, Project): assert_equals(old.name, new.name) assert_equals(old.annotations, new.annotations) assert_equals(old.concreteType, new.concreteType) # TEST: Can't copy project to a folder assert_raises(ValueError, synapseutils.copy, syn, project_entity.id, destinationId=second_folder.id)
def concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username='', password=''): """ Concatenate multiple dataframes and store as a Synapse table. Reuse the indices from the original DataFrame, increasing number of columns. Parameters ---------- frames : list of pandas DataFrames paths to files to upload to Synapse synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame output table synapse_project_id : string Synapse ID for project Examples -------- >>> import pandas as pd >>> from mhealthx.io_data import concatenate_tables_to_synapse_table >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], >>> 'B': ['B0', 'B1', 'B2', 'B3'], >>> 'C': ['C0', 'C1', 'C2', 'C3'], >>> 'D': ['D0', 'D1', 'D2', 'D3']}, >>> index=[0, 1, 2, 3]) >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'], >>> 'F': ['B4', 'B5', 'B6', 'B7'], >>> 'G': ['C4', 'C5', 'C6', 'C7'], >>> 'H': ['D4', 'D5', 'D6', 'D7']}, >>> index=[0, 1, 2, 3]) >>> frames = [df1, df2] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Test to join tables' >>> username = '' >>> password = '' >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password) """ import pandas as pd import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Concatenate dataframes: reuse the indices from the original DataFrame, # increasing number of columns: table_data = pd.concat(frames, axis=1) #, join_axes=[frames[0].index]) # Create table schema: schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id) # Store as Synapse table: table = syn.store(Table(schema, table_data)) return table_data, synapse_project_id
def files_to_synapse_table(in_files, synapse_project_id, table_name, column_name='fileID', username='', password=''): """ Upload files and file handle IDs to Synapse. Parameters ---------- in_files : list of strings paths to files to upload to Synapse synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table column_name : string header for column of fileIDs username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- synapse_project_id : string Synapse ID for project Examples -------- >>> from mhealthx.io_data import files_to_synapse_table >>> in_files = ['/Users/arno/Local/wav/test1.wav'] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Test to store files and file handle IDs' >>> column_name = 'fileID1' >>> username = '' >>> password = '' >>> table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password) >>> #column_name = 'fileID2' >>> #in_files = ['/Users/arno/Local/wav/test2.wav'] >>> #table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password) """ import synapseclient from synapseclient import Schema from synapseclient.table import Column, RowSet, Row syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store file handle IDs: files_handles = [] for in_file in in_files: file_handle = syn._chunkedUploadFile(in_file) files_handles.append([file_handle['id']]) # New column headers: new_column_header = Column(name=column_name, columnType='FILEHANDLEID') # See if Synapse table exists: # tex = list(syn.chunkedQuery("select id from Table where parentId=='{0}'" # " and name=='{1}'".format(synapse_project_id, # table_name))) # If Synapse table does not exist, create table schema: # if not tex: # Create table schema: schema = syn.store( Schema(name=table_name, columns=[new_column_header], parent=synapse_project_id)) # Upload files and file handle IDs with new schema: syn.store( RowSet(columns=[new_column_header], schema=schema, rows=[Row(r) for r in files_handles]))
def copy_synapse_table(synapse_table_id, synapse_project_id, table_name='', remove_columns=[], username='', password=''): """ Copy Synapse table to another Synapse project. Parameters ---------- synapse_table_id : string Synapse ID for table to copy synapse_project_id : string copy table to project with this Synapse ID table_name : string schema name of table remove_columns : list of strings column headers for columns to be removed username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame Synapse table contents table_name : string schema name of table synapse_project_id : string Synapse ID for project within which table is to be written Examples -------- >>> from mhealthx.io_data import copy_synapse_table >>> synapse_table_id = 'syn4590865' >>> synapse_project_id = 'syn4899451' >>> table_name = 'Copy of ' + synapse_table_id >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a'] >>> username = '' >>> password = '' >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password) """ import synapseclient from synapseclient import Schema from synapseclient.table import Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Download Synapse table as a dataframe: results = syn.tableQuery("select * from {0}".format(synapse_table_id)) table_data = results.asDataFrame() # Remove specified columns: if remove_columns: for remove_column in remove_columns: del table_data[remove_column] # Upload to Synapse table: table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) table = syn.store(Table(schema, table_data)) return table_data, table_name, synapse_project_id
def opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password): """ Save openSMILE's SMILExtract audio features to a Synapse table. Parameters ---------- in_files : list of strings full path to the input files synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame output table table_name : string schema name of table synapse_table_id : string Synapse table ID Examples -------- >>> from mhealthx.features import opensmile_features_to_synapse >>> in_files = ['/home/arno/smile/test1.wav.csv','/home/arno/smile/test2.wav.csv','/home/arno/smile/test3.wav.csv'] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Phonation openSMILE feature table' >>> username = '' >>> password = '' >>> table_data, table_name, synapse_table_id = opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password) """ import pandas as pd import synapseclient from synapseclient import Schema, Table, as_table_columns from mhealthx.io_data import concatenate_tables_to_synapse_table as cat syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store each file as a row in a Synapse table: first = True for in_file in in_files: if first: df_data = pd.read_csv(in_file) first = False else: df_data = pd.read_csv(in_file) table_data, project_id = cat(frames, synapse_project_id, table_name, username, password) # Create table schema: schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id) # Store as Synapse table: table = syn.store(Table(schema, table_data)) synapse_table_id = str(table.tableId) return table_data, table_name, synapse_table_id
def createAMPADTable(keyFile, clinicalFile): """ Create the AMP AD table with merged data from keyFile with clinicalFile. If any of the supplementary files exist for a particular dataset, change the binary classifiers to the synapse ID holding the data and reset 0 to null for the table. Input: keyFile: Dataframe with the keys and information regarding what exists for each patient clinicalFile: Dataframe with clinical data for various patients """ toUpload = [] clinicalHeader = clinicalFile.columns.values #seenList = [] # Iterate through each project within keyFile for i, row in keyFile.iterrows(): # Create empty list for new row to be added to synapse table newRow = [] # Ignore binary varibles which all end in '_data' for item in row.iteritems(): if (item[0] == 'niagas_data'): if (not pd.isnull(row.niagas_data)): newRow.append(arrayExpressionSynID) else: newRow.append(float('nan')) elif (not item[0].endswith('_data')): newRow.append(item[1]) # Check if row has clinical data if (row.clinical_data): # Create reference to clinicalFile project ID clinicalKeyList = clinicalFile['projid'] # get the index of the projID in the clinical file index = clinicalKeyList[clinicalKeyList == row.projid].index.tolist() if (len(index) == 1): index = index[0] #seenList.append(row.projid) for entry in clinicalFile.iloc[index][1:]: newRow.append(entry) # If the length of the idnex is 0, it means the key file thinks # there is clinical information for this patient but it does # not exist in the clinical file elif (len(index) == 0): print("Key file indicates that projID %s should have "\ "clinical information, but it does not exist in "\ "the clinical information file" % row.projid) for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) # If the lengh of index list is greater than 1, that means projID # appears more than once in the file. Send warning to user else: print("projID %s appears more than once in clinical file at "\ "positions %s" % (row.projid, index)) for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) else: for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) # Check if row has gwas data if (row.gwas_data): newRow.append(genotypeSynID) newRow.append(imputedGenotypeSynID) else: newRow.append(float('nan')) newRow.append(float('nan')) if (row.mwas_data): newRow.append(methylationSynID) else: newRow.append(float('nan')) if (row.mirna_data): newRow.append(mirnaSynID) else: newRow.append(float('nan')) if (row.mrna_data): newRow.append(rnaseqSynID) else: newRow.append(float('nan')) toUpload.append(newRow) df = pd.DataFrame(toUpload) columns = list(keyFile.columns.values) index = columns.index('clinical_data') - 1 columns.remove('clinical_data') idnex = columns.index('gwas_data') columns.remove('gwas_data') columns.insert(index + 1, 'genotype data') columns.insert(index + 2, 'imputed genotype data') for i in range(1, len(clinicalHeader)): columns.insert(index + i, clinicalHeader[i]) df.columns = columns df.to_csv('mergedTables.csv', encodings='utf-8', index=False) print("Uploading to Synapse") schema = Schema(name='AMP AD Samples Table', columns=as_table_columns(df), parent='syn2580853') syn.store(Table(schema, df))
def test_copy(): """Tests the copy function""" # Create a Project project_entity = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(project_entity.id) acl = syn.setPermissions( project_entity, other_user['principalId'], accessType=['READ', 'CREATE', 'UPDATE', 'DOWNLOAD']) # Create two Folders in Project folder_entity = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) second_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) third_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(folder_entity.id) schedule_for_cleanup(second_folder.id) schedule_for_cleanup(third_folder.id) # Annotations and provenance repo_url = 'https://github.com/Sage-Bionetworks/synapsePythonClient' annots = {'test': ['hello_world']} prov = Activity(name="test", used=repo_url) # Create, upload, and set annotations/provenance on a file in Folder filename = utils.make_bogus_data_file() schedule_for_cleanup(filename) file_entity = syn.store(File(filename, parent=folder_entity)) externalURL_entity = syn.store( File(repo_url, name='rand', parent=folder_entity, synapseStore=False)) syn.setAnnotations(file_entity, annots) syn.setAnnotations(externalURL_entity, annots) syn.setProvenance(externalURL_entity.id, prov) schedule_for_cleanup(file_entity.id) schedule_for_cleanup(externalURL_entity.id) # ------------------------------------ # TEST COPY FILE # ------------------------------------ output = synapseutils.copy(syn, file_entity.id, destinationId=project_entity.id) output_URL = synapseutils.copy(syn, externalURL_entity.id, destinationId=project_entity.id, skipCopyAnnotations=True) #Verify that our copied files are identical copied_ent = syn.get(output[file_entity.id]) copied_URL_ent = syn.get(output_URL[externalURL_entity.id], downloadFile=False) copied_ent_annot = syn.getAnnotations(copied_ent) copied_url_annot = syn.getAnnotations(copied_URL_ent) copied_prov = syn.getProvenance(copied_ent) copied_url_prov = syn.getProvenance(copied_URL_ent) schedule_for_cleanup(copied_ent.id) schedule_for_cleanup(copied_URL_ent.id) # TEST: set_Provenance = Traceback print("Test: setProvenance = Traceback") assert copied_prov['used'][0]['reference']['targetId'] == file_entity.id assert copied_url_prov['used'][0]['reference'][ 'targetId'] == externalURL_entity.id # TEST: Make sure copied files are the same assert copied_ent_annot == annots assert copied_ent.dataFileHandleId == file_entity.dataFileHandleId # TEST: Make sure copied URLs are the same assert copied_url_annot == {} assert copied_URL_ent.externalURL == repo_url assert copied_URL_ent.name == 'rand' assert copied_URL_ent.dataFileHandleId == externalURL_entity.dataFileHandleId # TEST: Throw error if file is copied to a folder/project that has a file with the same filename assert_raises(ValueError, synapseutils.copy, syn, project_entity.id, destinationId=project_entity.id) assert_raises(ValueError, synapseutils.copy, syn, file_entity.id, destinationId=project_entity.id) assert_raises(ValueError, synapseutils.copy, syn, file_entity.id, destinationId=third_folder.id, setProvenance="gib") assert_raises(ValueError, synapseutils.copy, syn, file_entity.id, destinationId=file_entity.id) print("Test: setProvenance = None") output = synapseutils.copy(syn, file_entity.id, destinationId=second_folder.id, setProvenance=None) assert_raises(SynapseHTTPError, syn.getProvenance, output[file_entity.id]) schedule_for_cleanup(output[file_entity.id]) print("Test: setProvenance = Existing") output_URL = synapseutils.copy(syn, externalURL_entity.id, destinationId=second_folder.id, setProvenance="existing") output_prov = syn.getProvenance(output_URL[externalURL_entity.id]) schedule_for_cleanup(output_URL[externalURL_entity.id]) assert output_prov['name'] == prov['name'] assert output_prov['used'] == prov['used'] if 'username' not in other_user or 'password' not in other_user: sys.stderr.write( '\nWarning: no test-authentication configured. skipping testing copy function when trying to copy file made by another user.\n' ) return try: print( "Test: Other user copy should result in different data file handle" ) syn_other = synapseclient.Synapse(skip_checks=True) syn_other.login(other_user['username'], other_user['password']) output = synapseutils.copy(syn_other, file_entity.id, destinationId=third_folder.id) new_copied_ent = syn.get(output[file_entity.id]) new_copied_ent_annot = syn.getAnnotations(new_copied_ent) schedule_for_cleanup(new_copied_ent.id) copied_URL_ent.externalURL = "https://www.google.com" copied_URL_ent = syn.store(copied_URL_ent) output = synapseutils.copy(syn_other, copied_URL_ent.id, destinationId=third_folder.id, version=1) new_copied_URL = syn.get(output[copied_URL_ent.id], downloadFile=False) schedule_for_cleanup(new_copied_URL.id) assert new_copied_ent_annot == annots assert new_copied_ent.dataFileHandleId != copied_ent.dataFileHandleId #Test if copying different versions gets you the correct file assert new_copied_URL.versionNumber == 1 assert new_copied_URL.externalURL == repo_url assert new_copied_URL.dataFileHandleId != copied_URL_ent.dataFileHandleId finally: syn_other.logout() # ------------------------------------ # TEST COPY LINKS # ------------------------------------ print("Test: Copy Links") second_file = utils.make_bogus_data_file() #schedule_for_cleanup(filename) second_file_entity = syn.store(File(second_file, parent=project_entity)) link_entity = Link(second_file_entity.id, parent=folder_entity.id) link_entity = syn.store(link_entity) #function under test uses queries which are eventually consistent but not immediately after creating the entities start_time = time.time() while syn.query("select id from entity where id=='%s'" % link_entity.id).get('totalNumberOfResults') <= 0: assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) copied_link = synapseutils.copy(syn, link_entity.id, destinationId=second_folder.id) old = syn.get(link_entity.id, followLink=False) new = syn.get(copied_link[link_entity.id], followLink=False) assert old.linksTo['targetId'] == new.linksTo['targetId'] assert old.linksTo['targetVersionNumber'] == new.linksTo[ 'targetVersionNumber'] schedule_for_cleanup(second_file_entity.id) schedule_for_cleanup(link_entity.id) schedule_for_cleanup(copied_link[link_entity.id]) time.sleep(3) assert_raises(ValueError, synapseutils.copy, syn, link_entity.id, destinationId=second_folder.id) # ------------------------------------ # TEST COPY TABLE # ------------------------------------ second_project = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(second_project.id) print("Test: Copy Tables") cols = [ Column(name='n', columnType='DOUBLE', maximumSize=50), Column(name='c', columnType='STRING', maximumSize=50), Column(name='i', columnType='INTEGER') ] data = [[2.1, 'foo', 10], [2.2, 'bar', 20], [2.3, 'baz', 30]] schema = syn.store( Schema(name='Testing', columns=cols, parent=project_entity.id)) row_reference_set = syn.store( RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data])) table_map = synapseutils.copy(syn, schema.id, destinationId=second_project.id) copied_table = syn.tableQuery('select * from %s' % table_map[schema.id]) rows = copied_table.asRowSet()['rows'] # TEST: Check if all values are the same for i, row in enumerate(rows): assert row['values'] == data[i] assert_raises(ValueError, synapseutils.copy, syn, schema.id, destinationId=second_project.id) schedule_for_cleanup(schema.id) schedule_for_cleanup(table_map[schema.id]) # ------------------------------------ # TEST COPY FOLDER # ------------------------------------ print("Test: Copy Folder") mapping = synapseutils.copy(syn, folder_entity.id, destinationId=second_project.id) for i in mapping: old = syn.get(i, downloadFile=False) new = syn.get(mapping[i], downloadFile=False) assert old.name == new.name assert old.annotations == new.annotations assert old.concreteType == new.concreteType assert_raises(ValueError, synapseutils.copy, syn, folder_entity.id, destinationId=second_project.id) # TEST: Throw error if excludeTypes isn't in file, link and table or isn't a list assert_raises(ValueError, synapseutils.copy, syn, second_folder.id, destinationId=second_project.id, excludeTypes=["foo"]) assert_raises(ValueError, synapseutils.copy, syn, second_folder.id, destinationId=second_project.id, excludeTypes="file") # TEST: excludeType = ["file"], only the folder is created second = synapseutils.copy(syn, second_folder.id, destinationId=second_project.id, excludeTypes=["file", "table", "link"]) copied_folder = syn.get(second[second_folder.id]) assert copied_folder.name == second_folder.name assert len(second) == 1 # TEST: Make sure error is thrown if foldername already exists start_time = time.time() while syn.query("select id from entity where id=='%s'" % copied_folder.id).get('totalNumberOfResults') <= 0: assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) assert_raises(ValueError, synapseutils.copy, syn, second_folder.id, destinationId=second_project.id) # ------------------------------------ # TEST COPY PROJECT # ------------------------------------ print("Test: Copy Project") third_project = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(third_project.id) mapping = synapseutils.copy(syn, project_entity.id, destinationId=third_project.id) for i in mapping: old = syn.get(i, downloadFile=False) new = syn.get(mapping[i], downloadFile=False) if not isinstance(old, Project): assert old.name == new.name assert old.annotations == new.annotations assert old.concreteType == new.concreteType # TEST: Can't copy project to a folder assert_raises(ValueError, synapseutils.copy, syn, project_entity.id, destinationId=second_folder.id)
def test_command_get_recursive_and_query(test_state): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = test_state.project # Create Folders in Project folder_entity = test_state.syn.store(Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = test_state.syn.store(Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) test_state.schedule_for_cleanup(f) file_entity = File(f, parent=folder_entity2) file_entity = test_state.syn.store(file_entity) file_entities.append(file_entity) test_state.schedule_for_cleanup(f) # Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) test_state.schedule_for_cleanup(f) file_entity = File(f, parent=folder_entity) file_entity = test_state.syn.store(file_entity) file_entities.append(file_entity) # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent, # but faster than chunked queries. time.sleep(2) # Test recursive get run(test_state, 'synapse' '--skip-checks', 'get', '-r', folder_entity.id) # Verify that we downloaded files: new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) test_state.schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) test_state.schedule_for_cleanup(downloaded) # Test query get using a Table with an entity column # This should be replaced when Table File Views are implemented in the client cols = [Column(name='id', columnType='ENTITYID')] schema1 = test_state.syn.store(Schema(name='Foo Table', columns=cols, parent=project_entity)) test_state.schedule_for_cleanup(schema1.id) data1 = [[x.id] for x in file_entities] test_state.syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data1])) time.sleep(3) # get -q are eventually consistent # Test Table/View query get run(test_state, 'synapse' '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) # Verify that we downloaded files: new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) test_state.schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) test_state.schedule_for_cleanup(downloaded) test_state.schedule_for_cleanup(new_paths[0])
# ---------------------------------------------------------------------------------------------------------------------- # Model # ---------------------------------------------------------------------------------------------------------------------- Base = declarative_base() SYN_SCHEMA = Schema(name=secrets.CONSENTS_TABLE_NAME, columns=[ SynColumn(name='study_id', columnType='STRING', maximumSize=31), SynColumn(name='internal_id', columnType='STRING'), SynColumn(name='consent_dt', columnType='STRING', maximumSize=63), SynColumn(name='location_sid', columnType='STRING', maximumSize=127), SynColumn(name='search_sid', columnType='STRING', maximumSize=127), SynColumn(name='notes', columnType='STRING', maximumSize=1000), ], parent=secrets.PROJECT_SYNID) BLANK_CONSENT = ('blank', 0, 'blank', 'blank', 'blank', 'blank') class AppWrap(object): """a class used to wrap the application configuration options required to initialize the encryption cypher""" def __init__(self, config):
def partial_rowset_test_state(syn, project): cols = [ Column(name='foo', columnType='INTEGER'), Column(name='bar', columnType='INTEGER') ] table_schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [[1, None], [None, 2]] syn.store(RowSet(schema=table_schema, rows=[Row(r) for r in data])) # set up a file view folder = syn.store( Folder(name="PartialRowTestFolder" + str(uuid.uuid4()), parent=project)) syn.store( File("~/path/doesnt/matter", name="f1", parent=folder, synapseStore=False)) syn.store( File("~/path/doesnt/matter/again", name="f2", parent=folder, synapseStore=False)) cols = [ Column(name='foo', columnType='INTEGER'), Column(name='bar', columnType='INTEGER') ] view_schema = syn.store( EntityViewSchema(name='PartialRowTestViews' + str(uuid.uuid4()), columns=cols, addDefaultViewColumns=False, parent=project, scopes=[folder])) table_changes = [{'foo': 4}, {'bar': 5}] view_changes = [{'bar': 6}, {'foo': 7}] expected_table_cells = pd.DataFrame({ 'foo': [4.0, float('NaN')], 'bar': [float('NaN'), 5.0] }) expected_view_cells = pd.DataFrame({ 'foo': [float('NaN'), 7.0], 'bar': [6.0, float('NaN')] }) class TestState: def __init__(self): self.syn = syn self.project = project self.table_schema = table_schema self.view_schema = view_schema self.table_changes = table_changes self.view_changes = view_changes self.expected_table_cells = expected_table_cells self.expected_view_cells = expected_view_cells return TestState()
def test_syncFromSynapse__non_file_Entity(): table_schema = "syn12345" with patch.object(syn, "getChildren", return_value = []),\ patch.object(syn, "get", return_value = Schema(name="asssdfa", parent="whatever")): assert_raises(ValueError, synapseutils.syncFromSynapse, syn, table_schema)