Пример #1
0
def _copyTable(syn, entity, destinationId, setAnnotations=False):
    """
    Copies synapse Tables

    :param entity:          A synapse ID of Table Schema

    :param destinationId:   Synapse ID of a project that the Table wants to be copied to

    :param setAnnotations:  Set the annotations of the copied table to be the annotations of the entity
                            Defaults to False
    """

    print("Getting table %s" % entity)
    myTableSchema = syn.get(entity)
    #CHECK: If Table name already exists, raise value error
    search = syn.query('select name from table where projectId == "%s"' % destinationId)
    for i in search['results']:
        if i['table.name'] == myTableSchema.name:
            raise ValueError('A table named "%s" already exists in this location. Table could not be copied'%myTableSchema.name)

    d = syn.tableQuery('select * from %s' % myTableSchema.id)
    d = d.asDataFrame()
    d = d.reset_index()
    del d['index']

    colIds = myTableSchema.columnIds

    newTableSchema = Schema(name = myTableSchema.name,
                           parent = destinationId,
                           columns=colIds)
    if setAnnotations:
        newTableSchema.annotations = myTableSchema.annotations

    if len(d) > 0:
        print("Created new table using schema %s" % newTableSchema.name)
        newTable = Table(schema=newTableSchema,values=d)
        newTable = syn.store(newTable)
        return(newTable.schema.id)
    else:
        print("No data, so storing schema %s" % newTableSchema.name)
        newTableSchema = syn.store(newTableSchema)
        return(newTableSchema.id)
Пример #2
0
def test_store_table_datetime():
    current_datetime = datetime.fromtimestamp(round(time.time(), 3))
    schema = syn.store(
        Schema("testTable", [Column(name="testerino", columnType='DATE')],
               project))
    rowset = RowSet(rows=[Row([current_datetime])], schema=schema)
    syn.store(Table(schema, rowset))

    query_result = syn.tableQuery("select * from %s" % id_of(schema),
                                  resultsAs="rowset")
    assert_equals(current_datetime,
                  query_result.rowset['rows'][0]['values'][0])
def challenge_demo(number_of_submissions=NUM_OF_SUBMISSIONS_TO_CREATE,
                   cleanup=True):
    try:
        # create a Challenge project, evaluation queue, etc.
        objects = set_up()
        evaluation = objects['evaluation']

        ## import challenge *after* we write the config file
        ## 'cause challenge.py imports the config file
        import challenge

        ## a dirty hack to share the same synapse connection object
        challenge.syn = syn

        # create leaderboard wiki page
        leaderboard_columns = challenge.conf.leaderboard_columns[evaluation.id]
        create_wiki(evaluation, objects['challenge_project'],
                    objects['participants_team'], leaderboard_columns)

        # create leaderboard table
        schema = syn.store(
            Schema(name=evaluation.name,
                   columns=challenge.to_column_objects(leaderboard_columns),
                   parent=objects['challenge_project']))

        # stash a reference to the table in the challenge config
        challenge.conf.leaderboard_tables[evaluation.id] = schema.id

        # create submissions on behalf of a team
        submit_to_challenge(evaluation,
                            objects['participant_file'],
                            team=objects['my_team'],
                            n=number_of_submissions)

        # validate correctness
        # (this can be done at the same time as scoring, below, but we
        # demonstrate doing the two tasks separately)
        challenge.validate(evaluation)

        # score the validated submissions
        challenge.score(evaluation)

        # query the results (this is the action used by dynamic leader boards
        # viewable in challenge web pages). The process of indexing submission
        # annotations for query is asynchronous. Wait a second to give it a
        # fighting chance of finishing.
        time.sleep(1)
        challenge.query(evaluation, columns=leaderboard_columns)

    finally:
        if cleanup and "objects" in locals() and objects:
            tear_down(objects)
Пример #4
0
 def _table_setup(cls):
     # set up a table
     cols = [
         Column(name='foo', columnType='INTEGER'),
         Column(name='bar', columnType='INTEGER')
     ]
     schema = syn.store(
         Schema(name='PartialRowTest' + str(uuid.uuid4()),
                columns=cols,
                parent=project))
     data = [[1, None], [None, 2]]
     syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))
     return schema
Пример #5
0
 def _table_setup(cls):
     # set up a table
     cols = [
         Column(name='foo', columnType='STRING', maximumSize=1000),
         Column(name='bar', columnType='STRING')
     ]
     schema = syn.store(
         Schema(name='PartialRowTest' + str(uuid.uuid4()),
                columns=cols,
                parent=project))
     data = [['foo1', None], [None, 'bar2']]
     syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))
     return schema
Пример #6
0
def test_download_table_files():
    cols = [
        Column(name='artist', columnType='STRING', maximumSize=50),
        Column(name='album', columnType='STRING', maximumSize=50),
        Column(name='year', columnType='INTEGER'),
        Column(name='catalog', columnType='STRING', maximumSize=50),
        Column(name='cover', columnType='FILEHANDLEID')
    ]

    schema = syn.store(Schema(name='Jazz Albums', columns=cols,
                              parent=project))
    schedule_for_cleanup(schema)

    data = [[
        "John Coltrane", "Blue Train", 1957, "BLP 1577",
        "coltraneBlueTrain.jpg"
    ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"],
            [
                "Sonny Rollins", "Newk's Time", 1958, "BLP 4001",
                "rollinsBN4001.jpg"
            ],
            [
                "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543",
                "burrellWarholBN1543.jpg"
            ]]

    ## upload files and store file handle ids
    original_files = []
    for row in data:
        path = utils.make_bogus_data_file()
        original_files.append(path)
        schedule_for_cleanup(path)
        file_handle = syn._chunkedUploadFile(path)
        row[4] = file_handle['id']

    row_reference_set = syn.store(
        RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data]))

    ## retrieve the files for each row and verify that they are identical to the originals
    results = syn.tableQuery(
        'select artist, album, year, catalog, cover from %s' % schema.id,
        resultsAs="rowset")
    for i, row in enumerate(results):
        print "%s_%s" % (row.rowId, row.versionNumber), row.values
        file_info = syn.downloadTableFile(results,
                                          rowId=row.rowId,
                                          versionNumber=row.versionNumber,
                                          column='cover',
                                          downloadLocation='.')
        assert filecmp.cmp(original_files[i], file_info['path'])
        schedule_for_cleanup(file_info['path'])
Пример #7
0
def test_tables_pandas():
    # create a pandas DataFrame
    df = pd.DataFrame({
        'A': ("foo", "bar", "baz", "qux", "asdf"),
        'B':
        tuple(0.42 * i for i in range(5)),
        'C': (101, 202, 303, 404, 505),
        'D': (False, True, False, True, False),
        # additional data types supported since SYNPY-347
        'int64':
        tuple(np.int64(range(5))),
        'datetime64':
        tuple(
            np.datetime64(d) for d in [
                '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                '2005-02-05'
            ]),
        'string_':
        tuple(
            np.string_(s)
            for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
    })

    cols = as_table_columns(df)
    cols[0].maximumSize = 20
    schema = Schema(name="Nifty Table", columns=cols, parent=project)

    # store in Synapse
    table = syn.store(Table(schema, df))

    # retrieve the table and verify
    results = syn.tableQuery('select * from %s' % table.schema.id,
                             resultsAs='csv')
    df2 = results.asDataFrame(convert_to_datetime=True)

    # simulate rowId-version rownames for comparison
    df.index = ['%s_0' % i for i in range(5)]

    # for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails
    if six.PY3:
        df['string_'] = df['string_'].transform(str)

    # SYNPY-717
    df['datetime64'] = df['datetime64'].apply(
        lambda x: pd.Timestamp(x).tz_localize('UTC'))

    # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column;
    # second .all() gives a bool that is ANDed value of that Series

    assert_frame_equal(df2, df)
Пример #8
0
def test_syncFromSynapse__children_contain_non_file():
    proj = syn.store(Project(name="test_syncFromSynapse_children_non_file" + str(uuid.uuid4())))
    schedule_for_cleanup(proj)

    temp_file = utils.make_bogus_data_file()
    schedule_for_cleanup(temp_file)
    file_entity = syn.store(File(temp_file, name="temp_file_test_syncFromSynapse_children_non_file" + str(uuid.uuid4()), parent=proj))

    table_schema = syn.store(Schema(name="table_test_syncFromSynapse", parent=proj))

    temp_folder = tempfile.mkdtemp()
    schedule_for_cleanup(temp_folder)

    files_list = synapseutils.syncFromSynapse(syn, proj, temp_folder)
    assert_equals(1, len(files_list))
    assert_equals(file_entity, files_list[0])
Пример #9
0
    def get_or_create_schema(self, **kwargs) -> Schema:
        """Gets an existing table schema by name and parent or
        creates a new one.

        Args:
            Same arguments as synapseclient.Schema

        Returns:
            A synapseclient.Schema.

        """

        schema = Schema(**kwargs)
        schema = self._find_by_obj_or_create(schema)
        self.logger.info('{} Schema {} ({})'.format(self._update_str,
                                                    schema.name, schema.id))
        return schema
Пример #10
0
def dontruntest_big_csvs():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='n', columnType='INTEGER'))
    cols.append(Column(name='is_bogus', columnType='BOOLEAN'))

    schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    print("Created table:", schema1.id)
    print("with columns:", schema1.columnIds)

    ## write rows to CSV file
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        schedule_for_cleanup(temp.name)
        filename = temp.name

    with io.open(filename, mode='w', encoding="utf-8", newline='') as temp:
        writer = csv.writer(temp,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator=str(os.linesep))
        writer.writerow([col.name for col in cols])

        for i in range(10):
            for j in range(100):
                foo = cols[1].enumValues[random.randint(0, 2)]
                writer.writerow(
                    ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0,
                     random.randint(0, 100), random.random() >= 0.5))
            print("wrote 100 rows to disk")

    ## upload CSV
    UploadToTableResult = syn._uploadCsv(filepath=temp.name, schema=schema1)

    from synapseclient.table import CsvFileTable
    results = CsvFileTable.from_table_query(syn,
                                            "select * from %s" % schema1.id)
    print("etag:", results.etag)
    print("tableId:", results.tableId)

    for row in results:
        print(row)
Пример #11
0
def dontruntest_big_tables():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='n', columnType='INTEGER'))
    cols.append(Column(name='is_bogus', columnType='BOOLEAN'))

    table1 = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    print "Created table:", table1.id
    print "with columns:", table1.columnIds

    rows_per_append = 10

    for i in range(1000):
        rows = []
        for j in range(rows_per_append):
            foo = cols[1].enumValues[random.randint(0, 2)]
            rows.append(
                Row(('Robot ' + str(i * rows_per_append + j), foo,
                     random.random() * 200.0, random.randint(0, 100),
                     random.random() >= 0.5)))
        print "added %d rows" % rows_per_append
        rowset1 = syn.store(RowSet(columns=cols, schema=table1, rows=rows))

    results = syn.tableQuery("select * from %s" % table1.id)
    print "etag:", results.etag
    print "tableId:", results.tableId

    for row in results:
        print row

    results = syn.tableQuery(
        "select n, COUNT(n), MIN(x), AVG(x), MAX(x), SUM(x) from %s group by n"
        % table1.id)
    df = results.asDataFrame()

    print df.shape
    print df
def test_table_query(test_state):
    """Test command line ability to do table query."""

    cols = [
        Column(name='name', columnType='STRING', maximumSize=1000),
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']),
        Column(name='x', columnType='DOUBLE'),
        Column(name='age', columnType='INTEGER'),
        Column(name='cartoon', columnType='BOOLEAN')
    ]

    project_entity = test_state.project

    schema1 = test_state.syn.store(
        Schema(name=str(uuid.uuid4()), columns=cols, parent=project_entity))
    test_state.schedule_for_cleanup(schema1.id)

    data1 = [['Chris', 'bar', 11.23, 45, False],
             ['Jen', 'bat', 14.56, 40,
              False], ['Jane', 'bat', 17.89, 6, False],
             ['Henry', 'bar', 10.12, 1, False]]

    test_state.syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data1]))

    # Test query
    output = run(test_state, 'synapse'
                 '--skip-checks', 'query', 'select * from %s' % schema1.id)

    output_rows = output.rstrip("\n").split("\n")

    # Check the length of the output
    assert len(output_rows) == 5, "got %s rows" % (len(output_rows), )

    # Check that headers are correct.
    # Should be column names in schema plus the ROW_ID and ROW_VERSION
    my_headers_set = output_rows[0].split("\t")
    expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list(
        map(lambda x: x.name, cols))
    assert my_headers_set == expected_headers_set, "%r != %r" % (
        my_headers_set, expected_headers_set)
Пример #13
0
def test_rowset_tables(syn, project):
    cols = [
        Column(name='name', columnType='STRING', maximumSize=1000),
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']),
        Column(name='x', columnType='DOUBLE'),
        Column(name='age', columnType='INTEGER'),
        Column(name='cartoon', columnType='BOOLEAN'),
        Column(name='description', columnType='LARGETEXT')
    ]

    schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project))

    data1 = [['Chris', 'bar', 11.23, 45, False, 'a'],
             ['Jen', 'bat', 14.56, 40, False, 'b'],
             ['Jane', 'bat', 17.89, 6, False, 'c' * 1002],
             ['Henry', 'bar', 10.12, 1, False, 'd']]
    row_reference_set1 = syn.store(
        RowSet(schema=schema1, rows=[Row(r) for r in data1]))
    assert len(row_reference_set1['rows']) == 4
Пример #14
0
def update_global_scores_table(global_data):
    import challenge_config as config
    from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
    # 'principalId', 'name', 'score_lb', 'score_mean', 'score_ub', 'rank'
    cols = [
        Column(name='UserID', columnType='STRING', maximumSize=100),
        Column(name='Name', columnType='STRING', maximumSize=100),
        Column(name='score_lb', columnType='DOUBLE'),
        Column(name='score_mean', columnType='DOUBLE'),
        Column(name='score_ub', columnType='DOUBLE'),
        Column(name='rank', columnType='DOUBLE'),
    ]
    schema = Schema(name='Global Scores',
                    columns=cols,
                    parent=config.CHALLENGE_SYN_ID)

    results = syn.tableQuery("select * from {}".format('syn7237020'))
    if len(results) > 0:
        a = syn.delete(results.asRowSet())
    table = syn.store(Table(schema, global_data))
    results = syn.tableQuery("select * from {}".format(table.tableId))
    for row in results:
        print row
    return
Пример #15
0
def test_download_table_files():
    cols = [
        Column(name='artist', columnType='STRING', maximumSize=50),
        Column(name='album', columnType='STRING', maximumSize=50),
        Column(name='year', columnType='INTEGER'),
        Column(name='catalog', columnType='STRING', maximumSize=50),
        Column(name='cover', columnType='FILEHANDLEID')
    ]

    schema = syn.store(Schema(name='Jazz Albums', columns=cols,
                              parent=project))
    schedule_for_cleanup(schema)

    data = [[
        "John Coltrane", "Blue Train", 1957, "BLP 1577",
        "coltraneBlueTrain.jpg"
    ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"],
            [
                "Sonny Rollins", "Newk's Time", 1958, "BLP 4001",
                "rollinsBN4001.jpg"
            ],
            [
                "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543",
                "burrellWarholBN1543.jpg"
            ]]

    ## upload files and store file handle ids
    original_files = []
    for row in data:
        path = utils.make_bogus_data_file()
        original_files.append(path)
        schedule_for_cleanup(path)
        file_handle = syn.uploadFileHandle(path, project)
        row[4] = file_handle['id']

    row_reference_set = syn.store(
        RowSet(schema=schema, rows=[Row(r) for r in data]))

    ## retrieve the files for each row and verify that they are identical to the originals
    results = syn.tableQuery(
        "select artist, album, 'year', 'catalog', cover from %s" % schema.id,
        resultsAs="rowset")
    for i, row in enumerate(results):
        path = syn.downloadTableFile(results,
                                     rowId=row.rowId,
                                     versionNumber=row.versionNumber,
                                     column='cover')
        assert filecmp.cmp(original_files[i], path)
        schedule_for_cleanup(path)

    ## test that cached copies are returned for already downloaded files
    original_downloadFile_method = syn._downloadFileHandle
    with patch(
            "synapseclient.Synapse._downloadFileHandle") as _downloadFile_mock:
        _downloadFile_mock.side_effect = original_downloadFile_method

        results = syn.tableQuery(
            "select artist, album, 'year', 'catalog', cover from %s where artist = 'John Coltrane'"
            % schema.id,
            resultsAs="rowset")
        for i, row in enumerate(results):
            file_path = syn.downloadTableFile(results,
                                              rowId=row.rowId,
                                              versionNumber=row.versionNumber,
                                              column='cover')
            assert filecmp.cmp(original_files[i], file_path)

        assert not _downloadFile_mock.called, "Should have used cached copy of file and not called _downloadFile"

    ## test download table column
    results = syn.tableQuery('select * from %s' % schema.id)
    ## uncache 2 out of 4 files
    for i, row in enumerate(results):
        if i % 2 == 0:
            syn.cache.remove(row[6])
    file_map = syn.downloadTableColumns(results, ['cover'])
    assert len(file_map) == 4
    for row in results:
        filecmp.cmp(original_files[i], file_map[row[6]])
Пример #16
0
def test_tables_csv():

    ## Define schema
    cols = []
    cols.append(Column(name='Name', columnType='STRING'))
    cols.append(Column(name='Born', columnType='INTEGER'))
    cols.append(Column(name='Hipness', columnType='DOUBLE'))
    cols.append(Column(name='Living', columnType='BOOLEAN'))

    schema = Schema(name='Jazz Guys', columns=cols, parent=project)

    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    ## the following creates a CSV file and uploads it to create a new table
    table = syn.store(Table(schema, data))

    ## Query and download an identical CSV
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv",
                             includeRowIdAndRowVersion=False)

    ## Test that CSV file came back as expected
    for expected_row, row in zip(data, results):
        assert expected_row == row, "expected %s but got %s" % (expected_row,
                                                                row)

    try:
        ## check if we have pandas
        import pandas as pd

        df = results.asDataFrame()
        assert all(df.columns.values == ['Name', 'Born', 'Hipness', 'Living'])
        assert list(df.iloc[1, [0, 1, 3]]) == ['Miles Davis', 1926, False]
        assert df.iloc[1, 2] - 9.87 < 0.0001
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test of .asDataFrame for CSV tables.\n\n'
        )

    ## Aggregate query
    expected = {True: [True, 1929, 3, 6.38], False: [False, 1926, 5, 7.104]}

    results = syn.tableQuery(
        'select Living, min(Born), count(Living), avg(Hipness) from %s group by Living'
        % table.schema.id,
        resultsAs="csv",
        includeRowIdAndRowVersion=False)
    for row in results:
        living = row[0]
        assert expected[living][1] == row[1]
        assert expected[living][2] == row[2]
        assert abs(expected[living][3] - row[3]) < 0.0001

    ## Aggregate query results to DataFrame
    try:
        ## check if we have pandas
        import pandas as pd

        df = results.asDataFrame()
        assert all(expected[df.iloc[0, 0]][0:3] == df.iloc[0, 0:3])
        assert abs(expected[df.iloc[1, 0]][3] - df.iloc[1, 3]) < 0.0001
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test of .asDataFrame for aggregate queries as CSV tables.\n\n'
        )

    ## Append rows
    more_jazz_guys = [["Sonny Clark", 1931, 8.43, False],
                      ["Hank Mobley", 1930, 5.67, False],
                      ["Freddie Hubbard", 1938,
                       float('nan'), False],
                      ["Thelonious Monk", 1917,
                       float('inf'), False]]
    table = syn.store(Table(table.schema, more_jazz_guys))

    ## test that CSV file now has more jazz guys
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv")
    for expected_row, row in zip(data + more_jazz_guys, results):
        for field, expected_field in zip(row[2:], expected_row):
            if type(field) is float and math.isnan(field):
                assert type(expected_field) is float and math.isnan(
                    expected_field)
            elif type(expected_field) is float and math.isnan(expected_field):
                assert type(field) is float and math.isnan(field)
            else:
                assert expected_field == field

    ## Update as a RowSet
    rowset = results.asRowSet()
    for row in rowset['rows']:
        if row['values'][1] == 1930:
            row['values'][2] = 8.5
    row_reference_set = syn.store(rowset)

    ## aggregate queries won't return row id and version, so we need to
    ## handle this correctly
    results = syn.tableQuery(
        'select Born, COUNT(*) from %s group by Born order by Born' %
        table.schema.id,
        resultsAs="csv")
    assert results.includeRowIdAndRowVersion == False
    for i, row in enumerate(results):
        assert row[0] == [1917, 1926, 1929, 1930, 1931, 1935, 1936, 1938][i]
        assert row[1] == [1, 2, 2, 2, 2, 1, 1, 1][i]

    try:
        import pandas as pd
        results = syn.tableQuery("select * from %s where Born=1930" %
                                 table.schema.id,
                                 resultsAs="csv")
        df = results.asDataFrame()
        all(df['Born'].values == 1930)
        all(df['Hipness'].values == 8.5)

        ## Update via a Data Frame
        df['Hipness'] = 9.75
        table = syn.store(Table(table.tableId, df, etag=results.etag))

        results = syn.tableQuery("select * from %s where Born=1930" %
                                 table.tableId,
                                 resultsAs="csv")
        for row in results:
            assert row[4] == 9.75
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n'
        )

    ## check what happens when query result is empty
    results = syn.tableQuery('select * from %s where Born=2013' %
                             table.tableId,
                             resultsAs="csv")
    assert len(list(results)) == 0

    try:
        import pandas as pd
        results = syn.tableQuery('select * from %s where Born=2013' %
                                 table.tableId,
                                 resultsAs="csv")
        df = results.asDataFrame()
        assert df.shape[0] == 0
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n'
        )

    ## delete some rows
    results = syn.tableQuery('select * from %s where Hipness < 7' %
                             table.tableId,
                             resultsAs="csv")
    syn.delete(results)
Пример #17
0
def test_rowset_tables():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='age', columnType='INTEGER'))
    cols.append(Column(name='cartoon', columnType='BOOLEAN'))
    cols.append(Column(name='description', columnType='LARGETEXT'))

    schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project))

    ## Get columns associated with the given table
    retrieved_cols = list(syn.getTableColumns(schema1))

    ## Test that the columns we get are the same as the ones we stored
    assert len(retrieved_cols) == len(cols)
    for retrieved_col, col in zip(retrieved_cols, cols):
        assert retrieved_col.name == col.name
        assert retrieved_col.columnType == col.columnType

    data1 = [['Chris', 'bar', 11.23, 45, False, 'a'],
             ['Jen', 'bat', 14.56, 40, False, 'b'],
             ['Jane', 'bat', 17.89, 6, False, 'c' * 1002],
             ['Henry', 'bar', 10.12, 1, False, 'd']]
    row_reference_set1 = syn.store(
        RowSet(schema=schema1, rows=[Row(r) for r in data1]))
    assert len(row_reference_set1['rows']) == 4

    ## add more new rows
    data2 = [['Fred', 'bat', 21.45, 20, True, 'e'],
             ['Daphne', 'foo', 27.89, 20, True, 'f'],
             ['Shaggy', 'foo', 23.45, 20, True, 'g'],
             ['Velma', 'bar', 25.67, 20, True, 'h']]
    syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data2]))

    results = syn.tableQuery("select * from %s order by name" % schema1.id,
                             resultsAs="rowset")

    assert results.count == 8
    assert results.tableId == schema1.id

    ## test that the values made the round trip
    expected = sorted(data1 + data2)
    for expected_values, row in zip(expected, results):
        assert expected_values == row['values'], 'got %s but expected %s' % (
            row['values'], expected_values)

    ## To modify rows, we have to select then first.
    result2 = syn.tableQuery('select * from %s where age>18 and age<30' %
                             schema1.id,
                             resultsAs="rowset")

    ## make a change
    rs = result2.asRowSet()
    for row in rs['rows']:
        row['values'][2] = 88.888

    ## store it
    row_reference_set = syn.store(rs)

    ## check if the change sticks
    result3 = syn.tableQuery('select name, x, age from %s' % schema1.id,
                             resultsAs="rowset")
    for row in result3:
        if int(row['values'][2]) == 20:
            assert row['values'][1] == 88.888

    ## Add a column
    bday_column = syn.store(Column(name='birthday', columnType='DATE'))

    column = syn.getColumn(bday_column.id)
    assert column.name == "birthday"
    assert column.columnType == "DATE"

    schema1.addColumn(bday_column)
    schema1 = syn.store(schema1)

    results = syn.tableQuery(
        'select * from %s where cartoon=false order by age' % schema1.id,
        resultsAs="rowset")
    rs = results.asRowSet()

    ## put data in new column
    bdays = ('2013-3-15', '2008-1-3', '1973-12-8', '1969-4-28')
    for bday, row in zip(bdays, rs.rows):
        row['values'][6] = bday
    row_reference_set = syn.store(rs)

    ## query by date and check that we get back two kids
    date_2008_jan_1 = utils.to_unix_epoch_time(datetime(2008, 1, 1))
    results = syn.tableQuery(
        'select name from %s where birthday > %d order by birthday' %
        (schema1.id, date_2008_jan_1),
        resultsAs="rowset")
    assert ["Jane", "Henry"] == [row['values'][0] for row in results]

    try:
        import pandas as pd
        df = results.asDataFrame()
        assert all(df.ix[:, "name"] == ["Jane", "Henry"])
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )

    results = syn.tableQuery(
        'select birthday from %s where cartoon=false order by age' %
        schema1.id,
        resultsAs="rowset")
    for bday, row in zip(bdays, results):
        assert row['values'][0] == datetime.strptime(
            bday,
            "%Y-%m-%d"), "got %s but expected %s" % (row['values'][0], bday)

    try:
        import pandas as pd
        results = syn.tableQuery(
            "select foo, MAX(x), COUNT(foo), MIN(age) from %s group by foo order by foo"
            % schema1.id,
            resultsAs="rowset")
        df = results.asDataFrame()
        assert df.shape == (3, 4)
        assert all(df.iloc[:, 0] == ["bar", "bat", "foo"])
        assert all(df.iloc[:, 1] == [88.888, 88.888, 88.888])
        assert all(df.iloc[:, 2] == [3, 3, 2])
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )

    ## test delete rows by deleting cartoon characters
    syn.delete(
        syn.tableQuery('select name from %s where cartoon = true' % schema1.id,
                       resultsAs="rowset"))

    results = syn.tableQuery('select name from %s order by birthday' %
                             schema1.id,
                             resultsAs="rowset")
    assert ["Chris", "Jen", "Jane",
            "Henry"] == [row['values'][0] for row in results]

    ## check what happens when query result is empty
    results = syn.tableQuery('select * from %s where age > 1000' % schema1.id,
                             resultsAs="rowset")
    assert len(list(results)) == 0

    try:
        import pandas as pd
        results = syn.tableQuery('select * from %s where age > 1000' %
                                 schema1.id,
                                 resultsAs="rowset")
        df = results.asDataFrame()
        assert df.shape[0] == 0
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )
Пример #18
0
def test_copy():
    """Tests the copy function"""
    # Create a Project
    project_entity = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(project_entity.id)
    # Create two Folders in Project
    folder_entity = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    second_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    third_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(folder_entity.id)
    schedule_for_cleanup(second_folder.id)
    schedule_for_cleanup(third_folder.id)

    # Annotations and provenance
    repo_url = 'https://github.com/Sage-Bionetworks/synapsePythonClient'
    annos = {'test': ['hello_world']}
    prov = Activity(name="test", used=repo_url)
    # Create, upload, and set annotations/provenance on a file in Folder
    filename = utils.make_bogus_data_file()
    schedule_for_cleanup(filename)
    file_entity = syn.store(File(filename, parent=folder_entity))
    externalURL_entity = syn.store(
        File(repo_url, name='rand', parent=folder_entity, synapseStore=False))
    syn.setAnnotations(file_entity, annos)
    syn.setAnnotations(externalURL_entity, annos)
    syn.setProvenance(externalURL_entity.id, prov)
    schedule_for_cleanup(file_entity.id)
    schedule_for_cleanup(externalURL_entity.id)
    # ------------------------------------
    # TEST COPY FILE
    # ------------------------------------
    output = synapseutils.copy(syn,
                               file_entity.id,
                               destinationId=project_entity.id)
    output_URL = synapseutils.copy(syn,
                                   externalURL_entity.id,
                                   destinationId=project_entity.id,
                                   skipCopyAnnotations=True)

    # Verify that our copied files are identical
    copied_ent = syn.get(output[file_entity.id])
    copied_URL_ent = syn.get(output_URL[externalURL_entity.id],
                             downloadFile=False)

    copied_ent_annot = syn.getAnnotations(copied_ent)
    copied_url_annot = syn.getAnnotations(copied_URL_ent)
    copied_prov = syn.getProvenance(copied_ent)
    copied_url_prov = syn.getProvenance(copied_URL_ent)
    schedule_for_cleanup(copied_ent.id)
    schedule_for_cleanup(copied_URL_ent.id)

    # TEST: set_Provenance = Traceback
    assert_equals(copied_prov['used'][0]['reference']['targetId'],
                  file_entity.id)
    assert_equals(copied_url_prov['used'][0]['reference']['targetId'],
                  externalURL_entity.id)

    # TEST: Make sure copied files are the same
    assert_equals(copied_ent_annot, annos)
    assert_equals(copied_ent.dataFileHandleId, file_entity.dataFileHandleId)

    # TEST: Make sure copied URLs are the same
    assert_equals(copied_url_annot, {})
    assert_equals(copied_URL_ent.externalURL, repo_url)
    assert_equals(copied_URL_ent.name, 'rand')
    assert_equals(copied_URL_ent.dataFileHandleId,
                  externalURL_entity.dataFileHandleId)

    # TEST: Throw error if file is copied to a folder/project that has a file with the same filename
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  project_entity.id,
                  destinationId=project_entity.id)
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  file_entity.id,
                  destinationId=project_entity.id)
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  file_entity.id,
                  destinationId=third_folder.id,
                  setProvenance="gib")
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  file_entity.id,
                  destinationId=file_entity.id)

    # Test: setProvenance = None
    output = synapseutils.copy(syn,
                               file_entity.id,
                               destinationId=second_folder.id,
                               setProvenance=None)
    assert_raises(SynapseHTTPError, syn.getProvenance, output[file_entity.id])
    schedule_for_cleanup(output[file_entity.id])

    # Test: setProvenance = Existing
    output_URL = synapseutils.copy(syn,
                                   externalURL_entity.id,
                                   destinationId=second_folder.id,
                                   setProvenance="existing")
    output_prov = syn.getProvenance(output_URL[externalURL_entity.id])
    schedule_for_cleanup(output_URL[externalURL_entity.id])
    assert_equals(output_prov['name'], prov['name'])
    assert_equals(output_prov['used'], prov['used'])

    # ------------------------------------
    # TEST COPY LINKS
    # ------------------------------------
    second_file = utils.make_bogus_data_file()
    # schedule_for_cleanup(filename)
    second_file_entity = syn.store(File(second_file, parent=project_entity))
    link_entity = Link(second_file_entity.id, parent=folder_entity.id)
    link_entity = syn.store(link_entity)

    copied_link = synapseutils.copy(syn,
                                    link_entity.id,
                                    destinationId=second_folder.id)
    old = syn.get(link_entity.id, followLink=False)
    new = syn.get(copied_link[link_entity.id], followLink=False)
    assert_equals(old.linksTo['targetId'], new.linksTo['targetId'])

    schedule_for_cleanup(second_file_entity.id)
    schedule_for_cleanup(link_entity.id)
    schedule_for_cleanup(copied_link[link_entity.id])

    time.sleep(3)

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  link_entity.id,
                  destinationId=second_folder.id)

    # ------------------------------------
    # TEST COPY TABLE
    # ------------------------------------
    second_project = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(second_project.id)
    cols = [
        Column(name='n', columnType='DOUBLE', maximumSize=50),
        Column(name='c', columnType='STRING', maximumSize=50),
        Column(name='i', columnType='INTEGER')
    ]
    data = [[2.1, 'foo', 10], [2.2, 'bar', 20], [2.3, 'baz', 30]]

    schema = syn.store(
        Schema(name='Testing', columns=cols, parent=project_entity.id))
    syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))

    table_map = synapseutils.copy(syn,
                                  schema.id,
                                  destinationId=second_project.id)
    copied_table = syn.tableQuery('select * from %s' % table_map[schema.id])
    rows = copied_table.asRowSet()['rows']
    # TEST: Check if all values are the same
    for i, row in enumerate(rows):
        assert_equals(row['values'], data[i])

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  schema.id,
                  destinationId=second_project.id)

    schedule_for_cleanup(schema.id)
    schedule_for_cleanup(table_map[schema.id])

    # ------------------------------------
    # TEST COPY FOLDER
    # ------------------------------------
    mapping = synapseutils.copy(syn,
                                folder_entity.id,
                                destinationId=second_project.id)
    for i in mapping:
        old = syn.get(i, downloadFile=False)
        new = syn.get(mapping[i], downloadFile=False)
        assert_equals(old.name, new.name)
        assert_equals(old.annotations, new.annotations)
        assert_equals(old.concreteType, new.concreteType)

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  folder_entity.id,
                  destinationId=second_project.id)
    # TEST: Throw error if excludeTypes isn't in file, link and table or isn't a list
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  second_folder.id,
                  destinationId=second_project.id,
                  excludeTypes=["foo"])
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  second_folder.id,
                  destinationId=second_project.id,
                  excludeTypes="file")
    # TEST: excludeType = ["file"], only the folder is created
    second = synapseutils.copy(syn,
                               second_folder.id,
                               destinationId=second_project.id,
                               excludeTypes=["file", "table", "link"])

    copied_folder = syn.get(second[second_folder.id])
    assert_equals(copied_folder.name, second_folder.name)
    assert_equals(len(second), 1)
    # TEST: Make sure error is thrown if foldername already exists

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  second_folder.id,
                  destinationId=second_project.id)

    # ------------------------------------
    # TEST COPY PROJECT
    # ------------------------------------
    third_project = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(third_project.id)

    mapping = synapseutils.copy(syn,
                                project_entity.id,
                                destinationId=third_project.id)
    for i in mapping:
        old = syn.get(i, downloadFile=False)
        new = syn.get(mapping[i], downloadFile=False)
        if not isinstance(old, Project):
            assert_equals(old.name, new.name)
        assert_equals(old.annotations, new.annotations)
        assert_equals(old.concreteType, new.concreteType)

    # TEST: Can't copy project to a folder
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  project_entity.id,
                  destinationId=second_folder.id)
Пример #19
0
def concatenate_tables_to_synapse_table(frames,
                                        synapse_project_id,
                                        table_name,
                                        username='',
                                        password=''):
    """
    Concatenate multiple dataframes and store as a Synapse table.

    Reuse the indices from the original DataFrame,
    increasing number of columns.

    Parameters
    ----------
    frames : list of pandas DataFrames
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> import pandas as pd
    >>> from mhealthx.io_data import concatenate_tables_to_synapse_table
    >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    >>>                     'B': ['B0', 'B1', 'B2', 'B3'],
    >>>                     'C': ['C0', 'C1', 'C2', 'C3'],
    >>>                     'D': ['D0', 'D1', 'D2', 'D3']},
    >>>                    index=[0, 1, 2, 3])
    >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
    >>>                     'F': ['B4', 'B5', 'B6', 'B7'],
    >>>                     'G': ['C4', 'C5', 'C6', 'C7'],
    >>>                     'H': ['D4', 'D5', 'D6', 'D7']},
    >>>                     index=[0, 1, 2, 3])
    >>> frames = [df1, df2]
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to join tables'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password)
    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Concatenate dataframes: reuse the indices from the original DataFrame,
    # increasing number of columns:
    table_data = pd.concat(frames, axis=1)  #, join_axes=[frames[0].index])

    # Create table schema:
    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))

    return table_data, synapse_project_id
Пример #20
0
def files_to_synapse_table(in_files,
                           synapse_project_id,
                           table_name,
                           column_name='fileID',
                           username='',
                           password=''):
    """
    Upload files and file handle IDs to Synapse.

    Parameters
    ----------
    in_files : list of strings
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    column_name : string
        header for column of fileIDs
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> from mhealthx.io_data import files_to_synapse_table
    >>> in_files = ['/Users/arno/Local/wav/test1.wav']
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to store files and file handle IDs'
    >>> column_name = 'fileID1'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password)
    >>> #column_name = 'fileID2'
    >>> #in_files = ['/Users/arno/Local/wav/test2.wav']
    >>> #table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Column, RowSet, Row

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store file handle IDs:
    files_handles = []
    for in_file in in_files:
        file_handle = syn._chunkedUploadFile(in_file)
        files_handles.append([file_handle['id']])

    # New column headers:
    new_column_header = Column(name=column_name, columnType='FILEHANDLEID')

    # See if Synapse table exists:
    # tex = list(syn.chunkedQuery("select id from Table where parentId=='{0}'"
    #                             " and name=='{1}'".format(synapse_project_id,
    #                                                       table_name)))
    # If Synapse table does not exist, create table schema:
    # if not tex:

    # Create table schema:
    schema = syn.store(
        Schema(name=table_name,
               columns=[new_column_header],
               parent=synapse_project_id))

    # Upload files and file handle IDs with new schema:
    syn.store(
        RowSet(columns=[new_column_header],
               schema=schema,
               rows=[Row(r) for r in files_handles]))
Пример #21
0
def copy_synapse_table(synapse_table_id,
                       synapse_project_id,
                       table_name='',
                       remove_columns=[],
                       username='',
                       password=''):
    """
    Copy Synapse table to another Synapse project.

    Parameters
    ----------
    synapse_table_id : string
        Synapse ID for table to copy
    synapse_project_id : string
        copy table to project with this Synapse ID
    table_name : string
        schema name of table
    remove_columns : list of strings
        column headers for columns to be removed
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        Synapse table contents
    table_name : string
        schema name of table
    synapse_project_id : string
        Synapse ID for project within which table is to be written

    Examples
    --------
    >>> from mhealthx.io_data import copy_synapse_table
    >>> synapse_table_id = 'syn4590865'
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Copy of ' + synapse_table_id
    >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a']
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Download Synapse table as a dataframe:
    results = syn.tableQuery("select * from {0}".format(synapse_table_id))
    table_data = results.asDataFrame()

    # Remove specified columns:
    if remove_columns:
        for remove_column in remove_columns:
            del table_data[remove_column]

    # Upload to Synapse table:
    table_data.index = range(table_data.shape[0])
    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)
    table = syn.store(Table(schema, table_data))

    return table_data, table_name, synapse_project_id
Пример #22
0
def opensmile_features_to_synapse(in_files, synapse_project_id,
                                  table_name, username, password):
    """
    Save openSMILE's SMILExtract audio features to a Synapse table.

    Parameters
    ----------
    in_files : list of strings
        full path to the input files
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    table_name : string
        schema name of table
    synapse_table_id : string
        Synapse table ID

    Examples
    --------
    >>> from mhealthx.features import opensmile_features_to_synapse
    >>> in_files = ['/home/arno/smile/test1.wav.csv','/home/arno/smile/test2.wav.csv','/home/arno/smile/test3.wav.csv']
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Phonation openSMILE feature table'
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_table_id = opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password)

    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    from mhealthx.io_data import concatenate_tables_to_synapse_table as cat

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store each file as a row in a Synapse table:
    first = True
    for in_file in in_files:
        if first:
            df_data = pd.read_csv(in_file)
            first = False
        else:
            df_data = pd.read_csv(in_file)

    table_data, project_id = cat(frames, synapse_project_id, table_name,
                                 username, password)

    # Create table schema:
    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))
    synapse_table_id = str(table.tableId)

    return table_data, table_name, synapse_table_id
Пример #23
0
def createAMPADTable(keyFile, clinicalFile):
    """
    Create the AMP AD table with merged data from keyFile with clinicalFile.
    If any of the supplementary files exist for a particular dataset, change
    the binary classifiers to the synapse ID holding the data and reset 0
    to null for the table.

    Input:
        keyFile: Dataframe with the keys and information regarding what
            exists for each patient
        clinicalFile: Dataframe with clinical data for various patients

    """

    toUpload = []

    clinicalHeader = clinicalFile.columns.values

    #seenList = []
    # Iterate through each project within keyFile
    for i, row in keyFile.iterrows():
        # Create empty list for new row to be added to synapse table
        newRow = []

        # Ignore binary varibles which all end in '_data'
        for item in row.iteritems():
            if (item[0] == 'niagas_data'):
                if (not pd.isnull(row.niagas_data)):
                    newRow.append(arrayExpressionSynID)
                else:
                    newRow.append(float('nan'))

            elif (not item[0].endswith('_data')):
                newRow.append(item[1])

        # Check if row has clinical data
        if (row.clinical_data):
            # Create reference to clinicalFile project ID
            clinicalKeyList = clinicalFile['projid']

            # get the index of the projID in the clinical file
            index = clinicalKeyList[clinicalKeyList ==
                                    row.projid].index.tolist()

            if (len(index) == 1):
                index = index[0]
                #seenList.append(row.projid)
                for entry in clinicalFile.iloc[index][1:]:
                    newRow.append(entry)

            # If the length of the idnex is 0, it means the key file thinks
            # there is clinical information for this patient but it does
            # not exist in the clinical file
            elif (len(index) == 0):
                print("Key file indicates that projID %s should have "\
                    "clinical information, but it does not exist in "\
                    "the clinical information file" % row.projid)
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float('nan'))

            # If the lengh of index list is greater than 1, that means projID
            # appears more than once in the file. Send warning to user
            else:
                print("projID %s appears more than once in clinical file at "\
                    "positions %s" % (row.projid, index))
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float('nan'))

        else:
            for _ in range(1, len(clinicalHeader)):
                newRow.append(float('nan'))

        # Check if row has gwas data
        if (row.gwas_data):
            newRow.append(genotypeSynID)
            newRow.append(imputedGenotypeSynID)
        else:
            newRow.append(float('nan'))
            newRow.append(float('nan'))

        if (row.mwas_data):
            newRow.append(methylationSynID)
        else:
            newRow.append(float('nan'))

        if (row.mirna_data):
            newRow.append(mirnaSynID)
        else:
            newRow.append(float('nan'))

        if (row.mrna_data):
            newRow.append(rnaseqSynID)
        else:
            newRow.append(float('nan'))

        toUpload.append(newRow)

    df = pd.DataFrame(toUpload)
    columns = list(keyFile.columns.values)
    index = columns.index('clinical_data') - 1
    columns.remove('clinical_data')

    idnex = columns.index('gwas_data')
    columns.remove('gwas_data')
    columns.insert(index + 1, 'genotype data')
    columns.insert(index + 2, 'imputed genotype data')

    for i in range(1, len(clinicalHeader)):
        columns.insert(index + i, clinicalHeader[i])

    df.columns = columns

    df.to_csv('mergedTables.csv', encodings='utf-8', index=False)

    print("Uploading to Synapse")
    schema = Schema(name='AMP AD Samples Table',
                    columns=as_table_columns(df),
                    parent='syn2580853')
    syn.store(Table(schema, df))
def test_copy():
    """Tests the copy function"""
    # Create a Project
    project_entity = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(project_entity.id)
    acl = syn.setPermissions(
        project_entity,
        other_user['principalId'],
        accessType=['READ', 'CREATE', 'UPDATE', 'DOWNLOAD'])
    # Create two Folders in Project
    folder_entity = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    second_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    third_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(folder_entity.id)
    schedule_for_cleanup(second_folder.id)
    schedule_for_cleanup(third_folder.id)

    # Annotations and provenance
    repo_url = 'https://github.com/Sage-Bionetworks/synapsePythonClient'
    annots = {'test': ['hello_world']}
    prov = Activity(name="test", used=repo_url)
    # Create, upload, and set annotations/provenance on a file in Folder
    filename = utils.make_bogus_data_file()
    schedule_for_cleanup(filename)
    file_entity = syn.store(File(filename, parent=folder_entity))
    externalURL_entity = syn.store(
        File(repo_url, name='rand', parent=folder_entity, synapseStore=False))
    syn.setAnnotations(file_entity, annots)
    syn.setAnnotations(externalURL_entity, annots)
    syn.setProvenance(externalURL_entity.id, prov)
    schedule_for_cleanup(file_entity.id)
    schedule_for_cleanup(externalURL_entity.id)
    # ------------------------------------
    # TEST COPY FILE
    # ------------------------------------
    output = synapseutils.copy(syn,
                               file_entity.id,
                               destinationId=project_entity.id)
    output_URL = synapseutils.copy(syn,
                                   externalURL_entity.id,
                                   destinationId=project_entity.id,
                                   skipCopyAnnotations=True)

    #Verify that our copied files are identical
    copied_ent = syn.get(output[file_entity.id])
    copied_URL_ent = syn.get(output_URL[externalURL_entity.id],
                             downloadFile=False)

    copied_ent_annot = syn.getAnnotations(copied_ent)
    copied_url_annot = syn.getAnnotations(copied_URL_ent)
    copied_prov = syn.getProvenance(copied_ent)
    copied_url_prov = syn.getProvenance(copied_URL_ent)
    schedule_for_cleanup(copied_ent.id)
    schedule_for_cleanup(copied_URL_ent.id)

    # TEST: set_Provenance = Traceback
    print("Test: setProvenance = Traceback")
    assert copied_prov['used'][0]['reference']['targetId'] == file_entity.id
    assert copied_url_prov['used'][0]['reference'][
        'targetId'] == externalURL_entity.id

    # TEST: Make sure copied files are the same
    assert copied_ent_annot == annots
    assert copied_ent.dataFileHandleId == file_entity.dataFileHandleId

    # TEST: Make sure copied URLs are the same
    assert copied_url_annot == {}
    assert copied_URL_ent.externalURL == repo_url
    assert copied_URL_ent.name == 'rand'
    assert copied_URL_ent.dataFileHandleId == externalURL_entity.dataFileHandleId

    # TEST: Throw error if file is copied to a folder/project that has a file with the same filename
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  project_entity.id,
                  destinationId=project_entity.id)
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  file_entity.id,
                  destinationId=project_entity.id)
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  file_entity.id,
                  destinationId=third_folder.id,
                  setProvenance="gib")
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  file_entity.id,
                  destinationId=file_entity.id)

    print("Test: setProvenance = None")
    output = synapseutils.copy(syn,
                               file_entity.id,
                               destinationId=second_folder.id,
                               setProvenance=None)
    assert_raises(SynapseHTTPError, syn.getProvenance, output[file_entity.id])
    schedule_for_cleanup(output[file_entity.id])

    print("Test: setProvenance = Existing")
    output_URL = synapseutils.copy(syn,
                                   externalURL_entity.id,
                                   destinationId=second_folder.id,
                                   setProvenance="existing")
    output_prov = syn.getProvenance(output_URL[externalURL_entity.id])
    schedule_for_cleanup(output_URL[externalURL_entity.id])
    assert output_prov['name'] == prov['name']
    assert output_prov['used'] == prov['used']

    if 'username' not in other_user or 'password' not in other_user:
        sys.stderr.write(
            '\nWarning: no test-authentication configured. skipping testing copy function when trying to copy file made by another user.\n'
        )
        return

    try:
        print(
            "Test: Other user copy should result in different data file handle"
        )
        syn_other = synapseclient.Synapse(skip_checks=True)
        syn_other.login(other_user['username'], other_user['password'])

        output = synapseutils.copy(syn_other,
                                   file_entity.id,
                                   destinationId=third_folder.id)
        new_copied_ent = syn.get(output[file_entity.id])
        new_copied_ent_annot = syn.getAnnotations(new_copied_ent)
        schedule_for_cleanup(new_copied_ent.id)

        copied_URL_ent.externalURL = "https://www.google.com"
        copied_URL_ent = syn.store(copied_URL_ent)
        output = synapseutils.copy(syn_other,
                                   copied_URL_ent.id,
                                   destinationId=third_folder.id,
                                   version=1)
        new_copied_URL = syn.get(output[copied_URL_ent.id], downloadFile=False)
        schedule_for_cleanup(new_copied_URL.id)

        assert new_copied_ent_annot == annots
        assert new_copied_ent.dataFileHandleId != copied_ent.dataFileHandleId
        #Test if copying different versions gets you the correct file
        assert new_copied_URL.versionNumber == 1
        assert new_copied_URL.externalURL == repo_url
        assert new_copied_URL.dataFileHandleId != copied_URL_ent.dataFileHandleId
    finally:
        syn_other.logout()

    # ------------------------------------
    # TEST COPY LINKS
    # ------------------------------------
    print("Test: Copy Links")
    second_file = utils.make_bogus_data_file()
    #schedule_for_cleanup(filename)
    second_file_entity = syn.store(File(second_file, parent=project_entity))
    link_entity = Link(second_file_entity.id, parent=folder_entity.id)
    link_entity = syn.store(link_entity)

    #function under test uses queries which are eventually consistent but not immediately after creating the entities
    start_time = time.time()
    while syn.query("select id from entity where id=='%s'" %
                    link_entity.id).get('totalNumberOfResults') <= 0:
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)

    copied_link = synapseutils.copy(syn,
                                    link_entity.id,
                                    destinationId=second_folder.id)
    old = syn.get(link_entity.id, followLink=False)
    new = syn.get(copied_link[link_entity.id], followLink=False)
    assert old.linksTo['targetId'] == new.linksTo['targetId']
    assert old.linksTo['targetVersionNumber'] == new.linksTo[
        'targetVersionNumber']

    schedule_for_cleanup(second_file_entity.id)
    schedule_for_cleanup(link_entity.id)
    schedule_for_cleanup(copied_link[link_entity.id])

    time.sleep(3)

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  link_entity.id,
                  destinationId=second_folder.id)

    # ------------------------------------
    # TEST COPY TABLE
    # ------------------------------------
    second_project = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(second_project.id)
    print("Test: Copy Tables")
    cols = [
        Column(name='n', columnType='DOUBLE', maximumSize=50),
        Column(name='c', columnType='STRING', maximumSize=50),
        Column(name='i', columnType='INTEGER')
    ]
    data = [[2.1, 'foo', 10], [2.2, 'bar', 20], [2.3, 'baz', 30]]

    schema = syn.store(
        Schema(name='Testing', columns=cols, parent=project_entity.id))
    row_reference_set = syn.store(
        RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data]))

    table_map = synapseutils.copy(syn,
                                  schema.id,
                                  destinationId=second_project.id)
    copied_table = syn.tableQuery('select * from %s' % table_map[schema.id])
    rows = copied_table.asRowSet()['rows']
    # TEST: Check if all values are the same
    for i, row in enumerate(rows):
        assert row['values'] == data[i]

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  schema.id,
                  destinationId=second_project.id)

    schedule_for_cleanup(schema.id)
    schedule_for_cleanup(table_map[schema.id])

    # ------------------------------------
    # TEST COPY FOLDER
    # ------------------------------------
    print("Test: Copy Folder")
    mapping = synapseutils.copy(syn,
                                folder_entity.id,
                                destinationId=second_project.id)
    for i in mapping:
        old = syn.get(i, downloadFile=False)
        new = syn.get(mapping[i], downloadFile=False)
        assert old.name == new.name
        assert old.annotations == new.annotations
        assert old.concreteType == new.concreteType

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  folder_entity.id,
                  destinationId=second_project.id)
    # TEST: Throw error if excludeTypes isn't in file, link and table or isn't a list
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  second_folder.id,
                  destinationId=second_project.id,
                  excludeTypes=["foo"])
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  second_folder.id,
                  destinationId=second_project.id,
                  excludeTypes="file")
    # TEST: excludeType = ["file"], only the folder is created
    second = synapseutils.copy(syn,
                               second_folder.id,
                               destinationId=second_project.id,
                               excludeTypes=["file", "table", "link"])

    copied_folder = syn.get(second[second_folder.id])
    assert copied_folder.name == second_folder.name
    assert len(second) == 1
    # TEST: Make sure error is thrown if foldername already exists
    start_time = time.time()
    while syn.query("select id from entity where id=='%s'" %
                    copied_folder.id).get('totalNumberOfResults') <= 0:
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)

    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  second_folder.id,
                  destinationId=second_project.id)

    # ------------------------------------
    # TEST COPY PROJECT
    # ------------------------------------
    print("Test: Copy Project")
    third_project = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(third_project.id)

    mapping = synapseutils.copy(syn,
                                project_entity.id,
                                destinationId=third_project.id)
    for i in mapping:
        old = syn.get(i, downloadFile=False)
        new = syn.get(mapping[i], downloadFile=False)
        if not isinstance(old, Project):
            assert old.name == new.name
        assert old.annotations == new.annotations
        assert old.concreteType == new.concreteType

    # TEST: Can't copy project to a folder
    assert_raises(ValueError,
                  synapseutils.copy,
                  syn,
                  project_entity.id,
                  destinationId=second_folder.id)
Пример #25
0
def test_command_get_recursive_and_query(test_state):
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = test_state.project

    # Create Folders in Project
    folder_entity = test_state.syn.store(Folder(name=str(uuid.uuid4()),
                                                parent=project_entity))

    folder_entity2 = test_state.syn.store(Folder(name=str(uuid.uuid4()),
                                                 parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        test_state.schedule_for_cleanup(f)
        file_entity = File(f, parent=folder_entity2)
        file_entity = test_state.syn.store(file_entity)
        file_entities.append(file_entity)
        test_state.schedule_for_cleanup(f)

    # Add a file in the Folder as well
    f = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    test_state.schedule_for_cleanup(f)
    file_entity = File(f, parent=folder_entity)
    file_entity = test_state.syn.store(file_entity)
    file_entities.append(file_entity)

    # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent,
    # but faster than chunked queries.
    time.sleep(2)
    # Test recursive get
    run(test_state,
        'synapse'
        '--skip-checks', 'get', '-r', folder_entity.id)
    # Verify that we downloaded files:
    new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    test_state.schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        test_state.schedule_for_cleanup(downloaded)

    # Test query get using a Table with an entity column
    # This should be replaced when Table File Views are implemented in the client
    cols = [Column(name='id', columnType='ENTITYID')]

    schema1 = test_state.syn.store(Schema(name='Foo Table', columns=cols, parent=project_entity))
    test_state.schedule_for_cleanup(schema1.id)

    data1 = [[x.id] for x in file_entities]

    test_state.syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data1]))

    time.sleep(3)  # get -q are eventually consistent
    # Test Table/View query get
    run(test_state,
        'synapse'
        '--skip-checks', 'get', '-q',
        "select id from %s" % schema1.id)
    # Verify that we downloaded files:
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    test_state.schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        test_state.schedule_for_cleanup(downloaded)

    test_state.schedule_for_cleanup(new_paths[0])
Пример #26
0
# ----------------------------------------------------------------------------------------------------------------------
# Model
# ----------------------------------------------------------------------------------------------------------------------
Base = declarative_base()

SYN_SCHEMA = Schema(name=secrets.CONSENTS_TABLE_NAME,
                    columns=[
                        SynColumn(name='study_id',
                                  columnType='STRING',
                                  maximumSize=31),
                        SynColumn(name='internal_id', columnType='STRING'),
                        SynColumn(name='consent_dt',
                                  columnType='STRING',
                                  maximumSize=63),
                        SynColumn(name='location_sid',
                                  columnType='STRING',
                                  maximumSize=127),
                        SynColumn(name='search_sid',
                                  columnType='STRING',
                                  maximumSize=127),
                        SynColumn(name='notes',
                                  columnType='STRING',
                                  maximumSize=1000),
                    ],
                    parent=secrets.PROJECT_SYNID)
BLANK_CONSENT = ('blank', 0, 'blank', 'blank', 'blank', 'blank')


class AppWrap(object):
    """a class used to wrap the application configuration options required to initialize the encryption cypher"""
    def __init__(self, config):
Пример #27
0
def partial_rowset_test_state(syn, project):
    cols = [
        Column(name='foo', columnType='INTEGER'),
        Column(name='bar', columnType='INTEGER')
    ]
    table_schema = syn.store(
        Schema(name='PartialRowTest' + str(uuid.uuid4()),
               columns=cols,
               parent=project))
    data = [[1, None], [None, 2]]
    syn.store(RowSet(schema=table_schema, rows=[Row(r) for r in data]))

    # set up a file view
    folder = syn.store(
        Folder(name="PartialRowTestFolder" + str(uuid.uuid4()),
               parent=project))
    syn.store(
        File("~/path/doesnt/matter",
             name="f1",
             parent=folder,
             synapseStore=False))
    syn.store(
        File("~/path/doesnt/matter/again",
             name="f2",
             parent=folder,
             synapseStore=False))

    cols = [
        Column(name='foo', columnType='INTEGER'),
        Column(name='bar', columnType='INTEGER')
    ]
    view_schema = syn.store(
        EntityViewSchema(name='PartialRowTestViews' + str(uuid.uuid4()),
                         columns=cols,
                         addDefaultViewColumns=False,
                         parent=project,
                         scopes=[folder]))

    table_changes = [{'foo': 4}, {'bar': 5}]
    view_changes = [{'bar': 6}, {'foo': 7}]

    expected_table_cells = pd.DataFrame({
        'foo': [4.0, float('NaN')],
        'bar': [float('NaN'), 5.0]
    })
    expected_view_cells = pd.DataFrame({
        'foo': [float('NaN'), 7.0],
        'bar': [6.0, float('NaN')]
    })

    class TestState:
        def __init__(self):
            self.syn = syn
            self.project = project
            self.table_schema = table_schema
            self.view_schema = view_schema
            self.table_changes = table_changes
            self.view_changes = view_changes
            self.expected_table_cells = expected_table_cells
            self.expected_view_cells = expected_view_cells

    return TestState()
Пример #28
0
def test_syncFromSynapse__non_file_Entity():
    table_schema = "syn12345"
    with patch.object(syn, "getChildren", return_value = []),\
         patch.object(syn, "get", return_value = Schema(name="asssdfa", parent="whatever")):
        assert_raises(ValueError, synapseutils.syncFromSynapse, syn,
                      table_schema)