def test_list_of_rows_table():
    data = [["John Coltrane",  1926, 8.65, False],
            ["Miles Davis",    1926, 9.87, False],
            ["Bill Evans",     1929, 7.65, False],
            ["Paul Chambers",  1935, 5.14, False],
            ["Jimmy Cobb",     1929, 5.78, True],
            ["Scott LaFaro",   1936, 4.21, False],
            ["Sonny Rollins",  1930, 8.99, True],
            ["Kenny Burrel",   1931, 4.37, True]]

    cols = [Column(id='1', name='Name', columnType='STRING'),
            Column(id='2', name='Born', columnType='INTEGER'),
            Column(id='3', name='Hipness', columnType='DOUBLE'),
            Column(id='4', name='Living', columnType='BOOLEAN')]

    schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001")

    # need columns to do cast_values w/o storing
    table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols])

    for table_row, expected_row in zip(table, data):
        assert table_row == expected_row

    rowset = table.asRowSet()
    for rowset_row, expected_row in zip(rowset.rows, data):
        assert rowset_row['values'] == expected_row

    table.columns = cols

    df = table.asDataFrame()
    assert list(df['Name']) == [r[0] for r in data]
def test_list_of_rows_table():
    data = [["John Coltrane",  1926, 8.65, False],
            ["Miles Davis",    1926, 9.87, False],
            ["Bill Evans",     1929, 7.65, False],
            ["Paul Chambers",  1935, 5.14, False],
            ["Jimmy Cobb",     1929, 5.78, True],
            ["Scott LaFaro",   1936, 4.21, False],
            ["Sonny Rollins",  1930, 8.99, True],
            ["Kenny Burrel",   1931, 4.37, True]]

    cols = [Column(id='1', name='Name', columnType='STRING'),
            Column(id='2', name='Born', columnType='INTEGER'),
            Column(id='3', name='Hipness', columnType='DOUBLE'),
            Column(id='4', name='Living', columnType='BOOLEAN')]

    schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001")

    # need columns to do cast_values w/o storing
    table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols])

    for table_row, expected_row in zip(table, data):
        assert_equals(table_row, expected_row)

    rowset = table.asRowSet()
    for rowset_row, expected_row in zip(rowset.rows, data):
        assert_equals(rowset_row['values'], expected_row)

    table.columns = cols

    df = table.asDataFrame()
    assert_equals(list(df['Name']), [r[0] for r in data])
Пример #3
0
def test_synapse_integer_columns_with_missing_values_from_dataframe():
    #SYNPY-267
    cols = [
        Column(name='x', columnType='STRING'),
        Column(name='y', columnType='INTEGER'),
        Column(name='z', columnType='DOUBLE')
    ]
    schema = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    ## write rows to CSV file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
                                     delete=False) as temp:
        schedule_for_cleanup(temp.name)
        #2nd row is missing a value in its integer column
        temp.write('x,y,z\na,1,0.9\nb,,0.8\nc,3,0.7\n')
        temp.flush()
        filename = temp.name

    #create a table from csv
    table = Table(schema, filename)
    df = table.asDataFrame()

    table_from_dataframe = Table(schema, df)
    assert_not_equal(table.filepath, table_from_dataframe.filepath)
    #compare to make sure no .0's were appended to the integers
    assert filecmp.cmp(table.filepath, table_from_dataframe.filepath)
Пример #4
0
def test_pandas_to_table():
    pd = _try_import_pandas('test_pandas_to_table')

    df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"]))
    schema = Schema(name="Baz",
                    parent="syn12345",
                    columns=as_table_columns(df))
    print("\n", df, "\n\n")

    ## A dataframe with no row id and version
    table = Table(schema, df)

    for i, row in enumerate(table):
        print(row)
        assert row[0] == (i + 1)
        assert row[1] == ["c", "d", "e"][i]

    assert len(table) == 3

    ## If includeRowIdAndRowVersion=True, include empty row id an versions
    ## ROW_ID,ROW_VERSION,a,b
    ## ,,1,c
    ## ,,2,d
    ## ,,3,e
    table = Table(schema, df, includeRowIdAndRowVersion=True)
    for i, row in enumerate(table):
        print(row)
        assert row[0] is None
        assert row[1] is None
        assert row[2] == (i + 1)

    ## A dataframe with no row id and version
    df = pd.DataFrame(index=["1_7", "2_7", "3_8"],
                      data=dict(a=[100, 200, 300], b=["c", "d", "e"]))
    print("\n", df, "\n\n")

    table = Table(schema, df)
    for i, row in enumerate(table):
        print(row)
        assert row[0] == ["1", "2", "3"][i]
        assert row[1] == ["7", "7", "8"][i]
        assert row[2] == (i + 1) * 100
        assert row[3] == ["c", "d", "e"][i]

    ## A dataframe with row id and version in columns
    df = pd.DataFrame(
        dict(ROW_ID=["0", "1", "2"],
             ROW_VERSION=["8", "9", "9"],
             a=[100, 200, 300],
             b=["c", "d", "e"]))
    print("\n", df, "\n\n")

    table = Table(schema, df)
    for i, row in enumerate(table):
        print(row)
        assert row[0] == ["0", "1", "2"][i]
        assert row[1] == ["8", "9", "9"][i]
        assert row[2] == (i + 1) * 100
        assert row[3] == ["c", "d", "e"][i]
Пример #5
0
def test_pandas_to_table():
    df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"]))
    schema = Schema(name="Baz",
                    parent="syn12345",
                    columns=as_table_columns(df))

    # A dataframe with no row id and version
    table = Table(schema, df)

    for i, row in enumerate(table):
        assert_equals(row[0], (i + 1))
        assert_equals(row[1], ["c", "d", "e"][i])

        assert_equals(len(table), 3)

    # If includeRowIdAndRowVersion=True, include empty row id an versions
    # ROW_ID,ROW_VERSION,a,b
    # ,,1,c
    # ,,2,d
    # ,,3,e
    table = Table(schema, df, includeRowIdAndRowVersion=True)
    for i, row in enumerate(table):
        assert_is_none(row[0])
        assert_is_none(row[1])
        assert_equals(row[2], (i + 1))

    # A dataframe with no row id and version
    df = pd.DataFrame(index=["1_7", "2_7", "3_8"],
                      data=dict(a=[100, 200, 300], b=["c", "d", "e"]))

    table = Table(schema, df)
    for i, row in enumerate(table):
        assert_equals(row[0], ["1", "2", "3"][i])
        assert_equals(row[1], ["7", "7", "8"][i])
        assert_equals(row[2], (i + 1) * 100)
        assert_equals(row[3], ["c", "d", "e"][i])

    # A dataframe with row id and version in columns
    df = pd.DataFrame(
        dict(ROW_ID=["0", "1", "2"],
             ROW_VERSION=["8", "9", "9"],
             a=[100, 200, 300],
             b=["c", "d", "e"]))

    table = Table(schema, df)
    for i, row in enumerate(table):
        assert_equals(row[0], ["0", "1", "2"][i])
        assert_equals(row[1], ["8", "9", "9"][i])
        assert_equals(row[2], (i + 1) * 100)
        assert_equals(row[3], ["c", "d", "e"][i])
def test_RowSetTable():
    row_set_json = {
        'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
        'headers': [
         {'columnType': 'STRING', 'id': '353', 'name': 'name'},
         {'columnType': 'DOUBLE', 'id': '355', 'name': 'x'},
         {'columnType': 'DOUBLE', 'id': '3020', 'name': 'y'},
         {'columnType': 'INTEGER', 'id': '891', 'name': 'n'}],
        'rows': [{
          'rowId': 5,
          'values': ['foo', '1.23', '2.2', '101'],
          'versionNumber': 3},
         {'rowId': 6,
          'values': ['bar', '1.34', '2.4', '101'],
          'versionNumber': 3},
         {'rowId': 7,
          'values': ['foo', '1.23', '2.2', '101'],
          'versionNumber': 4},
         {'rowId': 8,
          'values': ['qux', '1.23', '2.2', '102'],
          'versionNumber': 3}],
        'tableId': 'syn2976298'}

    row_set = RowSet.from_json(row_set_json)

    assert row_set.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
    assert row_set.tableId == 'syn2976298'
    assert len(row_set.headers) == 4
    assert len(row_set.rows) == 4

    schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353,355,3020,891], parent="syn1000001")

    table = Table(schema, row_set)

    assert table.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
    assert table.tableId == 'syn2976298'
    assert len(table.headers) == 4
    assert len(table.asRowSet().rows) == 4

    try:
        import pandas as pd

        df = table.asDataFrame()
        assert df.shape == (4,4)
        assert all(df['name'] == ['foo', 'bar', 'foo', 'qux'])

    except ImportError as e1:
        sys.stderr.write('Pandas is apparently not installed, skipping part of test_RowSetTable.\n\n')
Пример #7
0
def test_tables_csv():

    # Define schema
    cols = [
        Column(name='Name', columnType='STRING'),
        Column(name='Born', columnType='INTEGER'),
        Column(name='Hipness', columnType='DOUBLE'),
        Column(name='Living', columnType='BOOLEAN')
    ]

    schema = Schema(name='Jazz Guys', columns=cols, parent=project)

    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    # the following creates a CSV file and uploads it to create a new table
    table = syn.store(Table(schema, data))

    # Query and download an identical CSV
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv",
                             includeRowIdAndRowVersion=False)

    # Test that CSV file came back as expected
    for expected_row, row in zip(data, results):
        assert_equals(expected_row, row,
                      "expected %s but got %s" % (expected_row, row))
Пример #8
0
def test_tables_pandas():
    try:
        ## check if we have pandas
        import pandas as pd

        ## create a pandas DataFrame
        df = pd.DataFrame({
            'A': ("foo", "bar", "baz", "qux", "asdf"),
            'B': tuple(math.pi * i for i in range(5)),
            'C': (101, 202, 303, 404, 505),
            'D': (False, True, False, True, False)
        })

        cols = as_table_columns(df)
        cols[0].maximumSize = 20
        schema = Schema(name="Nifty Table", columns=cols, parent=project)

        ## store in Synapse
        table = syn.store(Table(schema, df))

        ## retrieve the table and verify
        results = syn.tableQuery('select * from %s' % table.schema.id)
        df2 = results.asDataFrame()

        ## simulate rowId-version rownames for comparison
        df.index = ['%s_0' % i for i in range(5)]
        assert all(df2 == df)

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test_tables_pandas.\n\n'
        )
Пример #9
0
def test_build_table_download_file_handle_list__repeated_file_handles():
    syn = synapseclient.client.Synapse(debug=True, skip_checks=True)

    #patch the cache so we don't look there in case FileHandle ids actually exist there
    patch.object(syn.cache, "get", return_value=None)

    cols = [
        Column(name='Name', columnType='STRING', maximumSize=50),
        Column(name='filehandle', columnType='FILEHANDLEID')
    ]

    schema = Schema(name='FileHandleTest', columns=cols, parent='syn420')

    #using some large filehandle numbers so i don
    data = [["ayy lmao", 5318008], ["large numberino", 0x5f3759df],
            ["repeated file handle", 5318008],
            ["repeated file handle also", 0x5f3759df]]

    ## need columns to do cast_values w/o storing
    table = Table(schema,
                  data,
                  headers=[SelectColumn.from_column(col) for col in cols])

    file_handle_associations, file_handle_to_path_map = syn._build_table_download_file_handle_list(
        table, ['filehandle'])

    #verify only 2 file_handles are added (repeats were ignored)
    assert_equals(2, len(file_handle_associations))
    assert_equals(0,
                  len(file_handle_to_path_map))  #might as well check anyways
Пример #10
0
def test_list_of_rows_table():
    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    cols = []
    cols.append(Column(id='1', name='Name', columnType='STRING'))
    cols.append(Column(id='2', name='Born', columnType='INTEGER'))
    cols.append(Column(id='3', name='Hipness', columnType='DOUBLE'))
    cols.append(Column(id='4', name='Living', columnType='BOOLEAN'))

    schema1 = Schema(name='Jazz Guys',
                     columns=cols,
                     id="syn1000002",
                     parent="syn1000001")

    ## need columns to do cast_values w/o storing
    table = Table(schema1,
                  data,
                  headers=[SelectColumn.from_column(col) for col in cols])

    for table_row, expected_row in zip(table, data):
        assert table_row == expected_row

    rowset = table.asRowSet()
    for rowset_row, expected_row in zip(rowset.rows, data):
        assert rowset_row['values'] == expected_row

    table.columns = cols

    ## test asDataFrame
    try:
        import pandas as pd

        df = table.asDataFrame()
        assert all(df['Name'] == [r[0] for r in data])

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping asDataFrame portion of test_list_of_rows_table.\n\n'
        )
Пример #11
0
def test_table_file_view_csv_update_annotations__includeEntityEtag():
    folder = syn.store(
        synapseclient.Folder(name="updateAnnoFolder" + str(uuid.uuid4()),
                             parent=project))
    anno1_name = "annotationColumn1"
    anno2_name = "annotationColumn2"
    initial_annotations = {
        anno1_name: "initial_value1",
        anno2_name: "initial_value2"
    }
    file_entity = syn.store(
        File(name=
             "test_table_file_view_csv_update_annotations__includeEntityEtag",
             path="~/fakepath",
             synapseStore=False,
             parent=folder,
             annotations=initial_annotations))

    annotation_columns = [
        Column(name=anno1_name, columnType='STRING'),
        Column(name=anno2_name, columnType='STRING')
    ]
    entity_view = syn.store(
        EntityViewSchema(name="TestEntityViewSchemaUpdateAnnotation" +
                         str(uuid.uuid4()),
                         parent=project,
                         scopes=[folder],
                         columns=annotation_columns))

    query_str = "SELECT {anno1}, {anno2} FROM {proj_id}".format(
        anno1=anno1_name, anno2=anno2_name, proj_id=utils.id_of(entity_view))

    #modify first annotation using rowset
    rowset_query_result = syn.tableQuery(query_str, resultsAs="rowset")
    rowset = rowset_query_result.asRowSet()
    rowset_changed_anno_value = "rowset_value_change"
    rowset.rows[0].values[0] = rowset_changed_anno_value
    syn.store(rowset)

    #modify second annotation using csv
    csv_query_result = syn.tableQuery(query_str, resultsAs="csv")
    dataframe = csv_query_result.asDataFrame()
    csv_changed_anno_value = "csv_value_change"
    dataframe.ix[0, anno2_name] = csv_changed_anno_value
    syn.store(Table(utils.id_of(entity_view), dataframe))

    #check annotations in the file entity. Annotations may not be immediately updated so we wait in while loop
    expected_annotations = {
        anno1_name: [rowset_changed_anno_value],
        anno2_name: [csv_changed_anno_value]
    }
    start_time = time.time()
    while (expected_annotations != file_entity.annotations):
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)
        file_entity = syn.get(file_entity, downloadFile=False)
Пример #12
0
def test_tables_pandas():
    try:
        ## check if we have pandas
        import pandas as pd

        #import numpy for datatypes
        import numpy as np

        ## create a pandas DataFrame
        df = pd.DataFrame({
            'A': ("foo", "bar", "baz", "qux", "asdf"),
            'B':
            tuple(0.42 * i for i in range(5)),
            'C': (101, 202, 303, 404, 505),
            'D': (False, True, False, True, False),
            # additional data types supported since SYNPY-347
            'int64':
            tuple(np.int64(range(5))),
            'datetime64':
            tuple(
                np.datetime64(d) for d in [
                    '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                    '2005-02-05'
                ]),
            'string_':
            tuple(
                np.string_(s)
                for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
        })

        cols = as_table_columns(df)
        cols[0].maximumSize = 20
        schema = Schema(name="Nifty Table", columns=cols, parent=project)

        ## store in Synapse
        table = syn.store(Table(schema, df))

        ## retrieve the table and verify
        results = syn.tableQuery('select * from %s' % table.schema.id,
                                 resultsAs='csv')
        df2 = results.asDataFrame(convert_to_datetime=True)

        ## simulate rowId-version rownames for comparison
        df.index = ['%s_0' % i for i in range(5)]

        #for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails
        if six.PY3: df['string_'] = df['string_'].transform(str)

        # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; second .all() gives a bool that is ANDed value of that Series
        assert (df2 == df).all().all()

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test_tables_pandas.\n\n'
        )
Пример #13
0
def write_synapse_table(table_data,
                        synapse_project_id,
                        table_name='',
                        username='',
                        password=''):
    """
    Write data to a Synapse table.

    Parameters
    ----------
    table_data : Pandas DataFrame
        Synapse table contents
    synapse_project_id : string
        Synapse ID for project within which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> from mhealthx.io_data import read_synapse_table_files, write_synapse_table
    >>> in_synapse_table_id = 'syn4590865'
    >>> synapse_project_id = 'syn4899451'
    >>> column_names = []
    >>> download_limit = None
    >>> out_path = '.'
    >>> username = ''
    >>> password = ''
    >>> table_data, files = read_synapse_table_files(in_synapse_table_id, column_names, download_limit, out_path, username, password)
    >>> table_name = 'Contents of ' + in_synapse_table_id
    >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    table_data.index = range(table_data.shape[0])

    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)

    syn.store(Table(schema, table_data))
def test_RowSetTable():
    row_set_json = {
        'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
        'headers': [
         {'columnType': 'STRING', 'id': '353', 'name': 'name'},
         {'columnType': 'DOUBLE', 'id': '355', 'name': 'x'},
         {'columnType': 'DOUBLE', 'id': '3020', 'name': 'y'},
         {'columnType': 'INTEGER', 'id': '891', 'name': 'n'}],
        'rows': [{
          'rowId': 5,
          'values': ['foo', '1.23', '2.2', '101'],
          'versionNumber': 3},
         {'rowId': 6,
          'values': ['bar', '1.34', '2.4', '101'],
          'versionNumber': 3},
         {'rowId': 7,
          'values': ['foo', '1.23', '2.2', '101'],
          'versionNumber': 4},
         {'rowId': 8,
          'values': ['qux', '1.23', '2.2', '102'],
          'versionNumber': 3}],
        'tableId': 'syn2976298'}

    row_set = RowSet.from_json(row_set_json)

    assert_equals(row_set.etag, 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee')
    assert_equals(row_set.tableId, 'syn2976298')
    assert_equals(len(row_set.headers), 4)
    assert_equals(len(row_set.rows), 4)

    schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353, 355, 3020, 891], parent="syn1000001")

    table = Table(schema, row_set)

    assert_equals(table.etag, 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee')
    assert_equals(table.tableId, 'syn2976298')
    assert_equals(len(table.headers), 4)
    assert_equals(len(table.asRowSet().rows), 4)

    df = table.asDataFrame()
    assert_equals(df.shape, (4, 4))
    assert_equals(list(df['name']), ['foo', 'bar', 'foo', 'qux'])
Пример #15
0
def test_store_table_datetime():
    current_datetime = datetime.fromtimestamp(round(time.time(), 3))
    schema = syn.store(
        Schema("testTable", [Column(name="testerino", columnType='DATE')],
               project))
    rowset = RowSet(rows=[Row([current_datetime])], schema=schema)
    rowset_table = syn.store(Table(schema, rowset))

    query_result = syn.tableQuery("select * from %s" % id_of(schema),
                                  resultsAs="rowset")
    assert_equals(current_datetime,
                  query_result.rowset['rows'][0]['values'][0])
Пример #16
0
def test_synapse_integer_columns_with_missing_values_from_dataframe():
    # SYNPY-267
    cols = [
        Column(name='x', columnType='STRING'),
        Column(name='y', columnType='INTEGER'),
        Column(name='z', columnType='DOUBLE')
    ]
    schema = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    line_terminator = str(os.linesep)
    # write rows to CSV file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
                                     delete=False) as temp:
        schedule_for_cleanup(temp.name)
        # 2nd row is missing a value in its integer column
        temp.write('x,y,z' + line_terminator + 'a,1,0.9' + line_terminator +
                   'b,,0.8' + line_terminator + 'c,3,0.7' + line_terminator)
        temp.flush()
        filename = temp.name

    # create a table from csv
    table = Table(schema, filename)
    df = table.asDataFrame()

    table_from_dataframe = Table(schema, df)
    assert_not_equal(table.filepath, table_from_dataframe.filepath)
    df2 = table_from_dataframe.asDataFrame()
    assert_frame_equal(df, df2)
def test_dict_to_table():
    d = dict(a=[1, 2, 3], b=["c", "d", "e"])
    df = pd.DataFrame(d)
    schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df))

    with patch.object(CsvFileTable, "from_data_frame") as mocked_from_data_frame:
        Table(schema, d)

    # call_agrs is a tuple with values and name
    agrs_list = mocked_from_data_frame.call_args[0]
    # getting the second argument
    df_agr = agrs_list[1]
    assert df_agr.equals(df)
Пример #18
0
def test_list_of_rows_table():
    data = [["John Coltrane",  1926, 8.65, False],
            ["Miles Davis",    1926, 9.87, False],
            ["Bill Evans",     1929, 7.65, False],
            ["Paul Chambers",  1935, 5.14, False],
            ["Jimmy Cobb",     1929, 5.78, True],
            ["Scott LaFaro",   1936, 4.21, False],
            ["Sonny Rollins",  1930, 8.99, True],
            ["Kenny Burrel",   1931, 4.37, True]]

    cols = []
    cols.append(Column(id='1', name='Name', columnType='STRING'))
    cols.append(Column(id='2', name='Born', columnType='INTEGER'))
    cols.append(Column(id='3', name='Hipness', columnType='DOUBLE'))
    cols.append(Column(id='4', name='Living', columnType='BOOLEAN'))

    schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001")

    ## need columns to do cast_values w/o storing
    table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols])

    for table_row, expected_row in izip(table, data):
        assert table_row==expected_row

    rowset = table.asRowSet()
    for rowset_row, expected_row in izip(rowset.rows, data):
        assert rowset_row['values']==expected_row

    table.columns = cols

    ## test asDataFrame
    try:
        import pandas as pd

        df = table.asDataFrame()
        assert all(df['Name'] == [r[0] for r in data])

    except ImportError as e1:
        sys.stderr.write('Pandas is apparently not installed, skipping asDataFrame portion of test_list_of_rows_table.\n\n')
Пример #19
0
def test_tables_pandas():
    # create a pandas DataFrame
    df = pd.DataFrame({
        'A': ("foo", "bar", "baz", "qux", "asdf"),
        'B':
        tuple(0.42 * i for i in range(5)),
        'C': (101, 202, 303, 404, 505),
        'D': (False, True, False, True, False),
        # additional data types supported since SYNPY-347
        'int64':
        tuple(np.int64(range(5))),
        'datetime64':
        tuple(
            np.datetime64(d) for d in [
                '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                '2005-02-05'
            ]),
        'string_':
        tuple(
            np.string_(s)
            for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
    })

    cols = as_table_columns(df)
    cols[0].maximumSize = 20
    schema = Schema(name="Nifty Table", columns=cols, parent=project)

    # store in Synapse
    table = syn.store(Table(schema, df))

    # retrieve the table and verify
    results = syn.tableQuery('select * from %s' % table.schema.id,
                             resultsAs='csv')
    df2 = results.asDataFrame(convert_to_datetime=True)

    # simulate rowId-version rownames for comparison
    df.index = ['%s_0' % i for i in range(5)]

    # for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails
    if six.PY3:
        df['string_'] = df['string_'].transform(str)

    # SYNPY-717
    df['datetime64'] = df['datetime64'].apply(
        lambda x: pd.Timestamp(x).tz_localize('UTC'))

    # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column;
    # second .all() gives a bool that is ANDed value of that Series

    assert_frame_equal(df2, df)
def test_RowSetTable():
    row_set_json = {
        'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
        'headers': [
            {'columnType': 'STRING', 'id': '353', 'name': 'name'},
            {'columnType': 'DOUBLE', 'id': '355', 'name': 'x'},
            {'columnType': 'DOUBLE', 'id': '3020', 'name': 'y'},
            {'columnType': 'INTEGER', 'id': '891', 'name': 'n'}],
        'rows': [{
            'rowId': 5,
            'values': ['foo', '1.23', '2.2', '101'],
            'versionNumber': 3},
            {'rowId': 6,
             'values': ['bar', '1.34', '2.4', '101'],
             'versionNumber': 3},
            {'rowId': 7,
             'values': ['foo', '1.23', '2.2', '101'],
             'versionNumber': 4},
            {'rowId': 8,
             'values': ['qux', '1.23', '2.2', '102'],
             'versionNumber': 3}],
        'tableId': 'syn2976298'}

    row_set = RowSet.from_json(row_set_json)

    assert row_set.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
    assert row_set.tableId == 'syn2976298'
    assert len(row_set.headers) == 4
    assert len(row_set.rows) == 4

    schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353, 355, 3020, 891], parent="syn1000001")

    table = Table(schema, row_set)

    assert table.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
    assert table.tableId == 'syn2976298'
    assert len(table.headers) == 4
    assert len(table.asRowSet().rows) == 4

    df = table.asDataFrame()
    assert df.shape == (4, 4)
    assert list(df['name']) == ['foo', 'bar', 'foo', 'qux']
Пример #21
0
def test_tables_csv():

    ## Define schema
    cols = []
    cols.append(Column(name='Name', columnType='STRING'))
    cols.append(Column(name='Born', columnType='INTEGER'))
    cols.append(Column(name='Hipness', columnType='DOUBLE'))
    cols.append(Column(name='Living', columnType='BOOLEAN'))

    schema = Schema(name='Jazz Guys', columns=cols, parent=project)

    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    ## the following creates a CSV file and uploads it to create a new table
    table = syn.store(Table(schema, data))

    ## Query and download an identical CSV
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv",
                             includeRowIdAndRowVersion=False)

    ## Test that CSV file came back as expected
    for expected_row, row in zip(data, results):
        assert expected_row == row, "expected %s but got %s" % (expected_row,
                                                                row)

    try:
        ## check if we have pandas
        import pandas as pd

        df = results.asDataFrame()
        assert all(df.columns.values == ['Name', 'Born', 'Hipness', 'Living'])
        assert list(df.iloc[1, [0, 1, 3]]) == ['Miles Davis', 1926, False]
        assert df.iloc[1, 2] - 9.87 < 0.0001
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test of .asDataFrame for CSV tables.\n\n'
        )

    ## Aggregate query
    expected = {True: [True, 1929, 3, 6.38], False: [False, 1926, 5, 7.104]}

    results = syn.tableQuery(
        'select Living, min(Born), count(Living), avg(Hipness) from %s group by Living'
        % table.schema.id,
        resultsAs="csv",
        includeRowIdAndRowVersion=False)
    for row in results:
        living = row[0]
        assert expected[living][1] == row[1]
        assert expected[living][2] == row[2]
        assert abs(expected[living][3] - row[3]) < 0.0001

    ## Aggregate query results to DataFrame
    try:
        ## check if we have pandas
        import pandas as pd

        df = results.asDataFrame()
        assert all(expected[df.iloc[0, 0]][0:3] == df.iloc[0, 0:3])
        assert abs(expected[df.iloc[1, 0]][3] - df.iloc[1, 3]) < 0.0001
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test of .asDataFrame for aggregate queries as CSV tables.\n\n'
        )

    ## Append rows
    more_jazz_guys = [["Sonny Clark", 1931, 8.43, False],
                      ["Hank Mobley", 1930, 5.67, False],
                      ["Freddie Hubbard", 1938,
                       float('nan'), False],
                      ["Thelonious Monk", 1917,
                       float('inf'), False]]
    table = syn.store(Table(table.schema, more_jazz_guys))

    ## test that CSV file now has more jazz guys
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv")
    for expected_row, row in zip(data + more_jazz_guys, results):
        for field, expected_field in zip(row[2:], expected_row):
            if type(field) is float and math.isnan(field):
                assert type(expected_field) is float and math.isnan(
                    expected_field)
            elif type(expected_field) is float and math.isnan(expected_field):
                assert type(field) is float and math.isnan(field)
            else:
                assert expected_field == field

    ## Update as a RowSet
    rowset = results.asRowSet()
    for row in rowset['rows']:
        if row['values'][1] == 1930:
            row['values'][2] = 8.5
    row_reference_set = syn.store(rowset)

    ## aggregate queries won't return row id and version, so we need to
    ## handle this correctly
    results = syn.tableQuery(
        'select Born, COUNT(*) from %s group by Born order by Born' %
        table.schema.id,
        resultsAs="csv")
    assert results.includeRowIdAndRowVersion == False
    for i, row in enumerate(results):
        assert row[0] == [1917, 1926, 1929, 1930, 1931, 1935, 1936, 1938][i]
        assert row[1] == [1, 2, 2, 2, 2, 1, 1, 1][i]

    try:
        import pandas as pd
        results = syn.tableQuery("select * from %s where Born=1930" %
                                 table.schema.id,
                                 resultsAs="csv")
        df = results.asDataFrame()
        all(df['Born'].values == 1930)
        all(df['Hipness'].values == 8.5)

        ## Update via a Data Frame
        df['Hipness'] = 9.75
        table = syn.store(Table(table.tableId, df, etag=results.etag))

        results = syn.tableQuery("select * from %s where Born=1930" %
                                 table.tableId,
                                 resultsAs="csv")
        for row in results:
            assert row[4] == 9.75
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n'
        )

    ## check what happens when query result is empty
    results = syn.tableQuery('select * from %s where Born=2013' %
                             table.tableId,
                             resultsAs="csv")
    assert len(list(results)) == 0

    try:
        import pandas as pd
        results = syn.tableQuery('select * from %s where Born=2013' %
                                 table.tableId,
                                 resultsAs="csv")
        df = results.asDataFrame()
        assert df.shape[0] == 0
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n'
        )

    ## delete some rows
    results = syn.tableQuery('select * from %s where Hipness < 7' %
                             table.tableId,
                             resultsAs="csv")
    syn.delete(results)
def test_csv_table():
    # Maybe not truly a unit test, but here because it doesn't do
    # network IO to synapse
    data = [["1", "1", "John Coltrane",  1926, 8.65, False],
            ["2", "1", "Miles Davis",    1926, 9.87, False],
            ["3", "1", "Bill Evans",     1929, 7.65, False],
            ["4", "1", "Paul Chambers",  1935, 5.14, False],
            ["5", "1", "Jimmy Cobb",     1929, 5.78, True],
            ["6", "1", "Scott LaFaro",   1936, 4.21, False],
            ["7", "1", "Sonny Rollins",  1930, 8.99, True],
            ["8", "1", "Kenny Burrel",   1931, 4.37, True]]

    filename = None

    cols = [Column(id='1', name='Name', columnType='STRING'),
            Column(id='2', name='Born', columnType='INTEGER'),
            Column(id='3', name='Hipness', columnType='DOUBLE'),
            Column(id='4', name='Living', columnType='BOOLEAN')]

    schema1 = Schema(id='syn1234', name='Jazz Guys', columns=cols, parent="syn1000001")

    # TODO: use StringIO.StringIO(data) rather than writing files
    try:
        # create CSV file
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            filename = temp.name

        with io.open(filename, mode='w', encoding="utf-8", newline='') as temp:
            writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep))
            headers = ['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols]
            writer.writerow(headers)
            for row in data:
                writer.writerow(row)

        table = Table(schema1, filename)
        assert_is_instance(table, CsvFileTable)

        # need to set column headers to read a CSV file
        table.setColumnHeaders(
            [SelectColumn(name="ROW_ID", columnType="STRING"),
             SelectColumn(name="ROW_VERSION", columnType="STRING")] +
            [SelectColumn.from_column(col) for col in cols])

        # test iterator
        for table_row, expected_row in zip(table, data):
            assert_equals(table_row, expected_row)

        # test asRowSet
        rowset = table.asRowSet()
        for rowset_row, expected_row in zip(rowset.rows, data):
            assert_equals(rowset_row['values'], expected_row[2:])
            assert_equals(rowset_row['rowId'], expected_row[0])
            assert_equals(rowset_row['versionNumber'], expected_row[1])

        df = table.asDataFrame()
        assert_equals(list(df['Name']), [row[2] for row in data])
        assert_equals(list(df['Born']), [row[3] for row in data])
        assert_equals(list(df['Living']), [row[5] for row in data])
        assert_equals(list(df.index), ['%s_%s' % tuple(row[0:2]) for row in data])
        assert_equals(df.shape, (8, 4))

    except Exception:
        if filename:
            try:
                if os.path.isdir(filename):
                    shutil.rmtree(filename)
                else:
                    os.remove(filename)
            except Exception as ex:
                print(ex)
        raise
Пример #23
0
def test_RowSetTable():
    row_set_json = {
        'etag':
        'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee',
        'headers': [{
            'columnType': 'STRING',
            'id': '353',
            'name': 'name'
        }, {
            'columnType': 'DOUBLE',
            'id': '355',
            'name': 'x'
        }, {
            'columnType': 'DOUBLE',
            'id': '3020',
            'name': 'y'
        }, {
            'columnType': 'INTEGER',
            'id': '891',
            'name': 'n'
        }],
        'rows': [{
            'rowId': 5,
            'values': ['foo', '1.23', '2.2', '101'],
            'versionNumber': 3
        }, {
            'rowId': 6,
            'values': ['bar', '1.34', '2.4', '101'],
            'versionNumber': 3
        }, {
            'rowId': 7,
            'values': ['foo', '1.23', '2.2', '101'],
            'versionNumber': 4
        }, {
            'rowId': 8,
            'values': ['qux', '1.23', '2.2', '102'],
            'versionNumber': 3
        }],
        'tableId':
        'syn2976298'
    }

    row_set = RowSet.from_json(row_set_json)

    assert row_set.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
    assert row_set.tableId == 'syn2976298'
    assert len(row_set.headers) == 4
    assert len(row_set.rows) == 4

    schema = Schema(id="syn2976298",
                    name="Bogus Schema",
                    columns=[353, 355, 3020, 891],
                    parent="syn1000001")

    table = Table(schema, row_set)

    assert table.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
    assert table.tableId == 'syn2976298'
    assert len(table.headers) == 4
    assert len(table.asRowSet().rows) == 4

    try:
        import pandas as pd

        df = table.asDataFrame()
        assert df.shape == (4, 4)
        assert all(df['name'] == ['foo', 'bar', 'foo', 'qux'])

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_RowSetTable.\n\n'
        )
Пример #24
0
def concatenate_tables_to_synapse_table(frames,
                                        synapse_project_id,
                                        table_name,
                                        username='',
                                        password=''):
    """
    Concatenate multiple dataframes and store as a Synapse table.

    Reuse the indices from the original DataFrame,
    increasing number of columns.

    Parameters
    ----------
    frames : list of pandas DataFrames
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> import pandas as pd
    >>> from mhealthx.io_data import concatenate_tables_to_synapse_table
    >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    >>>                     'B': ['B0', 'B1', 'B2', 'B3'],
    >>>                     'C': ['C0', 'C1', 'C2', 'C3'],
    >>>                     'D': ['D0', 'D1', 'D2', 'D3']},
    >>>                    index=[0, 1, 2, 3])
    >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
    >>>                     'F': ['B4', 'B5', 'B6', 'B7'],
    >>>                     'G': ['C4', 'C5', 'C6', 'C7'],
    >>>                     'H': ['D4', 'D5', 'D6', 'D7']},
    >>>                     index=[0, 1, 2, 3])
    >>> frames = [df1, df2]
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to join tables'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password)
    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Concatenate dataframes: reuse the indices from the original DataFrame,
    # increasing number of columns:
    table_data = pd.concat(frames, axis=1)  #, join_axes=[frames[0].index])

    # Create table schema:
    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))

    return table_data, synapse_project_id
Пример #25
0
def copy_synapse_table(synapse_table_id,
                       synapse_project_id,
                       table_name='',
                       remove_columns=[],
                       username='',
                       password=''):
    """
    Copy Synapse table to another Synapse project.

    Parameters
    ----------
    synapse_table_id : string
        Synapse ID for table to copy
    synapse_project_id : string
        copy table to project with this Synapse ID
    table_name : string
        schema name of table
    remove_columns : list of strings
        column headers for columns to be removed
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        Synapse table contents
    table_name : string
        schema name of table
    synapse_project_id : string
        Synapse ID for project within which table is to be written

    Examples
    --------
    >>> from mhealthx.io_data import copy_synapse_table
    >>> synapse_table_id = 'syn4590865'
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Copy of ' + synapse_table_id
    >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a']
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Download Synapse table as a dataframe:
    results = syn.tableQuery("select * from {0}".format(synapse_table_id))
    table_data = results.asDataFrame()

    # Remove specified columns:
    if remove_columns:
        for remove_column in remove_columns:
            del table_data[remove_column]

    # Upload to Synapse table:
    table_data.index = range(table_data.shape[0])
    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)
    table = syn.store(Table(schema, table_data))

    return table_data, table_name, synapse_project_id
Пример #26
0
def test_csv_table():
    ## Maybe not truly a unit test, but here because it doesn't do
    ## network IO to synapse
    data = [["1", "1", "John Coltrane", 1926, 8.65, False],
            ["2", "1", "Miles Davis", 1926, 9.87, False],
            ["3", "1", "Bill Evans", 1929, 7.65, False],
            ["4", "1", "Paul Chambers", 1935, 5.14, False],
            ["5", "1", "Jimmy Cobb", 1929, 5.78, True],
            ["6", "1", "Scott LaFaro", 1936, 4.21, False],
            ["7", "1", "Sonny Rollins", 1930, 8.99, True],
            ["8", "1", "Kenny Burrel", 1931, 4.37, True]]

    filename = None

    cols = []
    cols.append(Column(id='1', name='Name', columnType='STRING'))
    cols.append(Column(id='2', name='Born', columnType='INTEGER'))
    cols.append(Column(id='3', name='Hipness', columnType='DOUBLE'))
    cols.append(Column(id='4', name='Living', columnType='BOOLEAN'))

    schema1 = Schema(id='syn1234',
                     name='Jazz Guys',
                     columns=cols,
                     parent="syn1000001")

    #TODO: use StringIO.StringIO(data) rather than writing files
    try:
        ## create CSV file
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            filename = temp.name

        with io.open(filename, mode='w', encoding="utf-8", newline='') as temp:
            writer = csv.writer(temp,
                                quoting=csv.QUOTE_NONNUMERIC,
                                lineterminator=str(os.linesep))
            headers = ['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols]
            writer.writerow(headers)
            for row in data:
                writer.writerow(row)

        table = Table(schema1, filename)
        assert isinstance(table, CsvFileTable)

        ## need to set column headers to read a CSV file
        table.setColumnHeaders([
            SelectColumn(name="ROW_ID", columnType="STRING"),
            SelectColumn(name="ROW_VERSION", columnType="STRING")
        ] + [SelectColumn.from_column(col) for col in cols])

        ## test iterator
        for table_row, expected_row in zip(table, data):
            assert table_row == expected_row

        ## test asRowSet
        rowset = table.asRowSet()
        for rowset_row, expected_row in zip(rowset.rows, data):
            assert rowset_row['values'] == expected_row[2:]
            assert rowset_row['rowId'] == expected_row[0]
            assert rowset_row['versionNumber'] == expected_row[1]

        ## test asDataFrame
        try:
            import pandas as pd

            df = table.asDataFrame()
            assert all(df['Name'] == [row[2] for row in data])
            assert all(df['Born'] == [row[3] for row in data])
            assert all(df['Living'] == [row[5] for row in data])
            assert all(df.index == ['%s_%s' % tuple(row[0:2]) for row in data])
            assert df.shape == (8, 4)

        except ImportError as e1:
            sys.stderr.write(
                'Pandas is apparently not installed, skipping asDataFrame portion of test_csv_table.\n\n'
            )

    except Exception as ex1:
        if filename:
            try:
                if os.path.isdir(filename):
                    shutil.rmtree(filename)
                else:
                    os.remove(filename)
            except Exception as ex:
                print(ex)
        raise
def feature_file_to_synapse_table(feature_file,
                                  raw_feature_file,
                                  source_file_id,
                                  provenance_activity_id,
                                  command,
                                  command_line,
                                  synapse_table_id,
                                  username='',
                                  password=''):
    """
    Upload files and file handle IDs to Synapse.

    Parameters
    ----------
    feature_file : string
        path to file to upload to Synapse
    raw_feature_file : string
        path to file to upload to Synapse
    source_file_id : string
        Synapse file handle ID to source file used to generate features
    provenance_activity_id : string
        Synapse provenance activity ID
    command : string
        name of command run to generate raw feature file
    command_line : string
        full command line run to generate raw feature file
    synapse_table_id : string
        Synapse table ID for table to store file handle IDs, etc.
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> from mhealthx.xtra import feature_file_to_synapse_table
    >>> feature_file = '/Users/arno/Local/wav/test1.wav'
    >>> raw_feature_file = '/Users/arno/Local/wav/test1.wav'
    >>> source_file_id = ''
    >>> provenance_activity_id = ''
    >>> command_line = 'SMILExtract -C blah -I blah -O blah'
    >>> synapse_table_id = 'syn4899451'
    >>> username = ''
    >>> password = ''
    >>> feature_file_to_synapse_table(feature_file, raw_feature_file, source_file_id, provenance_activity_id, command_line, synapse_table_id, username, password)

    """
    import synapseclient
    from synapseclient.table import Table

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store feature and raw feature files and get file handle IDs:
    file_handle = syn._chunkedUploadFile(feature_file)
    file_id = file_handle['id']
    raw_file_handle = syn._chunkedUploadFile(raw_feature_file)
    raw_file_id = raw_file_handle['id']

    # Add new row to Synapse table:
    new_rows = [[
        file_id, raw_file_id, source_file_id, provenance_activity_id, command,
        command_line
    ]]
    schema = syn.get(synapse_table_id)
    table = syn.store(Table(schema, new_rows))

    return synapse_table_id
Пример #28
0
def test_csv_table():
    ## Maybe not truly a unit test, but here because it doesn't do
    ## network IO to synapse
    data = [["1", "1", "John Coltrane",  1926, 8.65, False],
            ["2", "1", "Miles Davis",    1926, 9.87, False],
            ["3", "1", "Bill Evans",     1929, 7.65, False],
            ["4", "1", "Paul Chambers",  1935, 5.14, False],
            ["5", "1", "Jimmy Cobb",     1929, 5.78, True],
            ["6", "1", "Scott LaFaro",   1936, 4.21, False],
            ["7", "1", "Sonny Rollins",  1930, 8.99, True],
            ["8", "1", "Kenny Burrel",   1931, 4.37, True]]

    filename = None

    cols = []
    cols.append(Column(id='1', name='Name', columnType='STRING'))
    cols.append(Column(id='2', name='Born', columnType='INTEGER'))
    cols.append(Column(id='3', name='Hipness', columnType='DOUBLE'))
    cols.append(Column(id='4', name='Living', columnType='BOOLEAN'))

    schema1 = Schema(id='syn1234', name='Jazz Guys', columns=cols, parent="syn1000001")

    #TODO: use StringIO.StringIO(data) rather than writing files

    try:
        ## create CSV file
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=os.linesep)
            writer.writerow(['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols])
            filename = temp.name
            for row in data:
                writer.writerow(row)

        table = Table(schema1, filename)
        assert isinstance(table, CsvFileTable)

        ## need to set column headers to read a CSV file
        table.setColumnHeaders(
            [SelectColumn(name="ROW_ID", columnType="STRING"),
             SelectColumn(name="ROW_VERSION", columnType="STRING")] +
            [SelectColumn.from_column(col) for col in cols])

        ## test iterator
        # print "\n\nJazz Guys"
        for table_row, expected_row in izip(table, data):
            # print table_row, expected_row
            assert table_row==expected_row

        ## test asRowSet
        rowset = table.asRowSet()
        for rowset_row, expected_row in izip(rowset.rows, data):
            #print rowset_row, expected_row
            assert rowset_row['values']==expected_row[2:]
            assert rowset_row['rowId']==expected_row[0]
            assert rowset_row['versionNumber']==expected_row[1]

        ## test asDataFrame
        try:
            import pandas as pd

            df = table.asDataFrame()
            assert all(df['Name'] == [row[2] for row in data])
            assert all(df['Born'] == [row[3] for row in data])
            assert all(df['Living'] == [row[5] for row in data])
            assert all(df.index == ['%s_%s'%tuple(row[0:2]) for row in data])
            assert df.shape == (8,4)

        except ImportError as e1:
            sys.stderr.write('Pandas is apparently not installed, skipping asDataFrame portion of test_csv_table.\n\n')

    except Exception as ex1:
        if filename:
            try:
                if os.path.isdir(filename):
                    shutil.rmtree(filename)
                else:
                    os.remove(filename)
            except Exception as ex:
                print ex
        raise