Exemplo n.º 1
0
def test_table_query():
    """Test command line ability to do table query.

    """

    cols = []
    cols.append(
        synapseclient.Column(name='name',
                             columnType='STRING',
                             maximumSize=1000))
    cols.append(
        synapseclient.Column(name='foo',
                             columnType='STRING',
                             enumValues=['foo', 'bar', 'bat']))
    cols.append(synapseclient.Column(name='x', columnType='DOUBLE'))
    cols.append(synapseclient.Column(name='age', columnType='INTEGER'))
    cols.append(synapseclient.Column(name='cartoon', columnType='BOOLEAN'))

    project_entity = project

    schema1 = syn.store(
        synapseclient.Schema(name=str(uuid.uuid4()),
                             columns=cols,
                             parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 = [['Chris', 'bar', 11.23, 45, False],
             ['Jen', 'bat', 14.56, 40,
              False], ['Jane', 'bat', 17.89, 6, False],
             ['Henry', 'bar', 10.12, 1, False]]

    row_reference_set1 = syn.store(
        synapseclient.RowSet(schema=schema1,
                             rows=[synapseclient.Row(r) for r in data1]))

    # Test query
    output = run('synapse', '--skip-checks', 'query',
                 'select * from %s' % schema1.id)

    output_rows = output.rstrip("\n").split("\n")

    # Check the length of the output
    assert len(output_rows) == 5, "got %s rows" % (len(output_rows), )

    # Check that headers are correct.
    # Should be column names in schema plus the ROW_ID and ROW_VERSION
    my_headers_set = output_rows[0].split("\t")
    expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list(
        map(lambda x: x.name, cols))
    assert my_headers_set == expected_headers_set, "%r != %r" % (
        my_headers_set, expected_headers_set)
Exemplo n.º 2
0
def test_migrate_project(request, syn, schedule_for_cleanup,
                         storage_location_id):
    test_name = request.node.name
    project_name = "{}-{}".format(test_name, uuid.uuid4())
    project = synapseclient.Project(name=project_name)
    project_entity = syn.store(project)

    file_0_path = _create_temp_file()
    schedule_for_cleanup(file_0_path)
    file_0_name = "{}-{}".format(test_name, 1)
    file_0 = synapseclient.File(name=file_0_name,
                                path=file_0_path,
                                parent=project_entity)
    file_0_entity = syn.store(file_0)
    default_storage_location_id = file_0_entity._file_handle[
        'storageLocationId']

    folder_1_name = "{}-{}-{}".format(test_name, 1, uuid.uuid4())
    folder_1 = synapseclient.Folder(parent=project_entity, name=folder_1_name)
    folder_1_entity = syn.store(folder_1)

    file_1_path = _create_temp_file()
    schedule_for_cleanup(file_1_path)
    file_1_name = "{}-{}".format(test_name, 1)
    file_1 = synapseclient.File(name=file_1_name,
                                path=file_1_path,
                                parent=folder_1_entity)
    file_1_entity = syn.store(file_1)

    file_2_path = _create_temp_file()
    schedule_for_cleanup(file_2_path)
    file_2_name = "{}-{}".format(test_name, 2)
    file_2 = synapseclient.File(name=file_2_name,
                                path=file_2_path,
                                parent=folder_1_entity)
    file_2_entity = syn.store(file_2)

    # file 3 shares the same file handle id as file 1
    file_3_path = file_1_path
    file_3_name = "{}-{}".format(test_name, 3)
    file_3 = synapseclient.File(name=file_3_name,
                                path=file_3_path,
                                parent=folder_1_entity)
    file_3.dataFileHandleId = file_1_entity.dataFileHandleId
    file_3_entity = syn.store(file_3)

    table_1_cols = [
        synapseclient.Column(name='file_col_1', columnType='FILEHANDLEID'),
        synapseclient.Column(name='num', columnType='INTEGER'),
        synapseclient.Column(name='file_col_2', columnType='FILEHANDLEID'),
    ]
    table_1 = syn.store(
        synapseclient.Schema(name=test_name,
                             columns=table_1_cols,
                             parent=folder_1_entity))
    table_1_file_col_1_1 = _create_temp_file()
    table_1_file_handle_1 = syn.uploadFileHandle(table_1_file_col_1_1, table_1)
    table_1_file_col_1_2 = _create_temp_file()
    table_1_file_handle_2 = syn.uploadFileHandle(table_1_file_col_1_2, table_1)
    table_1_file_col_2_1 = _create_temp_file()
    table_1_file_handle_3 = syn.uploadFileHandle(table_1_file_col_2_1, table_1)
    table_1_file_col_2_2 = _create_temp_file()
    table_1_file_handle_4 = syn.uploadFileHandle(table_1_file_col_2_2, table_1)

    data = [
        [table_1_file_handle_1['id'], 1, table_1_file_handle_2['id']],
        [table_1_file_handle_3['id'], 2, table_1_file_handle_4['id']],
    ]

    table_1_entity = syn.store(
        synapseclient.RowSet(schema=table_1,
                             rows=[synapseclient.Row(r) for r in data]))

    db_path = tempfile.NamedTemporaryFile(delete=False).name
    schedule_for_cleanup(db_path)

    index_result = synapseutils.index_files_for_migration(
        syn,
        project_entity,
        storage_location_id,
        db_path,
        file_version_strategy='new',
        include_table_files=True,
    )

    counts_by_status = index_result.get_counts_by_status()
    assert counts_by_status['INDEXED'] == 8
    assert counts_by_status['ERRORED'] == 0

    migration_result = synapseutils.migrate_indexed_files(syn,
                                                          db_path,
                                                          force=True)

    file_0_entity_updated = syn.get(utils.id_of(file_0_entity),
                                    downloadFile=False)
    file_1_entity_updated = syn.get(utils.id_of(file_1_entity),
                                    downloadFile=False)
    file_2_entity_updated = syn.get(utils.id_of(file_2_entity),
                                    downloadFile=False)
    file_3_entity_updated = syn.get(utils.id_of(file_3_entity),
                                    downloadFile=False)
    file_handles = [
        f['_file_handle'] for f in (
            file_0_entity_updated,
            file_1_entity_updated,
            file_2_entity_updated,
            file_3_entity_updated,
        )
    ]

    table_1_id = utils.id_of(table_1_entity)
    results = syn.tableQuery("select file_col_1, file_col_2 from {}".format(
        utils.id_of(table_1_entity)))
    table_file_handles = []
    for row in results:
        for file_handle_id in row[2:]:
            file_handle = syn._getFileHandleDownload(
                file_handle_id, table_1_id,
                objectType='TableEntity')['fileHandle']
            table_file_handles.append(file_handle)
    file_handles.extend(table_file_handles)

    _assert_storage_location(file_handles, storage_location_id)
    assert storage_location_id != default_storage_location_id

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query_result = cursor.execute(
            "select status, count(*) from migrations where type in (?, ?) group by status",
            (_MigrationType.FILE.value,
             _MigrationType.TABLE_ATTACHED_FILE.value)).fetchall()

        counts = {r[0]: r[1] for r in query_result}

        # should only be one status and they should all be migrated
        # should be 3 migrated files entities + 4 migrated table attached files
        assert len(counts) == 1
        assert counts[_MigrationStatus.MIGRATED.value] == 8

    csv_file = tempfile.NamedTemporaryFile(delete=False)
    schedule_for_cleanup(csv_file.name)
    migration_result.as_csv(csv_file.name)
    with open(csv_file.name, 'r') as csv_file_in:
        csv_contents = csv_file_in.read()

    table_1_id = table_1_entity['tableId']

    # assert the content of the csv. we don't assert any particular order of the lines
    # but the presence of the expected lines and the correct # of lines
    csv_lines = csv_contents.split('\n')
    assert "id,type,version,row_id,col_name,from_storage_location_id,from_file_handle_id,to_file_handle_id,status,exception" in csv_lines  # noqa
    assert f"{file_0_entity.id},file,,,,{default_storage_location_id},{file_0_entity.dataFileHandleId},{file_0_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_1_entity.id},file,,,,{default_storage_location_id},{file_1_entity.dataFileHandleId},{file_1_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_2_entity.id},file,,,,{default_storage_location_id},{file_2_entity.dataFileHandleId},{file_2_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_3_entity.id},file,,,,{default_storage_location_id},{file_3_entity.dataFileHandleId},{file_3_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,1,file_col_1,{default_storage_location_id},{table_1_file_handle_1['id']},{table_file_handles[0]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,1,file_col_2,{default_storage_location_id},{table_1_file_handle_2['id']},{table_file_handles[1]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,2,file_col_1,{default_storage_location_id},{table_1_file_handle_3['id']},{table_file_handles[2]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,2,file_col_2,{default_storage_location_id},{table_1_file_handle_4['id']},{table_file_handles[3]['id']},MIGRATED," in csv_lines  # noqa
    assert "" in csv_lines  # expect trailing newline in a csv
def test_command_get_recursive_and_query():
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = project

    # Create Folders in Project
    folder_entity = syn.store(
        synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity))

    folder_entity2 = syn.store(
        synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        schedule_for_cleanup(f)
        file_entity = synapseclient.File(f, parent=folder_entity2)
        file_entity = syn.store(file_entity)
        file_entities.append(file_entity)
        schedule_for_cleanup(f)

    # Add a file in the Folder as well
    f = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    schedule_for_cleanup(f)
    file_entity = synapseclient.File(f, parent=folder_entity)
    file_entity = syn.store(file_entity)
    file_entities.append(file_entity)

    # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent,
    # but faster than chunked queries.
    time.sleep(2)
    # Test recursive get
    run('synapse', '--skip-checks', 'get', '-r', folder_entity.id)
    # Verify that we downloaded files:
    new_paths = [
        os.path.join('.', folder_entity2.name, os.path.basename(f))
        for f in uploaded_paths[:-1]
    ]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert_true(os.path.exists(downloaded))
        assert_true(filecmp.cmp(downloaded, uploaded))
        schedule_for_cleanup(downloaded)

    # Test query get using a Table with an entity column
    # This should be replaced when Table File Views are implemented in the client
    cols = [synapseclient.Column(name='id', columnType='ENTITYID')]

    schema1 = syn.store(
        synapseclient.Schema(name='Foo Table',
                             columns=cols,
                             parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 = [[x.id] for x in file_entities]

    syn.store(
        synapseclient.RowSet(schema=schema1,
                             rows=[synapseclient.Row(r) for r in data1]))

    time.sleep(3)  # get -q are eventually consistent
    # Test Table/View query get
    output = run('synapse', '--skip-checks', 'get', '-q',
                 "select id from %s" % schema1.id)
    # Verify that we downloaded files:
    new_paths = [
        os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]
    ]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert_true(os.path.exists(downloaded))
        assert_true(filecmp.cmp(downloaded, uploaded))
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])
def test_command_get_recursive_and_query():
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = project

    # Create Folders in Project
    folder_entity = syn.store(synapseclient.Folder(name=str(uuid.uuid4()),
                                                   parent=project_entity))

    folder_entity2 = syn.store(synapseclient.Folder(name=str(uuid.uuid4()),
                                                    parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f  = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        schedule_for_cleanup(f)
        file_entity = synapseclient.File(f, parent=folder_entity2)
        file_entity = syn.store(file_entity)
        file_entities.append(file_entity)
        schedule_for_cleanup(f)


    #Add a file in the Folder as well
    f  = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    schedule_for_cleanup(f)
    file_entity = synapseclient.File(f, parent=folder_entity)
    file_entity = syn.store(file_entity)
    file_entities.append(file_entity)

    #function under test uses queries which are eventually consistent but not immediately after creating the entities
    start_time = time.time()
    while syn.query("select id from entity where id=='%s'" % file_entity.id).get('totalNumberOfResults') <= 0:
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)

    ### Test recursive get
    output = run('synapse', '--skip-checks',
                 'get', '-r',
                 folder_entity.id)
    #Verify that we downloaded files:
    new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)


    ### Test query get
    ### Note: We're not querying on annotations because tests can fail if there
    ###       are lots of jobs queued as happens when staging is syncing
    output = run('synapse', '--skip-checks',
                 'get', '-q', "select id from file where parentId=='%s'" %
                 folder_entity2.id)
    #Verify that we downloaded files from folder_entity2
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    for downloaded, uploaded in zip(new_paths, uploaded_paths[:-1]):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])

    ### Test query get using a Table with an entity column
    ### This should be replaced when Table File Views are implemented in the client
    cols = []
    cols.append(synapseclient.Column(name='id', columnType='ENTITYID'))

    schema1 = syn.store(synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 =[[x.id] for x in file_entities]

    print(data1)

    row_reference_set1 = syn.store(synapseclient.RowSet(columns=cols, schema=schema1,
                                   rows=[synapseclient.Row(r) for r in data1]))

    ### Test Table/View query get
    output = run('synapse', '--skip-checks', 'get', '-q',
                 "select id from %s" % schema1.id)
    #Verify that we downloaded files:
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])