Exemplo n.º 1
0
    def unit_tests_3(self):
        ''' Test cluster with --skipMash
        '''

        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['--SkipMash'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are not the same:
        for db in ['Cdb', 'Ndb', 'Mdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(
                db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Exemplo n.º 2
0
def test_dereplicate_5(self):
    '''
    Test greedy clustering
    '''
    genomes = self.large_genome_set[:10]
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    if len(genomes) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    # Get greedy results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipMash',
        '--greedy_secondary_clustering', '-sa', '0.95', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run normal
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipMash', '-sa',
        '0.95', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values()), c
        assert set(CSdb[c].value_counts().to_dict().keys()) == set(
            Cdb[c].value_counts().to_dict().keys()), c
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(CSdb.columns) - set(Cdb.columns) == set(
        ['greedy_representative'])
Exemplo n.º 3
0
def test_dereplicate_4(self):
    '''
    Test the ability of primary clustering to take a large genome set and break it into chunks
    '''
    genomes = self.large_genome_set
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    if len(genomes) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    # Get normal results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipSecondary',
        '--primary_chunksize', '50', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run with chunking
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipSecondary',
        '--multiround_primary_clustering', '--primary_chunksize', '50', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values())
        assert set(CSdb[c].value_counts().to_dict().keys()) == set(
            Cdb[c].value_counts().to_dict().keys())
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(Cdb.columns) - set(CSdb.columns) == set(
        ['length', 'subcluster', 'primary_representitive'])
Exemplo n.º 4
0
    def functional_test_1(self):
        '''
        Cluster the 5 genomes using default settings
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        args = argumentParser.parse_args(['cluster', wd_loc, '-g'] + genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        db2 = wd.get_db('Cdb')

        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
Exemplo n.º 5
0
def test_choose_2(self):
    '''
    Try out the --skipCheckM argument for choose
    '''
    # Delete Chdb
    wd_loc = self.working_wd_loc
    os.remove(wd_loc + '/data_tables/Chdb.csv')
    os.remove(wd_loc + '/data_tables/Sdb.csv')
    os.remove(wd_loc + '/data_tables/Wdb.csv')

    # Run choose with --skipCheckM
    args = argumentParser.parse_args(
        ['dereplicate', wd_loc, '--ignoreGenomeQuality'])
    kwargs = vars(args)
    del kwargs['genomes']
    drep.d_choose.d_choose_wrapper(wd_loc, **kwargs)
    #
    # controller = Controller()
    # controller.parseArguments(args)

    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)
    for db in ['Sdb', 'Wdb', 'genomeInformation']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)
        assert not test_utils.compare_dfs(db1,
                                          db2), "{0} is the same!".format(db)

    sdb = wd.get_db('Sdb')
    Swd.get_db(db)
    for s in sdb['score'].tolist():
        assert (s > 0) & (s < 5)

    gdb = wd.get_db('genomeInformation')
    assert 'centrality' in gdb.columns
Exemplo n.º 6
0
    def skipsecondary_test(self):
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        args = argumentParser.parse_args(['cluster',wd_loc,'-g'] +genomes \
                + ['--SkipSecondary'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Mdb.csv is correct
        db1 = Swd.get_db('Mdb')
        db2 = wd.get_db('Mdb')
        #assert compare_dfs(db1, db2), "{0} is not the same!".format('Mdb')

        # Confirm Ndb.csv doesn't exist
        db2 = wd.get_db('Ndb')
        assert db2.empty, 'Ndb is not empty'
Exemplo n.º 7
0
def test_unit_3(self):
    '''
    Test cluster with --skipMash
    '''

    # normal complete run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \
                                     self.genomes + ['--SkipMash'])
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are not the same:
    for db in ['Cdb', 'Ndb']:  # , 'Mdb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)

        assert not test_utils.compare_dfs(
            db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Exemplo n.º 8
0
def test_taxonomy_4(self):
    '''
    Try actually running centrifuge without prodigal done
    '''
    loc, works = drep.d_bonus.find_program('centrifuge')
    if works == False:
        print('Centrifuge not installed- skipping tests')

    else:
        genomes = self.genomes
        wd_loc = self.wd_loc
        swd_loc = self.s_wd_loc

        # Remove previous data run
        shutil.rmtree(os.path.join(self.wd_loc, 'data', 'centrifuge'))
        shutil.rmtree(os.path.join(self.wd_loc, 'data', 'prodigal'))

        # Call the command
        args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
                + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
                '--tax_method', 'percent'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(swd_loc)
        wd = WorkDirectory(wd_loc)

        tdbS = Swd.get_db('BdbP')
        tdb = wd.get_db('Bdb')
        del tdbS['location']
        del tdb['location']
        assert test_utils.compare_dfs(
            tdb, tdbS), "{0} is not the same!".format('Bdb')

        tdbS = Swd.get_db('TdbP')
        tdb = wd.get_db('Tdb')
        assert test_utils.compare_dfs(
            tdb, tdbS), "{0} is not the same!".format('Tdb')
Exemplo n.º 9
0
    def functional_test_3(self):
        '''
        Cluster the 5 genomes using ANImf
        '''

        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\
            'ANImf','-g']+genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        del db1['comparison_algorithm']
        db2 = wd.get_db('Cdb')
        del db2['comparison_algorithm']
        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
Exemplo n.º 10
0
    def unit_tests_1(self):
        '''
        Test a normal run of cluster
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        #for db in ['Cdb', 'Mdb', 'Ndb']:
        for db in ['Cdb', 'Ndb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)

            if compare_dfs(db1, db2) == False:
                # db1['solution'] = True
                # db2['solution'] = False
                # db = pd.merge(db1, db2, on='')
                db1 = db1[['reference', 'querry', 'ani']]
                db1.rename(columns={'ani': 'ani1'}, inplace=True)
                db2 = db2[['reference', 'querry', 'ani']]
                db2.rename(columns={'ani': 'ani2'}, inplace=True)
                db1.sort_values(['reference', 'querry'], inplace=True)
                db2.sort_values(['reference', 'querry'], inplace=True)
                print("{0} is not the same!".format(db))
                my_panel = pd.Panel(dict(df1=db1, df2=db2))
                print(my_panel.apply(report_diff, axis=0))
                print(pd.merge(db1, db2, on=['reference', 'querry']))

            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
Exemplo n.º 11
0
def test_unit_5(self):
    '''
    Test changing cluster --S_algorithm gANI
    '''
    loc, works = drep.d_bonus.find_program('ANIcalculator')
    if not works:
        return

    # normal complete run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \
                                     self.genomes + ['--S_algorithm', 'gANI'])
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are correct:
    for db in ['Cdb', 'Mdb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)
        assert test_utils.compare_dfs(db1,
                                      db2), "{0} is not the same!".format(db)
Exemplo n.º 12
0
def test_taxonomy_1(self):
    '''
    Check the taxonomy call for max method
    '''
    genomes = self.genomes
    wd_loc = self.wd_loc
    swd_loc = self.s_wd_loc

    # Call the command
    args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
            + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
            '--tax_method', 'max'])
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    Swd = WorkDirectory(swd_loc)
    wd = WorkDirectory(wd_loc)

    tdbS = Swd.get_db('Bdb')
    tdb = wd.get_db('Bdb')
    del tdbS['location']
    del tdb['location']
    assert test_utils.compare_dfs(tdb,
                                  tdbS), "{0} is not the same!".format('Bdb')

    tdbS = Swd.get_db('Tdb')
    tdb = wd.get_db('Tdb')

    if test_utils.compare_dfs(tdb, tdbS) == False:
        print("{0} is not the same! May be due to centrifuge index issues".
              format('Tdb'))
        my_panel = pd.Panel(dict(df1=tdbS, df2=tdb))
        print(my_panel.apply(test_utils.report_diff, axis=0))

    assert True
Exemplo n.º 13
0
def test_dereplicate_6(self):
    '''
    Test zipped genomes
    '''
    genomes = self.zipped_genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    test_utils.sanity_check(WorkDirectory(s_wd_loc))

    args = argumentParser.parse_args(
        ['compare', wd_loc, '--S_algorithm', 'fastANI', '-g'] + genomes)
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    wd = WorkDirectory(wd_loc)
    anis = wd.get_db('Ndb')['ani'].tolist()
    assert max(anis) <= 1
    assert min(anis) >= 0
    assert len(set(anis)) > 1
Exemplo n.º 14
0
def test_dereplicate_3(self):
    '''
    Use goANI
    '''
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    test_utils.sanity_check(WorkDirectory(s_wd_loc))

    args = argumentParser.parse_args(
        ['compare', wd_loc, '--S_algorithm', 'goANI', '-g'] + genomes)
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    s_wd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)
    Ndb = wd.get_db('Ndb')
    assert len(Ndb) > 0

    # Perform sanity check to make sure solutions directiory isn't
    # being overwritten
    test_utils.sanity_check(s_wd)
Exemplo n.º 15
0
    def unit_tests_1(self):
        '''
        Test a normal run of cluster
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        #for db in ['Cdb', 'Mdb', 'Ndb']:
        for db in ['Cdb', 'Ndb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)

            # get rid of some precision on the ANI
            if db == 'Ndb':
                db1['ani'] = [float("{0:.4f}".format(x)) for x in db1['ani']]
                db2['ani'] = [float("{0:.4f}".format(x)) for x in db2['ani']]

            if compare_dfs(db1, db2) == False:
                # # db1['solution'] = True
                # # db2['solution'] = False
                # # db = pd.merge(db1, db2, on='')
                db1 = db1[['reference', 'querry', 'ani']]
                # db1.rename(columns={'ani':'ani1'}, inplace=True)
                db2 = db2[['reference', 'querry', 'ani']]
                # db2.rename(columns={'ani':'ani2'}, inplace=True)

                print("now?")
                print(compare_dfs(db1, db2))

                db1 = db1.sort_values(['reference', 'querry'])
                db2 = db2.sort_values(['reference', 'querry'])
                print(db1)
                print(db2)

                my_panel = pd.Panel(dict(df1=db1, df2=db2))
                print('panel:')
                print(my_panel.apply(report_diff, axis=0))
                # print("{0} is not the same!".format(db))
                #
                # my_panel = pd.Panel(dict(df1=db1,df2=db2))
                # print('panel:')
                # print(my_panel.apply(report_diff, axis=0))
                # print('merge:')
                # xdb = pd.merge(db1, db2, on=['reference', 'querry'])
                # print(xdb)
                # print('diff:')
                # print(xdb[xdb['ani1'] != xdb['ani2']])
                #
                # print('ref sorted 1')
                # print(db1['reference'].sort_values())
                #
                # print('ref sorted 2')
                # print(db2['reference'].sort_values())
                #
                # print('querry sorted 1')
                # print(db1['querry'].sort_values())
                #
                # print('querry sorted 2')
                # print(db2['querry'].sort_values())

            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)