Пример #1
0
    def parseArguments(self, args):
        ''' Parse user options and call the correct pipeline'''

        # Load the workDirectory
        wd_loc = str(os.path.abspath(args.work_directory))
        wd = WorkDirectory(wd_loc)

        # Set up the logger
        self.setup_logger(wd.get_loc('log'))
        logging.debug(str(args))

        # Call the appropriate workflow
        if args.operation == "dereplicate":
            self.dereplicate_operation(**vars(args))
        if args.operation == "compare":
            self.compare_operation(**vars(args))

        if args.operation == "filter":
            self.filter_operation(**vars(args))
        if args.operation == "cluster":
            self.cluster_operation(**vars(args))
        if args.operation == "analyze":
            self.analyze_operation(**vars(args))
        if args.operation == "choose":
            self.choose_operation(**vars(args))
        if args.operation == "adjust":
            self.adjust_operation(**vars(args))
        if args.operation == "bonus":
            self.bonus_operation(**vars(args))
        if args.operation == "evaluate":
            self.evaluate_operation(**vars(args))
Пример #2
0
    def parseArguments(self, args):
        ''' Parse user options and call the correct pipeline'''
        if args.operation == 'check_dependencies':
            drep.d_bonus.check_dependencies(print_out=True)
            return

        # Load the workDirectory
        wd_loc = str(os.path.abspath(args.work_directory))
        wd = WorkDirectory(wd_loc)

        # Set up the logger
        self.setup_logger(wd.get_loc('log'))
        logging.debug(str(args))

        # Do some testing
        if args.run_tertiary_clustering:
            if args.operation != "dereplicate":
                raise ValueError(
                    "Can only run tertiary clustering with dereplicate")

        # Call the appropriate workflow
        if args.operation == "dereplicate":
            self.dereplicate_operation(**vars(args))
        if args.operation == "compare":
            self.compare_operation(**vars(args))
Пример #3
0
def test_choose_2(self):
    '''
    Try out the --skipCheckM argument for choose
    '''
    # Delete Chdb
    wd_loc = self.working_wd_loc
    os.remove(wd_loc + '/data_tables/Chdb.csv')
    os.remove(wd_loc + '/data_tables/Sdb.csv')
    os.remove(wd_loc + '/data_tables/Wdb.csv')

    # Run choose with --skipCheckM
    args = argumentParser.parse_args(
        ['dereplicate', wd_loc, '--ignoreGenomeQuality'])
    kwargs = vars(args)
    del kwargs['genomes']
    drep.d_choose.d_choose_wrapper(wd_loc, **kwargs)
    #
    # controller = Controller()
    # controller.parseArguments(args)

    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)
    for db in ['Sdb', 'Wdb', 'genomeInformation']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)
        assert not test_utils.compare_dfs(db1,
                                          db2), "{0} is the same!".format(db)

    sdb = wd.get_db('Sdb')
    Swd.get_db(db)
    for s in sdb['score'].tolist():
        assert (s > 0) & (s < 5)

    gdb = wd.get_db('genomeInformation')
    assert 'centrality' in gdb.columns
Пример #4
0
    def taxTest1(self):
        '''
        Check the taxonomy call for max method
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        swd_loc = self.s_wd_loc

        # Call the command
        args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
                + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
                '--tax_method', 'max'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(swd_loc)
        wd = WorkDirectory(wd_loc)

        tdbS = Swd.get_db('Bdb')
        tdb = wd.get_db('Bdb')
        del tdbS['location']
        del tdb['location']
        assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb')

        tdbS = Swd.get_db('Tdb')
        tdb = wd.get_db('Tdb')

        if compare_dfs(tdb, tdbS) == False:
            print("{0} is not the same! May be due to centrifuge index issues".
                  format('Tdb'))
            my_panel = pd.Panel(dict(df1=tdbS, df2=tdb))
            print(my_panel.apply(report_diff, axis=0))

        assert True
Пример #5
0
    def taxTest2(self):
        '''
        Check the taxonomy call for percent method
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        swd_loc = self.s_wd_loc

        # Call the command
        args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
                + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
                '--tax_method', 'percent'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(swd_loc)
        wd = WorkDirectory(wd_loc)

        tdbS = Swd.get_db('BdbP')
        tdb = wd.get_db('Bdb')
        del tdbS['location']
        del tdb['location']
        assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb')

        tdbS = Swd.get_db('TdbP')
        tdb = wd.get_db('Tdb')
        assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Tdb')
Пример #6
0
def test_skipsecondary(self):
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] +genomes \
            + ['--SkipSecondary'])
    # controller = Controller()
    # controller.parseArguments(args)

    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Mdb.csv is correct
    db1 = Swd.get_db('Mdb')
    db2 = wd.get_db('Mdb')
    #assert compare_dfs(db1, db2), "{0} is not the same!".format('Mdb')

    # Confirm Ndb.csv doesn't exist
    db2 = wd.get_db('Ndb')
    assert db2.empty, 'Ndb is not empty'
Пример #7
0
    def unit_test_2(self):
        '''
        Try out the --skipCheckM argument for choose
        '''
        # Delete Chdb
        wd_loc = self.working_wd_loc
        os.remove(wd_loc + '/data_tables/Chdb.csv')
        os.remove(wd_loc + '/data_tables/Sdb.csv')
        os.remove(wd_loc + '/data_tables/Wdb.csv')

        # Run choose with --skipCheckM
        args = argumentParser.parse_args(
            ['choose', wd_loc, '--noQualityFiltering'])
        controller = Controller()
        controller.parseArguments(args)

        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)
        for db in ['Sdb', 'Wdb', 'genomeInformation']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(db1, db2), "{0} is the same!".format(db)

        sdb = wd.get_db('Sdb')
        for s in sdb['score'].tolist():
            assert (s > 0) & (s < 5)
Пример #8
0
    def unit_tests_6(self):
        '''
        Test drep call commands
        '''
        # try on single mash command

        wd   = WorkDirectory(self.working_wd_loc)
        MASH_folder = wd.get_dir('MASH')
        log_folder = wd.get_dir('cmd_logs')

        mash_exe = 'mash'
        all_file = MASH_folder + 'ALL.msh'

        cmd = [mash_exe, 'dist', all_file, all_file, '>', MASH_folder
            + 'MASH_table.tsv']
        cmd = ' '.join(cmd)
        drep.run_cmd(cmd, shell=True, logdir=log_folder)

        assert len(glob.glob(log_folder + '*')) == 3
Пример #9
0
    def functional_test_2(self):
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        sanity_check(WorkDirectory(s_wd_loc))

        args = argumentParser.parse_args(['compare', wd_loc, '-g'] + genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        s_wd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)
        ensure_identicle(s_wd, wd, skip=['Bdb', 'Chdb', 'Sdb', 'Wdb', 'Widb',\
            'genomeInformation', 'Mdb'])

        # Perform sanity check to make sure solutions directiory isn't
        # being overwritten
        sanity_check(s_wd)
Пример #10
0
    def functional_test_1(self):
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        sanity_check(WorkDirectory(s_wd_loc))

        args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + genomes \
            + ['--checkM_method', 'taxonomy_wf'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        s_wd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)
        ensure_identicle(s_wd, wd, skip=['Bdb', 'Mdb'])

        # Perform sanity check to make sure solutions directiory isn't
        # being overwritten
        sanity_check(s_wd)
Пример #11
0
def test_dereplicate_6(self):
    '''
    Test zipped genomes
    '''
    genomes = self.zipped_genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    test_utils.sanity_check(WorkDirectory(s_wd_loc))

    args = argumentParser.parse_args(
        ['compare', wd_loc, '--S_algorithm', 'fastANI', '-g'] + genomes)
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    wd = WorkDirectory(wd_loc)
    anis = wd.get_db('Ndb')['ani'].tolist()
    assert max(anis) <= 1
    assert min(anis) >= 0
    assert len(set(anis)) > 1
Пример #12
0
    def unit_tests_4(self):
        '''
        Test changing cluster -pa
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['-pa', '0.10'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        # for db in ['Mdb']:
        #     db1 = Swd.get_db(db)
        #     db2 =  wd.get_db(db)
        #     assert compare_dfs(db1, db2), "{0} is not the same!".format(db)

        # Confirm the following are not the same:
        for db in ['Ndb', 'Cdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(
                db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Пример #13
0
def test_cluster_functional_4(self):
    '''
    Cluster the 5 genomes using fastANI
    '''

    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    args = argumentParser.parse_args(['dereplicate',wd_loc,'--S_algorithm',\
        'fastANI','-g']+genomes)
    # controller = Controller()
    # controller.parseArguments(args)
    # args = argumentParser.parse_args(['dereplicate', wd_loc, '--S_algorithm', 'ANImf', '-g'] + genomes)

    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Cdb.csv is correct
    db1 = Swd.get_db('Cdb')
    del db1['comparison_algorithm']
    db2 = wd.get_db('Cdb')
    del db2['comparison_algorithm']
    assert test_utils.compare_dfs(db1,
                                  db2), "{0} is not the same!".format('Cdb')
Пример #14
0
    def unit_test_1(self):
        '''
        Ensure choose can handle when Chdb is not present, running checkM automatically
        '''
        # Delete Chdb
        wd_loc = self.working_wd_loc
        os.remove(wd_loc + '/data_tables/Chdb.csv')

        # Modify Bdb so the genome locations are right
        genomes = load_test_genomes()
        g2l = {os.path.basename(g): g for g in genomes}

        Bdb = pd.read_csv(wd_loc + '/data_tables/Bdb.csv')
        Bdb['location'] = Bdb['genome'].map(g2l)
        Bdb.to_csv(wd_loc + '/data_tables/Bdb.csv', index=False)

        # Run choose - this should re-run checkM and re-generate chdb
        args = argumentParser.parse_args(['choose', wd_loc, '--checkM_method',\
            'taxonomy_wf'])
        controller = Controller()
        controller.parseArguments(args)

        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)
        for db in ['Chdb', 'genomeInformation']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
Пример #15
0
    def functional_test_2(self):
        '''
        Cluster the 5 genomes using gANI
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        # Make sure gANI is installed
        loc, works = find_program('ANIcalculator')
        if (loc == None or works == False):
            print('Cannot locate the program {0}- skipping related tests'\
                .format('ANIcalculator (for gANI)'))
            return

        args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\
            'gANI','-g']+genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        del db1['comparison_algorithm']
        db2 = wd.get_db('Cdb')
        del db2['comparison_algorithm']
        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
Пример #16
0
def test_unit_7(self):
    '''
    Test cluster with --SkipSecondary
    '''
    # run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \
                                     self.genomes + ['--SkipSecondary'])
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are the same:
    # for db in ['Mdb']:
    #     db1 = Swd.get_db(db)
    #     db2 =  wd.get_db(db)
    #     assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format(db)

    # Confirm the following are not the same:
    for db in ['Cdb', 'Ndb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)
        assert not test_utils.compare_dfs(
            db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Пример #17
0
def test_cluster_functional_1(self):
    '''
    Cluster the 5 genomes using default settings
    '''
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    args = argumentParser.parse_args(['dereplicate', wd_loc, '-g'] + genomes)
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # args = argumentParser.parse_args(['cluster',wd_loc,'-g']+genomes)
    # controller = Controller()
    # controller.parseArguments(args)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Cdb.csv is correct
    db1 = Swd.get_db('Cdb')
    db2 = wd.get_db('Cdb')

    assert test_utils.compare_dfs(db1,
                                  db2), "{0} is not the same!".format('Cdb')
Пример #18
0
def test_dereplicate_1(self):
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    test_utils.sanity_check(WorkDirectory(s_wd_loc))

    args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + genomes \
        + ['--checkM_method', 'taxonomy_wf', '--debug', '--S_algorithm',
                    'ANImf'])
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    s_wd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)
    test_utils.ensure_identicle(
        s_wd,
        wd,
        skip=['Bdb', 'Mdb', 'Sdb', 'Wdb', 'genomeInformation', 'Widb'])

    # Perform sanity check to make sure solutions directiory isn't
    # being overwritten
    test_utils.sanity_check(s_wd)
Пример #19
0
def test_dereplicate_3(self):
    '''
    Use goANI
    '''
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    test_utils.sanity_check(WorkDirectory(s_wd_loc))

    args = argumentParser.parse_args(
        ['compare', wd_loc, '--S_algorithm', 'goANI', '-g'] + genomes)
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    s_wd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)
    Ndb = wd.get_db('Ndb')
    assert len(Ndb) > 0

    # Perform sanity check to make sure solutions directiory isn't
    # being overwritten
    test_utils.sanity_check(s_wd)
Пример #20
0
def test_taxonomy_4(self):
    '''
    Try actually running centrifuge without prodigal done
    '''
    loc, works = drep.d_bonus.find_program('centrifuge')
    if works == False:
        print('Centrifuge not installed- skipping tests')

    else:
        genomes = self.genomes
        wd_loc = self.wd_loc
        swd_loc = self.s_wd_loc

        # Remove previous data run
        shutil.rmtree(os.path.join(self.wd_loc, 'data', 'centrifuge'))
        shutil.rmtree(os.path.join(self.wd_loc, 'data', 'prodigal'))

        # Call the command
        args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
                + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
                '--tax_method', 'percent'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(swd_loc)
        wd = WorkDirectory(wd_loc)

        tdbS = Swd.get_db('BdbP')
        tdb = wd.get_db('Bdb')
        del tdbS['location']
        del tdb['location']
        assert test_utils.compare_dfs(
            tdb, tdbS), "{0} is not the same!".format('Bdb')

        tdbS = Swd.get_db('TdbP')
        tdb = wd.get_db('Tdb')
        assert test_utils.compare_dfs(
            tdb, tdbS), "{0} is not the same!".format('Tdb')
Пример #21
0
def test_list_genome_load(self):
    '''
    Test inputting a list of genomes via a text file
    '''
    bdb = drep.d_cluster.utils.load_genomes(self.genomes)
    data_folder = self.test_dir

    # Make the list of genomes
    if not os.path.exists(data_folder):
        os.mkdir(data_folder)
    genome_loc = os.path.join(data_folder, 'genomes.txt')
    with open(genome_loc, 'w') as o:
        for i, row in bdb.iterrows():
            o.write(row['location'] + '\n')

    # Test it out
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    # args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\
    #     'fastANI','-g',genome_loc])
    # controller = Controller()
    # controller.parseArguments(args)
    args = argumentParser.parse_args(
        ['dereplicate', wd_loc, '--S_algorithm', 'fastANI', '-g', genome_loc])
    kwargs = vars(args)
    # del kwargs['genomes']
    # drep.d_cluster.d_cluster_wrapper(wd_loc, **kwargs)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Cdb.csv is correct
    db1 = Swd.get_db('Cdb')
    del db1['comparison_algorithm']
    db2 = wd.get_db('Cdb')
    del db2['comparison_algorithm']
    assert test_utils.compare_dfs(db1,
                                  db2), "{0} is not the same!".format('Cdb')

    Ndb = drep.d_cluster.compare_utils.compare_genomes(bdb, 'fastANI',
                                                       data_folder)
    db = Ndb[(Ndb['reference'] == 'Enterococcus_faecalis_T2.fna')\
        & (Ndb['querry'] == 'Enterococcus_casseliflavus_EC20.fasta')]

    assert (db['ani'].tolist()[0] > 0.7) & (db['ani'].tolist()[0] < 0.8)
Пример #22
0
def test_unit_1(self):
    '''
    Test a normal run of cluster
    '''
    # normal complete run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '--S_algorithm', 'ANImf', '-g'] + \
                                     self.genomes)
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are correct:
    # for db in ['Cdb', 'Mdb', 'Ndb']:
    for db in ['Cdb', 'Ndb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)

        # get rid of some precision on the ANI; you are comparing fastANI with ANImf
        if db == 'Ndb':
            db1['ani'] = [round(x, 3) for x in db1['ani']]
            db2['ani'] = [round(x, 3) for x in db2['ani']]
            db1['alignment_length'] = [
                round(x, -6) for x in db1['alignment_length']
            ]
            db2['alignment_length'] = [
                round(x, -6) for x in db2['alignment_length']
            ]

            #db1 = db1[db2.columns]
            db1 = db1[['ani', 'alignment_length', 'querry', 'reference']]
            db2 = db2[['ani', 'alignment_length', 'querry', 'reference']]

            db1 = db1.sort_values(['querry',
                                   'reference']).reset_index(drop=True)
            db2 = db2.sort_values(['querry',
                                   'reference']).reset_index(drop=True)

        if db == 'Cdb':
            db1 = db1[['genome', 'secondary_cluster'
                       ]].sort_values('genome').reset_index(drop=True)
            db2 = db2[['genome', 'secondary_cluster'
                       ]].sort_values('genome').reset_index(drop=True)

        assert test_utils.compare_dfs2(
            db1, db2, verbose=True), "{0} is not the same!".format(db)
Пример #23
0
def test_dereplicate_8(self):
    '''
    Test greedy clustering with some primary clusters only having a single member
    '''
    if len(self.large_genome_set) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    genomes = [self.large_genome_set[0], self.large_genome_set[20]]
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    # Get greedy results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI',
        '--multiround_primary_clustering', '--primary_chunksize', '50',
        '--greedy_secondary_clustering', '-sa', '0.95', '-pa', '0.99', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run normal
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI',
        '--multiround_primary_clustering', '--primary_chunksize', '50', '-sa',
        '0.95', '-pa', '0.99', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values()), c
        if c != 'secondary_cluster':
            assert set(CSdb[c].value_counts().to_dict().keys()) == set(
                Cdb[c].value_counts().to_dict().keys()
            )  #, [set(CSdb[c].value_counts().to_dict().keys()), set(Cdb[c].value_counts().to_dict().keys())]
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(CSdb.columns) - set(Cdb.columns) == set(
        ['greedy_representative'])
Пример #24
0
    def unit_tests_5(self):
        '''
        Test changing cluster --S_algorithm gANI
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['--S_algorithm', 'gANI'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        for db in ['Cdb', 'Mdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
Пример #25
0
    def unit_tests_3(self):
        ''' Test cluster with --skipMash
        '''

        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['--SkipMash'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are not the same:
        for db in ['Cdb', 'Ndb', 'Mdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(
                db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Пример #26
0
def test_dereplicate_4(self):
    '''
    Test the ability of primary clustering to take a large genome set and break it into chunks
    '''
    genomes = self.large_genome_set
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    if len(genomes) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    # Get normal results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipSecondary',
        '--primary_chunksize', '50', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run with chunking
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipSecondary',
        '--multiround_primary_clustering', '--primary_chunksize', '50', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values())
        assert set(CSdb[c].value_counts().to_dict().keys()) == set(
            Cdb[c].value_counts().to_dict().keys())
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(Cdb.columns) - set(CSdb.columns) == set(
        ['length', 'subcluster', 'primary_representitive'])
Пример #27
0
def test_dereplicate_5(self):
    '''
    Test greedy clustering
    '''
    genomes = self.large_genome_set[:10]
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    if len(genomes) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    # Get greedy results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipMash',
        '--greedy_secondary_clustering', '-sa', '0.95', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run normal
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipMash', '-sa',
        '0.95', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values()), c
        assert set(CSdb[c].value_counts().to_dict().keys()) == set(
            Cdb[c].value_counts().to_dict().keys()), c
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(CSdb.columns) - set(Cdb.columns) == set(
        ['greedy_representative'])
Пример #28
0
    def functional_test_1(self):
        '''
        Cluster the 5 genomes using default settings
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        args = argumentParser.parse_args(['cluster', wd_loc, '-g'] + genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        db2 = wd.get_db('Cdb')

        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
Пример #29
0
    def unit_tests_1(self):
        '''
        Test a normal run of cluster
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        #for db in ['Cdb', 'Mdb', 'Ndb']:
        for db in ['Cdb', 'Ndb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)

            if compare_dfs(db1, db2) == False:
                # db1['solution'] = True
                # db2['solution'] = False
                # db = pd.merge(db1, db2, on='')
                db1 = db1[['reference', 'querry', 'ani']]
                db1.rename(columns={'ani': 'ani1'}, inplace=True)
                db2 = db2[['reference', 'querry', 'ani']]
                db2.rename(columns={'ani': 'ani2'}, inplace=True)
                db1.sort_values(['reference', 'querry'], inplace=True)
                db2.sort_values(['reference', 'querry'], inplace=True)
                print("{0} is not the same!".format(db))
                my_panel = pd.Panel(dict(df1=db1, df2=db2))
                print(my_panel.apply(report_diff, axis=0))
                print(pd.merge(db1, db2, on=['reference', 'querry']))

            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
Пример #30
0
    def functional_test_3(self):
        '''
        Cluster the 5 genomes using ANImf
        '''

        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\
            'ANImf','-g']+genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        del db1['comparison_algorithm']
        db2 = wd.get_db('Cdb')
        del db2['comparison_algorithm']
        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')