Пример #1
0
def diff_collections2(b1, b2, result_dir, step=10000):
    '''
    b2 is new collection, b1 is old collection
    '''
    DIFFFILE_PATH = '/home/kevinxin/diff_result/'
    DATA_FOLDER = os.path.join(DIFFFILE_PATH, result_dir)
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields=[])
    data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields=[])
    cnt = 0
    cnt_update = 0
    cnt_add = 0
    cnt_delete = 0

    for _batch in data_new:
        cnt += 1
        id_list_new = [_doc['_id'] for _doc in _batch]
        docs_common = b1.target_collection.find({'_id': {'$in': id_list_new}}, projection=[])
        ids_common = [_doc['_id'] for _doc in docs_common]
        id_in_new = list(set(id_list_new) - set(ids_common))
        _updates = []
        if len(ids_common) > 0:
            _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common), fastdiff=True)
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {'add': id_in_new,
                   'update': _updates,
                   'delete': [],
                   'source': b2.target_collection.name,
                   'timestamp': get_timestamp()}
        if len(_updates) != 0 or len(id_in_new) != 0:
            dump(_result, file_name)
            print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='')
            cnt_update += len(_updates)
            cnt_add += len(id_in_new)
    print("Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}".format(cnt_update, cnt_add))
    print("="*100)
    for _batch in data_old:
        cnt += 1
        id_list_old = [_doc['_id'] for _doc in _batch]
        docs_common = b2.target_collection.find({'_id': {'$in': id_list_old}}, projection=[])
        ids_common = [_doc['_id'] for _doc in docs_common]
        id_in_old = list(set(id_list_old)-set(ids_common))
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {'delete': id_in_old,
                   'add': [],
                   'update': [],
                   'source': b2.target_collection.name,
                   'timestamp': get_timestamp()}
        if len(id_in_old) != 0:
            dump(_result, file_name)
            print("(Deleted: {})".format(len(id_in_old)), end='')
            cnt_delete += len(id_in_old)
    print("Finished calculating diff for the old collection. Total number of docs deleted: {}".format(cnt_delete))
    print("="*100)
    print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(cnt_update, cnt_add, cnt_delete))
Пример #2
0
    def test_0_registerdid_batch(self):
        # create 5000 account
        # send 1 etp to each account
        # issue a did for each account
        now = common.get_timestamp()

        def get_account_name(i, j):
            return "Account_%s_%s" % (i, j)
        def get_did_symbol(i, j):
            return "tempdid_%s_%s.%s" % (i, j, now)

        lastwords = []
        addresses = []

        batch_amount_i, batch_amount_j = 200, 10 # total 2000

        def get_lastword(i, j):
            return lastwords[i * batch_amount_j + j]

        def get_address(i, j):
            return addresses[i * batch_amount_j + j]

        def get_did_count():
            ec, message = mvs_rpc.list_dids()
            self.assertEqual(ec, 0)
            return len(message["dids"])

        try:
            for i in xrange(batch_amount_i):
                receivers = {}
                for j in xrange(batch_amount_j):
                    ec, (lastword, address) = mvs_rpc.easy_new_account(get_account_name(i, j), "123456")
                    self.assertEqual(ec, 0)
                    lastwords.append(lastword)
                    addresses.append( address )

                    receivers[address] = 10**8 # 1 etp for each
                Alice.sendmore_etp(receivers)
                Alice.mining()

            previous = get_did_count()
            for i in xrange(batch_amount_i):
                for j in xrange(batch_amount_j):
                    ec, message = mvs_rpc.register_did(get_account_name(i, j), "123456", get_address(i, j), get_did_symbol(i, j))
                    self.assertEqual(ec, 0)

            current = 0
            mine_round = 0
            while current < previous + batch_amount_i*batch_amount_j:
                Alice.mining()
                current = get_did_count()
                mine_round += 1

            self.assertEqual(mine_round, 2)

        finally:
            for i in xrange(batch_amount_i):
                for j in xrange(batch_amount_j):
                    ec, message = mvs_rpc.delete_account(get_account_name(i, j), "123456", get_lastword(i, j))
                    self.assertEqual(ec, 0, message)
Пример #3
0
def make_plots(y, min_n_of_epochs):
    import matplotlib.pyplot as plt
    from os.path import join, exists
    from os import makedirs
    import sys
    sys.path.append('..')
    import configuration.model as config
    from utils.common import get_timestamp
    timestamp = get_timestamp()
    # we are creating directory in which all the plots will be stored
    storing_directory = join(config.path_for_storing, str(timestamp)+"_iter_middle_misclass")
    # if directory doesn't exist (which should be always) create it
    if not exists(storing_directory):
        makedirs(storing_directory)
    # for each experiment that has been run
    for current_idx, experiment in enumerate(y):
        # if it has seen at least 3 epochs
        if len(experiment) > min_n_of_epochs:
            # save plot of the misclass error w.r.t number of epochs seen
            plt.plot(range(1, len(experiment)+1), experiment, 'r-')
            plt.xlabel("number of epochs seen")
            plt.ylabel("misclass error")
            plt.savefig(join(storing_directory, 'epochs_misclass_model'+str(current_idx)+".png"),
                        bbox_inches='tight')
            plt.clf()
Пример #4
0
def diff_collections(b1, b2, use_parallel=True, step=10000):
    """
    b1, b2 are one of supported backend class in databuild.backend.
    e.g.,
        b1 = GeneDocMongoDBBackend(c1)
        b2 = GeneDocMongoDBBackend(c2)
    """

    id_s1 = set(b1.get_id_list())
    id_s2 = set(b2.get_id_list())
    print("Size of collection 1:\t", len(id_s1))
    print("Size of collection 2:\t", len(id_s2))

    id_in_1 = id_s1 - id_s2
    id_in_2 = id_s2 - id_s1
    id_common = id_s1 & id_s2
    print("# of docs found only in collection 1:\t", len(id_in_1))
    print("# of docs found only in collection 2:\t", len(id_in_2))
    print("# of docs found in both collections:\t", len(id_common))

    print("Comparing matching docs...")
    _updates = []
    if len(id_common) > 0:
        if not use_parallel:
            _updates = _diff_doc_inner_worker2(b1, b2, list(id_common))
        else:
            from utils.parallel import run_jobs_on_ipythoncluster
            _path = os.path.split(os.path.split(os.path.abspath(__file__))[0])[0]
            id_common = list(id_common)
            # b1_target_collection = b1.target_collection.name
            # b2_es_index = b2.target_esidxer.ES_INDEX_NAME
            _b1 = (b1.target_name, b1.name)
            _b2 = (b2.target_name, b2.name)
            task_li = [(_b1, _b2, id_common[i: i + step], _path) for i in range(0, len(id_common), step)]
            job_results = run_jobs_on_ipythoncluster(_diff_doc_inner_worker2, task_li)
            _updates = []
            if job_results:
                for res in job_results:
                    _updates.extend(res)
            else:
                print("Parallel jobs failed or were interrupted.")
                return None

        print("Done. [{} docs changed]".format(len(_updates)))

    _deletes = []
    if len(id_in_1) > 0:
        _deletes = sorted(id_in_1)

    _adds = []
    if len(id_in_2) > 0:
        _adds = sorted(id_in_2)

    changes = {'update': _updates,
               'delete': _deletes,
               'add': _adds,
               'source': b2.target_collection.name,
               'timestamp': get_timestamp()}
    return changes
Пример #5
0
def _download(__metadata__):
    from utils.dataload import download as _download

    output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp())
    for species in ['human', 'mouse', 'yeast']:
        url = __metadata__['__url_{}__'.format(species)]
        output_file = 'CPDB_pathways_genes_{}.tab'.format(species)
        _download(url, output_folder, output_file)
Пример #6
0
def objective_function(samp):
    current_time = get_timestamp()
    k = config.number_of_cross_validation_parts
    yaml_scheme_path = config.yaml_skelton_path
    data_scheme_yaml = config.data_yaml_scheme
    dataset_dict = config.data_dict
    seed = config.seed
    return CrossValidator.run(k=k, model_dictionary=samp, data_yaml_scheme_path=data_scheme_yaml,
                              dataset_files=dataset_dict, seed=seed)
Пример #7
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.src_db[self.src_name].count() > 0:
             new_name = '_'.join([self.src_name, 'archive', get_timestamp(), get_random_string()])
             self.src_db[self.src_name].rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.src_name)
     else:
         return None
 def test_2_didsend_more(self):
     receivers = {
         Bob.mainaddress(): 100000,
         'Invalid_address_' + common.get_timestamp(): 100001,
         Dale.mainaddress(): 100002,
         Eric.did_symbol: 100003,
     }
     #Invalid_address or did
     ec, message = mvs_rpc.didsendmore(Alice.name, Alice.password,
                                       receivers, Alice.did_symbol)
     self.assertEqual(ec, 7006, message)
Пример #9
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()])
             self.collection.rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.__collection__)
     else:
         print("Error: load data first.")
Пример #10
0
def make_plots(y):
    import matplotlib.pyplot as plt
    from os.path import join
    import sys
    sys.path.append('..')
    import configuration.model as config
    from utils.common import get_timestamp
    timestamp = get_timestamp()
    plt.plot(range(1, len(y)+1), y, 'r-')
    plt.xlabel("number of model seen by hyperopt")
    plt.ylabel("misclass error")
    plt.savefig(join(config.path_for_storing, timestamp+'hyp_model_misclass.png'), bbox_inches='tight')
    plt.clf()
Пример #11
0
def backup_src_configs():
    import json
    import os
    from utils.common import send_s3_file, get_timestamp, DateTimeJSONEncoder

    db = get_src_db()
    for cfg in ['src_dump', 'src_master', 'src_build']:
        xli = list(db[cfg].find())
        bakfile = '/tmp/{}_{}.json'.format(cfg, get_timestamp())
        bak_f = file(bakfile, 'w')
        json.dump(xli, bak_f, cls=DateTimeJSONEncoder, indent=2)
        bak_f.close()
        bakfile_key = 'genedoc_src_config_bk/' + os.path.split(bakfile)[1]
        print('Saving to S3: "{}"... '.format(bakfile_key), end='')
        send_s3_file(bakfile, bakfile_key, overwrite=True)
        os.remove(bakfile)
        print('Done.')
Пример #12
0
def backup_src_configs():
    import json
    import os
    from utils.common import send_s3_file, get_timestamp, DateTimeJSONEncoder

    db = get_src_db()
    for cfg in ['src_dump', 'src_master', 'src_build']:
        xli = list(db[cfg].find())
        bakfile = '/tmp/{}_{}.json'.format(cfg, get_timestamp())
        bak_f = file(bakfile, 'w')
        json.dump(xli, bak_f, cls=DateTimeJSONEncoder, indent=2)
        bak_f.close()
        bakfile_key = 'genedoc_src_config_bk/' + os.path.split(bakfile)[1]
        print('Saving to S3: "{}"... '.format(bakfile_key), end='')
        send_s3_file(bakfile, bakfile_key, overwrite=True)
        os.remove(bakfile)
        print('Done.')
    def test_7_change_did(self):
        '''modify did between Zac's addresses'''
        temp_did = "ZAC.DIID@" + common.get_timestamp()
        Alice.send_etp(Zac.mainaddress(), 10**8)
        Alice.mining()
        ec, message = mvs_rpc.register_did(Zac.name, Zac.password,
                                           Zac.mainaddress(), temp_did)
        self.assertEqual(ec, 0, message)
        Alice.mining()

        # no enough balance, unspent = 0, payment = 10000
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.addresslist[1], temp_did)
        self.assertEqual(ec, 3302, message)

        Alice.send_etp(Zac.addresslist[1], 10**4)
        Alice.mining()

        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.addresslist[1], temp_did)
        self.assertEqual(ec, 0, message)
        Alice.mining()

        ec, message = mvs_rpc.list_dids(Zac.name, Zac.password)
        self.assertEqual(ec, 0, message)

        self.assertEqual(message['dids'][0]['symbol'], temp_did, message)
        self.assertEqual(message['dids'][0]['address'], Zac.addresslist[1],
                         message)

        # confirm the modification procedure by list_didaddresses
        ec, message = mvs_rpc.list_didaddresses(temp_did)
        self.assertEqual(ec, 0, message)

        self.assertEqual(message['addresses'][0]["address"],
                         Zac.addresslist[1])
        self.assertEqual(message['addresses'][0]["status"], "current")

        self.assertEqual(message['addresses'][1]["address"],
                         Zac.addresslist[0])
        self.assertEqual(message['addresses'][1]["status"], "history")
Пример #14
0
def get_sample_experiment():
    from hyperopt.pyll.stochastic import sample
    from pylearn2.config import yaml_parse
    from os.path import join
    import sys
    sys.path.append('..')
    from hyperopt_api.parser import build
    from yaml_parser import yaml_parser as yp
    from hyperopt_api.search_space import get_search_space
    import configuration.model as config
    from utils.common import get_timestamp

    # prepare all variables that don't need to be updated with each iteration
    spa = get_search_space()    # define search space over possible models

    path = config.data_path

    # obtain the yaml skelton
    with open(config.yaml_skelton_path) as f:
        default_string = f.read()

    samp = sample(spa)  # generate sample (will give a description of a model)
    mod = build(samp)   # based on description generated build an object that will fit into yaml_paser

    # define weight decay parameters. They depend on the number of layers (there is one parameter fo each layer)
    weight_decay_coeffs = yp.parse_weight_decay(mod)

    # generate a filename to store the best model
    pkl_filename = join(config.path_for_storing, get_timestamp() + "_best.pkl")

    # create dictionary with hyper parameters
    hyper_params = {'model': yp.parse_to_yaml(mod), 'path': yp.parse_to_yaml(path),
                    'weight_decay_coeffs': weight_decay_coeffs, 'pkl_filename': pkl_filename}
    # fill the yaml skelton with hyperparameters
    yaml_string = default_string % hyper_params

    network = yaml_parse.load(yaml_string)

    return network
Пример #15
0
__author__ = 'agnieszka'

from os.path import join
from pylearn2.config import yaml_parse
import sys
sys.path.append('..')
from utils.common import notify
import configuration.model as config
import models
import yaml_parser as yp
from utils.common import get_timestamp

current_time = get_timestamp()

# setting parameters of convolutional layer
con = models.ConvElemwise()
con.layer_name = "h0"
con.output_channels = 1
con.kernel_shape = [2, 2]
con.nonlinearity = models.TanhConvNonlinearity()
con.irange = 0.1
con.pool_shape = [2, 3]

# setting parameters of softmax layer
sof = models.Softmax()
sof.n_classes = 2
sof.layer_name = "softmax"
sof.irange = 0.1

# creating list of layers
layers = [con, sof]
Пример #16
0
t = Terminal()
import configuration.model as config

# prepare all variables that don't need to be updated with each iteration
spa = get_search_space()    # define search space over possible models

# define data paths
path = config.data_path

# obtain the yaml skelton
with open(config.yaml_skelton_path) as f:
    default_string = f.read()

# for each sample that will be generated from search space space
for i in xrange(20):
    timestamp = get_timestamp()

    print t.bold_red('ITERATION:'), t.bold_red(str(i)), "started at: ", timestamp

    samp = sample(spa)  # generate sample (will give a description of a model)
    print t.bold_cyan('SAMP'), samp

    mod = build(samp)   # based on description generated build an object that will fit into yaml_parser
    print t.bold_blue('MODEL'), mod

    # define weight decay parameters. They depend on the number of layers (there is one parameter fo each layer)
    weight_decay_coeffs = yp.parse_weight_decay(mod)

    # generate a filename to store the best model
    pkl_filename = join(config.path_for_storing, timestamp+"best_"+str(i)+'_'+".pkl")
Пример #17
0
 def _get_target_name(self):
     return 'genedoc_{}_{}_{}'.format(self._build_config['name'],
                                      get_timestamp(), get_random_string()).lower()
    def test_9_change_did_multisig(self):
        did_normal_symbal = "Zac@" + common.get_timestamp()
        Alice.send_etp(Zac.mainaddress(), 10**8)
        Alice.mining()

        ec, message = mvs_rpc.register_did(Zac.name, Zac.password,
                                           Zac.mainaddress(),
                                           did_normal_symbal)
        self.assertEqual(ec, 0, message)
        Alice.mining()

        group = [Alice, Cindy, Dale, Frank, Zac]

        did_symbol = '@'.join(r.name for r in group) + common.get_timestamp()
        for i, role in enumerate(group):
            addr = role.new_multisigaddress(
                "Alice & Cindy & Zac's Multisig-DID",
                group[:i] + group[i + 1:], 3)

        Alice.send_etp(addr, (10**9))
        Alice.mining()

        ec, message = mvs_rpc.register_did(group[0].name, group[0].password,
                                           addr, did_symbol)
        self.assertEqual(ec, 0, message)

        ec, message = mvs_rpc.sign_multisigtx(group[1].name, group[1].password,
                                              message)
        self.assertEqual(ec, 0, message)

        ec, message = mvs_rpc.sign_multisigtx(group[2].name, group[2].password,
                                              message, True)
        self.assertEqual(ec, 0, message)
        Alice.mining()

        # did not find
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.mainaddress(),
                                         common.get_timestamp())
        self.assertEqual(ec, 7006, message)

        #did not owner by account
        ec, message = mvs_rpc.change_did(Bob.name, Bob.password,
                                         Bob.mainaddress(), did_symbol)
        self.assertEqual(ec, 7009, message)

        #did address invalid
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password, "Test" * 20,
                                         did_symbol)
        self.assertEqual(ec, 4012, message)

        #address didn't owned by the account
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Bob.mainaddress(), did_symbol)
        self.assertEqual(ec, 4003, message)

        #address is already binded with did
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.mainaddress(), did_symbol)
        self.assertEqual(ec, 7002, message)

        # no enough balance, unspent = 0, payment = 10000
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.addresslist[1], did_symbol)
        self.assertEqual(ec, 3302, message)

        Alice.send_etp(Zac.addresslist[1], 10**5)
        Alice.mining()

        #signature must be large than 3
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.addresslist[1], did_symbol)
        self.assertEqual(ec, 0, message)

        #cannot transfer to another multi-signature
        ec, message = mvs_rpc.sign_multisigtx(group[0].name, group[0].password,
                                              message, True)
        self.assertEqual(ec, 5304, message)

        group_new = [Bob, Dale, Zac]
        for i, role in enumerate(group_new):
            addr_new = role.new_multisigaddress(
                "Bob & Dale & Zac's Multisig-DID",
                group_new[:i] + group_new[i + 1:], 2)

        Alice.send_etp(addr_new, (10**6))
        Alice.mining()
        ec, message = mvs_rpc.change_did(Zac.name, Zac.password, addr_new,
                                         did_symbol)
        self.assertEqual(ec, 7010, message)

        ec, message = mvs_rpc.change_did(Zac.name, Zac.password,
                                         Zac.addresslist[1], did_symbol)
        self.assertEqual(ec, 0, message)

        ec, message = mvs_rpc.sign_multisigtx(group[0].name, group[0].password,
                                              message)
        self.assertEqual(ec, 0, message)

        ec, message = mvs_rpc.sign_multisigtx(group[1].name, group[1].password,
                                              message, True)
        self.assertEqual(ec, 0, message)
        self.assertNotEqual(
            Zac.get_didaddress(symbol=did_symbol), Zac.addresslist[1],
            "Failed where modify did address from multi_signature to multi_signature address"
        )
Пример #19
0
    def run(k, model_dictionary, data_yaml_scheme_path, dataset_files, seed=1337):
        print "CV_ENTER"
        # obtain the yaml skelton
        assert k >= 3  # we need to have at least 3 sets: train, validation, test

        with open(config.yaml_skelton_path) as f:
            default_string = f.read()

        with open(data_yaml_scheme_path) as f:
            data_yaml_scheme = f.read()

        list_of_scores = []
        for i in xrange(k):
            current_time = get_timestamp()

            # calculate which parts to choose
            validation_part = [i]
            test_part = [0]
            if i < k - 1:
                test_part = [i + 1]
            train_parts = [x for x in xrange(k) if x not in validation_part and x not in test_part]

            train_data_string = data_yaml_scheme % {
                "path": dataset_files["labeled_paths"],
                "y_val": dataset_files["labeled_values"],
                "cv": [k, train_parts],
                "seed": seed,
                "middle_path": dataset_files["middle_paths"],
                "middle_val": dataset_files["middle_values"],
            }

            # we don't want any unlabelled examples in validation nor test data
            validation_data_string = data_yaml_scheme % {
                "path": dataset_files["labeled_paths"],
                "y_val": dataset_files["labeled_values"],
                "cv": [k, validation_part],
                "seed": seed,
                "middle_path": [],
                "middle_val": [],
            }

            test_data_string = data_yaml_scheme % {
                "path": dataset_files["labeled_paths"],
                "y_val": dataset_files["labeled_values"],
                "cv": [k, test_part],
                "seed": seed,
                "middle_path": [],
                "middle_val": [],
            }

            mod = build(model_dictionary)  # based on description generated build an object that will fit into
            # yaml_parser
            print t.bold_cyan("SAMP"), model_dictionary
            print t.bold_blue("MODEL"), mod

            # define weight decay parameters. They depend on the number of layers (there is one parameter fo each layer)
            weight_decay_coeffs = yp.parse_weight_decay(mod)

            # generate a filename to store the best model
            pkl_filename = join(config.path_for_storing, current_time + "_best.pkl")

            # create dictionary with hyper parameters
            hyper_params = {
                "model": yp.parse_to_yaml(mod),
                "train_data": train_data_string,
                "validation_data": validation_data_string,
                "weight_decay_coeffs": weight_decay_coeffs,
                "pkl_filename": pkl_filename,
            }
            yaml_string = default_string % hyper_params

            network = None
            # misclass_error = 1
            # f1_score_error = 1
            roc_score = 0

            try:
                # create the model based on a yaml
                network = yaml_parse.load(yaml_string)
                print t.bold_magenta("NETWORK"), type(network)
                # train the model
                network.main_loop()

            except BaseException:  # TODO: this exception is to broad
                # if exception was thrown save yaml of a model that generated that exception
                with open(current_time + ".yaml", "w") as YAML_FILE:
                    YAML_FILE.write(yaml_string)
                # write down errors description to a file
                with open(current_time + ".error", "w") as ERROR_FILE:
                    ERROR_FILE.write(traceback.format_exc())

            finally:
                if network is not None:
                    try:
                        from numpy import argmax

                        # run predictions to obtain score for this model
                        test_data = yaml_parse.load(test_data_string)
                        best_model = serial.load("best_model_roc_youden.model")

                        roc_score_, threshold = roc_score_threshold_getter(network)

                        predictor = Predictor(best_model)
                        predictions = predictor.get_predictions(test_data.X)

                        fp = 0
                        fn = 0
                        tp = 0
                        tn = 0

                        for tr, pre in zip(test_data.y, predictions):
                            if pre[0][1] > threshold:
                                if tr[0] == 1:
                                    tp += 1
                                else:
                                    fp += 1
                            else:
                                if tr[0] == 0:
                                    tn += 1
                                else:
                                    fn += 1
                        roc_score = (float(tp) / (float(tp) + fn)) - (float(fp) / (float(fp) + tn))
                        list_of_scores.append(1 - roc_score)  # we want to maximise this score, hyperopt minimises

                        print t.bold_red("_ROC: Best roc score for this model: " + str(roc_score))
                        print t.bold_red("_ROC: Obtained for threshold: " + str(threshold))
                        precision = float(tp) / (tp + fp)
                        recall = float(tp) / (tp + fn)
                        f1score = 0
                        if precision + recall != 0:
                            f1score = 2 * precision * recall / (precision + recall)
                        print "precision:", precision
                        print "recall:", recall
                        print "f1score", f1score

                    except BaseException:  # TODO: this exception is to broad
                        with open(current_time + "_ROC_error", "w") as ERROR_FILE:
                            ERROR_FILE.write(traceback.format_exc())

                # print t.bold_red("M_01: misclass_error for this model: "+str(misclass_error))
                # return misclass_error
                print t.bold_red("M_02: roc score error for this model: " + str(roc_score))
                # list_of_scores.append(f1_score_error)
        m = mean(list_of_scores)
        print "CV_MEAN mean(1-ROC) on this architecture:", m
        return m
Пример #20
0
def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000):
    '''
    b2 is new collection, b1 is old collection
    '''
    if use_parallel:
        import multiprocessing
        from functools import partial
    DATA_FOLDER = result_dir
    data_new = doc_feeder(b2.target_collection,
                          step=step,
                          inbatch=True,
                          fields={'_id': 1})
    data_old = doc_feeder(b1.target_collection,
                          step=step,
                          inbatch=True,
                          fields={'_id': 1})
    cnt = 0
    cnt_update = 0
    cnt_add = 0
    cnt_delete = 0
    _timestamp = get_timestamp()
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    for batch in data_new:
        cnt += 1
        id_list_new = [doc['_id'] for doc in batch]
        ids_common = [
            doc['_id']
            for doc in b1.target_collection.find({'_id': {
                '$in': id_list_new
            }}, {'_id': 1})
        ]
        id_in_new = list(set(id_list_new) - set(ids_common))
        _updates = []
        if len(ids_common) > 0:
            if use_parallel:
                step = int(len(ids_common) / multiprocessing.cpu_count())
                task_list = [
                    ids_common[i:i + step]
                    for i in range(0, len(ids_common), step)
                ]
                pool = multiprocessing.Pool()
                partial_worker = partial(_diff_parallel_worker,
                                         b1.target_collection.name,
                                         b2.target_collection.name)
                results = pool.map(partial_worker, task_list)
                pool.close()
                pool.join()
                for result in results:
                    _updates += result
            else:
                _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common))
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {
            'add': id_in_new,
            'update': _updates,
            'delete': [],
            'source': b2.target_collection.name,
            'timestamp': _timestamp
        }
        if len(_updates) != 0 or len(id_in_new) != 0:
            dump(_result, file_name)
            print("(Updated: {}, Added: {})".format(len(_updates),
                                                    len(id_in_new)),
                  end='')
            cnt_update += len(_updates)
            cnt_add += len(id_in_new)
    print(
        "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}"
        .format(cnt_update, cnt_add))
    print("=" * 100)
    for _batch in data_old:
        cnt += 1
        id_list_old = [_doc['_id'] for _doc in _batch]
        ids_common = [
            doc['_id']
            for doc in b2.target_collection.find({'_id': {
                '$in': id_list_old
            }}, {'_id': 1})
        ]
        id_in_old = list(set(id_list_old) - set(ids_common))
        _result = {
            'delete': id_in_old,
            'add': [],
            'update': [],
            'source': b2.target_collection.name,
            'timestamp': _timestamp
        }
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        if len(id_in_old) != 0:
            dump(_result, file_name)
            print("(Deleted: {})".format(len(id_in_old)), end='')
            cnt_delete += len(id_in_old)
    print(
        "Finished calculating diff for the old collection. Total number of docs deleted: {}"
        .format(cnt_delete))
    print("=" * 100)
    print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(
        cnt_update, cnt_add, cnt_delete))