Пример #1
0
def student2enrollid_time():

    '''initial'''
    student2enrollid_time = {}
    enrollid_second = cPickle.load(open('enrollid_second.cPickle'))
    student_enrollment = cPickle.load(open('student_enrollment.cPickle'))

    for key,values in student_enrollment.items():
        # print key,values
        max_time = 0
        for enrollid in values:
            secods = enrollid_second.get(enrollid)
            print secods
            if secods > max_time:
                max_time = secods

        for enrollid in values:
            student2enrollid_time[enrollid] = max_time
        # raw_input(student2enrollid_time)

    w = open("student2enrollid_time.cPickle",'w')
    cPickle.dump(student2enrollid_time,w)
    w.close()

    w = open('student2enrollid_time.txt','w')
    w.write('enrollment_id,student2enrollid_time\n')
    for key in sorted(student2enrollid_time.iterkeys()):
        w.write(str(key))
        w.write(',')
        w.write(str(student2enrollid_time[key]))
        w.write('\n')
    w.close()
Пример #2
0
def loadArray(dirpath):
    # pattern = regex = str variable = '.+\.label' (recommended)
    pattern = '.+\.label'
    # another = 'array' (recommended)
    another = 'array'
    names = os.listdir(dirpath)
    random.shuffle(names)
    for name in names:
        if re.match(pattern,name) != None:
            #print name
            folder,prename,num,suffix = name.split('.')
            target = folder + '.' + prename + '.' + num + '.' + another
            targetpath = dirpath + '/' + target
            # find another suffix data file
            # meanwhile examine the num, length of spectrogram = length of label
            if os.path.exists(targetpath):
                # extract object from a file
                with file(target,'rb') as f:
                    spectroArray = cPickle.load(f)
                    # GPU default type is float32
                    spectroArray = np.float32(spectroArray)
                with file(name,'rb') as f:
                    labelArray = cPickle.load(f)
                    # label should be int type
                    labelArray = np.int32(labelArray)
                yield spectroArray,labelArray,int(num)
Пример #3
0
def load_knowledge(knowledge):
    #existing naive_bayes object and keyword list
    nb=None
    kw=list()
    if knowledge is not None:
        if not os.path.isdir(knowledge):
            print("Knowledge bust be a directory")
            exit()

    else:
        knowledge =os.path.expanduser('~/.shakespeare')
        #make this directory if it doesn't already exist
        if not (os.path.exists(knowledge)):
            print('Creating directory: {}'.format(knowledge))
            os.mkdir(knowledge)

    kfiles = glob.glob(knowledge+'/*')
    if os.path.join(knowledge,'nb.p') in kfiles:
        nb=pickle.load(open(os.path.join(knowledge,'nb.p')))
    else:
        print("Warning: knowledge dir {} does not contain nb.p (pickled naive bayes object)".format(knowledge))

    if os.path.join(knowledge,'kw.p') in kfiles:
        kw=pickle.load(open(os.path.join(knowledge,'kw.p')))
    else:
        print("Warning: knowledge dir {} does not contain kw.p (pickled keyword list)".format(knowledge))

    return(nb,kw, knowledge)
def load_vocabulary():
    """
    Unpickle and return the content of the file
    `free_associations_vocabulary` generated by `process_data.py`.

    Output
    ------
        W:          association matrix
        id2voc:     dictionary, keys are word ids and values words
        voc2id:     dictionary, keys are words and values word ids
    """

    # absolute path to the free association norms data
    path = '../../data/associationmatrices/'
    filename = 'association_norms_symm'

    try:
        with open(path+filename, 'rb') as f:
            id2voc = pickle.load(f)
            voc2id = pickle.load(f)
            Wsparse = pickle.load(f)
    except IOError:
        raise IOError('Association matrix "' + filename + '" not found' +
                      ' in ' + path + '. To generate the matrix run ' +
                      'generate_association_matrix.py')

    # convert to dense matrix (stored as sparse for memory reasons)
    W = np.asarray(Wsparse.todense())

    # normalize weights to [0-1] interval
    W /= 2
    np.fill_diagonal(W, 1.)

    return W, id2voc, voc2id
Пример #5
0
    def __init__(self, descrs, aggFunc='mean', caching=True):
        self.reportMissing = True
        self.caching = caching
        self.cached_file_name = None

        if isinstance(descrs, str):
            self.descrs_file = descrs
            self.descrs = pickle.load(open(self.descrs_file, 'rb'))
            self.cached_file_name = '%s-%s.pkl' % (self.descrs_file, aggFunc)
        elif isinstance(descrs, dict):
            self.descrs = descrs

        if self.caching and self.cached_file_name is not None and os.path.exists(self.cached_file_name):
            self.space = pickle.load(open(self.cached_file_name, 'rb'))
        elif aggFunc in ['mean', 'max']:
            if aggFunc == 'mean':
                f = self.aggMean
            elif aggFunc == 'max':
                f = self.aggMax

            self.space = {}
            for k in self.descrs.keys():
                vecs = self.descrs[k].values()
                if len(vecs) < 2:
                    if self.reportMissing:
                        print('Warning: Not enough vectors for key %s - skipping' % k)
                    continue
                self.space[k] = f(vecs)

            if self.caching and self.cached_file_name is not None:
                pickle.dump(self.space, open(self.cached_file_name, 'wb'))
def readTurbostatDataFile(filename,newFile=None,verbose=False,needsReload=False):
	if newFile == None:
		newFile = filename+"_tsdata.gz"

	if not needsReload:
		try:
			if verbose:
				print "Loading data from power file..."
			fp = gzip.open(newFile,"rb")
			data = pickle.load(fp)
			colHeaders = pickle.load(fp)
			fp.close()

		except IOError as err:
			if verbose:
				print "Does not exist (%s). Attempting to create..."%(err)
			needsReload = True

	if needsReload:
		data,colHeaders = generateTurbostatDataFile(filename, newFile, verbose)

	if verbose:
		print "Got %d blocks."%(data.shape[0])

	return (data,colHeaders)
Пример #7
0
def loadState():
    import gzip
    global outputFile
    #f = gzip.GzipFile(outputFile,'r')
    
    try:
        f = open(outputFile, 'r')
    except:
        return "Humane Document file does not exist."

    try:
        state = cPickle.load(f)

        changeList = []
        while 1:
            try:
                changes = cPickle.load(f)
                changeList.extend(changes)
            except EOFError:
                break
    except:
        return "Error loading Humane Document."

    f.close()

    state.restore()
    _applyChanges(changeList)
    return "Loaded file correctly."
Пример #8
0
 def open(self, dirname, filename):
     self.filename = filename
     self.dirname = dirname
     self.status.SetStatusText("Opening: {0}".format(filename), 0)
     try:
         handle = open(os.path.join(dirname, filename), 'rb')
         header = cPickle.load(handle)
         if header != FILE_HEADER:
             wx.MessageBox('Invalid or corrupted file', 'Warning',
                       wx.OK | wx.ICON_WARNING)
             self.status.SetStatusText("Open failed", 0)
             return
         _version = cPickle.load(handle)
         self.settings.start = cPickle.load(handle)
         self.settings.stop = cPickle.load(handle)
         self.spectrum = cPickle.load(handle)
     except:
         wx.MessageBox('File could not be opened', 'Warning',
                       wx.OK | wx.ICON_WARNING)
         self.status.SetStatusText("Open failed", 0)
         return
     self.isSaved = True
     self.set_range()
     self.draw_plot()
     handle.close()
     self.status.SetStatusText("Finished", 0)
Пример #9
0
def getTrainTest(flag , date = '2016-01-20'):
    if configs['save_fea'] == True:
        fea = pickle.load(open(configs['train_fea'] , 'r'))
        #print fea.dtypes
        test_fea = pickle.load(open(configs['test_fea'] , 'r'))
        #print test_fea.dtypes
    else:
        fea = Fea.getTrainFea()
        test_fea = Fea.getTestFea()
    if flag == 'online':
        train_x = fea[fea_names].values
        train_y = fea[y_fea_names]
        test_x = test_fea[fea_names].values
        test_y = test_fea[['poi' , 'key']]
        train = [train_x , train_y]
        test = [test_x , test_y]
        return train , test
    else:
        train_x = fea[fea.date < date][fea_names].values
        train_y = fea[fea.date < date][y_fea_names]
        test_x = fea[fea.date >= date][fea_names].values
        test_y = fea[fea.date >= date][y_fea_names]
        train = [train_x , train_y]
        test = [test_x , test_y]
        return train , test
Пример #10
0
 def plugin_init(self,mainframe,app_init):
     self.mainframe=mainframe
     self.worker=mainframe.tm
     panel=mainframe.float_mgr.add_panel('Map','Show or hide the map panel (use it to view or set the location of your images)','picty-map')
     self.mapframe=MapFrame(self)
     panel.vbox.pack_start(self.mapframe)
     places = {'Home':(0.0,0.0,1)}
     latlon = None
     place = None
     source = None
     data = settings.load_addon_prefs('map_plugin_settings')
     if data:
         places = data['places']
         source = data['source']
         place = data['place']
     else:
         try:
             f=open(os.path.join(settings.data_dir,'map-places'),'rb')
             version=cPickle.load(f)
             places=cPickle.load(f)
             if version>='0.1.1':
                 source=cPickle.load(f)
             f.close()
         except:
             log_err('No map-places file found')
     self.mapframe.set_places(places)
     self.mapframe.set_place(place)
     if source is not None:
         self.mapframe.set_preferred_source(source)
     ##TODO: should update map images whenever there are relevent collection changes (will need to maintian list of displayed images) -- may be enough to trap view add/remove and GPS metadata changes
     self.mainframe.connect("view-rebuild-complete",self.view_rebuild_complete)
Пример #11
0
def main():
    args = parse_args()
    option = json.load(open(args.option))
    prefix = args.prefix
    sufix = args.sufix
    data_type = args.data_type
    event_fn = args.event_fn
    word2vec_file = args.word2vec
    exp_name = args.exp_name

    max_sens = option["max_sens"]
    max_words = option["max_words"]
    padding = option["padding"]

    class2id = {k.strip():i for i,k in enumerate(open(event_fn))}

    dataset = nn.load_event_dataset(prefix, sufix)
    wf = open(word2vec_file)
    embedding = cPickle.load(wf)
    word2id = cPickle.load(wf)

    digit_dataset = nn.transform_event_dataset(dataset, word2id, class2id, data_type, max_sens, max_words, padding)

    model = GICF(option)
    model.run_experiment(digit_dataset, embedding, exp_name)
Пример #12
0
 def copy_flow(self, key):
     trained_flow_path = "%s/%s.pickle" % (self.directory , "train_flow_"+ key)
     prewindowing_flow_path = "%s/%s.pickle" % (self.directory , "prewindowing_flow_"+ key)
     prewindowing_offline_flow_path = "%s/%s.pickle" % (self.directory , "prewindowing_offline_flow_"+ key)
     prewindowed_train_flow_path = "%s/%s.pickle" % (self.directory , "prewindowed_train_flow_"+ key)
     # using the trained flow for adaptation
     if os.path.exists(trained_flow_path):
         shutil.copyfile(trained_flow_path, "%s/%s.pickle" % (self.directory, "abri_flow_" + key + "_unadapted"))
     # using the prewindowing flow and prewindowed-trained flow for adaptation
     else:
         flh_1 = {}
         flh_2 = {}
         prewindowing_flow = {}
         postprocessing_flow = {}
         unadapted_flow = {}
         if os.path.exists(prewindowing_flow_path):
             flh_1[key] = open(prewindowing_flow_path, 'r')
         elif os.path.exists(prewindowing_offline_flow_path):
             flh_1[key] = open(prewindowing_offline_flow_path, 'r')
         flh_2[key] = open("%s/%s.pickle" % (self.directory , "prewindowed_train_flow_"+ key), 'r')
         prewindowing_flow[key] = cPickle.load(flh_1[key])
         prewindowing_flow[key].pop(-1)
         prewindowing_flow[key].pop(-1)
         postprocessing_flow[key] = cPickle.load(flh_2[key])
         postprocessing_flow[key].pop(0)
         postprocessing_flow[key].pop(0)
         unadapted_flow[key] = prewindowing_flow[key] + postprocessing_flow[key]
         flh_1[key].close()
         flh_2[key].close()
         unadapted_file = open("%s/%s.pickle" % (self.directory, "abri_flow_" + key + "_unadapted"), 'w+')
         cPickle.dump(unadapted_flow[key], unadapted_file)
 def loadTree(filename):
   # returns a tuple of the root item and tree object
   f = open(filename, "rb")
   root_item = pickle.load(f)
   tree_object = pickle.load(f)
   f.close()
   return (root_item, tree_object)
Пример #14
0
def searchNearestNeighborsOPQ(codeFilename, codebooksFilename, queriesFilename, \
                                     queriesCount, k=10000, threadsCount=30):
    model = pickle.load(open(codebooksFilename, 'r'))
    codebooks = model[0]
    R = model[1]
    M = codebooks.shape[0]
    codebookDim = codebooks.shape[2]
    dim = codebookDim * M
    codebookSize = codebooks.shape[1]
    codes = pickle.load(open(codeFilename, 'r'))
    queries = readXvecs(queriesFilename, dim, queriesCount)
    queries = np.dot(queries, R.T).astype('float32')
    result = np.zeros((queriesCount, k), dtype='int32')
    codeDistances = np.zeros((M, queriesCount, codebookSize),dtype='float32')
    for m in xrange(M):
        subqueries = queries[:,m*codebookDim:(m+1)*codebookDim].copy()
        codeDistances[m,:,:] = ynumpy.cross_distances(codebooks[m], subqueries)
    nearest = np.zeros((queriesCount, k), dtype='int32')
    qidRangeSize = 1
    rangesCount = int(math.ceil(float(queriesCount) / qidRangeSize))
    pool = Pool(threadsCount)
    ans = pool.map(partial(findNearestForRangePQ, \
                           rangeSize=qidRangeSize, codebookDistances=codeDistances, pointsCodes=codes, listLength=k), \
                           range(0, rangesCount))
    pool.close()
    pool.join()
    for i in xrange(len(ans)):
        if ans[i] == None:
            pass
        else:
            qidsCount = ans[i].shape[0]
            nearest[i*qidRangeSize:i*qidRangeSize+qidsCount,:] = ans[i]
    return nearest
Пример #15
0
def load_infnet_from_file(f_name=None, rng=None, Xd=None, \
                          new_params=None):
    """
    Load a clone of some previously trained model.
    """
    assert(not (f_name is None))
    pickle_file = open(f_name)
    self_dot_params = cPickle.load(pickle_file)
    if not (new_params is None):
        for k in new_params:
            self_dot_params[k] = new_params[k]
    self_dot_numpy_param_dicts = cPickle.load(pickle_file)
    self_dot_shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
    for layer_group in ['shared', 'mu', 'sigma']:
        for numpy_dict in self_dot_numpy_param_dicts[layer_group]:
            shared_dict = {}
            for key in numpy_dict:
                val = to_fX(numpy_dict[key])
                shared_dict[key] = theano.shared(val)
            self_dot_shared_param_dicts[layer_group].append(shared_dict)
    # now, create a PeaNet with the configuration we just unpickled
    clone_net = InfNet(rng=rng, Xd=Xd, params=self_dot_params, \
            shared_param_dicts=self_dot_shared_param_dicts)
    # helpful output
    print("==================================================")
    print("LOADED InfNet WITH PARAMS:")
    for k in self_dot_params:
        print("    {0:s}: {1:s}".format(str(k), str(self_dot_params[k])))
    print("==================================================")
    return clone_net
def mnist(datasets_dir='/Tmp/kastner'):
    try:
        import urllib
        urllib.urlretrieve('http://google.com')
    except AttributeError:
        import urllib.request as urllib
    url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
    data_file = os.path.join(datasets_dir, 'mnist.pkl.gz')
    if not os.path.exists(data_file):
        urllib.urlretrieve(url, data_file)

    print('... loading data')
    # Load the dataset
    f = gzip.open(data_file, 'rb')
    try:
        train_set, valid_set, test_set = cPickle.load(f, encoding="latin1")
    except TypeError:
        train_set, valid_set, test_set = cPickle.load(f)
    f.close()

    test_x, test_y = test_set
    test_x = test_x.astype('float32')
    test_x = test_x.astype('float32').reshape(test_x.shape[0], 1, 28, 28)
    test_y = test_y.astype('int32')
    valid_x, valid_y = valid_set
    valid_x = valid_x.astype('float32')
    valid_x = valid_x.astype('float32').reshape(valid_x.shape[0], 1, 28, 28)
    valid_y = valid_y.astype('int32')
    train_x, train_y = train_set
    train_x = train_x.astype('float32').reshape(train_x.shape[0], 1, 28, 28)
    train_y = train_y.astype('int32')
    rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
    return rval
Пример #17
0
def main():
    args = parse_arguments()
    print "hi from making files", args.make
    if 'graph' in args.make:
        print args.pickle_file
        pham = cPickle.load(open(args.pickle_file.strip('"'), 'rb'))
        graph_start_sites(args, pham, args.dir)

    if 'starts' in args.make:
        phage_genes = cPickle.load(open(args.pickle_file.strip('"'), 'rb'))

        make_suggested_starts(phage_genes, args.phage, args.dir)

    if 'genome' in args.make:
        phage = cPickle.load(open(args.pickle_file.strip('"'), 'rb'))
        make_pham_genome(phage, args.phage, args.phage_length, args.dir)
        make_suggested_starts(phage, args.phage, args.dir)

    if 'text' in args.make:
        print args.pickle_file
        pham = cPickle.load(open(args.pickle_file.strip('"'), 'rb'))
        graph_start_sites(args, pham, args.dir)
        print "phage", args.phage
        if not args.phage:
            print "hello no phage"
            make_pham_text(args, pham, args.pham_no, args.dir, only_pham=True)
        else:
            make_pham_text(args, pham, args.pham_no, args.dir)

    if 'fasta' in args.make:
        pass
Пример #18
0
 def __init__(self,
              b=None,
              eta=1.,
              pa=None,
              q=None,
              x=None,
              y=None,
              load=False):
     self.b = b
     self.eta = eta
     self.pa = pa
     self.q = q
     self.x = x
     self.y = y
     self.nsteps = 1e3
     if load:
         import cPickle
         f = open('powerlaw.alphax', 'rb')
         self.xmodel = cPickle.load(f)
         f.close()
         f = open('powerlaw.alphay', 'rb')
         self.ymodel = cPickle.load(f)
         f.close()
     else:
         self.xmodel = None
         self.ymodel = None
Пример #19
0
def one_model():
    # load feat names
    #feat_names = config.feat_names
    feat_names = ['label']
    model_type = "extratree"
    model_param = config.param_spaces[model_type]

    ## load best params for each model (feat, model)
    #with open("%s/model_best_params" %config.data_folder) as f:
    #    param_best_dic = pickle.load(f)

    ## supply the extra parameter from config.param_spaces
    #for feat in config.feat_names:
    #    for model in config.model_list:
    #        if param_best_dic.has_key("%s_%s"%(feat, model)):
    #            param_space = config.param_spaces[model]
    #            for key in param_space.keys():
    #                if param_best_dic["%s_%s"%(feat, model)].has_key(key) is False:
    #                    param_best_dic["%s_%s"%(feat, model)][key] = param_space[key]
    #print param_best_dic

    # load feat, cross validation
    for iter in range(config.kiter):
        for fold in range(config.kfold):
            for feat in feat_names:
                print "Gen pred for (iter%d, fold%d, %s) cross validation" %(iter, fold, feat)
                with open("%s/iter%d/fold%d/train.%s.feat.pkl" %(config.data_folder, iter, fold, feat), 'rb') as f:
                    [x_train, y_train] = pickle.load(f)
                with open("%s/iter%d/fold%d/valid.%s.feat.pkl" %(config.data_folder, iter, fold, feat), 'rb') as f:
                    [x_test, y_test] = pickle.load(f)
                path = "%s/iter%d/fold%d" %(config.data_folder, iter, fold)
                #train_model(path, x_train, y_train, x_val, y_val, feat, param_best_dic)
                pred_val = hyperopt_library(model_type, model_param, x_train, y_train, x_test, y_test)
                print "ml score is %f" %ml_score(y_test, pred_val)
            break
Пример #20
0
    def load(self, load_dir, load_filename = 'model.pkl'):
        """ load the model """
        print '... loading model'
        save_file = open(os.path.join(load_dir, load_filename),'r')
        args = cPickle.load(save_file)
        self.__init__(
                 seed_params = args['seed_params'],
                 seed_noise = args['seed_noise'],
                 input = args['input'],
                 n_visible= args['n_visible'],
                 n_hidden= args['n_hidden'],
                 #tied_weights = args['tied_weights'],
                 act_enc = args['act_enc'],
                 act_dec = args['act_dec'],
                 W = args['W'],
                 W_prime = args['W_prime'],
                 b = args['b'],
                 b_prime = args['b_prime'])

        self.W.value = cPickle.load(save_file)
        if not self.tied_weights:
            self.W_prime.value = cPickle.load(save_file)
        self.b.value = cPickle.load(save_file)
        self.b_prime.value = cPickle.load(save_file)
        save_file.close()
Пример #21
0
    def test_parser_1(self):
        """Tests the XMLDocParser and SentenceParser subclasses"""

        # Load correct parses
        with open(ROOT + '/test/data/CDR_TestSet_docs.pkl', 'rb') as f:
            gold_docs = cPickle.load(f)

        with open(ROOT + '/test/data/CDR_TestSet_sents.pkl', 'rb') as f:
            gold_sents = cPickle.load(f)

        # Set up the doc parser
        xml_parser = XMLDocParser(
            path=ROOT + '/test/data/CDR_TestSet.xml',
            doc='.//document',
            text='.//passage/text/text()',
            id='.//id/text()',
            keep_xml_tree=False)

        sent_parser = SentenceParser()

        corpus = Corpus(xml_parser, sent_parser, max_docs=20)

        print len(corpus.get_docs())
        print len(corpus.get_contexts())

        self.assertEqual(corpus.get_docs(), gold_docs)
        self.assertEqual(corpus.get_contexts(), gold_sents)
Пример #22
0
 def __init__(self, discretized=False, discretizer=Orange.feature.discretization.ThresholdDiscretizer(threshold=0.)):
     self.ml_data = cPickle.load(file(orange_disc_filename if discretized else orange_data_filename))
     self.test_data = cPickle.load(file(orange_disc_test_filename if discretized else orange_test_filename))
     print "Check train vs. test:", self.ml_data.domain.features[0] == self.test_data.domain.features[0]
     print "Loaded:", len(self.ml_data)
     self.classes = {v.name:v for v in self.ml_data.domain.class_vars}
     self.disc_features = [discretizer.constructVariable(x) for x in self.ml_data.domain.features]
Пример #23
0
def load_nets(myglob):
    """
    Load a dictionary of trained neural networks with filenames
    matching a unix-style pattern.  Filenames of networks trained for
    individual output measures should be named,
    `final-OutputCol-NameOfOutputCol_net.pkl` and filenames for a network
    trained for the complete set of output measures should be `full_net.pkl`.

    Parameters
    ----------
    myglob  : str
              the unix-style pattern (including abs or rel path) for the
              filenames you want to load.  Wildcards accepted.

    Returns
    -------
    nets  : dict
            a dictionary with the trained networks.  Keys are the columns
            from y_train that the network is trained on.
    """
    files_list = glob.glob(myglob)
    nets = {}

    if [x for x in files_list if 'full' in x]:
        with open(files_list[0], 'rb') as pkl:
            nets['all'] = pickle.load(pkl)[0]
    else:
        for filename in files_list:
            output_col = int(filename.split('/')[-1].split('-')[1])
            with open(filename, 'rb') as pkl:
                nets[output_col] = pickle.load(pkl)[0]

    return nets
Пример #24
0
def load_db(db_file, dump_names, dump_info):
    global var_records, merge_graph, tag_source

    print "Loading " + db_file

    input = open(db_file, "rb")
    var_records = cPickle.load(input)

    if dump_names or dump_info:        
        if dump_names:
            for name in var_records.keys():
                print name
        if dump_info:
            pp = pprint.PrettyPrinter(indent=4)                
            pp.pprint(var_records)
        
        for v in var_records.keys():
            if var_records[v].is_global:
                assert(len(var_records[v].functions) == len(var_records[v].entry_tags))
                sys.exit(0)
        sys.exit(0)
                
    merge_graph = cPickle.load(input)
    tag_source = cPickle.load(input)

    print "Loaded " + db_file
Пример #25
0
def read_index(index_file):
    global global_dict, k_gram, id_name
    fh = open(index_file, 'rb')
    global_dict = cPickle.load(fh)
    k_gram = cPickle.load(fh)
    id_name = cPickle.load(fh)
    fh.close()
Пример #26
0
def get_final_amar(experiment_path, member, member_path, min_samples = 0):
    #print "Called with (", experiment_path, ",", member, ",", member_path, ")"
    gen = 0
    # Start searching from first generation, ignore given member_path
    member_path = experiment_path + "generation" + str(gen) + "\\" + member + "\\"
    
    # First, loop past the generation BEFORE the member was born
    while not os.path.isdir(member_path):
        gen += 1
        member_path = experiment_path + "generation" + str(gen) + "\\" + member + "\\"
        # Security check
        if gen >= 2000:
            print "[ERROR] Looking for member on path", member_path, "in generation 2000, member will probably not be found!"
            return 42.0
    
    # Then loop just past the final generation of the member
    while os.path.isdir(member_path):
        gen += 1
        member_path = experiment_path + "generation" + str(gen) + "\\" + member + "\\"
    # Then go back 1
    member_path = experiment_path + "generation" + str(gen-1) + "\\" + member + "\\"
    
    # Check if the member has enough samples to actually count
    samples = float( cPickle.load(open(member_path + "eval\\mar_samples.p", "rb")) )
    if samples >= min_samples:
        # Retrieve AMAR
        return float( cPickle.load(open(member_path + "eval\\average_mar.p", "rb")) )
    else:
        # Return high dummy value
        return 999999.9
Пример #27
0
def getData_Voxforge():
    pickle_directory = prm.params["pickle_directory"].get()
    voxforge_directory = prm.params["voxforge_directory"].get()

    try:
        print "Loading the Data... (may take a while)"
        data = pickle.load(open(pickle_directory + "data_vf.p", "rb"))
        labels = pickle.load(open(pickle_directory + "labels_vf.p", "rb"))
        print "Data Loaded, Good to Go!"
    except Exception as excp:  # data_vf.p doesnt exist
        print "Exception:", excp
        data, labels, rawData = extract_Data(voxforge_directory)
        print "Flattening ze Data"
        # flatten the data (for svm)
        data_flat = []
        data = np.array(data)
        for elem in data:
            data_flat.append(elem.flatten())
        data = data_flat
        data_male = []
        data_female = []
        
        for elem in zip(data, labels):
            if elem[1] == 1:  # male
                data_male.append(elem[0])
            else:
                data_female.append(elem[0])
            
        print "Select Data Subset... (may take a while)"
        labels_male = np.ones(len(data_male))
        labels_female = np.zeros(len(data_female))
        data_male, labels_male = select_Data_Subset(data_male, labels_male, 0.1)
        data_female, labels_female = select_Data_Subset(data_female, labels_female, 1)
        
        print "Shapes of Data (male, female) and sum of labels", np.shape(data_male), np.shape(data_female), sum(labels)
    
        data = np.concatenate((data_male, data_female))
        labels = np.concatenate((labels_male, labels_female))
        print "Data Subset Selected!"
    
        dataComplete = zip(data, labels)
        
        # SHUFFLE THE IMAGES
        random.shuffle(dataComplete)
        
        data = []
        labels = []
        for elem in dataComplete:
            data.append(elem[0])
            labels.append(elem[1])
        
        print "Saving the Data... (may take a while)"
        pickle.dump(data, open(pickle_directory + "data_vf.p", "wb"))        
        pickle.dump(labels, open(pickle_directory + "labels_vf.p", "wb"))        
        print "Data Saved, Good to Go!"
    
        print "Shapes of Data (male, female) and Labels", np.shape(data_male), np.shape(data_female), np.shape(labels)
        print "Sum of labels:", sum(labels)
        
        return data, labels, rawData
Пример #28
0
 def loadModel(self):
     print "Loading the unigram&bigram infomation......"
     inputs = open(self.path + r"bigram_feat_id.pkl", "rb")
     self.bigram_feat_id = load(inputs)
     self.bigram_feat_num = len(self.bigram_feat_id)
     inputs.close()
     inputs1 = open(self.path + r"unigram_feat_id.pkl", "rb")
     self.unigram_feat_id = load(inputs1)
     self.unigram_feat_num = len(self.unigram_feat_id)
     inputs1.close()
     inputs2 = open(self.path + r"dict_feat_id.pkl", "rb")
     self.dict_feat_id = load(inputs2)
     self.dict_feat_num = len(self.dict_feat_id)
     # print "Loading process done."
     print "Loading the prb infomation......"
     inputs = open(self.path + r"init_prb.pkl", "rb")
     self.init_prb = load(inputs)
     inputs.close()
     inputs1 = open(self.path + r"trans_prb.pkl", "rb")
     self.trans_prb = load(inputs1)
     inputs1.close()
     print "Loading process done."
     self.dimension = (
         self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
     )
Пример #29
0
def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('in_abs', help='input abstracts file path: "../stemmed_abstracts.pickle" ')
	parser.add_argument('in_wb', help='input abstracts file path: "../word_base.pickle" ')
	parser.add_argument('out_abs', help='file path of abstracts output file: "vector_abstracts.pickle"')

	args = parser.parse_args()
	
	print 'loading abstracts...'
	abs_file = open(args.in_abs)
	abstracts = cPickle.load(abs_file)
	abs_file.close()

	print 'loading word_base'
	wb_file = open(args.in_wb)
	word_base = cPickle.load(wb_file)
	wb_file.close()

	vector_abstracts = abstracts_to_vector( abstracts, word_base)
	
	print 'persist vector abstracts'
	output_file = open(args.out_abs,'w')
	cPickle.dump( vector_abstracts, output_file, -1 )
	output_file.close()
Пример #30
0
    def __init__(self, dbFilePath=None, revDbFilePath=None, pos=False):
        self.dbFilePath = dbFilePath
        self.revDbFilePath = revDbFilePath
        if pos:
        	self.tagger = PerceptronTagger()
        self.pos = pos

        # try to open forward database
        if not dbFilePath:
            self.dbFilePath = os.path.join(os.path.dirname(__file__), "markovdb")
        try:
            with open(self.dbFilePath, 'rb') as dbfile:
                self.db = pickle.load(dbfile)
        except (IOError, ValueError):
            logging.warn('Database file corrupt or not found, using empty database')
            self.db = _db_factory()

        # try to open backwards database
        if not revDbFilePath:
            self.revDbFilePath = os.path.join(os.path.dirname(__file__), "revmarkovdb")
        try:
            with open(self.revDbFilePath, 'rb') as dbfile:
                self.rev_db = pickle.load(dbfile)
        except (IOError, ValueError):
            logging.warn('Database file corrupt or not found, using empty database')
            self.rev_db = _db_factory()
Пример #31
0
# -*- coding: utf-8 -*-
__author__ = 'Shane_Kao'
import cx_Oracle
import cPickle as pickle
import os
os.environ["NLS_LANG"]=".AL32UTF8"
result = pickle.load(open("result", "r"))
dbname = 'PLMD3'
username = "******"
pwd = "lsrm"
dsn=cx_Oracle.makedsn('172.21.130.250','1533','PLMD3')
db=cx_Oracle.connect(username,pwd,dsn)
cursor = db.cursor()
for i in result:
    i['text']=i['text'].encode('utf-8')
    cursor.execute("INSERT INTO test1 VALUES (:post_id,:url,:text)" ,i)
db.commit()


Пример #32
0
def validate_model(mpath, rpath, data, steps=None):
    model = cPickle.load(open(mpath, 'rb'))
    predictions = model.align(data['images'], num_steps=steps, save_all=True)
    cPickle.dump(predictions, open(rpath, 'wb'), cPickle.HIGHEST_PROTOCOL)
    return predictions
Пример #33
0
 def load(self, data_file):
     f = open(data_file, 'rb')
     tmp_dict = pickle.load(f)
     f.close()
     self.__dict__.update(tmp_dict)
Пример #34
0
def load_data(filename):
    with open(filename, 'rb') as f:
        data = cPickle.load(f)
    return data
import pandas as pd
import cPickle as pickle
import patsy
import unidecode
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer

# loading data from the pickled data frame

with open('../jNotebooks/master_total_df.p','rb') as f:
    master_total_df = pickle.load(f)

global_start = datetime.datetime.now()
local_start = datetime.datetime.now()

def printProgress(messages):
	global global_start
	global local_start
	current = datetime.datetime.now()
	print 'task time: ', current-local_start, 'overall', current-global_start
	print '='*100
	print messages
	local_start = current


def uncode(x):
Пример #36
0
    eigs_S, U_S = np.linalg.eig(S)
    eigs_S = np.real(eigs_S)
    
    tmp = np.dot(np.concatenate((A, B.T), axis = 0), A_neg_half)
    V = np.dot(np.dot(tmp, U_S), np.diag((eigs_S + 1e-8)**(-0.5)))
    
    return V
    
if __name__ == '__main__':
    M = 3000;
    K = 4
    
    dataset = 'data-0.pkl.gz'
    path = '/home/bo/Data/RCV1/Processed/'
    f = gzip.open(path + dataset, 'rb')
    data = cPickle.load(f)
    f.close()
    
    train_x = data[0].toarray()
    train_x = train_x.astype(np.float32)
    train_y = np.asarray(data[1], dtype = np.int32)
    train_y = np.reshape(train_y, (train_y.shape[0], 1))
    
    dim = train_x.shape[1]
    data = np.concatenate((train_x, train_y), axis = 1)
    np.random.shuffle(data) 
    
    train_x = data[:][:, 0:dim]
    train_y = np.int32(np.squeeze(data[:][:, -1]))  
    
    V = nystrom(train_x, M)
Пример #37
0
def load_pkl_object(filename):
	pkl_file = open(filename, 'rb')
	data = pickle.load(pkl_file)
	pkl_file.close()
	return data
Пример #38
0
                break
            except:
                print "Failed user " + user + ", retrying"
                time.sleep(4)
        for post in posts:
            url = post['href']
            user_dict[user][url] = 1.0
            all_items[url] = 1

    # Fill in missing items with 0
    for ratings in user_dict.values():
        for item in all_items:
            if item not in ratings:
                ratings[item] = 0.0


if __name__ == "__main__":

    if len(sys.argv) == 1:

        delusers = initializeUserDict('programming')
        delusers['josephmisiti'] = {}
        fillItems(delusers)

        f = open('data_top_5.dat', 'wb')
        pickle.dump(delusers, f)
        f.close()
    else:
        f = open('data_top_5.dat', 'rb')
        data = pickle.load(f)
Пример #39
0
def pickle_load(pickle_fn):
    with open(pickle_fn, 'rb') as input_file:
        A = cPickle.load(input_file)
    # print A 
    return A 
Пример #40
0
                    required=True,
                    dest='digitization',
                    help='digitization for gTower Et (MeV)')

# parse the arguments, throw errors if missing any
args = parser.parse_args()

startTime_wall = time.time()
startTime_processor = time.clock()

filename_id = "seed{:0.0f}_noise{:0.0f}_signal{:0.0f}_digitization{:0.0f}".format(
    args.seedEt_thresh, args.noise_filter, args.tower_thresh,
    args.digitization)
filename = "data/seed{:0.0f}/leading_jets_{}.pkl".format(
    args.seedEt_thresh, filename_id)
data = pickle.load(file(filename))

endTime_wall = time.time()
endTime_processor = time.clock()
print "Finished reading in data:\n\t Wall time: %0.2f s \n\t Clock Time: %0.2f s" % (
    (endTime_wall - startTime_wall), (endTime_processor - startTime_processor))

dataSetStr = plotConfigs.dataSetStr
seedCutStr = '$E_T^\mathrm{seed} >\ %d\ \mathrm{GeV}$' % args.seedEt_thresh
noiseCutStr = '$E_T^\mathrm{tower} >\ %d\ \mathrm{GeV}$' % args.noise_filter
towerThrStr = '$\\rho\left(E_T^\mathrm{tower} <\ %d\ \mathrm{GeV}\\right)$' % args.tower_thresh

helpers = PlotHelpers(dataSetStr=dataSetStr,
                      seedCutStr=seedCutStr,
                      noiseCutStr=noiseCutStr,
                      towerThrStr=towerThrStr)
Пример #41
0
import cPickle as pickle
import os
import numpy
current_data = os.listdir('data')
current_data = [data[0 : -4] for data in current_data]
total = []
families = []
for family in current_data:
	file = open('data/{fam}.txt'.format(fam = family), 'rb')
	data_array = pickle.load(file)
	file.close()
	if 'eae' in family:
		families.append(data_array[0])
	total.append(data_array[0])
print(numpy.mean(total))
print(numpy.std(total))
print(numpy.mean(families))
print(numpy.std(families))
Пример #42
0
		print review
		for name, model in models.iteritems():

			print predict(model, encoding, review)[-1][0][0]
			results[name].append(predict(model, encoding, review)[-1][0][0]) 

	return results





if __name__ == '__main__':
	logging.debug('Loading encoding...')
	with open('data/charnet-encoding.pkl', 'rb') as fp:
		text_encoding_D = pickle.load(fp)
		text_encoding_D.include_stop_token  = False
		text_encoding_D.include_start_token = False

	discriminator_0 = Sequence(Vector(len(text_encoding_D))) >> (Repeat(LSTM(1024), 2) >> Softmax(2))
	discriminator_1 = Sequence(Vector(len(text_encoding_D))) >> (Repeat(LSTM(1024), 2) >> Softmax(2))
	discriminator_2 = Sequence(Vector(len(text_encoding_D))) >> Repeat(LSTM(1024) >> Dropout(0.5), 2) >> Softmax(2)
	discriminator_3 = Sequence(Vector(len(text_encoding_D))) >> (Repeat(LSTM(1024), 2) >> Softmax(2))
	discriminator_4 = Sequence(Vector(len(text_encoding_D))) >> Repeat(LSTM(1024) >> Dropout(0.5), 2) >> Softmax(2)
	discriminator_5 = Sequence(Vector(len(text_encoding_D))) >> Repeat(LSTM(1024) >> Dropout(0.5), 2) >> Softmax(2)

	logging.debug('Loading discriminators...')
	with open('models/discriminative/discriminative-model-0.0.0.pkl', 'rb') as fp:
		state = pickle.load(fp)
		state = (state[0][0], (state[0][1], state[1]))
		discriminator_0.set_state(state)
Пример #43
0
    def parse_results(openvas_results, ip=None):
        """
        Convert the OpenVAS scan results to the GoLismero data model.

        :param openvas_results: OpenVAS scan results.
        :type openvas_results: list(OpenVASResult)

        :param ip: (Optional) IP address to link the vulnerabilities to.
        :type ip: IP | None

        :returns: Scan results converted to the GoLismero data model.
        :rtype: list(Data)
        """

        # This is where we'll store the results.
        results = []

        # Remember the hosts we've seen so we don't create them twice.
        hosts_seen = {}

        # Maps of OpenVAS levels to GoLismero levels.
        LEVELS = {
            'debug': 'informational',
            'log': 'informational',
            'low': "low",
            'medium': 'middle',
            'high': "high",
        }
        RISKS = {
            'none': 0,
            'debug': 0,
            'log': 0,
            'low': 1,
            'medium': 2,
            'high': 3,
            'critical': 4
        }

        # Do we have the OpenVAS plugin database?
        if not os.path.exists(openvas_db):
            Logger.log_error(
                "OpenVAS plugin not initialized, please run setup.py")
            return

        # Load the database.
        with open(openvas_db, "rb") as f:
            use_openvas_db = Pickler.load(f)

        # Get the configuration.
        import_log = Config.audit_config.boolean(
            Config.plugin_args.get("import_log", "no"))
        import_debug = Config.audit_config.boolean(
            Config.plugin_args.get("import_debug", "no"))

        # For each OpenVAS result...
        for opv in openvas_results:
            try:

                # Get the host.
                host = opv.host

                # Skip if we don't have a target host.
                if host is None:
                    continue

                # Get the threat level.
                threat = getattr(opv, "threat", "log").lower()

                # Discard log and debug entries, keep only the vulnerabilities.
                if threat == "log" and not import_log:
                    continue
                if threat == "debug" and not import_debug:
                    continue

                # Get or create the vulnerable resource.
                target = ip
                if host in hosts_seen:
                    target = hosts_seen[host]
                elif not ip or ip.address != host:
                    try:
                        target = IP(host)
                    except ValueError:
                        target = Domain(host)
                    hosts_seen[host] = target
                    results.append(target)

                # Extract the relevant information from the results.
                nvt       = opv.nvt
                vid       = opv.id
                oid       = int(nvt.oid.split(".")[-1])
                name      = getattr(nvt, "name", None)
                cvss_base = getattr(nvt, "cvss_base", None)
                level     = LEVELS.get(threat, "informational")
                risk      = RISKS.get(
                    getattr(opv.nvt, "risk_factor", "none").lower(), 0)

                # Get the vulnerability description.
                description = opv.raw_description
                if not description:
                    description = nvt.description
                    if not description:
                        description = nvt.summary
                        if not description:
                            description = None

                # Extract the CVEs and Bugtraq IDs.
                cve = nvt.cve if nvt.cve else []
                if "NOCVE" in cve:
                    cve.remove("NOCVE")
                bid = []
                if nvt.bid:
                    bid.extend("BID-" + x for x in nvt.bid)
                if nvt.bugtraq:
                    bid.extend("BID-" + x for x in nvt.bugtraq)
                if "NOBID" in bid:
                    cve.remove("NOBID")

                # Extract the notes and add them to the description text.
                if opv.notes and description is not None:
                    description += "\n" + "\n".join(
                        " - " + note.text
                        for note in opv.notes
                    )

                # Extract the reference URLs from the description text.
                references = []
                if description is not None:
                    p = description.find("URL:")
                    while p >= 0:
                        p += 4
                        q2 = description.find("\n", p)
                        q1 = description.find(",", p, q2)
                        if q1 > p:
                            q = q1
                        else:
                            q = q2
                        if q < p:
                            q = len(description)
                        url = description[p:q].strip()
                        try:
                            url = parse_url(url).url
                            references.append(url)
                        except Exception:
                            Logger.log_error(format_exc())
                            pass
                        p = description.find("URL:", q)

                # Prepare the vulnerability properties.
                kwargs = {
                    "title":        "%s;;;%s" % (name, str(opv.port.port_name)),
                    "description":  description,
                    "references":   references,
                    "level":        level,
                    "risk":         risk,
                    "severity":     risk,
                    "impact":       risk,
                    "cvss_base":    cvss_base,
                    "cve":          cve,
                    "bid":          bid,
                    "tool_id":      "openvas_plugin_%s" % oid,
                    "custom_id":    vid,
                }

                # If we have the OpenVAS plugin database, look up the plugin ID
                # that reported this vulnerability and create the vulnerability
                # using a specific class. Otherwise use the vulnerability class
                # for uncategorized vulnerabilities.
                classname = "UncategorizedVulnerability"
                if oid in use_openvas_db:
                    classname = use_openvas_db[oid][0][0]

                # Create the Vulnerability object.
                try:
                    clazz = globals()[classname]
                    vuln  = clazz(target, **kwargs)
                except Exception, e:
                    t = format_exc()
                    Logger.log_error_more_verbose(
                        "Could not load vulnerability of type: %s" % classname)
                    Logger.log_error_more_verbose(t)
                    vuln = UncategorizedVulnerability(target, **kwargs)
                results.append(vuln)

            # Skip this result on error.
            except Exception, e:
                t = format_exc()
                Logger.log_error_verbose(
                    "Error parsing OpenVAS results: %s" % str(e))
                Logger.log_error_more_verbose(t)
Пример #44
0
def test_SRNN(finetune_lr=0.01, pretraining_epochs=0,
             pretrain_lr=0.01, k=1, training_epochs=1000, # TODO 100+
             dataset=DATASET, batch_size=100):
    """

    :type learning_rate: float
    :param learning_rate: learning rate used in the finetune stage
    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining
    :type pretrain_lr: float
    :param pretrain_lr: learning rate to be used during pre-training
    :type k: int
    :param k: number of Gibbs steps in CD/PCD
    :type training_epochs: int
    :param training_epochs: maximal number of iterations ot run the optimizer
    :type dataset: string
    :param dataset: path the the pickled dataset
    :type batch_size: int
    :param batch_size: the size of a minibatch
    """

    print "loading dataset from", dataset
    #datasets = load_data(dataset, nframes=N_FRAMES, features='fbank', scaling='normalize', cv_frac=0.2, speakers=False, numpy_array_only=True) 
    #datasets = load_data(dataset, nframes=N_FRAMES, features='fbank', scaling='student', cv_frac='fixed', speakers=False, numpy_array_only=True) 
    datasets = load_data(dataset, nframes=1, features='fbank', scaling='student', cv_frac='fixed', speakers=False, numpy_array_only=True) 
    #datasets = load_data(dataset, nframes=1, features='fbank', scaling='student', cv_frac=0.2, speakers=False, numpy_array_only=True) 

    train_set_x, train_set_y = datasets[0]  # if speakers, do test/test/test
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    print "dataset loaded!"
    print "train set size", train_set_x.shape[0]
    print "validation set size", valid_set_x.shape[0]
    print "test set size", test_set_x.shape[0]
    print "phones in train", len(set(train_set_y))
    print "phones in valid", len(set(valid_set_y))
    print "phones in test", len(set(test_set_y))

    to_int = {}
    with open('timit_to_int_and_to_state_dicts_tuple.pickle') as f:  # TODO
        to_int, _ = cPickle.load(f)
    train_set_iterator = DatasetSentencesIterator(train_set_x, train_set_y,
            to_int, N_FRAMES)
    valid_set_iterator = DatasetSentencesIterator(valid_set_x, valid_set_y,
            to_int, N_FRAMES)
    test_set_iterator = DatasetSentencesIterator(test_set_x, test_set_y,
            to_int, N_FRAMES)

    # numpy random generator
    numpy_rng = numpy.random.RandomState(123)
    print '... building the model'

    n_outs = len(set(train_set_y))
    dbn = SRNN(numpy_rng=numpy_rng, n_ins=N_FRAMES * N_FEATURES,
              relu_layers_sizes=[1024, 1024, 1024],
              n_outs=n_outs)

    # get the training, validation and testing function for the model
    print '... getting the finetuning functions'
    first_pass, train_fn = dbn.get_stacked_adadelta_trainer()
    train_scoref = dbn.score_stacked_classif(train_set_iterator)
    valid_scoref = dbn.score_stacked_classif(valid_set_iterator)
    test_scoref = dbn.score_stacked_classif(test_set_iterator)

    print '... finetuning the model'
    # early-stopping parameters
    patience = 1000  # look as this many examples regardless TODO
    patience_increase = 2.  # wait this much longer when a new best is
                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant

    best_validation_error = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        avg_costs = []
        for iteration, (x, y) in enumerate(train_set_iterator):

            #if best_validation_error < 0.5:  # this is a hack:
            if epoch > 1:  # this is a hack
                # TODO normally wait for total convergence and redo the 
                # training this way (because doing 2 trainings would not
                # badly learn Ws and would reset Adadelta):
                p_y_init = numpy.zeros((x.shape[0], n_outs), dtype='float32') + 1./n_outs
                p_y = first_pass(x, p_y_init)
                if N_FRAMES_WINDOW > 0:
                    p_y = numpy.concatenate([p_y_init[:N_FRAMES_WINDOW],
                        p_y[:-N_FRAMES_WINDOW]])
                avg_cost = train_fn(x, p_y, y)
            else:
                p_y_init = numpy.zeros((x.shape[0], n_outs), dtype='float32')
                avg_cost = train_fn(x, p_y_init, y)

            avg_costs.append(avg_cost)
            #print('  epoch %i, sentence %i, '
            #'avg cost for this sentence %f' % \
            #      (epoch, iteration, avg_cost))
        print('  epoch %i, avg costs %f' % \
              (epoch, numpy.mean(avg_costs)))
        print('  epoch %i, training error %f %%' % \
              (epoch, numpy.mean(train_scoref()) * 100.))

        # we check the validation error on every epoch
        validation_errors = valid_scoref()
        this_validation_error = numpy.mean(validation_errors)  # TODO this is a mean of means (with different lengths)
        print('  epoch %i, validation error %f %%' % \
              (epoch, this_validation_error * 100.))
        # if we got the best validation score until now
        if this_validation_error < best_validation_error:
            with open(output_file_name + '.pickle', 'w') as f:
                cPickle.dump(dbn, f)
            # improve patience if error improvement is good enough
            if (this_validation_error < best_validation_error *
                improvement_threshold):
                patience = max(patience, iteration * patience_increase)
            # save best validation score and iteration number
            best_validation_error = this_validation_error
            # test it on the test set
            test_errors = test_scoref()
            test_error = numpy.mean(test_errors)  # TODO this is a mean of means (with different lengths)
            print(('  epoch %i, test error of '
                   'best model %f %%') %
                  (epoch, test_error * 100.))
        if patience <= iteration:  # TODO correct that
            done_looping = True
            break

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f %%, '
           'with test performance %f %%') %
                 (best_validation_error * 100., test_score * 100.))
    print >> sys.stderr, ('The fine tuning code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time)
                                              / 60.))
    with open(output_file_name + '.pickle', 'w') as f:
        cPickle.dump(dbn, f)
Пример #45
0
def main(args):
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    state = eval(args.prototype)()
    timings = init_timings()

    # Load dictionary
    raw_dict = cPickle.load(open(state['dictionary'], 'r'))
    # Dictionaries to convert str to idx and vice-versa
    str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in raw_dict
                       ])  #字典里的每一项包含四个字段,(字符,字符号,词频,文本频率)
    idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq, _ in raw_dict])

    category = cPickle.load(open(state['category'], 'r'))
    assert (len(category) == state['cnum'])

    model = DocumentEncoder(state)
    rng = model.rng

    model.state['run_id'] = RUN_ID

    logger.debug("Training using exact log-likelihood")

    train_batch = model.build_train_function()  #训练函数,返回三个量,第一个是training_cost

    eval_batch = model.build_eval_function()  #测试(验证)函数

    logger.debug("Load data")
    train_data, \
    valid_data, = get_train_iterator(state)
    train_data.start()

    # Start looping through the dataset
    step = 0
    patience = state['patience']
    start_time = time.time()

    train_cost = 0
    train_variational_cost = 0
    train_posterior_mean_variance = 0
    train_misclass = 0
    train_done = 0
    train_dialogues_done = 0.0

    prev_train_cost = 0
    prev_train_done = 0

    ex_done = 0
    is_end_of_batch = True
    start_validation = False

    batch = None

    while (step < state['loop_iters']
           and (time.time() - start_time) / 60. < state['time_stop']
           and patience >= 0):

        # Training phase

        # If we are training on a primary and secondary dataset, sample at random from either of them

        batch = train_data.next()

        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got None...")
            break

        logger.debug("[TRAIN_%d] - Got batch %d,%d" %
                     (step, batch['x'].shape[1], batch['max_length']))

        if batch['max_length'] == state['max_grad_steps']:
            continue

        x_data = batch['x']

        #print 'x_data:\t',x_data

        x_data_reversed = batch['x_reversed']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']
        x_semantic = batch['x_semantic']
        x_reset = batch['x_reset']
        ran_cost_utterance = batch['ran_var_constutterance']

        is_end_of_batch = False
        if numpy.sum(numpy.abs(x_reset)) < 1:
            #print 'END-OF-BATCH EXAMPLE!'
            is_end_of_batch = True

        idx_s = (x_data == 2).nonzero()[0][0]

        if x_data[1:idx_s].shape[0] < 2:
            continue

        c, variational_cost, posterior_mean_variance = train_batch(
            x_data, max_length)

        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            gc.collect()
            continue

        train_cost += c
        train_variational_cost += variational_cost
        train_posterior_mean_variance += posterior_mean_variance

        train_done += batch['num_dialogues']
        train_dialogues_done += batch['num_dialogues']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time

            # Keep track of training cost for the last 'train_freq' batches.
            current_train_cost = train_cost / train_done
            if prev_train_done >= 1:
                current_train_cost = float(
                    train_cost - prev_train_cost) / float(train_done -
                                                          prev_train_done)

            prev_train_cost = train_cost
            prev_train_done = train_done

            h, m, s = ConvertTimedelta(this_time - start_time)
            print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f" % (h, m, s,\
                             state['time_stop'] - (time.time() - start_time)/60.,\
                             step, \
                             batch['x'].shape[1], \
                             batch['max_length'], \
                             float(train_cost/train_done))


        if valid_data is not None and\
            step % state['valid_freq'] == 0 and step > 1:
            start_validation = True

        if start_validation and is_end_of_batch:
            start_validation = False
            valid_data.start()
            valid_cost = 0
            valid_variational_cost = 0
            valid_posterior_mean_variance = 0

            valid_wordpreds_done = 0
            valid_dialogues_done = 0

            logger.debug("[VALIDATION START]")

            fw_valid = open('_VALID__%d.txt' % step, 'w')

            while True:
                batch = valid_data.next()

                # Train finished
                if not batch:
                    break

                logger.debug("[VALID] - Got batch %d,%d" %
                             (batch['x'].shape[1], batch['max_length']))

                if batch['max_length'] == state['max_grad_steps']:
                    continue

                x_data = batch['x']
                x_data_reversed = batch['x_reversed']
                max_length = batch['max_length']
                x_cost_mask = batch['x_mask']
                x_semantic = batch['x_semantic']
                x_semantic_nonempty_indices = numpy.where(x_semantic >= 0)

                x_reset = batch['x_reset']
                ran_cost_utterance = batch['ran_var_constutterance']

                #print ' '.join([idx_to_str[id_of_w] for id_of_w in x_data.T.tolist()[0]])
                idx_s = (x_data == 2).nonzero()[0][0]
                if x_data[1:idx_s].shape[0] < 2:
                    continue

                c, c_list, variational_cost, posterior_mean_variance, Gen_pro, Tar_Y = eval_batch(
                    x_data, max_length)

                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_cost += c
                valid_variational_cost += variational_cost
                valid_posterior_mean_variance += posterior_mean_variance

                print 'valid_cost', valid_cost
                #print 'Original: ', ' '.join([idx_to_str[id_of_w] for id_of_w in list(Tar_Y.T)[0]]) #'',join([idx_to_str[id_of_w] for id_of_w in Tar_Y])
                fw_valid.write('Label: ' + ' '.join(
                    [category[id_of_w]
                     for id_of_w in list(Tar_Y.T)[0]]) + '\r\n')
                Gen_pro = Gen_pro.tolist()[0]
                enum_ = enumerate(Gen_pro)
                Gen_sort = sorted(enum_, key=lambda x: x[1], reverse=True)[:30]
                Gen_tar = [i[0] for i in Gen_sort]

                #print 'Generations: ', ' '.join([idx_to_str[id_of_w] for id_of_w in Gen_tar])
                fw_valid.write(
                    'Predict: ' +
                    ' '.join([category[id_of_w]
                              for id_of_w in Gen_tar]) + '\r\n')
                #print 'valid_variational_cost', valid_variational_cost
                #print 'posterior_mean_variance', posterior_mean_variance

                valid_wordpreds_done += batch['num_preds']
                valid_dialogues_done += batch['num_dialogues']

            logger.debug("[VALIDATION END]")
            fw_valid.close()
            valid_cost /= valid_wordpreds_done
            valid_variational_cost /= valid_wordpreds_done
            valid_posterior_mean_variance /= valid_dialogues_done

            if len(timings["valid_cost"]) == 0 or valid_cost < numpy.min(
                    timings["valid_cost"]):
                patience = state['patience']
                # Saving model if decrease in validation cost
                save(model, timings)
                print 'best valid_cost', valid_cost
            elif valid_cost >= timings["valid_cost"][-1] * state[
                    'cost_threshold']:
                patience -= 1

            save(model, timings, '_' + str(step) + '_')

            print "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid variational cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % (
                float(valid_cost), float(
                    math.exp(valid_cost)), float(valid_variational_cost),
                float(valid_posterior_mean_variance), patience)

            timings["train_cost"].append(train_cost / train_done)
            timings["train_variational_cost"].append(train_variational_cost /
                                                     train_done)
            timings["train_posterior_mean_variance"].append(
                train_posterior_mean_variance / train_dialogues_done)
            timings["valid_cost"].append(valid_cost)
            timings["valid_variational_cost"].append(valid_variational_cost)
            timings["valid_posterior_mean_variance"].append(
                valid_posterior_mean_variance)

            # Reset train cost, train misclass and train done
            train_cost = 0
            train_done = 0
            prev_train_cost = 0
            prev_train_done = 0

        step += 1

    logger.debug("All done, exiting...")
 def do_load(self, arg):
   arg = arg.strip()
   if not len(arg):
     self.help_load()
     return
   self.db = pickle.load(file(arg, 'rb'))
Пример #47
0
		cPickle.dump((mesh_terms_test, mesh_terms_pred), open('predictions_decoder.pkl','w'))
	return metrics

X_train, Y_train, X_test, Y_test, X_val, Y_val, num_english_words, word2index = get_data()

X_train = [X_train[i] for i in xrange(20000)]
Y_train = [Y_train[i] for i in xrange(20000)]

word_embeddings = re.read_word_embeddings(word2index)
node_embeddings = re.read_node_embeddings()

# # code_2_index, index_2_code = compute_vocab_target(Y_train+Y_test+Y_val)
# code_2_index['SSOS'] = len(code_2_index.keys())
# index_2_code = index_2_code + ['SSOS']

code_2_index = cPickle.load(open('code_2_index.pkl', 'r'))
index_2_code = cPickle.load(open('index_2_code.pkl', 'r'))
seq2mesh = cPickle.load(open('seq_to_mesh.pkl','r'))

Y_train = get_y_index_sequences(Y_train)
Y_test = get_y_index_sequences(Y_test)
Y_val = get_y_index_sequences(Y_val)


# X_test = X_val[0:]
# Y_test = Y_val[0:]


# #reduce size of validation set
X_val = [X_val[i] for i in xrange(2000)]
Y_val = [Y_val[i] for i in xrange(2000)]
                  action='store_true',
                  help='Get theoretical PSDs')

parser.add_option('--do_expected_CSDs',
                  action='store_true',
                  help='Get theoretical CSDs')

(options, args) = parser.parse_args()

if len(args) < 1:
    parser.error(
        'You must specify at least SETUP_SIMULATE_EXPECTED_SIGNAL_PSD_OR_CSD.PKL!'
    )

file = open(args[0], 'rb')
setup = cpkl.load(file)
file.close()

workdir = os.getcwd() + '/'

Nb = setup['number of batches']

Ndays = len(setup['days'])
if Ndays % Nb == 0:
    Ndb = Ndays / Nb
    days_batches = [setup['days'][b * Ndb:(b + 1) * Ndb] for b in range(Nb)]
elif Ndays % Nb > 0:
    Ndb = Ndays / (Nb - 1)
    Ndbl = Ndays % (Nb - 1)
    days_batches = [
        setup['days'][b * Ndb:(b + 1) * Ndb] for b in range(Nb - 1)
Пример #49
0
# Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration
#
#  genMetadataXML.py
#  
#
#  Created by Alvin on 10/05/2010.
#
from __future__ import with_statement
import sys
import cPickle as pickle

usage = "genMetadataXML.py JOB_REPORT_PICKLE [--new|--old]"
if len( sys.argv ) < 2:
    print usage
    sys.exit( 1 )
with open( sys.argv[ 1 ] ) as f:
    r = pickle.load( f )
try:
    optParam = sys.argv[ 2 ]
except IndexError:
    optParam = '--old'
if optParam == '--new':
    r.writeMetaDataXML_new()
elif optParam == '--old':
    r.writeMetaDataXML_old()
else:
    print usage
    sys.exit(1)
sys.exit(0)
Пример #50
0
def load_model():
	encoder, decoder = cPickle.load(open('trained_model.pkl','r'))
	encoder = encoder.cuda()
	decoder = decoder.cuda()
	return encoder, decoder
Пример #51
0
def voc_eval(detpath,
             annopath,
             imagesetfile,
             classname,
             cachedir,
             ovthresh=0.5,
             use_07_metric=False):
    """rec, prec, ap = voc_eval(detpath,
                                annopath,
                                imagesetfile,
                                classname,
                                [ovthresh],
                                [use_07_metric])

    Top level function that does the PASCAL VOC evaluation.

    detpath: Path to detections
        detpath.format(classname) should produce the detection results file.
    annopath: Path to annotations
        annopath.format(imagename) should be the xml annotations file.
    imagesetfile: Text file containing the list of images, one image per line.
    classname: Category name (duh)
    cachedir: Directory for caching the annotations
    [ovthresh]: Overlap threshold (default = 0.5)
    [use_07_metric]: Whether to use VOC07's 11 point AP computation
        (default False)
    """
    # assumes detections are in detpath.format(classname)
    # assumes annotations are in annopath.format(imagename)
    # assumes imagesetfile is a text file with each line an image name
    # cachedir caches the annotations in a pickle file

    # first load gt
    if not os.path.isdir(cachedir):
        os.mkdir(cachedir)
    cachefile = os.path.join(cachedir, 'annots.pkl')
    # read list of images
    with open(imagesetfile, 'r') as f:
        lines = f.readlines()
    imagenames = [x.strip() for x in lines]

    if not os.path.isfile(cachefile):
        # load annots
        recs = {}
        for i, imagename in enumerate(imagenames):
            recs[imagename] = parse_rec(annopath.format(imagename))
            if i % 100 == 0:
                print 'Reading annotation for {:d}/{:d}'.format(
                    i + 1, len(imagenames))
        # save
        print 'Saving cached annotations to {:s}'.format(cachefile)
        with open(cachefile, 'w') as f:
            cPickle.dump(recs, f)
    else:
        # load
        with open(cachefile, 'r') as f:
            recs = cPickle.load(f)

    # extract gt objects for this class
    class_recs = {}
    npos = 0
    for imagename in imagenames:
        R = [obj for obj in recs[imagename] if obj['name'] == classname]
        bbox = np.array([x['bbox'] for x in R])
        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
        det = [False] * len(R)
        npos = npos + sum(~difficult)
        class_recs[imagename] = {
            'bbox': bbox,
            'difficult': difficult,
            'det': det
        }

    # read dets
    detfile = detpath.format(classname)
    with open(detfile, 'r') as f:
        lines = f.readlines()
    if any(lines) == 1:

        splitlines = [x.strip().split(' ') for x in lines]
        image_ids = [x[0] for x in splitlines]
        confidence = np.array([float(x[1]) for x in splitlines])
        BB = np.array([[float(z) for z in x[2:]] for x in splitlines])

        # sort by confidence
        sorted_ind = np.argsort(-confidence)
        sorted_scores = np.sort(-confidence)
        BB = BB[sorted_ind, :]
        image_ids = [image_ids[x] for x in sorted_ind]

        # go down dets and mark TPs and FPs
        nd = len(image_ids)
        tp = np.zeros(nd)
        fp = np.zeros(nd)
        for d in range(nd):
            R = class_recs[image_ids[d]]
            bb = BB[d, :].astype(float)
            ovmax = -np.inf
            BBGT = R['bbox'].astype(float)

            if BBGT.size > 0:
                # compute overlaps
                # intersection
                ixmin = np.maximum(BBGT[:, 0], bb[0])
                iymin = np.maximum(BBGT[:, 1], bb[1])
                ixmax = np.minimum(BBGT[:, 2], bb[2])
                iymax = np.minimum(BBGT[:, 3], bb[3])
                iw = np.maximum(ixmax - ixmin + 1., 0.)
                ih = np.maximum(iymax - iymin + 1., 0.)
                inters = iw * ih

                # union
                uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
                       (BBGT[:, 2] - BBGT[:, 0] + 1.) *
                       (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)

                overlaps = inters / uni
                ovmax = np.max(overlaps)
                jmax = np.argmax(overlaps)

            if ovmax > ovthresh:
                if not R['difficult'][jmax]:
                    if not R['det'][jmax]:
                        tp[d] = 1.
                        R['det'][jmax] = 1
                    else:
                        fp[d] = 1.
            else:
                fp[d] = 1.

        # compute precision recall
        fp = np.cumsum(fp)
        tp = np.cumsum(tp)
        rec = tp / float(npos)
        # avoid divide by zero in case the first detection matches a difficult
        # ground truth
        prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
        ap = voc_ap(rec, prec, use_07_metric)
    else:
        #          rec = -1
        #          prec = -1
        #          ap = -1
        rec = 0
        prec = 0
        ap = 0
    return rec, prec, ap
Пример #52
0
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = cPickle.load(fo)
    return dict
Пример #53
0
 def load_param(self, fname):
     with open(fname, "r") as fid:
         params = pickle.load(fid)
     return params
def query_page_folder_phoc(queries, folder, threshold, dont_load=False, use_gt_phoc=False, filter_small=False):
    """
    Evaluate folder of predictions given queries and threshold
    """

    # Dicts of query words
    all_dists = {}
    all_matches = {}
    all_relevants = {}

    cache_name = 'queries_%d.pkl' % (threshold*100)
    c = 0
    load_time = Timer()
    qtime = Timer()
    all_phocs, _ = phoc_letters_and_digits(queries)
    if (Path(folder) / cache_name).exists() and not dont_load:
        all_dists, all_matches = pklRick.load((Path(folder) / cache_name).open('rb'))
    else:
        print ('No chache found caching to %s' % cache_name)
        # Run over all predicted pages
        for page in Path(folder).glob('**/*.json'):
            load_time.tic()
            try:
                page_dict = json.load(page.open('rb'))
            except ValueError:
                print ('Somethin worng in %s lets see if we can go on' % page.stem)
            c += 1
            qtime.tic()
            # Run all queries per page
            for p, query in enumerate(queries):
                dists, matches, word_idx, words_in_page = query_page_phoc(query, page_dict, threshold=threshold,
                                                                          phoc=all_phocs[p, :], use_gt_phoc=use_gt_phoc, filter_small=filter_small)
                tmp_dist = all_dists.get(query, [])
                tmp_dist.extend(dists)
                all_dists[query] = tmp_dist

                tmp_match = all_matches.get(query, [])
                tmp_match.extend(matches)
                all_matches[query] = tmp_match

                tmp_match = all_relevants.get(query, [])
                tmp_match.append(words_in_page)
                all_relevants[query] = tmp_match

    # Cache mAP base data for fast reproduction of evaluation
    pklRick.dump((all_dists, all_matches), (Path(folder) / cache_name).open('wb'))
    mAP = 0
    recall = 0
    accuracy = 0
    n = 0
    for query in queries:
        # Per query evaluation
        AP, rec, acc = _map_and_recall(all_dists[query], all_matches[query], all_relevants[query])
        if AP is None or rec is None or acc is None:
            continue
        # Running means
        mAP = (1 / float(n+1))*AP + (float(n) / (n+1))*mAP
        recall = (1 / float(n+1))*rec + (float(n) / (n+1))*recall
        accuracy = (1 / float(n+1))*acc + (float(n) / (n+1))*accuracy
        n += 1
    return mAP, recall, accuracy
Пример #55
0
import numpy as np
from scipy.io import wavfile
import os
#import sys
import cPickle as pickle
from lasagne.updates import *

if __name__ == "__main__":

    # e.g. 1000_60sec.pkl
    in_pkl = sys.argv[1]
    out_pkl = sys.argv[2]

    if ".pkl" in in_pkl:
    	with open(in_pkl) as f:
            dat = pickle.load(f)
    	X_train, X_valid, X_test = dat[0]
    else:
        ctr = np.load(in_pkl)
        X_train, X_valid, X_test = ctr["arr_0"], ctr["arr_1"], ctr["arr_2"]

    sys.stderr.write("X_train shape = %s\n" % str(X_train.shape))
    sys.stderr.write("X_valid shape = %s\n" % str(X_valid.shape))
    sys.stderr.write("X_test shape = %s\n" % str(X_test.shape))

    args = dict()
    args["seed"] = 0
    args["batch_size"] = 128
    args["learning_rate"] = 0.01
    args["momentum"] = 0.9
    args["num_epochs"] = 4000
						bestmatch[1] = len(ref)
				r += bestmatch[1]
				c += len(hyp)

		# computing bleu score
		p0 = 1e-7
		bp = 1 if c > r else math.exp(1 - float(r) / float(c))
		p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
				for i in range(4)]
		s = math.fsum(w * math.log(p_n) \
					  for w, p_n in zip(weights, p_ns) if p_n)
		bleu = bp * math.exp(s)
		return bleu


data = pickle.load(open("needed.p"))
vocab = json.load(open("./vocab.json"))
outs = []
golds = []
domain_wise = {}
for domain in ['schedule','navigate','weather']:
	domain_wise[domain] = {}
	domain_wise[domain]['tp_prec'] = 0.0
	domain_wise[domain]['tp_recall'] = 0.0
	domain_wise[domain]['total_prec'] = 0.0
	domain_wise[domain]['total_recall'] = 0.0
	domain_wise[domain]['gold'] = []
	domain_wise[domain]['output'] = []

tp_prec = 0.0
tp_recall = 0.0
Пример #57
0
 def __init__(self):
         with open('drugbank.pck', 'rb') as f:
             self.data = pickle.load(f)
             self.description = pickle.load(f)
Пример #58
0

class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))


rnn_size = 256

# load data from pickle
f = open('data.pkl', 'r')

classes = cPickle.load(f)
chars = cPickle.load(f)
char_indices = cPickle.load(f)
indices_char = cPickle.load(f)

maxlen = cPickle.load(f)
step = cPickle.load(f)

X_ind = cPickle.load(f)
y_ind = cPickle.load(f)

f.close()

[s1, s2] = X_ind.shape

X = np.zeros((s1, s2, len(chars)), dtype=np.bool)
Пример #59
0
 def load_dict_data(self):
     """
         加载前缀数字典
     """
     if os.path.exists(self.data_path):
         self.root = cPickle.load(open(self.data_path, "r"))
def process_uniprot_ids(comm, uniprot_processor_num, num_tree_servers,
                        num_uniprot_processors, all_uniprot_ids):
  trees_of_uniprot_id = {}
  uniprot_ids_in_tree = {}

  uniprot_row_info = numpy.zeros(3, dtype='i')
  status = MPI.Status()

  while True:
    comm.Recv([uniprot_row_info, MPI.INT], source=0, tag=MPI.ANY_TAG,
              status = status)
    if status.Get_tag() == TAG_DATABASE_DONE:
      break
    else:
      tree_id = uniprot_row_info[0]
      left_id = uniprot_row_info[1]
      uniprot_id = uniprot_row_info[2]
      try:
        trees_of_uniprot_id[uniprot_id].add(tree_id)
      except KeyError:
        trees_of_uniprot_id[uniprot_id] = set([tree_id])
      try:
        uniprot_ids_in_tree[tree_id].add(uniprot_id)
      except KeyError:
        uniprot_ids_in_tree[tree_id] = set([uniprot_id])

  comm.Barrier()

  ortholog_request = numpy.zeros(2, dtype='i')
  uniprot_id_array = numpy.zeros(5000, dtype='i')
  phogs_supporting_orthology = numpy.zeros(5000, dtype='i')
  thresholds_of_orthology = numpy.zeros(5000, dtype='d')
  dir = '/clusterfs/ohana/external/genomes/QuestForOrthologs/Release5/'
  f = open(os.path.join(dir, "info_of_uniprot_accession.pkl"))
  info_of_uniprot_accession = cPickle.load(f)
  f.close()
  f = open(os.path.join(dir, "uniprot_accessions_of_uniprot_id.pkl"))
  uniprot_accessions_of_uniprot_id = cPickle.load(f)
  f.close()
  f = open(os.path.join('/clusterfs/vasudha/bpg/OrthologsForQuest/',
                        'OrthologsIn13ReferenceProteomes_%d_of_%d'
                        % (uniprot_processor_num, num_uniprot_processors)), "w")

  base_tree_server_id = 1
  def write_uniprot_id(uniprot_id):
    uniprot_accessions = uniprot_accessions_of_uniprot_id[uniprot_id]
    f.write("%d (%s)" % (uniprot_id, 
                        ','.join(["%s:%s" % (accession,
                        info_of_uniprot_accession[accession]['taxon'])
                        for accession in
                        uniprot_accessions_of_uniprot_id[uniprot_id]])))

  print "UniProt processor %d writing %d uniprot_ids" \
        % (uniprot_processor_num, len(trees_of_uniprot_id))
  t1 = MPI.Wtime()
  for uniprot_id in trees_of_uniprot_id:
    orthologs = {}
    for tree_id in trees_of_uniprot_id[uniprot_id]:
      tree_server_num = base_tree_server_id + tree_id % num_tree_servers
      ortholog_request[0] = tree_id
      ortholog_request[1] = uniprot_id
      comm.Send([ortholog_request, MPI.INT], dest = tree_server_num, 
                tag = TAG_ORTHOLOG_REQUEST)
      comm.Recv([uniprot_id_array, MPI.INT], source = tree_server_num,
                tag = TAG_ORTHOLOG_RESPONSE, status = status)
      num_uniprot_ids = status.Get_count(datatype = MPI.INT)
      comm.Recv([phogs_supporting_orthology, MPI.INT],
                source = tree_server_num, tag = TAG_ORTHOLOG_RESPONSE)
      comm.Recv([thresholds_of_orthology, MPI.DOUBLE_PRECISION], 
                source = tree_server_num, tag = TAG_ORTHOLOG_RESPONSE)
      for i in range(num_uniprot_ids):
        try:
          orthologs[uniprot_id_array[i]].add(
            (tree_id, phogs_supporting_orthology[i],
              thresholds_of_orthology[i]))
        except KeyError:
          orthologs[uniprot_id_array[i]] = set([
            (tree_id, phogs_supporting_orthology[i],
              thresholds_of_orthology[i])])
    write_uniprot_id(uniprot_id)
    f.write(": ")
    for ortholog in orthologs.keys():
      f.write("{")
      write_uniprot_id(ortholog)
      f.write(" <= ")
      for tree_id, left_id, threshold in orthologs[ortholog]:
        f.write("(PHOG%07d_%05d, %f)," % (tree_id, left_id, threshold))
      f.write("};")
    f.write("\n")
  f.close()
  t2 = MPI.Wtime()
  print "UniProt processor %d wrote all uniprot_ids in %g secs" \
        % (uniprot_processor_num, t2-t1)

  comm.Send([MPI.BOTTOM, MPI.INT], dest = 0, tag = TAG_ORTHOLOG_DONE)