Пример #1
0
def test__mapping_repr(display_max_rows, n_vars, n_attr) -> None:
    long_name = "long_name"
    a = defchararray.add(long_name, np.arange(0, n_vars).astype(str))
    b = defchararray.add("attr_", np.arange(0, n_attr).astype(str))
    c = defchararray.add("coord", np.arange(0, n_vars).astype(str))
    attrs = {k: 2 for k in b}
    coords = {_c: np.array([0, 1]) for _c in c}
    data_vars = dict()
    for (v, _c) in zip(a, coords.items()):
        data_vars[v] = xr.DataArray(
            name=v,
            data=np.array([3, 4]),
            dims=[_c[0]],
            coords=dict([_c]),
        )
    ds = xr.Dataset(data_vars)
    ds.attrs = attrs

    with xr.set_options(display_max_rows=display_max_rows):

        # Parse the data_vars print and show only data_vars rows:
        summary = formatting.dataset_repr(ds).split("\n")
        summary = [v for v in summary if long_name in v]
        # The length should be less than or equal to display_max_rows:
        len_summary = len(summary)
        data_vars_print_size = min(display_max_rows, len_summary)
        assert len_summary == data_vars_print_size

        summary = formatting.data_vars_repr(ds.data_vars).split("\n")
        summary = [v for v in summary if long_name in v]
        # The length should be equal to the number of data variables
        len_summary = len(summary)
        assert len_summary == n_vars

        summary = formatting.coords_repr(ds.coords).split("\n")
        summary = [v for v in summary if "coord" in v]
        # The length should be equal to the number of data variables
        len_summary = len(summary)
        assert len_summary == n_vars

    with xr.set_options(
            display_max_rows=display_max_rows,
            display_expand_coords=False,
            display_expand_data_vars=False,
            display_expand_attrs=False,
    ):
        actual = formatting.dataset_repr(ds)
        col_width = formatting._calculate_col_width(ds.variables)
        dims_start = formatting.pretty_print("Dimensions:", col_width)
        dims_values = formatting.dim_summary_limited(ds,
                                                     col_width=col_width + 1,
                                                     max_rows=display_max_rows)
        expected = f"""\
<xarray.Dataset>
{dims_start}({dims_values})
Coordinates: ({n_vars})
Data variables: ({n_vars})
Attributes: ({n_attr})"""
        expected = dedent(expected)
        assert actual == expected
def render(player, comp):
    c_coor = np.chararray((3, 3))
    c_coor[:] = "X"
    p_coor = np.chararray((3, 3))
    p_coor[:] = "O"
    print char.add(char.multiply(c_coor, comp.astype(int)),
                   char.multiply(p_coor, player.astype(int)))
Пример #3
0
 def TAD_bins(arr):
     """
     Returns TADs as objects from their coordinates.
     """
     if arr.shape[0]:
         _vector_str = np.vectorize(str)
         return npchar.add(_vector_str(arr[:, 0]),
                           npchar.add(",", _vector_str(arr[:, 1])))
     return arr
Пример #4
0
    def set_given_hopping(self, n, size, dic, mask, upper_part):
        '''
        Private method.
        Fill self.hop. 

        :param n: Integer. Hopping type.
        :param size: Integer. Number of hoppings.
        :param doc: Dictionary. Hopping dictionary.
        :param mask: np.ndarray. Mask.
        :param upper_part: Boolean. If True, self.hop['i'] < self.hop['j'].
        '''
        hop = np.empty(size,
                       dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'),
                              ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')])
        hop['n'] = dic['n']
        hop['t'] = dic['t']
        if upper_part:
            hop['i'] = self.store_hop[n]['i'][mask]
            hop['j'] = self.store_hop[n]['j'][mask]
            hop['ang'] = self.store_hop[n]['ang'][mask]
            hop['tag'] = self.store_hop[n]['tag'][mask]
        else:
            hop['i'] = self.store_hop[n]['j'][mask]
            hop['j'] = self.store_hop[n]['i'][mask]
            hop['ang'] = self.store_hop[n]['ang'][mask] - 180
            hop['tag'] = npc.add(self.lat.coor['tag'][hop['i']],
                                 self.lat.coor['tag'][hop['j']])
        return hop
Пример #5
0
    def set_hopping_manual(self, dict_hop, upper_part=True):
        '''
        Set hoppings manually.

        :param dict_hop: Dictionary of hoppings.
            key: hopping indices, val: hopping values.

        :parameter upper_part: Boolean. 

            * True, fill the Hamiltonian upper part.
            * False, fill the Hamiltonian lower part.  
        '''
        hop = np.zeros(len(dict_hop),
                       dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'),
                              ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')])
        i = [h[0] for h in dict_hop.keys()]
        j = [h[1] for h in dict_hop.keys()]
        t = [val for val in dict_hop.values()]
        hop['i'], hop['j'] = i, j
        hop['t'] = t
        hop['tag'] = npc.add(self.lat.coor['tag'][i], self.lat.coor['tag'][j])
        ang = 180 / PI * np.arctan2(
            self.lat.coor['y'][j] - self.lat.coor['y'][i],
            self.lat.coor['x'][j] - self.lat.coor['x'][i])
        if upper_part:
            ang[ang < 0] += 180
        else:
            ang[ang >= 0] -= 180
        hop['ang'] = ang
        self.hop = np.concatenate([self.hop, hop])
Пример #6
0
    def set_given_hopping(self, n, size, dic, mask, upper_part):
        '''
        Private method.
        Fill self.hop. 

        :param n: Integer. Hopping type.
        :param size: Integer. Number of hoppings.
        :param doc: Dictionary. Hopping dictionary.
        :param mask: np.ndarray. Mask.
        :param upper_part: Boolean. If True, self.hop['i'] < self.hop['j'].
        '''
        hop = np.empty(size, dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), 
                                                      ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')])
        hop['n'] = dic['n']
        hop['t'] = dic['t']
        if upper_part:
            hop['i'] = self.store_hop[n]['i'][mask]
            hop['j'] = self.store_hop[n]['j'][mask]
            hop['ang'] = self.store_hop[n]['ang'][mask]
            hop['tag'] = self.store_hop[n]['tag'][mask]
        else:
            hop['i'] = self.store_hop[n]['j'][mask]
            hop['j'] = self.store_hop[n]['i'][mask]
            hop['ang'] = self.store_hop[n]['ang'][mask] - 180
            hop['tag'] = npc.add(self.lat.coor['tag'][hop['i']], 
                                            self.lat.coor['tag'][hop['j']])
        return hop
Пример #7
0
    def set_hopping_manual(self, dict_hop, upper_part=True):
        '''
        Set hoppings manually.

        :param dict_hop: Dictionary of hoppings.
            key: hopping indices, val: hopping values.

        :parameter upper_part: Boolean. 

            * True, fill the Hamiltonian upper part.
            * False, fill the Hamiltonian lower part.  
        '''
        hop = np.zeros(len(dict_hop), dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), 
                                                                    ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')])
        i = [h[0] for h in dict_hop.keys()]
        j = [h[1] for h in dict_hop.keys()]
        t = [val for val in dict_hop.values()]
        hop['i'],  hop['j']= i, j
        hop['t'] = t 
        hop['tag'] = npc.add(self.lat.coor['tag'][i], 
                                        self.lat.coor['tag'][j])
        ang = 180 / PI * np.arctan2(self.lat.coor['y'][j]-self.lat.coor['y'][i],
                                                    self.lat.coor['x'][j]-self.lat.coor['x'][i])
        if upper_part:
            ang[ang < 0] += 180
        else:
            ang[ang >= 0] -= 180
        hop['ang'] = ang
        self.hop = np.concatenate([self.hop, hop])
Пример #8
0
def paral(path, d):
    probs = pd.read_csv("%s/%s.csv" % (direct, d), header=None).as_matrix()[:, 1].astype(float)
    # probs = np.loadtxt("%s/%s.csv" % (direct, d))[:, 1].astype(float)
    repeat = np.loadtxt("rep_trips/%s.txt" % d)[:, 1].astype(int)
    new_p = probs
    new_p[np.bitwise_and(repeat == 1, 1 > 0.5)] = 1.0
    indexes = np.arange(1, 201).astype(str)
    d_id = (np.ones(200)*int(d)).astype(int).astype(str)
    und = np.ones(200).astype(str)
    und[:] = "_"
    first_column = ncd.add(d_id, und)
    first_column = ncd.add(first_column, indexes)
    second_column = np.array(["%.8f" % p for p in new_p])

    outp = np.vstack((first_column, second_column)).T
    np.savetxt("%s/%s.csv" % (path, d), outp, fmt="%s", delimiter=",")
    print(d)
def paral(path, d):
    probs = pd.read_csv("%s/%s.csv" % (direct, d), header=None).as_matrix()[:, 1].astype(float)
    # probs = np.loadtxt("%s/%s.csv" % (direct, d))[:, 1].astype(float)
    repeat = np.loadtxt("rep_trips/%s.txt" % d)[:, 1].astype(int)
    new_p = probs
    new_p[np.bitwise_and(repeat == 1, 1 > 0.5)] = 1.0
    indexes = np.arange(1, 201).astype(str)
    d_id = (np.ones(200)*int(d)).astype(int).astype(str)
    und = np.ones(200).astype(str)
    und[:] = "_"
    first_column = ncd.add(d_id, und)
    first_column = ncd.add(first_column, indexes)
    second_column = np.array(["%.8f" % p for p in new_p])

    outp = np.vstack((first_column, second_column)).T
    np.savetxt("%s/%s.csv" % (path, d), outp, fmt="%s", delimiter=",")
    print d
Пример #10
0
 def dbscan_sub_clus(df,i):    
 '''
 Does another level of clustering on the basis of DBZ values.
 PARAMETER : eps2 , min_pts2 will be the parameter of dbz level of clustering
             i is cluster label
             df dataframe of only those points belonging to the cluster i
 ''' 
     db = DBSCAN(min_samples=min_pts2, eps=eps2)
     db.fit(df[['dbz']])
     lab = add(db.labels_.astype(str), '_'+str(i)) # new label is formed as 'New labels'_'cluster label i'
     return lab      #returns string 
Пример #11
0
def p_ill_do_it_faggot(d, X_out, y_out):
    Xt, yt = utils.load_driver_pca(d)
    X_out, y_out = utils.load_outliers_pca(d, _samples-200)
    yt[:] = 1
    # rs = utils.clean_noise(Xt, _n_clean)
    # Xt = np.vstack((rs["X_clean"], rs["X_noise"]))
    # yt[_n_clean:] = 0
    X = np.vstack((Xt, X_out))
    y = np.hstack((yt, y_out))
    k_fold = cross_validation.KFold(len(X), n_folds=5)
    X, y = shuffle(X, y, random_state=13)
    d_p = np.zeros((200))
    for j, (train, test) in enumerate(k_fold):
        # probas_ = gbrt.fit(X[train], y[train]).predict_proba(X[test])
        gbrt.fit(X[train], y[train])
        regr.fit(X[train], y[train])
        my_p = gbrt.predict_proba(Xt)[:, 1] + regr.predict_proba(Xt)[:, 1]
        d_p += my_p
    d_p /= float(len(k_fold) * 2)

    d_p = (d_p - d_p.min())/(d_p.max()-d_p.min())

    d_p[d_p > 0.9] = 1.0
    d_p[d_p < 0.1] = 0.0

    indexes = np.arange(1, 201).astype(str)
    d_id = (np.ones(200)*int(d)).astype(int).astype(str)
    und = np.ones(200).astype(str)
    und[:] = "_"
    first_column = ncd.add(d_id, und)
    first_column = ncd.add(first_column, indexes)

    second_column = np.array(["%.8f" % p for p in d_p])

    outp = np.vstack((first_column, second_column)).T

    np.savetxt("subm_%s/%s.csv" % (_hash_id, d), outp, fmt="%s", delimiter=",")
    print d
Пример #12
0
def trace_export(traces, time, x, y):
    index = np.arange(traces.shape[0])
    to_add = [
        ' ',
        x.astype(int).astype('str'), ' ',
        y.astype(int).astype('str')
    ]
    header = index.astype('str')
    for entry in to_add:
        header = np_str.add(header, entry)

    header = np.insert(header, 0, 'time (s)')
    export = np.vstack((header, np.vstack((time, traces)).T))

    return export
Пример #13
0
def createtable(M):
    s = np.shape(M)
    res = '<table boarder = "2">'
    for i in range(s[0]):
        res = add(res, ' <tr>')
        for j in range(s[1]):
            res = add(res, add('  <td>', add(str(M[i, j]), '</td>')))
        res = add(res, '</tr>')
    res = add(res, '</table>')
    return res
Пример #14
0
    def fill_store_hop(self, n):
        '''
        Private method.

        Store in *store_hop* indices (with :math:`i < j`), positive angles, and tags
        of a given type of hopping.
        '''
        ind = np.argwhere(np.isclose(self.dist_uni[n], self.vec_hop['dis'], atol=ATOL))
        ind_up = ind[ind[:, 1] > ind[:, 0]]
        hop = np.zeros(len(ind_up), dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), 
                                                                 ('ang', 'f8'), ('tag', 'S2')])
        hop['i'] = ind_up[:, 0]
        hop['j'] = ind_up[:, 1]
        hop['ang'] = self.vec_hop['ang'][ind_up[:, 0], ind_up[:, 1]]
        hop['tag'] = npc.add(self.lat.coor['tag'][ind_up[:, 0]], 
                                         self.lat.coor['tag'][ind_up[:, 1]])
        self.store_hop[n] = hop
Пример #15
0
    def set_hopping_def(self, hopping_def):
        '''
        Set specific hoppings. 

        :param hopping_def:  Dictionary of hoppings. 
            key: hopping indices, val: hopping values. 

        Example usage::

            sys.set_hopping_def({(0, 1): 1., (1, 2): -1j})
        '''
        error_handling.empty_hop(self.hop)
        error_handling.set_hopping_def(self.hop, hopping_def, self.lat.sites)
        for key, val in hopping_def.items():
            cond = (self.hop['i'] == key[0]) & (self.hop['j'] == key[1])
            self.hop['t'][cond] = val
            self.hop['ang'] = self.vec_hop['ang'][key[0], key[1]]
            self.hop['tag'] = npc.add(self.lat.coor['tag'][key[0]],
                                                   self.lat.coor['tag'][key[1]])
Пример #16
0
    def set_hopping_def(self, hopping_def):
        '''
        Set specific hoppings. 

        :param hopping_def:  Dictionary of hoppings. 
            key: hopping indices, val: hopping values. 

        Example usage::

            sys.set_hopping_def({(0, 1): 1., (1, 2): -1j})
        '''
        error_handling.empty_hop(self.hop)
        error_handling.set_hopping_def(self.hop, hopping_def, self.lat.sites)
        for key, val in hopping_def.items():
            cond = (self.hop['i'] == key[0]) & (self.hop['j'] == key[1])
            self.hop['t'][cond] = val
            self.hop['ang'] = self.vec_hop['ang'][key[0], key[1]]
            self.hop['tag'] = npc.add(self.lat.coor['tag'][key[0]],
                                      self.lat.coor['tag'][key[1]])
Пример #17
0
    def fill_store_hop(self, n):
        '''
        Private method.

        Store in *store_hop* indices (with :math:`i < j`), positive angles, and tags
        of a given type of hopping.
        '''
        ind = np.argwhere(
            np.isclose(self.dist_uni[n], self.vec_hop['dis'], atol=ATOL))
        ind_up = ind[ind[:, 1] > ind[:, 0]]
        hop = np.zeros(len(ind_up),
                       dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'),
                              ('ang', 'f8'), ('tag', 'S2')])
        hop['i'] = ind_up[:, 0]
        hop['j'] = ind_up[:, 1]
        hop['ang'] = self.vec_hop['ang'][ind_up[:, 0], ind_up[:, 1]]
        hop['tag'] = npc.add(self.lat.coor['tag'][ind_up[:, 0]],
                             self.lat.coor['tag'][ind_up[:, 1]])
        self.store_hop[n] = hop
#         mmdbins = np.percentile(mmd, np.arange(0,100.1,25))
        
        for j in range(1,len(mmdbins)):
            tmpind=np.in1d(mmdind,j)
            if np.sum(tmpind) ==0:
                plot([],[],c=colors[j-1])
                continue
            x1=updf.values[tmpind,:].transpose()
            y1=(tmp[tmpind,:]-tmp[tmpind,249][:,np.newaxis]).transpose()
            x0=np.nanmean(x1,axis=1)
            y0=np.nanmean(y1,axis=1)
#             plot(x0,y0,c=colormapping.to_rgba(tmpmmd[j-1]))
            plot(x0,y0,c=colors[j-1])
            
        ax=plt.gca()
        ax.legend( npch.add(npch.add( mmdbins[:-1].astype(str),' - '),mmdbins[1:].astype(str) ),loc='best')
        plt.ylabel('Altitude (km)')
        plt.xlabel('Updraft (m/s)')
        plt.title('Flight '+str(szi)+' updraft MMD cases '+str(len(np.squeeze(mmd))))
        plt.plot(plt.xlim(),np.array([0,0]),'k--')
        plt.plot(np.array([0,0]),plt.ylim(),'k--')
        plt.show()
#     except:
#         pass


#%%

# grouped w wind profile by IWC cats
import pandas as pd
Пример #19
0
    def set(self, *args, merge=False):
        """Main entry point to assign value on plate 
           
        Parameters 
        ----------
          well : dict or str
              - if dict, well must contain well identifier as key and value to assign as value.eg : {"A2" : "value", "A[3-6]" : 42} 
              - if string, well is only a well identifier eg : "G5" 

         value : list or str or int or float 
             - if list, value should be presented with multiple well identifer "B-D[2-5]", ["value1", "value2", "value3"]

        merge : bool (by default False) 
            Value on well are not overide but added
        Returns
        -------
         BioPlate : BioPlate
             return instance of plate

         Exemples
         --------    
         see :ref:`Set-values-on-plate`
                 
        """
        well, value = self._args_analyse(*args)
        if not isinstance(well, str) and isinstance(well, Iterable):
            generator = well.items() if isinstance(well, dict) else well
            for key, val in generator:
                if merge:
                    self.set(key, val, merge=True)
                else:
                    self.set(key, val)
            return self
        well = BioPlateMatrix(str(well))
        if isinstance(value, list):
            plate_shape = self[well.row, well.column].shape
            len_plate_shape = len(plate_shape)
            if len_plate_shape > 1:
                if well.pos == "R":
                    resh_val = np.reshape(value, (plate_shape[0], 1))
                else:
                    resh_val = value
                if merge:
                    self[well.row,
                         well.column] = ncd.add(self[well.row, well.column],
                                                resh_val)
                    return self
                self[well.row, well.column] = resh_val
                return self
            else:
                if merge:
                    self[well.row, well.column][:len(value)] = ncd.add(
                        self[well.row, well.column][:len(value)], value)
                    return self
                self[well.row, well.column][:len(value)] = value
                return self
        if merge:
            self[well.row, well.column] = ncd.add(self[well.row, well.column],
                                                  value)
            return self
        self[well.row, well.column] = value
        return self
Пример #20
0
train_labels = all_labels[3000:, :]

# Plot and save distribution of test data
classes, counts = np.unique(test_labels[:,1], return_counts=True)
plt.figure()
plt.bar(classes, counts)
plt.title('Distribution of retinopathy severity grades in test data')
plt.xlabel('Grade')
plt.ylabel('Count')
plt.savefig('../results/class_distribution_test.png')

class_dist = np.asarray((classes, counts), dtype=np.int).T
np.savetxt(fname='../results/class_distribution_test.csv', X=class_dist, delimiter=',')

# PLot and save distribution of train data
classes, counts = np.unique(train_labels[:,1], return_counts=True)
plt.figure()
plt.bar(classes, counts)
plt.title('Distribution of retinopathy severity grades in train data')
plt.xlabel('Grade')
plt.ylabel('Count')
plt.savefig('../results/class_distribution_train.png')

class_dist = np.asarray((classes, counts), dtype=np.int).T
np.savetxt(fname='../results/class_distribution_train.csv', X=class_dist, delimiter=',')

# Save filenames separately
test_filenames = add(test_labels[:,0], np.full(shape=test_labels[:,0].shape, fill_value='.jpeg'))
np.savetxt(fname='../data/test_filenames.txt', X=test_filenames, delimiter='', fmt='%s')
train_filenames = add(train_labels[:,0], np.full(shape=train_labels[:,0].shape, fill_value='.jpeg'))
np.savetxt(fname='../data/train_filenames.txt', X=train_filenames, delimiter='', fmt='%s')
Пример #21
0
    def fit(self):

        # Results
        _cols_x = ['x%d' % i for i in range(self.n_parameters)]
        self.hist_ = pd.DataFrame(index=range((self.iters + 1) * self.n_chromosomes),
                                  columns=['iter', ] + _cols_x + ['cost', 'orig', ])
        self.hist_[['orig', ]] = '-1'

        #Initial random population
        self.pop_ = self._random(self.n_chromosomes)
        self.cost_ = self._fitness_function()

        filter_iter = range(0, self.n_chromosomes)
        self.hist_.loc[filter_iter, 'iter'] = 0
        self.hist_.loc[filter_iter, 'cost'] = self.cost_
        self.hist_.loc[filter_iter, _cols_x] = self.pop_

        for i in range(self.iters):

            if self.verbose > 0:
                print('Iteration ' + str(i) + ' of ' + str(self.iters))

            orig = np.empty(self.n_chromosomes, dtype='S10')
            cost_sort = np.argsort(self.cost_)

            #Elitims
            new_pop = np.empty_like(self.pop_)
            new_pop[0:self.n_elite] = self.pop_[cost_sort[0:self.n_elite]]
            orig[0:self.n_elite] = (cost_sort[0:self.n_elite] + i * self.n_chromosomes).astype(np.str)

            #Cumulative probability of selection as parent
            zcost = (self.cost_ - np.average(self.cost_)) / np.std(self.cost_)
            pzcost = 1 - norm.cdf(zcost)
            pcost = np.cumsum(pzcost / sum(pzcost))

            #Select parents & match
            numparents = self.n_chromosomes - self.n_elite
            #TODO: Add random state
            rand_parents = np.random.rand(numparents, 2)
            parents = np.zeros(rand_parents.shape, dtype=np.int)
            for parent1 in range(numparents):
                for parent2 in range(2):
                    parents[parent1, parent2] = np.searchsorted(pcost, rand_parents[parent1, parent2])

                if self.type_ == 'binary':
                    #Binary
                    #random single point matching
                    rand_match = int(np.random.rand() * self.n_parameters)
                    child = self.pop_[parents[parent1, 0]]
                    child[rand_match:] = self.pop_[parents[parent1, 1], rand_match:]
                else:
                    #Continious
                    rand_match = np.random.rand(self.n_parameters)
                    child = self.pop_[parents[parent1, 0]] * rand_match
                    child += (1 - rand_match) * self.pop_[parents[parent1, 1]]

                new_pop[self.n_elite + parent1] = child

            orig[self.n_elite:] = [','.join(row.astype(np.str)) for row in (parents + i * self.n_chromosomes)]

            #Mutate
            m_rand = np.random.rand(self.n_chromosomes, self.n_parameters)
            m_rand[0:self.n_elite] = 1.0
            mutations = m_rand <= self.per_mutations
            num_mutations = np.count_nonzero(mutations)

            if self.type_ == 'binary':
                new_pop[mutations] = (new_pop[mutations] == 0).astype(np.int)
            else:
                new_pop[mutations] = self._random(num_mutations)[:, 0]

            rows_mutations = np.any(mutations, axis=1)
            orig[rows_mutations] = add(orig[rows_mutations],
                                       np.array(['_M'] * np.count_nonzero(rows_mutations), dtype='S10'))

            # Replace replicates with random
            temp_unique = np.ascontiguousarray(new_pop).view(np.dtype((np.void,
                                                                       new_pop.dtype.itemsize * new_pop.shape[1])))
            _, temp_unique_idx = np.unique(temp_unique, return_index=True)
            n_replace = self.n_chromosomes - temp_unique_idx.shape[0]
            if n_replace > 0:
                temp_unique_replace = np.ones(self.n_chromosomes, dtype=np.bool)
                temp_unique_replace[:] = True
                temp_unique_replace[temp_unique_idx] = False
                new_pop[temp_unique_replace] = self._random(n_replace)
                orig[temp_unique_replace] = '-1'

            self.pop_ = new_pop
            self.cost_ = self._fitness_function()

            filter_iter = range((i + 1) * self.n_chromosomes, (i + 2) * self.n_chromosomes)
            self.hist_.loc[filter_iter, 'iter'] = i + 1
            self.hist_.loc[filter_iter, 'cost'] = self.cost_
            self.hist_.loc[filter_iter, _cols_x] = self.pop_
            self.hist_.loc[filter_iter, 'orig'] = orig

        best = np.argmin(self.cost_)
        self.x = self.pop_[best]
        self.x_cost = self.cost_[best]
DIR = os.path.dirname(os.path.realpath(__file__))
ID_DIR = os.path.join(DIR, 'shapenetcore_ids')
DATASET_DIR = os.path.join(DIR, 'ShapeNetCore.v2/{}'.format(class_id))

if not os.path.exists(DATASET_DIR):
    print(
        "please download the ShapeNetCore v.2 dataset, and place it into the same directory as this file"
    )
    sys.exit(0)

if not os.path.exists(ID_DIR):
    os.mkdir(ID_DIR, 0777)

obj_ids = np.array(next(os.walk(DATASET_DIR))[1])
obj_ids = add(class_id + '/', obj_ids)
np.random.shuffle(obj_ids)

a = int(float(ratio1) * 0.01 * len(obj_ids))
b = int(float(ratio1 + ratio2) * 0.01 * len(obj_ids))

train, validate, test = obj_ids[:a], obj_ids[a:b], obj_ids[b:]

print('Total: %d' % len(obj_ids))
print('Train: %d' % len(train))
print('Validate: %d' % len(validate))
print('Test: %d' % len(test))

np.savetxt(os.path.join(ID_DIR, '{}_trainids.txt'.format(class_id)),
           train,
           fmt='%s')
Пример #23
0
def main():
    parser = argparse.ArgumentParser(description="run motif clustering")
    parser.add_argument('input_dir', help="The location of tomtom results, bzip'd")
    parser.add_argument('features', help='path to RSAT features file')
    parser.add_argument('--filter_motifs', default=True, action='store_true',
                        help="Pre-filter motifs to include; see comments")
    parser.add_argument('--plot_motifs', default=False, action='store_true',
                        help="Plot motif clusters in separate PDFs")
    parser.add_argument('--option', type=int, default=1,
                        help="Filtering option, 1 or 2; see comments")
    parser.add_argument('--mcl_I', type=float, default=2.4,
                        help='mcl I parameter value')
    args = parser.parse_args()

    # if option == 1, then we SUM all the log-pvals for multiple occurrences
    # of the same motif pair, then filter the sums to only those that
    # are < -10 (same as done for original Halo ensemble)
    # if option == 2, then we take the LOWEST log-pval over multiple occurrences
    # of the same motif pair, with no additional filtering.
    # Note, option (2) was used for Eco and (1) for Halo in MSB EGRIN2 paper.
    option = args.option
    print('OPTION:', option)

    # pre-filter motifs, not implemented yet. (1) remove motifs that are in coding
    # regions (from fimo table);
    # (2) filter by motif E-value (3) filter by bicluster residual?
    pre_filter = False
    if args.filter_motifs:
        pre_filter = args.filter_motifs

    coding_fracs = total_frac = None

    if pre_filter:
        """
        # necessary only because egrin2-tools has hyphen and can't have hyphens in
        # python module paths...
        try:
            os.symlink('egrin2-tools/src/postproc/coding_fracs.py', 'coding_fracs.py')
        except:
            None"""

        total_frac = cf.get_total_coding_rgn(args.features)

        cf_files = np.sort(np.array(glob.glob(os.path.join('*/coding_fracs.tsv.bz2'))))
        coding_fracs = []
        for f in cf_files:
            print(f)
            cff = pd.read_table(bz2.BZ2File(f), sep='\t')
            cm_run = os.path.dirname(f)  # .split('-')[2]
            cff['cm_run'] = cm_run
            if cff.shape[0] > 1:
                coding_fracs.append(cff)  # [f] = cff
        coding_fracs = pd.concat(coding_fracs, keys=None, ignore_index=True)

        # this has a hack - for some reason cluster_id in coding_fracs is %04d,
        # trim first zero to make it %03d ...
        splitted = npstr.split(coding_fracs.motif.values.astype(str), '_')

        # see https://stackoverflow.com/a/28286749
        clust_id = np.char.mod('_%03d_', np.array([int(i[0]) for i in splitted]))
        mot_id = np.char.mod('%02d', np.array([int(i[1]) for i in splitted]))
        mot_id = npstr.add(clust_id, mot_id)
        coding_fracs['motif_id'] = npstr.add(coding_fracs.cm_run.values.astype(str), mot_id)

    input_dir = 'tomtom_out'
    input_dir = args.input_dir

    # folder with the tomtom files bzip'd
    files = np.sort(np.array(glob.glob(input_dir + "/*tomtom.tsv.bz2")))
    dfs = {}
    #  can pd.concat work on shelved dataframes? YES. Note protocol=2 is faster and smaller.
    dfs = shelve.open('tomtom_shelf.db', protocol=2, writeback=False)

    # if using a shelf, once this is done once, you don't have to do it again.
    if len(dfs) != len(files):
        for f in files:
            gene = os.path.basename(f).split('.')[0]
            print(f, gene)
            if gene in dfs.keys():
                continue
            try:
                df = pd.read_table(bz2.BZ2File(f), sep='\t')
                print(df.shape)
                if df.shape[0] <= 0:
                    continue
                df = df.ix[df['p-value'] <= 0.01]  # 0.1]
                print(df.shape)
                df = df.ix[df['#Query ID'] != df['Target ID']]
                print(df.shape)
                df = df.ix[df.Overlap >= 6]  # same setting as Halo run
                df = df.drop(['Query consensus', 'Target consensus'], axis=1)

                if pre_filter:
                    # add the coding fracs to the df:
                    tmp = pd.merge(df, coding_fracs, how='left', left_on='#Query ID',
                                   right_on='motif_id')
                    tmp = pd.merge(tmp, coding_fracs, how='left', left_on='Target ID',
                                   right_on='motif_id')
                    tmp = tmp.drop(['motif_x', 'cm_run_x', 'motif_id_x', 'motif_y',
                                    'cm_run_y', 'motif_id_y'], axis=1)

                    # drop the motifs with coding fracs greater than
                    # (expected value) + (obs. stddev) / 2
                    cutoff = total_frac  # + coding_fracs.coding_frac.mad() / 2
                    tmp = tmp.ix[np.logical_or(tmp.coding_frac_x.values < cutoff,
                                               tmp.coding_frac_y.values < cutoff)]
                    print(tmp.shape)
                    df = tmp; del tmp

                dfs[gene] = df

            except:
                continue

    if not os.path.isfile('motifs_tomtom.tsv.bz2'):
        if type(dfs) == dict:
            dfs2 = pd.concat(dfs, axis=0)
        else:
            dfs2 = pd.concat(dfs.values(), axis=0)
        print(dfs2.shape)

        # incase we fail on steps below...
        dfs2.to_csv(bz2.BZ2File('motifs_tomtom.tsv.bz2', 'w'), sep='\t', index=False, header=True)

    else:
        dfs2 = pd.read_table(bz2.BZ2File('motifs_tomtom.tsv.bz2', 'r'))

    if option == 2:
        # sort so lower p-values come first (these are kept by drop_duplicates)
        dfs2.sort('p-value', inplace=True)

        # no, we sum up the duplicate weights below
        dfs2.drop_duplicates(['#Query ID', 'Target ID'], inplace=True)
        print(dfs2.shape)

    gr = pd.DataFrame({'query': dfs2['#Query ID'].values, 'target': dfs2['Target ID'].values,
                       'weight': np.round_(-np.log10(dfs2['p-value'].values + 1e-99), 4)})

    # igraph cannot read bzipped files (streams)
    gr.to_csv('motifs_graph.tsv', sep=' ', index=False, header=False)
    del gr

    gr2 = ig.Graph.Read_Ncol('motifs_graph.tsv', names=True, weights=True, directed=False)
    system('bzip2 -fv9 motifs_graph.tsv &')

    print(gr2.ecount(), gr2.vcount())

    # cool! see http://igraph.org/python/doc/igraph.GraphBase-class.html#simplify
    # add up the weights for duplicated edges into a single edge weight
    if option == 1:
        gr2a = gr2.simplify(multiple=True, loops=False, combine_edges=sum)
        print(gr2a.ecount(), gr2a.vcount())

        # see http://igraph.org/python/doc/tutorial/tutorial.html#selecting-vertices-and-edges
        # used 10 for Halo; use less for fewer runs. returns an EdgeList
        gr2b = gr2a.es.select(weight_gt=10)
        gr2b = gr2b.subgraph()   # convert to a graph
        print(gr2b.ecount(), gr2b.vcount())
    elif option == 2:
        gr2a = gr2.simplify(multiple=True, loops=False, combine_edges=max)
        print(gr2a.ecount(), gr2a.vcount())
        gr2b = gr2a

    del gr2

    # no weights used - same as Halo analysis which was best!
    gr2b.write_ncol("mot_metaclustering.txt", weights=None)

    # now run mcl, latest version from http://www.micans.org/mcl/src/mcl-latest.tar.gz
    param_I = args.mcl_I
    cmd = 'mcl mot_metaclustering.txt --abc -I %.1f -v all -te 3 -S 200000' % (param_I)
    system(cmd)

    param_I_str = str(param_I).replace('.','')
    fo = open('out.mot_metaclustering.txt.I%s'%(param_I_str), 'r')
    lines = fo.readlines()
    fo.close()
    lines = [np.array(line.split()) for line in lines]

    # file contains actual motif ids rather than numbers
    clusters = lines
    del lines

    clust_lens = np.array([len(i) for i in clusters])
    print('Clusters with >= 10 motifs:', np.sum(clust_lens >= 10))
    print('Total number of motifs:', gr2a.vcount())
    print('Number of motifs in >= 10-size clusters:',
          np.sum(np.array([len(clusters[i]) for i in np.where(clust_lens >= 10)[0]])))
    print('Fraction of motifs in >= 10-size clusters:',
          float(np.sum(np.array([len(clusters[i]) for i in np.where(clust_lens >= 10)[0]]))) /
          float(gr2a.vcount()))
    del gr2a

    # Get info on alignments for each motif cluster
    dfs2.set_index('#Query ID', drop=False, inplace=True)
    clust_dfs = {}
    for i in xrange(len(clusters)):
        clust = clusters[i]
        print(i, len(clust))
        if i in clust_dfs.keys() or len(clust) < 10 or i > 500:
            continue
        df = dfs2.ix[clust]
        df = df.iloc[np.in1d(df['Target ID'].values, clust)]
        df = df.sort(['p-value'])
        df = df.ix[~df.duplicated(['#Query ID', 'Target ID'])]  # remove dupes
        df = df.reset_index(drop=True)
        df['motif_clust'] = i
        print(df.shape)
        clust_dfs[i] = df

    del dfs2
    clust_dfs = pd.concat(clust_dfs, axis=0)
    print(clust_dfs.shape)

    # get coding fracs per motif cluster via:
    # clust_dfs.groupby('motif_clust').mean().coding_frac_x
    clust_dfs.to_csv(bz2.BZ2File('motif_clusts_%s.tsv.bz2'%(param_I_str), 'w'),
                     sep='\t', index=False, header=True)
Пример #24
0
            if FOLDERNAME == "/home/ole/windows/all_data/emb217/deployments/moorings/TC_Flach/RBR/data" or FOLDERNAME == "/home/ole/windows/all_data/emb217/deployments/moorings/TC_Tief/RBR/data":
                skip_header = 28

            temporary_data = np.genfromtxt(datafile_path,
                                           skip_header=skip_header,
                                           usecols=(0, 1, 2),
                                           encoding="iso8859_15",
                                           dtype="str")

        locale.setlocale(locale.LC_TIME, "C")
        print(locale.getlocale())
        #convert the strings in the data to datetime objects
        days = temporary_data[:, 0]
        hours = temporary_data[:, 1]
        string_time = np.asarray(add(days, add("-", hours)))
        full_utc = np.asarray([
            dt.datetime.strptime(string_time[i], "%d-%b-%Y-%X.%f")
            for i in np.arange(string_time.size)
        ])

        #Error in the time log of one sensor. Wrong about one hour
        #if datafile_path[25:] == "/emb217/deployments/moorings/TC_Flach/RBR/data/EMB217_TC-Chain-flach_016172_eng.txt":
        #    full_utc = full_utc - dt.timedelta(hours=1)

        full_temperature = temporary_data[:, 2].astype("float")
        #temperature can't be reasonable negative
        full_temperature[full_temperature < 0] = np.nan

        #search for measurement properties in the file
        for i in np.arange(np.shape(sensor_positions)[0]):
Пример #25
0
def run():
    parser = argparse.ArgumentParser(description="Examples: \n" +\
     "calc_spectra data/vega.pkl data/vega/ -i 0.000 1.5707963267948966 150; " +\
     "calc_spectra data/vega.pkl data/vega/ -i 0.088418; " +\
     "calc_spectra data/altair.pkl data/altair/ -i 0.8840; " +\
     "calc_spectra data/achernar.pkl data/achernar/ -i 1.0577")
    parser.add_argument("pkl_sfile", help="the pickled star file")
    parser.add_argument("output", help="the output directory")
    parser.add_argument(
        '-i',
        type=float,
        nargs='+',
        help='either a single inclination in radians ' +
        'or a equally spaced values specified by minimum, maximum and number',
        required=True)
    parser.add_argument("-m", help="longitudinal integration method: 0=cubic(default), 1=trapezoidal", type=int, \
      default=0)
    args = parser.parse_args()

    ## inputs
    pkl_sfile = args.pkl_sfile  # pickled star file
    output = args.output  # output location

    # integration method
    if args.m == 0:
        m = 'cubic'
    elif args.m == 1:
        m = 'trapezoid'
    else:
        sys.exit(
            "Longitudinal integration method should be either 0 (cubic) or 1 (trapezoidal)."
        )

    # inclinations
    i = args.i
    li = len(i)
    if li not in [1, 3]:
        sys.exit("Please specify either a single inclination in radians (one number) " +\
         "or a range specified by minimum, maximum and step (three numbers).")
    elif li == 1:
        inclinations = np.array(i)
        # decimal precision of inclination for printout
        prec = 6
    elif li == 3:
        mi, ma, num = i
        inclinations = np.linspace(mi, ma, num=int(num))
        # decimal precision of inclination for printout
        prec = np.int(np.ceil(-np.log10((ma - mi) / num)))
    leni = len(inclinations)

    # unpickle the star
    with open(pkl_sfile, 'rb') as f:
        st = pickle.load(f)
    # get the wavelengths at which we see light from this star
    wl = st.wavelengths

    ## write the spectra of the star in text format
    # create the directory if it doesn't exist
    if not os.path.exists(output):
        os.mkdir(output)
    # filenames
    if not output.endswith('/'):
        output += '/'
    filename = os.path.splitext(os.path.basename(pkl_sfile))[0]
    inc_str = np.array([("%." + str(prec) + "f") % x
                        for x in np.round(inclinations, decimals=prec)])
    ofiles = ch.add(output + filename, inc_str)
    ofiles = ch.replace(ofiles, '.', '_')
    ofiles = ch.add(ofiles, '.txt')

    for i, ofile in np.ndenumerate(ofiles):
        # message
        if i[0] % 10 == 0:
            print(
                str(i[0]) + " out of " + str(leni) +
                " inclinations calculated.")
            sys.stdout.flush()
        # current inclination
        inc = inclinations[i]
        # calculate the spectrum or the magnitudes
        light = st.integrate(inc, method=m)

        # create this file if it doesn't exist, open it for writing
        f = open(ofile, 'w+')
        # write the header
        f.write('# luminosity: ' + str(st.luminosity) + '\n')
        f.write('# omega: ' + str(st.surface.omega) + '\n')
        f.write('# inclination(rad): ' + str(inclinations[i]) + '\n')
        f.write('# mass: ' + str(st.mass) + '\n')
        f.write('# Req: ' + str(st.Req) + '\n')
        f.write('# distance: ' + format(st.distance, '.2e') + ' cm\n')
        f.write('# A_V: ' + format(*(st.a_v), '.2f') + '\n')
        f.write('# number of upper half z values: ' + str(st.map.nz) + '\n')
        # write the spectrum to the file
        f.write('\n')
        if st.bands is None:  # spectrum mode
            f.write('# wavelength(nm)\tflux(ergs/s/Hz/ster)\n')
            for j, w in np.ndenumerate(wl):
                f.write(str(w))
                f.write('\t %.5E' % light[j])
                f.write('\n')
        else:  # photometry mode
            f.write('# filter\twavelength(nm)\tmagnitude\n')
            for j, w in enumerate(wl):
                f.write(st.bands[j])
                f.write('\t %.6g' % w)
                f.write('\t %.8f' % light[j])
                f.write('\n')
        f.close()
Пример #26
0
        res = add(res, '</tr>')
    res = add(res, '</table>')
    return res


FORREPLACE = createtable(rho)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for z, height in enumerate(abs(rho)):

    ax.bar(np.arange(4), height, zs=z, zdir='y', color='b', alpha=0.8)

plt.savefig('rhobar3')

FORREPLACE = add(FORREPLACE,
                 '<img src="../rhobar3.png" width="500" height="500"><br>')

f = add(add(add(add('<br>Tangle ', str(mean[0])), ' +/- '), str(errs[0])),
        '\n')
f = add(
    f,
    add(
        add(add(add('<br>Linear Entropy ', str(mean[1])), ' +/- '),
            str(errs[1])), '\n'))
f = add(
    f,
    add(add(add(add('<br>Entropy ', str(mean[2])), ' +/- '), str(errs[2])),
        '\n'))
f = add(f, add(add('<br>Intensity ', str(intensity)), '\n'))
f = add(f, add(add('<br>fval ', str(fval)), '\n'))
Пример #27
0
    cf_files = np.sort(np.array(glob.glob(os.path.join('*/coding_fracs.tsv.bz2'))))
    coding_fracs = []  # {}
    for f in cf_files:
        print f
        cff = pd.read_table(bz2.BZ2File(f), sep='\t')
        cm_run = os.path.dirname(f)  # .split('-')[2]
        cff['cm_run'] = cm_run
        if cff.shape[0] > 1:
            coding_fracs.append(cff)  # [f] = cff
    coding_fracs = pd.concat(coding_fracs, keys=None, ignore_index=True)
    # this has a hack - for some reason cluster_id in coding_fracs is %04d, trim first zero to make it %03d ...
    splitted = npstr.split(coding_fracs.motif.values.astype(str), '_')
    clust_id = np.char.mod('_%03d_', np.array([int(i[0]) for i in splitted]))  # see https://stackoverflow.com/a/28286749
    mot_id = np.char.mod('%02d', np.array([int(i[1]) for i in splitted]))
    mot_id = npstr.add(clust_id, mot_id)
    coding_fracs['motif_id'] = npstr.add(coding_fracs.cm_run.values.astype(str), mot_id)

input_dir = 'tomtom_out'
input_dir = opt.input_dir
files = np.sort(np.array(glob.glob(input_dir + "/*tomtom.tsv.bz2")))  # folder with the tomtom files bzip'd
dfs = {}
#  can pd.concat work on shelved dataframes? YES. Note protocol=2 is faster and smaller.
dfs = shelve.open('tomtom_shelf.db', protocol=2, writeback=False)

if len(dfs) != len(files):  # if using a shelf, once this is done once, you don't have to do it again.
    for f in files:
        gene = os.path.basename(f).split('.')[0]
        print f, gene
        if gene in dfs.keys():
            continue
Пример #28
0
    def fit(self):

        # Results
        _cols_x = ['x%d' % i for i in range(self.n_parameters)]
        self.hist_ = pd.DataFrame(index=range(
            (self.iters + 1) * self.n_chromosomes),
                                  columns=[
                                      'iter',
                                  ] + _cols_x + [
                                      'cost',
                                      'orig',
                                  ])
        self.hist_[[
            'orig',
        ]] = '-1'

        #Initial random population
        self.pop_ = self._random(self.n_chromosomes)
        self.cost_ = self._fitness_function()

        filter_iter = range(0, self.n_chromosomes)
        self.hist_.loc[filter_iter, 'iter'] = 0
        self.hist_.loc[filter_iter, 'cost'] = self.cost_
        self.hist_.loc[filter_iter, _cols_x] = self.pop_

        for i in range(self.iters):

            if self.verbose > 0:
                print('Iteration ' + str(i) + ' of ' + str(self.iters))

            orig = np.empty(self.n_chromosomes, dtype='S10')
            cost_sort = np.argsort(self.cost_)

            #Elitims
            new_pop = np.empty_like(self.pop_)
            new_pop[0:self.n_elite] = self.pop_[cost_sort[0:self.n_elite]]
            orig[0:self.n_elite] = (cost_sort[0:self.n_elite] +
                                    i * self.n_chromosomes).astype(np.str)

            #Cumulative probability of selection as parent
            zcost = (self.cost_ - np.average(self.cost_)) / np.std(self.cost_)
            pzcost = 1 - norm.cdf(zcost)
            pcost = np.cumsum(pzcost / sum(pzcost))

            #Select parents & match
            numparents = self.n_chromosomes - self.n_elite
            #TODO: Add random state
            rand_parents = np.random.rand(numparents, 2)
            parents = np.zeros(rand_parents.shape, dtype=np.int)
            for parent1 in range(numparents):
                for parent2 in range(2):
                    parents[parent1, parent2] = np.searchsorted(
                        pcost, rand_parents[parent1, parent2])

                if self.type_ == 'binary':
                    #Binary
                    #random single point matching
                    rand_match = int(np.random.rand() * self.n_parameters)
                    child = self.pop_[parents[parent1, 0]]
                    child[rand_match:] = self.pop_[parents[parent1, 1],
                                                   rand_match:]
                else:
                    #Continious
                    rand_match = np.random.rand(self.n_parameters)
                    child = self.pop_[parents[parent1, 0]] * rand_match
                    child += (1 - rand_match) * self.pop_[parents[parent1, 1]]

                new_pop[self.n_elite + parent1] = child

            orig[self.n_elite:] = [
                ','.join(row.astype(np.str))
                for row in (parents + i * self.n_chromosomes)
            ]

            #Mutate
            m_rand = np.random.rand(self.n_chromosomes, self.n_parameters)
            m_rand[0:self.n_elite] = 1.0
            mutations = m_rand <= self.per_mutations
            num_mutations = np.count_nonzero(mutations)

            if self.type_ == 'binary':
                new_pop[mutations] = (new_pop[mutations] == 0).astype(np.int)
            else:
                new_pop[mutations] = self._random(num_mutations)[:, 0]

            rows_mutations = np.any(mutations, axis=1)
            orig[rows_mutations] = add(
                orig[rows_mutations],
                np.array(['_M'] * np.count_nonzero(rows_mutations),
                         dtype='S10'))

            # Replace replicates with random
            temp_unique = np.ascontiguousarray(new_pop).view(
                np.dtype((np.void, new_pop.dtype.itemsize * new_pop.shape[1])))
            _, temp_unique_idx = np.unique(temp_unique, return_index=True)
            n_replace = self.n_chromosomes - temp_unique_idx.shape[0]
            if n_replace > 0:
                temp_unique_replace = np.ones(self.n_chromosomes,
                                              dtype=np.bool)
                temp_unique_replace[:] = True
                temp_unique_replace[temp_unique_idx] = False
                new_pop[temp_unique_replace] = self._random(n_replace)
                orig[temp_unique_replace] = '-1'

            self.pop_ = new_pop
            self.cost_ = self._fitness_function()

            filter_iter = range((i + 1) * self.n_chromosomes,
                                (i + 2) * self.n_chromosomes)
            self.hist_.loc[filter_iter, 'iter'] = i + 1
            self.hist_.loc[filter_iter, 'cost'] = self.cost_
            self.hist_.loc[filter_iter, _cols_x] = self.pop_
            self.hist_.loc[filter_iter, 'orig'] = orig

        best = np.argmin(self.cost_)
        self.x = self.pop_[best]
        self.x_cost = self.cost_[best]
Пример #29
0
def get_generators(n_total,
                   batch_size,
                   image_shape=None,
                   type='array',
                   zeros_left=5000):
    '''
    Construct generators for training and validation data
    Zero grade images are downsampled
    :param n_total: number of total images to use (training plus validation)
    :param batch_size: batch size used in training
    :param image_shape: image size used in training
    :param zeros_left: how many images of grade zero should be left in the pool
                       use a negative value to keep all the zeros
    :return: train_gen: generator of training data
             test_gen: generator of validation data
    '''
    # Set the number of training samples
    n_train = int(np.ceil(n_total * 0.8))
    n_test = int(np.floor(n_total * 0.2))

    # Read filenames from a text file listing all the images
    full_filenames = np.genfromtxt('../data/train_filenames.txt', dtype=str)
    # Read the labels file
    full_labels = np.genfromtxt('../data/trainLabels.csv',
                                skip_header=1,
                                dtype=str,
                                delimiter=',')
    # Keep only labels of data that can be used in training
    full_samples = replace(full_filenames, ".jpeg", "")
    full_mask = np.isin(full_labels[:, 0], full_samples)
    trainable_labels = np.copy(full_labels[full_mask, :])

    # Downsample the zero grade, keeping only the first 5000
    # Randomize order
    np.random.seed(1234)
    np.random.shuffle(trainable_labels)
    # Arrange by a stable sort (mergesort)
    trainable_labels = np.copy(
        trainable_labels[trainable_labels[:, 1].argsort(kind='mergesort')])
    # Remove extra zeros
    if zeros_left > 0:
        _, counts = np.unique(trainable_labels[:, 1], return_counts=True)
        n_zeros = counts[0]
        downsampled_labels = np.copy(trainable_labels[(n_zeros -
                                                       zeros_left):, :])
    else:
        downsampled_labels = np.copy(trainable_labels)

    # Randomize and choose training data
    np.random.shuffle(downsampled_labels)
    train_labels = downsampled_labels[:n_train, :]
    #test_labels = downsampled_labels[n_train:(n_train + n_test)]
    # Exclude training samples from the original data and choose test data among them
    np.random.shuffle(trainable_labels)
    exclusion = np.isin(trainable_labels[:, 0],
                        train_labels[:, 0],
                        invert=True)
    valid_labels = np.copy(trainable_labels[exclusion, :])
    test_labels = np.copy(valid_labels[:n_test, :])

    # Print the counts of each class in test and train data
    _, train_counts = np.unique(train_labels[:, 1], return_counts=True)
    print("\nTrain distribution:")
    print(train_counts / np.sum(train_counts))
    _, test_counts = np.unique(test_labels[:, 1], return_counts=True)
    print("\nTest distribution:")
    print(test_counts / np.sum(test_counts))
    print("\n")

    if type == 'array':
        # Add .npy file ending
        train_filenames = add(train_labels[:, 0],
                              np.full(shape=n_train, fill_value='.npy'))
        test_filenames = add(test_labels[:, 0],
                             np.full(shape=n_test, fill_value='.npy'))
        # Add path of the data folder to the files
        train_filepaths = add(
            np.full(shape=train_filenames.shape, fill_value='../data/arrays/'),
            train_filenames)
        test_filepaths = add(
            np.full(shape=test_filenames.shape, fill_value='../data/arrays/'),
            test_filenames)

        # Create an instance of the image generator
        train_gen = ArrayGenerator(train_filepaths, train_labels[:, 1],
                                   batch_size)
        test_gen = ArrayGenerator(test_filepaths, test_labels[:, 1],
                                  batch_size)

    elif type == 'image':
        if image_shape is None:
            raise ValueError
        # Add .jpeg file ending
        train_filenames = add(train_labels[:, 0],
                              np.full(shape=n_train, fill_value='.jpeg'))
        test_filenames = add(test_labels[:, 0],
                             np.full(shape=n_test, fill_value='.jpeg'))
        # Add path of the data folder to the files
        train_filepaths = add(
            np.full(shape=train_filenames.shape, fill_value='../data/train/'),
            train_filenames)
        test_filepaths = add(
            np.full(shape=test_filenames.shape, fill_value='../data/train/'),
            test_filenames)

        # Create an instance of the image generator
        train_gen = ImageGenerator(train_filepaths, train_labels[:, 1],
                                   batch_size, image_shape)
        test_gen = ImageGenerator(test_filepaths, test_labels[:, 1],
                                  batch_size, image_shape)

    return train_gen, test_gen
Пример #30
0
 locUK608 = astropy.coordinates.EarthLocation.from_geodetic(
     lat=51.143833512, lon=-1.433500703, height=176.028)  # UK608 LBA
 locIE613 = astropy.coordinates.EarthLocation.from_geocentric(
     3801633.528060000, -529021.899396000, 5076997.185,
     unit='m')  # IE613 LBA
 if args.observatory.startswith('UK608'): tstart.location = locUK608
 elif argsobservatory.startswith('IE613'): tstart.location = locIE613
 if args.observatory.startswith('UK608'): tend.location = locUK608
 elif argsobservatory.startswith('IE613'): tend.location = locIE613
 filename = args.filename
 sources, durations = sourcelist(filename)
 numberofsources = len(sources)
 print('Designing observations starting at ', tstart)
 print(numberofsources, ' Sources')
 #    print (sources, durations)
 psrnames = add('PSR ', sources)
 #    print (psrnames, durations)
 lst = getlst(tstart, psrnames[0])
 # Find first source
 if args.strictorder:
     index = 0
 else:
     index = findfirstsource(
         psrnames, lst, 3
     )  # the last argument is in hours, to be subtracted from the LST to find the first source
 rotated_psrnames = np.roll(psrnames, -index)
 rotated_durations = np.roll(durations, -index)
 #    print (rotated_psrnames)
 currenttime = tstart
 deadtime = astropy.time.TimeDelta(60, format='sec')
 stepwait = astropy.time.TimeDelta(600, format='sec')