Exemplo n.º 1
0
    def unique_configurations( self, site_distribution, verbose=False, show_progress=False ):
        """
        Find the symmetry inequivalent configurations for a given population of objects.

        Args:
            site_distribution (dict): A dictionary that defines the number of each object 
                                      to be arranged in this system.

                                      e.g. for a system with four sites, with two occupied (denoted `1`)
                                      and two unoccupied (denoted `0`)::

                                          { 1: 2, 0: 2 }
            verbose (opt:default=False): Print verbose output.
            show_progress (opt:default=False): Show a progress bar.
                                      Setting to `True` gives a simple progress bar.
                                      Setting to `"notebook"` gives a Jupyter notebook compatible progress bar.

        Returns:
            unique_configurations (list): A list of :any:`Configuration` objects, for each symmetry 
                                          inequivalent configuration. 
        """
        s = flatten_list( [ [ key ] * site_distribution[ key ] for key in site_distribution ] )
        total_permutations = number_of_unique_permutations( s )
        if verbose:
            print( 'total number of sites: ' + str( sum( site_distribution.values() ) ) )
            print( 'using {:d} symmetry operations.'.format( len( self.symmetry_group.symmetry_operations ) ) )
            print( 'evaluating {:d} unique permutations.'.format( total_permutations ) )
        generator = unique_permutations( s )
        if show_progress:
            if show_progress=='notebook':
                generator = tqdm_notebook( generator, total=total_permutations, unit=' permutations' )
            else:
                generator = tqdm( generator, total=total_permutations, unit=' permutations' )
        return self.enumerate_configurations( generator, verbose=verbose )
Exemplo n.º 2
0
    def __new__(cls, iterable=None, desc=None, total=None, leave=True,
                backend=None, verbose=True):
        if backend is None:
            backend = Progressbar.backend

        if not verbose:
            backend = "hide"

        if backend == "tqdm":
            from tqdm import tqdm
            return tqdm(iterable=iterable, desc=desc, total=total, leave=leave,
                        ascii=True, ncols=80, file=sys.stdout,
                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed"
                                   "}<{remaining}{postfix}]") # remove rate_fmt
        elif backend == "tqdm_notebook":
            from tqdm import tqdm_notebook
            return tqdm_notebook(iterable=iterable, desc=desc, total=total,
                                 leave=leave)
        elif backend == "pyprind":
            from pyprind import ProgBar, prog_bar
            ProgBar._adjust_width = lambda self: None  # keep constant width
            if iterable is None:
                return ProgBar(total, title=desc, stream=1)
            else:
                return prog_bar(iterable, title=desc, stream=1,
                                iterations=total)
        elif backend == "hide":
            return NoProgressbar(iterable=iterable)
        else:
            raise NotImplementedError("unknown backend")
Exemplo n.º 3
0
def progbar(it=None, nb=False, **tqdm_settings):
    """ Turn any iterable into a progress bar, with notebook version. """
    defaults = {'ascii': True, 'smoothing': 0}
    # Overide defaults with custom tqdm_settings
    settings = {**defaults, **tqdm_settings}
    if nb:  # pragma: no cover
        return tqdm.tqdm_notebook(it, **settings)
    return tqdm.tqdm(it, **settings)
Exemplo n.º 4
0
def load_data(last_n_days=30):

    def load_gz(file):
        try:
            if file.split('.')[-1] == 'gz':
                with gzip.open(file) as f:
                    data = ujson.loads(f.read().decode('utf-8'))
            else:
                with open(file, encoding='utf-8') as f:
                    data = ujson.load(f)
        except:
            print(f'Error loading file: {file}')
        return [Window(*v) for v in data]

    files = {file: os.path.getctime(os.path.join(LOGS, file)) for file in os.listdir(LOGS)}
    split_date = (dt.fromtimestamp(files[sorted(files.keys())[-1]]) -
                  pd.Timedelta(str(last_n_days) + 'days')).date()
    data = None
    days = []
    for file in tqdm_notebook(files):
        if dt.fromtimestamp(files[file]).date() > split_date:
            day = load_gz(os.path.join(LOGS, file))
            day = pd.DataFrame.from_records(day, columns=Window._fields)
            day['boot'] = pd.Timestamp(day['start_time'].min())
            days.append(day)

    data = pd.concat([*days])
    data['start_time'] = data['start_time'].apply(lambda x: pd.Timestamp(x))
    data['last_update'] = data['last_update'].apply(lambda x: pd.Timestamp(x))
    data['focus_time'] = data['focus_time'].apply(lambda x: pd.Timedelta(x))
    data['start_time'] = data['last_update'] - data['focus_time']

    def categorize(x, dictionary):
        for k, v in dictionary.items():
            if k.lower() in x.lower():
                return v

    def merge(*lists):
        ret = lists[0]
        for l in lists[:-1]:
            assert len(l) == len(lists[-1])
        for i in range(len(lists[0])):
            for l in lists:
                if l[i]:
                    ret[i] = l[i]
                    break
        return ret

    if data is not None:
        data['category'] = merge(
            data['name'].apply(lambda x: categorize(x, categories_name)).values,
            data['exe'].apply(lambda x: categorize(x, categories_exe)).values,
            data['exe'].str.split('\\').apply(lambda x: x[-1]).values)

    # Delete unused columns
    del data['pid']
    del data['cmd']
    return data
Exemplo n.º 5
0
 def on_epoch_begin(self, epoch, logs=None):
     print('Epoch %d/%d' % (epoch + 1, self.epochs))
     if "steps" in self.params:
         self.use_steps = True
         self.target = self.params['steps']
     else:
         self.use_steps = False
         self.target = self.params['samples']
     self.prog_bar = tqdm.tqdm_notebook(total=self.target)
     self.log_values_by_metric = defaultdict(list)
Exemplo n.º 6
0
def progressbar(*args, **kwargs):
    """Uses tqdm progressbar. This function exists for wrapping purposes only.
    Original docstring follows:
    ----------------------------------------
    %s
    %s
    """
    try:
        return tqdm_notebook(*args, **kwargs)
    except:
        return tqdm(*args, **kwargs)
Exemplo n.º 7
0
def test_validation_loss(decoder, s, generate_batch, val_img_embeds, val_captions_indexed):
    np.random.seed(300)
    random.seed(300)
    val_loss = 0
    for _ in tqdm.tqdm_notebook(range(1000)):
        val_loss += s.run(decoder.loss, generate_batch(val_img_embeds,
                                                       val_captions_indexed,
                                                       32,
                                                       20))
    val_loss /= 1000.
    return val_loss
Exemplo n.º 8
0
    def on_epoch_begin(self, net, X=None, X_valid=None, **kwargs):
        # Assume it is a number until proven otherwise.
        batches_per_epoch = self.batches_per_epoch

        if self.batches_per_epoch == 'auto':
            batches_per_epoch = self._get_batches_per_epoch(net, X, X_valid)
        elif self.batches_per_epoch == 'count':
            # No limit is known until the end of the first epoch.
            batches_per_epoch = None

        if self._use_notebook():
            self.pbar = tqdm.tqdm_notebook(total=batches_per_epoch)
        else:
            self.pbar = tqdm.tqdm(total=batches_per_epoch)
Exemplo n.º 9
0
    def __iter__(self):
        state = self.prepareState(self._endpoint, self._filters, **self._prepareStateParams)
        entries = self._endpoint(sort= self._sort, n= self._n, **self._filters)

        if self._progbar:
            try:
                get_ipython
                inNotebook = True
            except NameError:
                inNotebook = False

            if not inNotebook:
                sys.stderr.write("Locating data...")

        entries = list(entries)

        if self._progbar and not inNotebook:
            sys.stderr.write("\r")

        if self._progbar:
            try:
                get_ipython # will fail faster and more reliably than tqdm_notebook
                entriesIterable = tqdm_notebook(entries, unit= "entries")
            except (NameError, AttributeError, TypeError):
                entriesIterable = tqdm(entries, unit= "entries")
        else:
            entriesIterable = entries

        def iterate():
            for entry in entriesIterable:
                try:
                    data = self.parse(entry, state= state) if state is not None else self.parse(entry)
                    yield entry, data
                except KeyboardInterrupt:
                    self._write('Interrupted while parsing "{}"'.format(entry.path))
                    break
                except GeneratorExit:
                    raise GeneratorExit
                except:
                    self._write('Error while parsing "{}":'.format(entry.path))
                    self._write( traceback.format_exc() )

        # chain the operations together
        # each function in self._chain is a generator which takes an iterator
        # (remember that you call a generator to "activate" it: calling a generator returns an iterator)
        # so end condition for the loop is that `iterate` refers to an iterator
        iterate = iterate()
        for do in self._chain:
            iterate = do(iterate)
        return iterate
Exemplo n.º 10
0
Arquivo: simlib.py Projeto: pelegs/msm
def simulate_notebook(params):
    num_steps = params['num_steps']
    num_dim = params['num_dim']
    num_particles = params['num_particles']
    A = params['Ddt'] / params['KBT']
    B = np.sqrt(2*params['Ddt'])
    U = params['potential']
    Xs = np.zeros(shape=(num_steps, num_dim, num_particles))
    Xs[0,:,:] = params['x0']
    for t in tqdm_notebook(range(1, num_steps)):
        drift = np.zeros(shape=(num_dim, num_particles))
        for i in range(num_particles):
            drift[:,i] = A * U.get_force(Xs[t-1,:,i])
        noise = B * np.random.normal(size=(num_dim, num_particles))
        Xs[t,:,:] = Xs[t-1,:,:] + drift + noise
    return Xs
Exemplo n.º 11
0
def challenge_evaluate_performance(fn):
    score = 0    
    for i in tnrange(8, desc="Total"):
        wave = load_wave("data/secret_tests/challenge_valid_%d"%i)    
        labels = true_labels[i]
        pred_labels = fn(wave)
        for j in range(3):
            # best of 3!
            score += test_classification_score(wave, labels, pred_labels)
        
        for j in tqdm_notebook(xrange(40), desc='Test case %d'%i):
            sleep(0.1)
    print "*** Total score: %.2f ***" % score
    return score
        
      
def download_file(url, file_path):
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get('content-length'))
    try:
        with open(file_path, 'wb', buffering=16*1024*1024) as f:
            bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True)
            bar.set_description(os.path.split(file_path)[-1])
            for chunk in r.iter_content(32 * 1024):
                f.write(chunk)
                bar.update(len(chunk))
            bar.close()
    except Exception:
        print("Download failed")
    finally:
        if os.path.getsize(file_path) != total_size:
            os.remove(file_path)
            print("Removed incomplete download")
Exemplo n.º 13
0
def progbar(it=None, nb=False, **kwargs):
    """Turn any iterable into a progress bar, with notebook option

    Parameters
    ----------
        it: iterable
            Iterable to wrap with progress bar
        nb: bool
            Whether  to display the notebook progress bar
        **kwargs: dict-like
            additional options to send to tqdm
    """
    defaults = {'ascii': True, 'smoothing': 0.0}
    # Overide defaults with custom kwargs
    settings = {**defaults, **kwargs}
    if nb:  # pragma: no cover
        return tqdm.tqdm_notebook(it, **settings)
    return tqdm.tqdm(it, **settings)
Exemplo n.º 14
0
def download_file(url, file_path):
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get('content-length'))
    bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True)
    bar.set_description(os.path.split(file_path)[-1])
    incomplete_download = False
    try:
        with open(file_path, 'wb', buffering=16 * 1024 * 1024) as f:
            for chunk in r.iter_content(1 * 1024 * 1024):
                f.write(chunk)
                bar.update(len(chunk))
    except Exception as e:
        raise e
    finally:
        bar.close()
        if os.path.exists(file_path) and os.path.getsize(file_path) != total_size:
            incomplete_download = True
            os.remove(file_path)
    if incomplete_download:
        raise Exception("Incomplete download")
Exemplo n.º 15
0
	def calculate(self, raw, window=1., step=0.25, minmax=False, variance=False,
				  reduction='mean', progressbar=True):
		if not minmax and not variance:
			raise Warning('Nothing computed. To compute variance you need to'
						  ' pass `variance=True`, to compute range you need to'
						  ' pass `minmax=True`.')
			return self

		data = raw._data
		window = int(round(window * raw.info['sfreq']))
		step = int(round(step * raw.info['sfreq']))
		self.window = window
		self.step = step
		self.sfreq = raw.info['sfreq']

		n_samples = data.shape[1]
		n_windows = int(np.floor((n_samples - window) / step))
		self.ranges = np.zeros(n_windows) if minmax else None
		self.variances = np.zeros(n_windows) if variance else None

		reduction = dict(mean=np.mean, max=np.max)[reduction]

		if progressbar:
			from tqdm import tqdm_notebook
			pbar = tqdm_notebook(total=n_windows)

		# step through data
		for window_idx in range(n_windows):
			first = window_idx * step
			last = first + window
			data_buffer = data[:, first:last]

			if minmax:
				self.ranges[window_idx] = reduction(
					data_buffer.max(axis=1) - data_buffer.min(axis=1))
			if variance:
				self.variances[window_idx] = reduction(data_buffer.var(axis=1))
			if progressbar:
				pbar.update(1)
		return self
Exemplo n.º 16
0
def _pbar(iterable, desc, leave=True, position=None, verbose='progressbar'):

    if verbose is not False and \
            verbose not in ['progressbar', 'tqdm', 'tqdm_notebook']:
        raise ValueError('verbose must be one of {progressbar,'
                         'tqdm, tqdm_notebook, False}. Got %s' % verbose)

    if verbose == 'progressbar':
        from mne.utils import ProgressBar
        pbar = ProgressBar(iterable, mesg=desc, spinner=True)
        print('')
    elif verbose == 'tqdm':
        from tqdm import tqdm
        pbar = tqdm(iterable, desc=desc, leave=leave, position=position,
                    dynamic_ncols=True)
    elif verbose == 'tqdm_notebook':
        from tqdm import tqdm_notebook
        pbar = tqdm_notebook(iterable, desc=desc, leave=leave,
                             position=position, dynamic_ncols=True)
    elif verbose is False:
        pbar = iterable
    return pbar
Exemplo n.º 17
0
 def train(self, n_epochs, K = 5):
     [obs_ph, act_ph, new_obs_ph, rew_ph, terminal_ph, policy_network, old_policy_network, actions, train_policy, train_state_value] = self._graph
     data_collector = A2CDataCollector(self._sess, actions, obs_ph, 20, 20)
     for i in tqdm_notebook(range(n_epochs)):
         self._update_old_network()
         obs, acts, new_obs, rews, terminal = data_collector.collect_data()
         for j in range(K):
             self._sess.run([train_policy],feed_dict={
                                         obs_ph: np.array(obs).reshape(-1, self._obs_dim),
                                         act_ph: np.array(acts).reshape(-1),
                                         new_obs_ph: np.array(new_obs).reshape(-1, self._obs_dim),
                                         rew_ph: np.array(rews).reshape(-1, 1),
                                         terminal_ph: np.array(terminal).reshape(-1, 1)
                                     })
         for j in range(30):
             self._sess.run([train_state_value],feed_dict={
                                         obs_ph: np.array(obs).reshape(-1, self._obs_dim),
                                         act_ph: np.array(acts).reshape(-1),
                                         new_obs_ph: np.array(new_obs).reshape(-1, self._obs_dim),
                                         rew_ph: np.array(rews).reshape(-1, 1),
                                         terminal_ph: np.array(terminal).reshape(-1, 1)
                                     })
     return data_collector.get_episode_statistics()
Exemplo n.º 18
0
def load_lfw_dataset(
        use_raw=False,
        dx=80, dy=80,
        dimx=45, dimy=45):

    # read attrs
    df_attrs = pd.read_csv(ATTRS_NAME, sep='\t', skiprows=1)
    df_attrs = pd.DataFrame(df_attrs.iloc[:, :-1].values, columns=df_attrs.columns[1:])
    imgs_with_attrs = set(map(tuple, df_attrs[["person", "imagenum"]].values))

    # read photos
    all_photos = []
    photo_ids = []

    with tarfile.open(RAW_IMAGES_NAME if use_raw else IMAGES_NAME) as f:
        for m in tqdm.tqdm_notebook(f.getmembers()):
            if m.isfile() and m.name.endswith(".jpg"):
                # prepare image
                img = decode_image_from_raw_bytes(f.extractfile(m).read())
                img = img[dy:-dy, dx:-dx]
                img = cv2.resize(img, (dimx, dimy))
                # parse person
                fname = os.path.split(m.name)[-1]
                fname_splitted = fname[:-4].replace('_', ' ').split()
                person_id = ' '.join(fname_splitted[:-1])
                photo_number = int(fname_splitted[-1])
                if (person_id, photo_number) in imgs_with_attrs:
                    all_photos.append(img)
                    photo_ids.append({'person': person_id, 'imagenum': photo_number})

    photo_ids = pd.DataFrame(photo_ids)
    all_photos = np.stack(all_photos).astype('uint8')

    # preserve photo_ids order!
    all_attrs = photo_ids.merge(df_attrs, on=('person', 'imagenum')).drop(["person", "imagenum"], axis=1)

    return all_photos, all_attrs
Exemplo n.º 19
0
    def eval(self,**kwargs):
        """ evaluate the link


        Parameters
        ----------

        applywav :boolean
         Apply waveform to H
        force : list
            Force the computation (['sig','ray','Ct','H']) AND save (replace previous computations)
        alg : 1|'old'|'exp'|'exp2'
            version of run for signature
        si_progress: bollean ( False)
            display progression bar for signatures
        diffraction : boolean (False)
            takes into consideration diffraction points
        ra_number_mirror_cf : int
            rays.to3D number of ceil/floor reflexions
        ra_ceil_H: float, (default [])
            ceil height . 
                If [] : Layout max ceil height 
                If 0 : only floor reflection (outdoor case) 
                If -1 : neither ceil nor floor reflection (2D case) 
        ra_vectorized: boolean (True)
            if True used the (2015 new) vectorized approach to determine 2drays
        progressbar: str
            None: no progress bar
            python : progress bar in ipython


        Returns
        -------

        ak : ndarray
            alpha_k
        tk : ndarray
            tau_k

        Notes
        -----

        update self.ak and self.tk

        self.ak : ndarray
            alpha_k
        self.tk : ndarray
            tau_k


        Examples
        --------

        .. plot::
            :include-source:

            >>> from pylayers.simul.link import *
            >>> L=DLink(verbose=False)
            >>> aktk = L.eval()


        See Also
        --------

        pylayers.antprop.signature
        pylayers.antprop.rays

        Experimental
        ------------

        alg = 2015 | 20152 (best)
            vectorized signature research
        si_reverb : number of reverb in source/target cycle if alg=2015

        """



        defaults={ 'applywav':True,
                   'si_progress':False,
                   'diffraction':True,
                   'ra_vectorized':True,
                   'ra_ceil_H':[],
                   'ra_number_mirror_cf':1,
                   'force':[],
                   'alg':1,
                   'si_reverb':4,
                   'threshold':0.1,
                   'verbose':[],
                   'progressbar':None,
                   }

        for key, value in defaults.items():
            if key not in kwargs:
                kwargs[key]=value

        if 'cutoff' not in kwargs:
            kwargs['cutoff']=self.cutoff
        else:
            self.cutoff=kwargs['cutoff']

        if 'force' in kwargs:
            if not isinstance(kwargs['force'],list):
                if kwargs['force'] == True :
                    kwargs['force'] = ['sig','ray','Ct','H']
                else :
                    kwargs['force'] = []

        if kwargs['verbose'] != []:
            self.verbose=kwargs['verbose']


        #pdb.set_trace()
        # must be placed after all the init !!!!
        if self.verbose :
            print "checkh5"
        self.checkh5()

        if isinstance(kwargs['progressbar'],str):
            if kwargs['progressbar'] =='notebook':
                pbar = tqdm.tqdm_notebook(total=100)
            elif kwargs['progressbar']=='python':
                pbar = tqdm.tqdm(total=100)
        elif isinstance(kwargs['progressbar'],tqdm.tqdm):
            pbar = kwargs['progressbar']



        ############
        # Signatures
        ############
        if self.verbose :
            print "Start Signatures"
        tic = time.time()
        Si = Signatures(self.L,self.ca,self.cb,cutoff=kwargs['cutoff'])

        if (self.dexist['sig']['exist'] and not ('sig' in kwargs['force'])):
            self.load(Si,self.dexist['sig']['grpname'])
            if self.verbose :
                print "load signature"
        else :
            if kwargs['alg']==1:
                Si.run(cutoff=kwargs['cutoff'],
                        diffraction=kwargs['diffraction'],
                        threshold=kwargs['threshold'],
                        progress=kwargs['si_progress'])
                if self.verbose :
                    print "default algorithm"

            if kwargs['alg']=='exp':
                TMP=Si.run_exp(cutoff=kwargs['cutoff'],
                        cutoffbound=kwargs['si_reverb'])
                if self.verbose :
                    print "experimental (ex 2015)"

            if kwargs['alg']=='exp2':
                TMP=Si.run_exp2(cutoff=kwargs['cutoff'],
                        cutoffbound=kwargs['si_reverb'])
                if self.verbose :
                    print "algo exp2 ( ex 20152)"

        #Si.run6(diffraction=kwargs['diffraction'])
        # save sig
            
            self.save(Si,'sig',self.dexist['sig']['grpname'],force = kwargs['force'])

        self.Si = Si
        toc = time.time()
        if self.verbose :
            print "Stop signature",toc-tic
        try:
            pbar.update(20)
        except: 
            pass



        ############
        # Rays
        ############
        if self.verbose :
            print "Start Rays"
        tic = time.time()
        R = Rays(self.a,self.b)

        if self.dexist['ray']['exist'] and not ('ray' in kwargs['force']):
            self.load(R,self.dexist['ray']['grpname'])

        else :

            # perform computation ...
            # ... with vetorized ray evaluation approach
            if kwargs['ra_vectorized']:
                r2d = Si.raysv(self.a,self.b)
            # ... or with original and slow approach ( to be removed in a near future)
            else :
                r2d = Si.rays(self.a,self.b)

            if kwargs['ra_ceil_H'] == []:
                ceilheight = self.L.maxheight
            else:
                ceilheight = kwargs['ra_ceil_H']

            R = r2d.to3D(self.L,H=ceilheight, N=kwargs['ra_number_mirror_cf'])
            R.locbas(self.L)
            # ...and save
            R.fillinter(self.L)

            C = Ctilde()
            C = R.eval(self.fGHz)
            self.save(R,'ray',self.dexist['ray']['grpname'],force = kwargs['force'])

        self.R = R
        toc = time.time()
        if self.verbose :
            print "Stop rays",toc-tic
        
        if self.R.nray == 0:
            raise NameError('No rays have been found. Try to re-run the simulation with a higher S.cutoff ')
        try:
            pbar.update(20)
        except: 
            pass
        ############
        # Ctilde
        ############
        
        if self.dexist['Ct']['exist'] and not ('Ct' in kwargs['force']):
            C=Ctilde()
            self.load(C,self.dexist['Ct']['grpname'])

        else :
            #if not hasattr(R,'I'):
            # Ctilde...
            # Find an other criteria in order to decide whether the R has
            # already been evaluated
            #pdb.set_trace()
            C = R.eval(self.fGHz)
            # ...save Ct
            self.save(C,'Ct',self.dexist['Ct']['grpname'],force = kwargs['force'])

        self.C = C

        try:
            pbar.update(20)
        except: 
            pass
        ############
        # H
        ############

        H = Tchannel()

        if self.dexist['H']['exist'] and not ('H' in kwargs['force']):
            self.load(H,self.dexist['H']['grpname'])
        else :
            # Ctilde antenna
            Cl=C.locbas(Tt=self.Ta, Tr=self.Tb)
            #T channel
            H = C.prop2tran(a=self.Aa,b=self.Ab,Friis=True,debug=True)
            self.save(H,'H',self.dexist['H']['grpname'],force = kwargs['force'])
        self.H = H
        try:
            pbar.update(20)
        except: 
            pass

        if kwargs['applywav']:
            if self.H.isFriis:
                self.ir = self.H.get_cir(self.wav.sf)
            else:
                self.ir = self.H.get_cir(self.wav.sfg)
        try:
            pbar.update(20)
        except: 
            pass
        return self.H.ak, self.H.tk
Exemplo n.º 20
0
# In[ ]:
print('Setting up our DataLoader for training..')
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                           all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=TRAIN_BATCH_SIZE)

print('Model.train!!!!')
model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(
            tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        logits = model(input_ids, segment_ids, input_mask, labels=None)
        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))
        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        print("\r%f" % loss, end='')
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
clf6 = make_pipeline(
    SMOTE(random_state=0),
    KerasClassifier(build_fn=dnn_models1,
                    epochs=25,
                    batch_size=1000,
                    verbose=1))

total_scores_1 = []
total_scores_2 = []
total_scores_3 = []
total_scores_4 = []
total_scores_5 = []
total_scores_6 = []

#hitseq_num = 1
for hitseq_num in tqdm_notebook(range(1, 11)):
    온라인_x1 = []
    for idx_index, idx_value in enumerate(idx):
        # hitseq_num개 이상의 클릭 로그를 가진 세션만 추출
        if idx_value >= hitseq_num:
            # 구매여부 변수에서 unique한 유저 아이디와 세션 아이디 하나를 가지고 옴
            구매여부_idx = 구매여부.iloc[idx_index, :-1]
            구매여부_idx = str(구매여부_idx[0]) + '_' + str(구매여부_idx[1])

            # 위에서 가지고 온 유저 아이디와 세션 아이디가 일치하는 데이터만 추출
            온라인_x_partial = 온라인[온라인['unique_id'] == 구매여부_idx].iloc[:, 3:-1]

            # hitseq_num개 이상의 클릭 로그를 가진 세션의 클릭 로그 중에서 hitseq_num까지의 클릭 로그만 사용(추출)
            # hitseq_num개 이후의 클릭 로그는 버림
            온라인_x_partial = np.array(
                온라인_x_partial[온라인_x_partial['hit_seq'] <= hitseq_num])
Exemplo n.º 22
0
def get_feature_matrix(sales, test, items, list_lags, date_block_threshold):
    
    """ This function create the model tablon"""
  
    # Create "grid" with columns
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    new_items = pd.DataFrame()
    cur_items_aux=np.array([])
    for block_num in sales['date_block_num'].unique():
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].append(pd.Series(cur_items_aux)).unique()
        cur_items_aux = cur_items[pd.Series(cur_items).isin(test.item_id)]
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Add submission shop_id-item_id in order to test predictions
    test['date_block_num'] = 34
    grid = grid.append(test[['shop_id', 'item_id', 'date_block_num']])

    # Groupby data to get shop-item-month aggregates
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Join it to the grid
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect()
    # List of columns that we will use to create lags
    cols_to_rename = list(all_data.columns.difference(index_cols)) 

    shift_range = list_lags

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()
    
        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # Don't use old data from year 2013
    all_data = all_data[all_data['date_block_num'] >= date_block_threshold] 

    # List of all lagged features
    fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

    # Category for each item
    item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    all_data = downcast_dtypes(all_data)
    gc.collect();
    
    return [all_data, to_drop_cols]
Exemplo n.º 23
0
    ax1.set_title('Image')
    ax2.imshow(first_seg[0, :, :, 0], vmin = 0, vmax = 1)
    ax2.set_title('Prediction')
fig.savefig('test_predictions.png')


from keras import models, layers
fullres_model = models.load_model('fullres_model.h5', compile=False)
seg_in_shape = fullres_model.get_input_shape_at(0)[1:3]
seg_out_shape = fullres_model.get_output_shape_at(0)[1:3]
print(seg_in_shape, '->', seg_out_shape)

from tqdm import tqdm_notebook
from skimage.morphology import binary_opening, disk
out_pred_rows = []
for c_img_name in tqdm_notebook(test_paths):
    c_path = os.path.join(test_image_dir, c_img_name)
    c_img = imread(c_path)
    c_img = np.expand_dims(c_img, 0)/255.0
    cur_seg = fullres_model.predict(c_img)[0]
    cur_seg = binary_opening(cur_seg>0.5, np.expand_dims(disk(2), -1))
    cur_rles = multi_rle_encode(cur_seg)
    if len(cur_rles)>0:
        for c_rle in cur_rles:
            out_pred_rows += [{'ImageId': c_img_name, 'EncodedPixels': c_rle}]
    else:
        out_pred_rows += [{'ImageId': c_img_name, 'EncodedPixels': None}]
    gc.collect()

c_path = os.path.join(test_image_dir, '000155de5.jpg')
c_img = imread(c_path) # 768,768,3
Exemplo n.º 24
0
        discriminator_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(
        zip(generator_gradient, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(
        zip(discriminator_gradient, discriminator.trainable_variables))


def play(sample_noise):
    result = generator(sample_noise)
    result = tf.reshape(result, (8, 28, 28))
    c = itertools.count(1)
    plt.figure(figsize=(18, 8))
    for image in result:
        plt.subplot(2, 4, next(c))
        plt.imshow(image)
    plt.show()


play(tf.random.normal((8, NOISE_DIM)))

sample_noise = tf.random.normal((8, NOISE_DIM))
play(sample_noise)
for _ in range(50):
    for images in tqdm.tqdm_notebook(dataset, total=len(list(dataset))):
        train_step(images)
    play(sample_noise)

# generator.save('generator.h5')
# discriminator.save('discriminator.h5')
Exemplo n.º 25
0
    def predict(self, BATCH_SIZE=2, CONF_THRESH=0.005, NMS_THRESH=0.45):
        # CONF_THRESH=0.25,NMS_THRESH=0.45, IOU_THRESH    = 0.5

        # Step1 - Get Model
        if (1):
            if self.MODEL == '' or self.MODEL == None:
                print(' - 1. Loading model : ', self.MODEL_WEIGHTFILE)
                self.MODEL = getYOLOv2(self.MODEL_CFGFILE,
                                       self.MODEL_WEIGHTFILE)
            self.MODEL.eval()

        # Step2 - Get Dataset
        if (1):
            with open(self.EVAL_IMAGELIST) as fp:
                tmp_files = fp.readlines()
                valid_files = [item.rstrip() for item in tmp_files]

            eval_dataset = VOCDatasetv2(self.EVAL_IMAGELIST,
                                        shape=(self.MODEL.width,
                                               self.MODEL.height),
                                        shuffle=False,
                                        transform=transforms.Compose([
                                            transforms.ToTensor(),
                                        ]))
            kwargs = {'num_workers': 1, 'pin_memory': True}
            eval_loader = torch.utils.data.DataLoader(eval_dataset,
                                                      batch_size=BATCH_SIZE,
                                                      shuffle=False,
                                                      **kwargs)

        # Step3 - Create File pointers for prediction storage (after removing the older files)
        if (1):
            fps = [0] * self.MODEL.num_classes
            if not os.path.exists(self.EVAL_OUTPUTDIR):
                os.mkdir(self.EVAL_OUTPUTDIR)
            else:
                for i in range(self.MODEL.num_classes):
                    buf = '%s/%s%s.txt' % (self.EVAL_OUTPUTDIR,
                                           self.EVAL_PREFIX,
                                           self.VOC_CLASSES[i])
                    if os.path.exists(buf):
                        os.remove(buf)
                # Should I delete folder and remake??
            for i in range(self.MODEL.num_classes):
                buf = '%s/%s%s.txt' % (self.EVAL_OUTPUTDIR, self.EVAL_PREFIX,
                                       self.VOC_CLASSES[i])
                fps[i] = open(buf, 'w')

        lineId = -1
        verbose = 0

        with torch.no_grad():
            val_loss_total = 0.0
            with tqdm.tqdm_notebook(total=len(eval_loader) *
                                    BATCH_SIZE) as pbar:

                for batch_idx, (data, target) in enumerate(eval_loader):
                    pbar.update(BATCH_SIZE)

                    t1 = time.time()
                    if self.USE_GPU:
                        data = data.cuda()
                        # target = target.cuda()
                    data, target = Variable(data), Variable(target)
                    output = self.MODEL(data).data
                    t2 = time.time()

                    if self.LOGGER != '':
                        if self.MODEL_LOSS != None:
                            # print (' - [DEBUG] target[target != 0.0]) : ', target[target != 0.0], ' || ', target.dtype)
                            if (len(target[target != 0.0])):
                                try:
                                    # print (' - [DEBUG] region_loss : ', self.MODEL_LOSS)
                                    val_loss = self.MODEL_LOSS(output, target)
                                    val_loss_total += val_loss.data
                                    if self.verbose:
                                        print(' - loss : ', val_loss)
                                except:
                                    traceback.print_exc()
                                    pdb.set_trace()
                            else:
                                print(' - No annotations : ',
                                      valid_files[lineId])

                    batch_boxes = get_region_boxes(output, CONF_THRESH,
                                                   self.MODEL.num_classes,
                                                   self.MODEL.anchors,
                                                   self.MODEL.num_anchors, 0,
                                                   1)
                    t3 = time.time()

                    for i in range(
                            output.size(0)):  # output.size(0) = batch_size
                        t31 = time.time()
                        lineId = lineId + 1
                        fileId = os.path.basename(
                            valid_files[lineId]).split('.')[0]
                        width, height = get_image_size(valid_files[lineId])
                        t32 = time.time()
                        # print(valid_files[lineId])
                        boxes = batch_boxes[i]
                        boxes = nms(boxes, NMS_THRESH)
                        for box in boxes:  # box = [x,y,w,h, box_conf, class_conf, cls_id]
                            # Top-Left Corner (xmin, xmax)
                            x1 = (box[0] - box[2] /
                                  2.0) * width  # x - w/2 (x = centre of BBox)
                            y1 = (box[1] - box[3] / 2.0) * height  # y - h/2
                            # Top-Right Corner (ymin, ymax)
                            x2 = (box[0] + box[2] / 2.0) * width  # x + h/2
                            y2 = (box[1] + box[3] / 2.0) * height  # y + h/2

                            box_conf = box[4]
                            for j in range(int((len(box) - 5) / 2)):
                                cls_conf = box[5 + 2 * j]
                                cls_id = box[6 + 2 * j]
                                prob = box_conf * cls_conf
                                fps[cls_id].write(
                                    '%s %f %f %f %f %f\n' %
                                    (fileId, prob, x1, y1, x2, y2)
                                )  # for each class_id, write down [prob, x1,y1,x2,y2]

                        t33 = time.time()
                        if (verbose):
                            print('    -- Time : imread : ',
                                  round(t32 - t31, 4), ' || boxes loop : ',
                                  round(t33 - t32, 4))

                    t4 = time.time()
                    # pdb.set_trace()
                    if (0):
                        print('  -- [DEBUG][PASCALVOCEval] Total time  : ',
                              round(t4 - t1, 2))
                        print('  -- [DEBUG][PASCALVOCEval] output time :  ',
                              round(t2 - t1, 2))
                        print('  -- [DEBUG][PASCALVOCEval] boxes time  :  ',
                              round(t3 - t2, 2))
                        print('  -- [DEBUG][PASCALVOCEval] file write  :  ',
                              round(t4 - t3, 2))

        if self.LOGGER != '':
            if self.MODEL_LOSS != None:
                self.LOGGER.save_value('Total Loss', 'Val Loss',
                                       self.LOGGER_EPOCH + 1,
                                       val_loss_total / len(eval_loader))

        for i in range(self.MODEL.num_classes):
            fps[i].close()

        self._do_python_eval()
def GMM_prediction(train, test, target_magic=None, seed=42, trained_parameter_file=None):
    if target_magic is not None:
        train = train[train[magic] == target_magic]
        test = test[test[magic] == target_magic]
        train.reset_index(drop=True,inplace=True)
        test.reset_index(drop=True,inplace=True)
    
    if trained_parameter_file is not None:
        trained_parameter = dict(np.load(trained_parameter_file))
        # trained_parameter = np.load(trained_parameter_file)
    else:
        trained_parameter = {}
    
    def get_mean_cov(x,y):
        max_label = y.astype(int).max()
        
        ps = []
        ms = []
        
        for i in range(max_label + 1):
        
            model = GraphicalLasso()
            label_i = (y==i).astype(bool)
            x2 = x[label_i]
            
            model.fit(x2)
            ps.append(model.precision_)
            ms.append(model.location_)

        ms = np.stack(ms)
        ps = np.stack(ps)
        
        return ms,ps
    
    # INITIALIZE VARIABLES
    cols = [c for c in train.columns if c not in ['id', 'target']]
    cols.remove('wheezy-copper-turtle-magic')

    
    # BUILD 512 SEPARATE MODELS
    random_seed_num = 8
    GMM_array = []
    for r in range(random_seed_num):
        GMM_array.append([np.zeros(len(train)), np.zeros(len(test))])
        
    for i in tqdm_notebook(range(512) if target_magic is None else [target_magic]):
        # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
        train2 = train[train['wheezy-copper-turtle-magic']==i]
        test2 = test[test['wheezy-copper-turtle-magic']==i]
        
        idx1 = train2.index; idx2 = test2.index
        train2.reset_index(drop=True,inplace=True)

        # FEATURE SELECTION 
        sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
        train3 = sel.transform(train2[cols])
        test3 = sel.transform(test2[cols])

        k = 3 # cluster_per_class
        
        for r in range(random_seed_num):
            # Initialize

            # STRATIFIED K-FOLD
            skf = StratifiedKFold(n_splits=11, random_state=seed+r, shuffle=True)
            for j, (train_index, test_index) in enumerate(skf.split(train3, train2['target'])):

                ms_key = "ms_{}_{}_{}".format(i, r, j)
                ps_key = "ps_{}_{}_{}".format(i, r, j)
                
                if ms_key in trained_parameter and ps_key in trained_parameter:
                    ms = trained_parameter[ms_key]
                    ps = trained_parameter[ps_key]
                else:
                    # MODEL AND PREDICT WITH GMM
                    new_label = np.zeros(len(train_index))
                    try_cnt = 0
                    while True:            
                        gm = GaussianMixture(random_state=seed+try_cnt+r, n_components=k).fit(train3[train_index,:][train2.loc[train_index]['target'] == 0])
                        new_label[train2.loc[train_index]['target'] == 0] = gm.predict(train3[train_index,:][train2.loc[train_index]['target'] == 0, :])
                        gm = GaussianMixture(random_state=seed+try_cnt+r, n_components=k).fit(train3[train_index,:][train2.loc[train_index]['target'] == 1])
                        new_label[train2.loc[train_index]['target'] == 1] = k + gm.predict(train3[train_index,:][train2.loc[train_index]['target'] == 1, :])

                        try:
                            ms, ps = get_mean_cov(train3[train_index,:], new_label)
                        except (FloatingPointError,ValueError) as e:
                            try_cnt += 1
                            continue
                        else:
                            break

                gm = GaussianMixture(random_state=seed, n_components=2*k, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,means_init=ms, precisions_init=ps)
                gm.fit(np.concatenate([train3[train_index,:], test3, train3[test_index, :]],axis = 0))
                
                # GMM_array[r][0]: oof
                # GMM_array[r][1]: preds
                GMM_array[r][0][idx1[test_index]] += np.sum(gm.predict_proba(train3[test_index,:])[:,k:], axis=1) 
                GMM_array[r][1][idx2] += np.sum(gm.predict_proba(test3)[:,k:], axis=1) / skf.n_splits
#                 oof[idx1[test_index]] += np.sum(gm.predict_proba(train3[test_index,:])[:,k:], axis=1) #/ random_seed_num
#                 preds[idx2] += np.sum(gm.predict_proba(test3)[:,k:], axis=1) / skf.n_splits #/ random_seed_num
#             GMM_array.append([oof, preds])

    # Print cv GMM
    averaging_oof = np.zeros(len(train))
    for array in GMM_array:
        averaging_oof += (array[0] / random_seed_num)
    auc = roc_auc_score(train['target'],averaging_oof)
    print('GMM_random_seed_averaging CV =',round(auc,5))
    
    return GMM_array
Exemplo n.º 27
0
    def _epoch(self, loader, criterion, optimizer=None, train=False):
        if train and not optimizer:
            raise AttributeError("Optimizer should be given for training")

        if train:
            self.base_model.train()
            mode = 'Train'
        else:
            self.base_model.eval()
            mode = 'Eval'

        losses = AverageMeter()
        labels = []
        outputs = []

        for bi, batch in enumerate(
                tqdm_notebook(loader,
                              desc="{} batches".format(mode),
                              leave=False)):
            inputs, targets = batch
            lengths = torch.randint(low=4,
                                    high=inputs.shape[2],
                                    size=(len(inputs), ))
            lengths, _ = torch.sort(lengths, descending=True)
            lengths[0] = inputs.shape[-1]
            inputs = inputs.permute(0, 2,
                                    1)  # Shape: (batch, length, features)
            if self.data == 'mimic_int':
                #this is multilabel with labels over time
                targets = targets[torch.range(0,
                                              len(inputs) - 1).long(), :,
                                  lengths - 1]
                targets = torch.argmax(targets, dim=1)
            elif self.data == 'simulation' or self.data == 'simulation_spike' or self.data == 'simulation_l2x':
                targets = targets[torch.range(0,
                                              len(inputs) - 1).long(),
                                  lengths - 1]
            elif self.data == 'mimic':  #does not have labels over time
                targets = targets[torch.range(0, len(inputs) - 1).long()]

            input_var = torch.autograd.Variable(inputs)
            target_var = torch.autograd.Variable(targets)
            input_var = input_var.to(self.device)
            target_var = target_var.to(self.device)

            output, alpha, beta = self.base_model(input_var, lengths)
            loss = criterion(output, target_var.long())

            labels.append(targets)

            # since the outputs are logit, not probabilities
            outputs.append(torch.nn.functional.softmax(output).data)

            # record loss
            losses.update(loss.item(), inputs.size(0))

            # compute gradient and do update step
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        return torch.cat(labels, 0), torch.cat(outputs, 0), losses.avg
Exemplo n.º 28
0
    unbinned_A_mle      = [[] for i in range(len(sig_params))]
    binned_A_mle        = [[] for i in range(len(sig_params))]
    binned_A_hybrid_mle = [[] for i in range(len(sig_params))]
    binned_A_50_mle     = [[] for i in range(len(sig_params))]
    binned_A_100_mle    = [[] for i in range(len(sig_params))]
    binned_A_200_mle    = [[] for i in range(len(sig_params))]
    binned_A_400_mle    = [[] for i in range(len(sig_params))]
    binned_A_1000_mle   = [[] for i in range(len(sig_params))]
    binned_A_2000_mle   = [[] for i in range(len(sig_params))]
    cnc_A_mle           = [[] for i in range(len(sig_params))]

    sig_pdf_ROOT = functools.partial(sig_pdf, doROOT=True)
    tf1_sig_pdf = TF1("tf1_sig_pdf", sig_pdf_ROOT, 2800, 13000, 2)

    for i, sig_p in enumerate(tqdm_notebook(sig_params, desc='Signal Model')):

        n_sig = n_bg
        tf1_sig_pdf.SetParameters(*sig_p)
        mc_sig = [tf1_sig_pdf.GetRandom() for ns in range(n_sig)]
        be_sig = bayesian_blocks(mc_sig, p0=0.02)

        true_sig_bc_bb      = get_true_bin_content(be_bg, sig_pdf, sig_p)
        true_sig_bc_50GeV   = get_true_bin_content(be_50GeV, sig_pdf, sig_p)
        true_sig_bc_100GeV  = get_true_bin_content(be_100GeV, sig_pdf, sig_p)
        true_sig_bc_200GeV  = get_true_bin_content(be_200GeV, sig_pdf, sig_p)
        true_sig_bc_400GeV  = get_true_bin_content(be_400GeV, sig_pdf, sig_p)
        true_sig_bc_1000GeV = get_true_bin_content(be_1000GeV, sig_pdf, sig_p)
        true_sig_bc_2000GeV = get_true_bin_content(be_2000GeV, sig_pdf, sig_p)

        be_hybrid = np.sort(np.unique(np.concatenate([be_bg, be_sig])))
def QDA_prediction(train, test, seed=42):
    cols = [c for c in train.columns if c not in ['id', 'target']]
    cols.remove('wheezy-copper-turtle-magic')
    oof = np.zeros(len(train))
    preds = np.zeros(len(test))

    for i in tqdm_notebook(range(512)):

        train2 = train[train['wheezy-copper-turtle-magic']==i]
        test2 = test[test['wheezy-copper-turtle-magic']==i]
        idx1 = train2.index; idx2 = test2.index
        train2.reset_index(drop=True,inplace=True)

        data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
        pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())])
        data2 = pipe.fit_transform(data[cols])
        train3 = data2[:train2.shape[0]]; test3 = data2[train2.shape[0]:]

        for r in range(30):
            skf = StratifiedKFold(n_splits=10, random_state=42+r, shuffle=True)
            for train_index, test_index in skf.split(train2, train2['target']):

                clf = QuadraticDiscriminantAnalysis(0.5)
                clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
                oof[idx1[test_index]] += clf.predict_proba(train3[test_index,:])[:,1] / 30.0
                preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits / 30.0

    auc = roc_auc_score(train['target'], oof)
    print(f'AUC: {auc:.5}')
    result_array = []
    for itr in range(4):
        test['target'] = preds
        test.loc[test['target'] > 0.955, 'target'] = 1
        test.loc[test['target'] < 0.045, 'target'] = 0
        usefull_test = test[(test['target'] == 1) | (test['target'] == 0)]
        new_train = pd.concat([train, usefull_test]).reset_index(drop=True)
        print(usefull_test.shape[0], "Test Records added for iteration : ", itr)
        new_train.loc[oof > 0.995, 'target'] = 1
        new_train.loc[oof < 0.005, 'target'] = 0
        oof2 = np.zeros(len(train))
        preds = np.zeros(len(test))
        for i in tqdm_notebook(range(512)):

            train2 = new_train[new_train['wheezy-copper-turtle-magic']==i]
            test2 = test[test['wheezy-copper-turtle-magic']==i]
            idx1 = train[train['wheezy-copper-turtle-magic']==i].index
            idx2 = test2.index
            train2.reset_index(drop=True,inplace=True)

            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())])
            data2 = pipe.fit_transform(data[cols])
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]


            random_seed_num = 30
            for r in range(random_seed_num):
                skf = StratifiedKFold(n_splits=10, random_state=seed+r, shuffle=True)
                for train_index, test_index in skf.split(train2, train2['target']):
                    oof_test_index = [t for t in test_index if t < len(idx1)]

                    clf = QuadraticDiscriminantAnalysis(0.5)
                    clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
                    if len(oof_test_index) > 0:
                        oof2[idx1[oof_test_index]] += clf.predict_proba(train3[oof_test_index,:])[:,1] / random_seed_num
                    preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits / random_seed_num

        
        result_array.append([oof2, preds])
        auc = roc_auc_score(train['target'], oof2)
        print(f'AUC: {auc:.5}')
    return result_array
Exemplo n.º 30
0
def todays_scrape(thedir, item, city, cityname, now):
    zips = pd.read_csv(thedir + 'zipcodes.csv', index_col=0)
    thezip = zips.loc[zips['City'] == cityname, 'Zipcode'].iloc[0]

    if cityname == 'Baltimore':
        thezip = 21211
    hdrs = {
        'User-Agent':
        ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 ' +
         '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'),
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding':
        'none',
        'Accept-Language':
        'en-US,en;q=0.8',
        'Connection':
        'keep-alive'
    }
    proxies = setup_proxy_rotation()
    proxy_index = random_proxy(proxies)
    proxy = proxies[proxy_index]

    first_url = ('https://' + cityname.lower().replace(' ', '') +
                 '.craigslist.org/search/lac/fuo?postal=' + str(thezip) +
                 '&query=' + item + '&s=' + '0' + '&search_distance=30')

    # create a new Firefox session
    #driver = webdriver.Chrome()
    #driver.implicitly_wait(30)
    #driver.get(first_url)
    page = requests.get(first_url, headers=hdrs, proxies=proxy)
    soup = BeautifulSoup(page.content, 'html.parser')
    #soup = BeautifulSoup(driver.page_source,'html.parser')

    # Get total number of couches
    totalcount = int(
        str(soup.find('span',
                      class_='totalcount')).split(">")[1].split("<")[0])

    badid = []
    theid = []
    theurl = []
    theprice = []
    theimgurl = []
    time_since_posting = []

    # This cycles through the Craigslist search result pages
    for ipage in tqdm_notebook(range(0, math.floor(totalcount / 120))):
        #ipage=1
        #if 1:
        next_url = ('https://' + cityname.lower().replace(' ', '') +
                    '.craigslist.org/search/lac/fuo?postal=' + str(thezip) +
                    '&query=' + item + '&s=' + str(120 * ipage) +
                    '&search_distance=30')

        proxies = setup_proxy_rotation()
        proxy_index = random_proxy(proxies)
        proxy = proxies[proxy_index]
        page = requests.get(next_url, headers=hdrs, proxies=proxy)
        soup = BeautifulSoup(page.content, 'html.parser')

        for i in soup.find_all('a', class_='result-image gallery empty'):
            badid.append(int(str(i).split('/')[-2].split('.')[0]))

        badcounter = 0
        for i in range(len(soup.find_all('a', class_="result-title"))):
            #i=116
            tit = str(soup.find_all('a', class_="result-title")[i])
            theid.append(int(tit.split(' ')[3].replace('data-id="', '')[0:-2]))
            theurl.append(tit.split(' ')[4].split('"')[1])

            trow = str(soup.find_all('li', class_='result-row')[i])
            theprice.append(
                int(
                    trow.split('result-meta')[1].split(">")[2].split("<")
                    [0].replace('$', '')))

            if ('result-image gallery empty'
                    in str(soup.find_all('li', class_='result-row')[i])):
                theimgurl.append('bad')
                badcounter += -1
            else:
                imgid = str(
                    soup.find_all('a', class_='result-image gallery')[
                        i + badcounter]).split('"')[3].split(',')[0][2:]
                tturl = (theurl[i].replace(theurl[i].split('/')[-2],
                                           imgid + '_300x300'))
                theimgurl.append('https://images.craigslist.org/' +
                                 tturl.split('/')[-2] + '.jpg')

            # Save image to disk
            outfile = thedir + city + '/' + item + '_images/' + str(
                theid[i]) + '.jpg'
            if not os.path.exists(outfile):
                urllib.request.urlretrieve(theimgurl[i], outfile)

            timepost = str(soup.find_all(
                'time', class_='result-date')[i]).split('"')[3]
            mydelta = (now - datetime.strptime(timepost, '%Y-%m-%d %H:%M'))
            time_since_posting.append(mydelta.days +
                                      mydelta.seconds / 60 / 60 / 24)

    # Get rid of shitty posts
    boolcompare = [True] * len(theid)
    for i in range(len(boolcompare)):
        if theid[i] in badid: boolcompare[i] = False
    theid = list(np.array(theid)[boolcompare])
    theprice = list(np.array(theprice)[boolcompare])
    theurl = list(np.array(theurl)[boolcompare])
    time_since_posting = list(np.array(time_since_posting)[boolcompare])
    theimgurl = list(np.array(theimgurl)[boolcompare])

    todays_scrape_df = pd.DataFrame(
        list(zip(theprice, time_since_posting, theimgurl, theurl)),
        columns=['price', 'time_since_posting', 'imgurl', 'url'],
        index=theid)
    return todays_scrape_df
Exemplo n.º 31
0
def first_scrape(thedir,
                 item,
                 city,
                 modify_id=False,
                 modify_url=False,
                 modify_price=False):

    if not modify_id:
        (theid, theurl, theprice) = gather_ids(thedir, item, city, cityname)
    else:
        theid = modify_id
        theurl = modify_url
        theprice = modify_price

    badid = []
    imgurl = [''] * len(np.array(theid))
    postdate = [''] * len(np.array(theid))
    time_since_posting = [0] * len(np.array(theid))

    proxies = setup_proxy_rotation()

    for i in tqdm_notebook(range(len(theurl))):
        #if not os.path.exists(thedir+city+'/'+item+'_images/'+str(theid[i])+'.jpg'):
        #headers = requests.utils.default_headers()
        #headers['User-Agent'] = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'+
        #                         ' (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
        hdrs = {
            'User-Agent':
            ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 ' +
             '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'),
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset':
            'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding':
            'none',
            'Accept-Language':
            'en-US,en;q=0.8',
            'Connection':
            'keep-alive'
        }

        if not (i % 10):
            # Renew proxies every 50, otherwise this is pretty slow
            proxies = setup_proxy_rotation()
            proxy_index = random_proxy(proxies)
            proxy = proxies[proxy_index]
        page = requests.get(theurl[i], proxies=proxy, headers=hdrs)
        singlesoup = BeautifulSoup(page.content, 'html.parser')

        if len(singlesoup.find_all('meta', property="og:image")) == 0:
            print('bad ID')
            badid.append(theid[i])
        else:
            tmp_image_url = str(
                singlesoup.find_all('meta',
                                    property="og:image")[0]).split('"')[1]
            #if not (i % 50):
            #   print('changing proxy')
            #   proxy_index = random_proxy(proxies)
            #   proxy = proxies[proxy_index]
            #    request = requests.get(tmp_image_url, proxies=proxy, headers={'Connection':'close'})
            #else:

            check_if_exists = None
            while check_if_exists == None:
                try:
                    check_if_exists = requests.get(tmp_image_url,
                                                   proxies=proxy,
                                                   headers=hdrs)
                except:
                    print("%% Taking a nap, page check didn't like me")
                    time.sleep(5)

            if check_if_exists.status_code == 200:
                # Save the image URL path
                imgurl[i] = tmp_image_url

                # Save the post image
                outfile = thedir + city + '/' + item + '_images/' + str(
                    theid[i]) + '.jpg'
                if not os.path.exists(outfile):
                    urllib.request.urlretrieve(tmp_image_url, outfile)

                # Save the post date information
                adate = str(singlesoup.find('time')).split('"')[3]
                adate = adate.replace('T', ' ')
                adate = adate.replace('-', ' ')
                adate = adate[0:-5]
                tpostdate = datetime.strptime(adate, '%Y %m %d %H:%M:%S')
                postdate[i] = (tpostdate.strftime("%d-%m-%Y"))

                # And time since posting
                datetime_object = datetime.strptime(adate, '%Y %m %d %H:%M:%S')
                time_since_posting[i] = ((now - datetime_object).days)
            else:
                badid.append(theid[i])

    # Get rid of shitty posts
    boolcompare = [True] * len(theid)
    for i in range(len(boolcompare)):
        if theid[i] in badid: boolcompare[i] = False
    theid = list(np.array(theid)[boolcompare])
    theprice = list(np.array(theprice)[boolcompare])
    theurl = list(np.array(theurl)[boolcompare])

    todays_scrape_df = pd.DataFrame(
        list(zip(theprice, time_since_posting, imgurl, theurl)),
        columns=['price', 'time_since_posting', 'imgurl', 'url'],
        index=theid)
    return todays_scrape_df
Exemplo n.º 32
0
#=============================JUPYTER NOTEBOOK==================================
## Progress bar
from tqdm import tqdm_notebook
from time import sleep

for i in tqdm_notebook(range(100)):
	sleep(0.01)
#-------------------------------------------------------------------------------
## matplotlib inline
%matplotlib inline

#-------------------------------------------------------------------------------
## change from scientific notation to decimal point in pandas
pd.set_option('display.float_format', lambda x: '%.0f' % x)

#-------------------------------------------------------------------------------
##
 #Limiting floats output to 3 decimal points
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
#-------------------------------------------------------------------------------
##Ignore warnings
import warnings; warnings.filterwarnings('ignore')

#-------------------------------------------------------------------------------
## Run SQL queries in Pandas
from pysqldf import SQLDF; sqldf = SQLDF(globals()); q = getattr(sqldf, 'execute')
import warnings; warnings.filterwarnings('ignore')

#-------------------------------------------------------------------------------
## Styling tables in Notebook
from IPython.display import HTML
Exemplo n.º 33
0
    def train(self):
        for epoch in tqdm_notebook(range(self.epochs), desc='epochs'):
            # for each epochs, we shuffle the list of all the datasets
            c = list(zip(self.sample_train, self.sample_target))
            shuffle(c)
            self.sample_train, self.sample_target = zip(*c)
            loss_total = 0
            steps = 0
            steps_nnet = 0
            print(self.sample_train[0])
            # Iterate all songs by the length of sample input (total_songs) and batches (batch_song)
            for i in tqdm_notebook(range(0, self.total_songs, self.batch_song),
                                   desc='MUSIC'):
                # EXAMPLE: [0,5,10,15,20] FOR TOTAL_SONGS = 20 AND BATCH_SONG = 5
                steps += 1
                #inputs_nnet_large, outputs_nnet_large = generate_batch_song(
                #   self.sample_input, self.batch_song, start_index=i, fs=self.frame_per_second,
                #  seq_len=seq_len, use_tqdm=False) # We use the function that have been defined here
                #inputs_nnet_large = np.array(self.note_tokenizer.transform(inputs_nnet_large), dtype=np.int32)
                #outputs_nnet_large = np.array(self.note_tokenizer.transform(outputs_nnet_large), dtype=np.int32)

                # EXAMPLE LARGE INPUTS = ARRAY([1,2,3,4],[2,3,4,5],[2,3,4,5],[2,3,4,5],[1,2,3,4])
                input_batch = [
                    y for x in self.sample_train[i:i + self.batch_song]
                    for y in x
                ]
                print(input_batch)
                break
                output_batch = [
                    y for x in self.sample_target[i:i + self.batch_song]
                    for y in x
                ]
                c = list(zip(input_batch, output_batch))
                print(c)
                sample_in = sample(c, 10000)
                input_batch, output_batch = zip(*sample_in)
                print(len(input_batch))
                inputs_nnet_large = np.array(input_batch)
                outputs_nnet_large = np.array(output_batch)

                # Get an index of all windows in a song
                index_shuffled = np.arange(start=0,
                                           stop=len(inputs_nnet_large))
                np.shuffle(index_shuffled)

                for nnet_steps in tqdm_notebook(
                        range(0, len(index_shuffled), self.batch_nnet_size)):
                    steps_nnet += 1
                    current_index = index_shuffled[nnet_steps:nnet_steps +
                                                   self.batch_nnet_size]

                    inputs_nnet, outputs_nnet = inputs_nnet_large[
                        current_index], outputs_nnet_large[current_index]

                    # To make sure no exception thrown by tensorflow on autograph
                    if len(inputs_nnet) // self.batch_nnet_size != 1:
                        break
                    loss = self.train_step(inputs_nnet, outputs_nnet)
                    loss_total += tf.math.reduce_sum(loss)
                    if steps_nnet % 20 == 0:
                        print("epochs {} | Steps {} | total loss : {}".format(
                            epoch + 1, steps_nnet, loss_total))

                    checkpoint.save(file_prefix=self.checkpoint_prefix)
Exemplo n.º 34
0

#%%
bpr_related_subreddits('dogs')

#%%
users = data['user'].cat.categories.array.to_numpy()

#%%
write_bpr_recommendations = False

#%%
user_comments = comments.T.tocsr()
if write_bpr_recommendations:
    # generate recommendations for each user and write out to a file
    with tqdm.tqdm_notebook(total=len(users)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for userid, username in enumerate(users):
                for subredditid, score in model.recommend(
                        userid, user_comments):
                    o.write("%s\t%s\t%s\n" %
                            (username, subreddits[subredditid], score))
                progress.update(1)

#%% [markdown]
# ### Sample user recommendations
#
# We went through the user 'xkcd_transciber' list of subreddits, where he/she commented. Taking a view of the kind of subreddits followed by the user we see that the predictions are good. This is just one sample, we are saving the recommendations for all users in a file and will also write the AUC score function for getting the exact scores for the generated recommendations.


#%%
Exemplo n.º 35
0
import subprocess
import numpy as np
import mdtraj as md
from tqdm import tqdm_notebook
import time

print "DSSP (Define Secondary Structure of Proteins) Hydrogen Bonding Algorithm for G protein\n"
print "‘H’ : Helix       ‘G’ : 3-helix (3/10 helix)\n'E’ : Beta ladder ‘B’ : Beta-bridge\n‘C’ : Random Coil ‘T’ : Hydrogen bonded turn\n"
gro_reference = '/Users/tue53144/Gprotein/gro/p8600/protein_only.gro'
traj_references = '/Volumes/Vav6/PROJ8600/RUN%d/CLONE%d/frame%d.xtc'

nruns = 1
nclones = 40
trajs = []
dssp = []
for run in tqdm_notebook(range(nruns), desc='Loading Trajs.'):
    time.sleep(0.01)
    for clone in range(nclones):
        nframes = int(
            subprocess.check_output(
                'ls /Volumes/Vav6/PROJ8600/RUN%d/CLONE%d/frame*.xtc | wc -l' %
                (run, clone),
                shell=True))
        frame = nframes - 1
        if frame >= 0:  #Doing all frames would take ~7 hours...
            for frm in range(frame):
                loadtrajs = md.load(traj_references % (run, clone, frm),
                                    top=gro_reference)
                if loadtrajs.time[-1] >= 0:
                    trajs.append(loadtrajs)
Exemplo n.º 36
0
def interpolate_catalog_sb(cat, bandname='r', radtype='eff',
                           sbname='sbeff_r', radname='rad_sb',
                           loopfunc=lambda x:x):
    """
    Takes a DECaLS tractor catalog and adds r-band half-light surface brightness
    to it.

    ``radtype`` can be "eff" for model-determined reff, or a angle-unit quantity
    for a fixed aperture SB

    For details/tests that this function works, see the
    "DECALS low-SB_completeness figures" notebook.
    """
    bandidx = decam_band_name_to_idx[bandname]

    if 'decam_apflux' in cat.colnames:
        r_ap_fluxes = cat['decam_apflux'][:, bandidx, :]
    elif 'apflux_' + bandname in cat.colnames:
        r_ap_fluxes = cat['apflux_' + bandname]
    else:
        raise ValueError('found no valid {}-band apflux column!'.format(bandname))
    assert r_ap_fluxes.shape[-1] == 8, 'Column does not have 8 apertures'

    expflux_r = np.empty_like(r_ap_fluxes[:, 0])
    rad = np.empty(len(r_ap_fluxes[:, 0]))
    ap_sizesv = DECALS_AP_SIZES.to(u.arcsec).value

    intr = interpolate.BarycentricInterpolator(ap_sizesv, [0]*len(ap_sizesv))

    if loopfunc == 'ProgressBar':
        from astropy.utils.console import ProgressBar
        loopfunc = lambda x: ProgressBar(x)
    elif loopfunc == 'NBProgressBar':
        from astropy.utils.console import ProgressBar
        loopfunc = lambda x: ProgressBar(x, ipython_widget=True)
    elif loopfunc == 'tqdm':
        import tqdm
        loopfunc = lambda x: tqdm.tqdm(x)
    elif loopfunc == 'tqdm_notebook':
        import tqdm
        loopfunc = lambda x: tqdm.tqdm_notebook(x)

    for i in loopfunc(range(len(r_ap_fluxes))):
        f = r_ap_fluxes[i]

        if radtype != 'eff':
            r = radtype
        elif cat['type'][i] == 'PSF ':
            if 'decam_psfsize' in cat.colnames:
                r = cat['decam_psfsize'][i, bandidx]
            else:
                r = cat['psfsize_' + bandname][i]
        elif cat['type'][i] == 'DEV ':
            if 'shapeDev_r' in cat.colnames:
                r = cat['shapeDev_r' ][i]
            else:
                # DR4 changed to all lower-case... WWHHHHYYY!!?!?!??!?!?!?!?
                r = cat['shapedev_r'][i]
        else:
            if 'shapeExp_r' in cat.colnames:
                r = cat['shapeExp_r'][i]
            else:
                # DR4 changed to all lower-case... WWHHHHYYY!!?!?!??!?!?!?!?
                r = cat['shapeexp_r'][i]

        intr.set_yi(f)
        expflux_r[i] = intr(r)
        rad[i] = r

    cat[sbname] = compute_sb(rad*u.arcsec, np.array(expflux_r))
    cat[radname] = rad*u.arcsec
def get_term_statistics(corpus_one,
                        corpus_two,
                        freq_num,
                        psudeocount=1,
                        disable_progressbar=False):
    """
    This function is designed to perform the folllowing calculations:
        - log likelihood of contingency table
        - log odds ratio
    keywords:
        corpus_one - a dataframe object with terms and counts
        corpus_two - a datafram object with terms and counts
        freq_num - number of most common words to use from both corpora
        psudeocount - the psudocount to avoid divide by zero
        disable_progressbar - show the progress bar?
    """

    spacy_nlp = spacy.load("en_core_web_sm")
    stop_word_list = list(spacy_nlp.Defaults.stop_words)

    # Remove special characters here when calculating odds ratio
    term_list = set(
        corpus_one.query("lemma.str.len() > 1").query(
            "lemma.str.contains(r'[a-z]')").
        query(f"lemma not in {stop_word_list}").sort_values(
            "count", ascending=False).head(freq_num).lemma.values) | set(
                corpus_two.query("lemma.str.len() > 1").
                query("lemma.str.contains(r'[a-z]')").query(
                    f"lemma not in {stop_word_list}").sort_values(
                        "count", ascending=False).head(freq_num).lemma.values)

    corpus_one_total = corpus_one["count"].sum()
    corpus_two_total = corpus_two["count"].sum()

    term_data = []
    for term in tqdm_notebook(term_list, disable=disable_progressbar):

        corpus_one_term_count = (
            corpus_one.query(f"lemma=={repr(term)}")["count"].values[0]
            if term in corpus_one.lemma.tolist() else 0)

        corpus_two_term_count = (
            corpus_two.query(f"lemma=={repr(term)}")["count"].values[0]
            if term in corpus_two.lemma.tolist() else 0)

        observed_contingency_table = np.array([
            [corpus_one_term_count, corpus_two_term_count],
            [corpus_one_total, corpus_two_total],
        ])

        # Log Likelihood

        ## add psudeocount to prevent log(0)
        observed_contingency_table += psudeocount

        a, b, c, d = (
            observed_contingency_table[0][0],
            observed_contingency_table[0][1],
            observed_contingency_table[1][0],
            observed_contingency_table[1][1],
        )

        # Obtained from (Kilgarriff, 2001) - Comparing Corpora
        def LL(a, b, c, d):
            return 2 * (a * np.log(a) + b * np.log(b) + c * np.log(c) +
                        d * np.log(d) - (a + b) * np.log(a + b) -
                        (a + c) * np.log(a + c) - (b + d) * np.log(b + d) -
                        (c + d) * np.log(c + d) +
                        (a + b + c + d) * np.log(a + b + c + d))

        log_likelihood = LL(a, b, c, d)

        # Log Odds
        log_ratio = float((a * d) / (b * c))

        term_data.append({
            "lemma": term,
            "corpus_one_a": a,
            "corpus_two_b": b,
            "corpus_one_c": c,
            "corpus_two_d": d,
            "log_likelihood": log_likelihood,
            "odds_ratio": log_ratio,
        })

    return pd.DataFrame.from_records(term_data)
Exemplo n.º 38
0
    def collect_comment(self, keyword):

        process = tqdm_notebook(self.href)

        for news in process:
            process.set_description("댓글 수집 중입니다.")
            self.__driver.implicitly_wait(3)
            self.__driver.get(news)

            try:
                # 최초 더보기 버튼 클릭
                self.__driver.find_element_by_css_selector(
                    ".u_cbox_btn_view_comment").click()
                self.__driver.implicitly_wait(3)

            # 버튼의 유형이 다른 경우 발생
            except Exception as e:
                try:
                    self.__driver.find_element_by_css_selector(
                        ".simplecmt_link_text").click()
                    self.__driver.implicitly_wait(3)
                except:
                    continue

                # pass

            # 더보기 버튼 계속 누르기.

            # 뉴스 기사 및 회사이름 가져오기.
            company = self.get_company_name()

            collect_text = ""
            company = 'C:/Users/khk37/뉴스기사/' + keyword + company.strip()

            try:

                if not os.path.exists(company.strip()):
                    os.mkdir(company)

            except Exception as e:
                print("os.mkdir 에러", e)

            try:

                collect_text = self.get_news_title(company, '.end_tit')
            except:
                try:
                    collect_text = self.get_news_title(company, '.tts_head')

                except:
                    collect_text = self.get_news_title(company,
                                                       '#articleTitle')

            try:
                while True:
                    self.__driver.execute_script(
                        "window.scrollTo(0,document.body.scrollHeight);")
                    self.__driver.find_element_by_css_selector(
                        ".u_cbox_btn_more").click()
                    self.__driver.execute_script(
                        "window.scrollTo(0,document.body.scrollHeight);")

            except exceptions.ElementNotVisibleException as e:  # 페이지
                pass
            except Exception as e:  # 다른 예외 발생시 확인
                self.page += 1
                print("에러 :  ", e)

            # document.body.scrollHeight
            # 스크롤 끝으로 올림.
            # self.__driver.execute_script("window.scrollTo(0, 0);")
            soup = self.parsing_html(self.__driver.page_source)
            comment_list = soup.find_all("span", {"class": "u_cbox_contents"})
            # last_height = self.__driver.execute_script("return document.body.scrollHeight")
            # elem = self.__driver.find_element_by_tag_name("body")

            down = 0
            number = 1

            for comment in comment_list:
                try:
                    collect_text._write_text(
                        self.model.predict_pos_neg(comment.text))
                except:
                    continue

            self.page += 1

        process.set_description("댓글 수집 완료.")
        return self.model.bad_or_good()
Exemplo n.º 39
0
                              index=dataset.index)
dataset_scaled['return'] = dataset['return']
dataset_scaled.describe()

# In[7]:

import tqdm

n = 3

X = []
y = []
indexes = []
dataset_scaled_x = dataset_scaled[feature_names]

for i in tqdm.tqdm_notebook(range(0, len(dataset_scaled) - n)):
    X.append(dataset_scaled_x.iloc[i:i + n].values)
    y.append(dataset_scaled['return'].iloc[i + n - 1])
    indexes.append(dataset_scaled.index[i + n - 1])
#dataset_scaled.head()

# In[8]:

import numpy as np
X = np.array(X)
y = np.array(y)

# In[9]:

indexes = np.array(indexes)
Exemplo n.º 40
0
plt.hist(sampledat, 200, normed=True);
plt.yscale('log');


# In[5]:

np.random.randint(0, len(sampledat), 10)


# In[39]:

# generate some data
bins = np.linspace(-4,4,100)
hists = {}
stats = {}
for npts in tqdm.tqdm_notebook(range(1,102,40)):
    d1 = sampledat[np.random.randint(0, len(sampledat), npts)]
    with pm.Model() as model:
        alpha = pm.Uniform('loc', -10, 10)
        #     beta = pm.Uniform('dist', 1, 1)
        x = pm.Cauchy(name='x', alpha=alpha, beta=1, observed=d1)
        trace = pm.sample(10000)
        hists[npts] = np.histogram(trace['loc'], bins)
        stats[npts] = np.percentile(trace['loc'], (1, 5, 25, 50, 75, 95, 99))


# In[40]:

keys = sorted(list(hists.keys()))
for k in keys:
    p = plt.plot(tb.bin_edges_to_center(bins), hists[k][0]/np.max(hists[k][0]), 
Exemplo n.º 41
0
    def train(self, target,source,gen_optimizer,disc_optimizer,num_epochs=10, disc_steps=1, gen_lr_schedule=None,disc_lr_schedule=None, model_dir=os.getcwd(), save_interval=100,notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True):
        assert(len(target.dataset) == len(source.dataset))
        assert(disc_steps < len(target.dataset))

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        self.model_dir = model_dir
        models_gen = os.path.join(model_dir, "gen_models")
        models_disc = os.path.join(model_dir, "disc_models")

        if not os.path.exists(models_gen):
            os.mkdir(models_gen)

        if not os.path.exists(models_disc):
            os.mkdir(models_disc)

        iterations = 0

        from tqdm import tqdm_notebook
        from tqdm import tqdm

        train_start_time = time()

        for e in tqdm(range(num_epochs)):

            self.gen_model.train()
            self.disc_model.train()
            self.on_epoch_start(e)

            running_gen_loss = torch.Tensor([0.0])
            running_disc_loss = torch.Tensor([0.0])
            gen_loss = 0.0
            disc_loss = 0.0
            gen_data_len = 0
            disc_data_len = 0

            if notebook_mode and batch_log:
                progress_ = tqdm_notebook(enumerate(zip(target,source)))
            elif batch_log:
                progress_ = tqdm(enumerate(zip(target,source)))
            else:
                progress_ = enumerate(zip(target,source))

            init_time = time()

            for i,(t,s) in progress_:

                if isinstance(t, list) or isinstance(t, tuple):
                    inputs = t[0]
                else:
                    inputs = t
                batch_size = inputs.size(0)
                disc_data_len += batch_size

                if len(self.__input_hooks) > 0:

                    for hook in self.__input_hooks:
                        inputs = hook(inputs)

                if isinstance(t, list):
                    t[0] = inputs
                elif isinstance(t, tuple):
                    t = (inputs,t[1])
                else:
                    t = inputs

                self.__disc_train_func__(t, s, disc_optimizer, running_disc_loss, e, i)

                disc_loss = running_disc_loss.data[0] / disc_data_len

                if (i+1) % disc_steps == 0:
                    self.__gen_train_func__(t, s, gen_optimizer, running_gen_loss, e, i)
                    gen_data_len += batch_size

                    gen_loss = running_gen_loss.data[0] / gen_data_len

                if batch_log:
                     progress_dict = {"Gen Loss": gen_loss,"Disc Loss":disc_loss}
                     progress_.set_postfix(progress_dict)

                iterations += 1

                if iterations % save_interval == 0:
                    self.save(s,iterations)
                    self.show(s,iterations)

                self.on_batch_end(e, i, gen_loss, disc_loss)
            if self.cuda:
                cuda.synchronize()
            duration = time() - init_time

            self.disc_loss_history.append(disc_loss)
            self.gen_loss_history.append(gen_loss)

            if gen_lr_schedule is not None:
                lr = gen_lr_schedule(e)
                adjust_learning_rate(lr,gen_optimizer)

            if disc_lr_schedule is not None:
                lr = disc_lr_schedule(e)
                adjust_learning_rate(lr, disc_optimizer)

            model_file = os.path.join(models_gen, "gen_model_{}.pth".format(e))
            self.save_generator(model_file)

            model_file = os.path.join(models_disc, "disc_model_{}.pth".format(e))
            self.save_discriminator(model_file)

            print("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss))

            if save_logs is not None:
                logfile = open(save_logs, "a")
                logfile.write("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss))
                logfile.close()

            epoch_arr = [x for x in range(e + 1)]

            if display_metrics or save_metrics:

                save_path = None

                if save_metrics:
                    save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e))

                visualize(epoch_arr, [PlotInput(value=self.gen_loss_history, name="Generator Loss", color="red"),
                                      PlotInput(value=self.disc_loss_history, name="Discriminator Loss", color="red")],display=display_metrics,
                          save_path=save_path)

            self.on_epoch_end(e,gen_loss, disc_loss, duration)
        train_end_time = time() - train_start_time
        self.on_training_completed(train_end_time)
Exemplo n.º 42
0
    def train(self, train_loader, loss_fn, optimizer,train_metrics,test_loader=None,test_metrics=None, num_epochs=10, lr_schedule=None,
              save_models="all", model_dir=os.getcwd(),notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True):


        if save_models not in ["all", "best"]:
            raise ValueError("save models must be 'all' or 'best' , {} is invalid".format(save_models))
        if save_models == "best" and test_loader is None:
            raise ValueError("save models can only be best when testloader is provided")

        if test_loader is not None:
            if test_metrics is None:
                raise ValueError("You must provide a metric for your test data")
            elif len(test_loader) == 0:
                raise ValueError("test metrics cannot be an empty list")

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)


        models_all = os.path.join(model_dir, "all_models")
        models_best = os.path.join(model_dir, "best_models")


        if not os.path.exists(models_all):
            os.mkdir(models_all)

        if not os.path.exists(models_best) and test_loader is not None:
            os.mkdir(models_best)


        from tqdm import tqdm_notebook
        from tqdm import tqdm

        best_metric = 0.0
        train_start_time = time()
        for e in tqdm(range(num_epochs)):
            print("Epoch {} of {}".format(e,num_epochs))

            for metric in train_metrics:
                metric.reset()

            self.model.train()
            self.on_epoch_start(e)

            running_loss = torch.Tensor([0.0])
            train_loss = 0.0
            data_len = 0


            if notebook_mode and batch_log:
                progress_ = tqdm_notebook(enumerate(train_loader))
            elif batch_log:
                progress_ = tqdm(enumerate(train_loader))
            else:
                progress_ = enumerate(train_loader)

            main_batch_size = 0

            init_time = time()

            for i, data in progress_:
                self.on_batch_start(e, i)

                if isinstance(data, list) or isinstance(data, tuple):
                    inputs = data[0]
                else:
                    inputs = data
                batch_size = inputs.size(0)

                if main_batch_size < batch_size:
                    main_batch_size = batch_size
                if len(self.__input_hooks) > 0:

                    for hook in self.__input_hooks:
                        inputs = hook(inputs)

                if isinstance(data, list):
                    data[0] = inputs
                elif isinstance(data, tuple):
                    data = (inputs,data[1])
                else:
                    data = inputs

                self.__train_func__(data,optimizer,loss_fn,train_metrics,running_loss,e,i)

                data_len += batch_size
                train_loss = running_loss.item()/data_len

                if batch_log:
                    progress_message = ""
                    for metric in train_metrics:
                        progress_message += "Train {} : {}".format(metric.name, metric.getValue())
                    progress_.set_description("{}/{} batches ".format(int(ceil(data_len / main_batch_size)),
                                                              int(ceil(len(train_loader.dataset) / main_batch_size))))
                    progress_dict = {"Train Loss": train_loss}
                    for metric in train_metrics:
                        progress_dict["Train " + metric.name] = metric.getValue()

                    progress_.set_postfix(progress_dict)

                self.on_batch_end(e, i, train_metrics, train_loss)
            if self.cuda:
                cuda.synchronize()

            self.loss_history.append(train_loss)
            duration = time() - init_time

            if lr_schedule is not None:
                lr = lr_schedule(e)
                adjust_learning_rate(lr,optimizer)

            model_file = os.path.join(models_all, "model_{}.pth".format(e))
            self.save_model(model_file)

            logfile = None
            if save_logs is not None:
                logfile = open(save_logs,"a")


            print(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss))
            if logfile is not None:
                logfile.write(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss))

            if test_loader is not None:
                message = "Accuracy did not improve"
                current_best = best_metric
                self.evaluate(test_loader,test_metrics)
                result = test_metrics[0].getValue()
                if result > current_best:
                    best_metric = result
                    message = "{} improved from {} to {}".format(test_metrics[0].name,current_best, result)
                    model_file = os.path.join(models_best,"model_{}.pth".format(e))
                    self.save_model(model_file)

                    print(os.linesep+"{} New Best Model saved in {}".format(message,model_file))
                    if logfile is not None:
                        logfile.write(os.linesep+"{} New Best Model saved in {}".format(message,model_file))

                else:
                    print(os.linesep+message)
                    if logfile is not None:
                        logfile.write(os.linesep+message)

                for metric in test_metrics:
                    print("Test {} : {}".format(metric.name,metric.getValue()))
                    if logfile is not None:
                        logfile.write(os.linesep+"Test {} : {}".format(metric.name,metric.getValue()))


            for metric in train_metrics:
                print("Train {} : {}".format(metric.name, metric.getValue()))
                if logfile is not None:
                    logfile.write(os.linesep + "Train {} : {}".format(metric.name, metric.getValue()))

            if logfile is not None:
                logfile.close()

            for metric in train_metrics:
                metric.add_history()


            epoch_arr = [x for x in range(e+1)]

            if display_metrics or save_metrics:

                save_path = None

                if save_metrics:
                    save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e))
                visualize(epoch_arr, [PlotInput(value=self.loss_history, name="Train Loss", color="red")],display=display_metrics,
                          save_path=save_path)

            if test_loader is not None and (display_metrics or save_metrics):
                    for metric in test_metrics:

                        save_path = None

                        if save_metrics:
                            save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e))
                        visualize(epoch_arr, [PlotInput(value=metric.history, name="Test "+metric.name, color="blue")],display=display_metrics,
                                      save_path=save_path)
            for metric in train_metrics:
                if save_metrics:
                    save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e))
                visualize(epoch_arr, [PlotInput(value=metric.history, name="Test " + metric.name, color="blue")],display=display_metrics,
                          save_path=save_path)

            self.on_epoch_end(e, train_metrics, test_metrics, train_loss, duration)
        train_end_time = time() - train_start_time

        self.on_training_completed(train_metrics,test_metrics,train_end_time)
Exemplo n.º 43
0
        X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(
            x_roll_mean, 0.95)
        X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(
            x_roll_mean, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(
            np.diff(x_roll_mean))
        X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = np.mean(
            np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
        X.loc[seg_id,
              'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()


# In[ ]:

# iterate over all segments
for seg_id in tqdm_notebook(range(segments)):
    seg = train_df.iloc[seg_id * rows:seg_id * rows + rows]
    create_features(seg_id, seg, train_X)
    train_y.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1]

# Let's check the result. We plot the shape and the head of train_X.

# In[ ]:

train_X.shape

# In[ ]:

train_X.head(10)

# We scale the data.
Exemplo n.º 44
0
pd.DataFrame(y_test).to_csv('./predictions/y_true.csv',
                            index=False,
                            encoding='utf-8')


def get_coefs(word, *arr):
    try:
        # print("word:",word)
        # print("arr:",arr)
        return word, np.asarray(arr, dtype='float32')
    except:
        return None, None


embeddings_index = dict(
    get_coefs(*o.strip().split()) for o in tqdm_notebook(
        open('./embeddings/glove.twitter.27B.50d.txt', encoding="utf8")))

#print(embeddings_index)

embed_size = 50

for k in tqdm_notebook(list(embeddings_index.keys())):
    v = embeddings_index[k]

    try:
        if v.shape != (embed_size, ):
            embeddings_index.pop(k)
            i = i + 1
    except:
        pass
Exemplo n.º 45
0
    def iterate_file(self,
                     fname=DS_FILE_NAME,
                     top_n_train=100000,
                     total=125000,
                     learning_rate=0.1,
                     tolerance=1e-16,
                     lmbda=0.01):

        self._loss = []
        n = 0
        accurate_sample = []
        # откроем файл
        with open(fname, 'r') as f:

            # прогуляемся по строкам файла
            for line in tqdm_notebook(f, total=total, mininterval=1):
                desired_tags = []
                pair = line.strip().split('\t')
                if len(pair) != 2:
                    continue
                sentence, tags = pair
                # слова вопроса, это как раз признаки x
                sentence = sentence.split(' ')
                # теги вопроса, это y
                tags = set(tags.split(' '))

                # значение функции потерь для текущего примера
                sample_loss = 0

                # прокидываем градиенты для каждого тега
                for tag in self._tags:
                    # целевая переменная равна 1 если текущий тег есть у текущего примера
                    y = int(tag in tags)

                    # расчитываем значение линейной комбинации весов и признаков объекта
                    # инициализируем z
                    # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ
                    z = self._b[tag]
                    for word in sentence:
                        # если в режиме тестирования появляется слово которого нет в словаре, то мы его игнорируем
                        if n >= top_n_train and word not in self._vocab:
                            continue
                        if word not in self._vocab:
                            self._vocab[word] = len(self._vocab)
                        z += self._w[tag][self._vocab[word]]
                    # вычисляем вероятность наличия тега
                    # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ
                    if z >= 0:
                        sigma = 1 / (1 + np.exp(-z))
                    else:
                        sigma = 1 - 1 / (1 + np.exp(z))

                    # обновляем значение функции потерь для текущего примера
                    # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ
                    if y == 1:
                        sample_loss += -y * np.log(np.max([tolerance, sigma]))

                    else:
                        sample_loss += -(1 - y) * np.log(1 - np.min([1 - tolerance, sigma]))

                    # если мы все еще в тренировочной части, то обновим параметры
                    if n < top_n_train:
                        # вычисляем производную логарифмического правдоподобия по весу
                        # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ
                        dLdw = y - sigma

                        # делаем градиентный шаг
                        # мы минимизируем отрицательное логарифмическое правдоподобие (второй знак минус)
                        # поэтому мы идем в обратную сторону градиента для минимизации (первый знак минус)
                        for word in sentence:
                            self._w[tag][self._vocab[word]] -= -learning_rate * dLdw
                        self._b[tag] -= -learning_rate * dLdw
                    if sigma > 0.9:
                        desired_tags.append(tag)
                if (n > top_n_train):
                    accurate_sample.append(len(tags.intersection(desired_tags)) / len(tags.union(desired_tags)))
                n += 1

                self._loss.append(sample_loss)
            return (np.mean(accurate_sample))
Exemplo n.º 46
0
    g_optimizer = torch.optim.Adam(model.G.parameters(),
                                   lr=lr,
                                   betas=(0.5, 0.999))

    z_sample = Variable(torch.randn(64, z_dim))
    z_sample = z_sample.to(device)

    D_loss = []
    G_loss = []
    GP = []
    images = []
    lam = 10.

    try:
        for epoch in range(args.STARTING_EPOCH, args.STARTING_EPOCH + epochs):
            for i, (imgs, _) in enumerate(tqdm_notebook(trainloader)):
                step = epoch * len(trainloader) + i + 1

                # set train
                model.G.train()

                # leafs
                imgs = Variable(imgs)
                bs = imgs.size(0)
                z = Variable(torch.randn(bs, z_dim))
                imgs, z = imgs.to(device), z.to(device)

                f_imgs = model.G(z)

                r_logit = model.D(imgs)
                f_logit = model.D(f_imgs.detach())
Exemplo n.º 47
0
    cur_seg = final_model.predict(c_img)[0]
    cur_seg = binary_opening(cur_seg>0.5, np.expand_dims(disk(2), -1))
    return cur_seg, c_img

def pred_encode(img):
    cur_seg, _ = predict(img)
    cur_rles = rle_encode(cur_seg)
    return [img, cur_rles if len(cur_rles) > 0 else None]


from tqdm import tqdm_notebook

test_paths = np.array(os.listdir(test_image_dir))
#test_paths = test_paths[0:10]
out_pred_rows = []
for c_img_name, index in zip(tqdm_notebook(test_paths), range(len(test_paths))):
    out_pred_rows += [pred_encode(c_img_name)]
    if index % 200 == 0:
        print('Processed {} test images'.format(index))

sub = pd.DataFrame(out_pred_rows)
sub.columns = ['ImageId', 'EncodedPixels']
sub.to_csv('submission_003.csv', index=False)

test_masks = pd.read_csv('submission_002.csv')
test_masks['ships'] = masks['EncodedPixels'].map(lambda c_row: 1 if isinstance(c_row, str) else 0)
test_masks.count()
test_masks.query('ships>0').count()
train_set.shape
train_set.query('ship_num>0').shape
def calibrate(arguments):
    calibration_data_filename, groups_to_calibrate, ids_in_subwatershed, parameter_realz, objective_function, minimize_objective_function, cpu = arguments
    
    # Load calibration data
    calibration_data = pickle.load( open(os.path.join(parent_dir,'calibration_data',calibration_data_filename)))
    calibration_data = calibration_data[spinup_date:stop_date]
    
    N = len(parameter_realz)

    # for each parameter realization
    objs = []
    best_fit = pd.DataFrame({'modeled':np.zeros(len(timestamps_hillslope))}, index=timestamps_hillslope).resample('D').mean()
    if minimize_objective_function: 
        objs_curr = np.inf
    else:
        objs_curr = -np.inf

    best_index = -1
    desc = "Core #%s"%(cpu)
    for i in tqdm_notebook(range(N), desc=desc):
        solved_groups = {}
        parameter_group_params = {}
        parameter_group_params = parameter_realz[i]

        solved_group_hillslopes_dict = {}
        for group_id in groups_to_calibrate:

            parameter_group_id = group_id[0]
            climate_group_id = group_id[1]

            vz = parameter_group_params[parameter_group_id]['vz'](**parameter_group_params[parameter_group_id])
            gz = parameter_group_params[parameter_group_id]['gz'](**parameter_group_params[parameter_group_id])    

            rew = REW(vz, gz,  **{'pet':climate_group_forcing[climate_group_id].pet, 'ppt':climate_group_forcing[climate_group_id].ppt, 'aspect':90})

            # storageVZ    = np.zeros(np.size(t))
            # storageGZ     = np.zeros(np.size(t))
            discharge       = np.zeros(np.size(t))
            leakage         = np.zeros(np.size(t))
            # ET              = np.zeros(np.size(t))

            # Resample pet and ppt to integration timestep
            ppt = np.array(rew.ppt[start_date:stop_date].resample(resample_freq_hillslope).ffill())
            pet = np.array(rew.pet[start_date:stop_date].resample(resample_freq_hillslope).ffill())

            # Solve group hillslope
            for l in range(len(t)):
                rew.vz.update(dt,**{'ppt':ppt[l],'pet':pet[l]})
                # storageVZ[l] = rew.vz.storageVZ
                leakage[l]      = rew.vz.leakage
                # ET[l]           = rew.vz.ET   
                rew.gz.update(dt,**{'leakage':leakage[l]})
                # storageGZ[l] = rew.gz.storageGZ
                discharge[l] = rew.gz.discharge

            # resample as daily data
            solved_groups[group_id] = pd.DataFrame({'discharge':discharge}, index=timestamps_hillslope).resample('D').mean()

        total_area = 0
        for rew_id in ids_in_subwatershed:
            total_area += rew_config[rew_id]['area_sqkm']

        name = str(i) + 'discharge'
        solved_subwatershed = pd.DataFrame({name:np.zeros(len(timestamps_hillslope))}, index=timestamps_hillslope).resample('D').mean()

        solved_subwatershed_array = np.zeros(int(len(solved_subwatershed)))
        for rew_id in ids_in_subwatershed:
            solved_subwatershed_array += rew_config[rew_id]['area_sqkm']/total_area*solved_groups[rew_config[rew_id]['group']]['discharge']

        solved_subwatershed[name] = solved_subwatershed_array
        objs.append(objective_function(solved_subwatershed[name][spinup_date:stop_date],calibration_data['runoff'][spinup_date:stop_date]))
        
        if minimize_objective_function:
            if objs[i]<objs_curr:
                objs_curr = objs[i]
                best_index = i
                best_fit = solved_subwatershed[name].copy()
                print('Min objective function value so far is: ' + str(objs_curr))
        else:
            if objs[i]>objs_curr:
                objs_curr = objs[i]
                best_index = i
                best_fit = solved_subwatershed[name].copy()
    
    return (best_fit, objs_curr, best_index)
Exemplo n.º 49
0
import time
import re
import pickle
import os
from collections import defaultdict,OrderedDict
import multiprocessing
import collections
import json

from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm, tqdm_pandas
tqdm_notebook().pandas()

import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns = 999

#from datetime import datetime, timedelta, timezone
import keras

from IPython.display import display

from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Dropout, Activation, GRU, Embedding
from keras.layers import concatenate as Concatenate
from keras.layers.core import Flatten, Reshape
from keras.layers.convolutional import *
from keras.layers.pooling import *
from keras.layers.normalization import BatchNormalization
from keras.layers.noise import GaussianDropout
Exemplo n.º 50
0
df_test = df_test.iloc[test_idx].copy()
del df_ratings

# In[14]:

df_test.iloc[1]["comment"], [
    mapping.get(x, UNK) for x in df_test.iloc[1]["comment"].split(" ")
]

# In[15]:

results = []
tokens_train, tokens_val, tokens_test = [], [], []
for df, tokens in zip((df_train, df_val, df_test),
                      (tokens_train, tokens_val, tokens_test)):
    for i, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
        tokens.append(
            np.array([BEG] +
                     [mapping.get(x, UNK) for x in row["comment"].split(" ")]))

# In[16]:

assert len(tokens_train) == df_train.shape[0]

# In[74]:

tokens_val[0]

# In[75]:

df_val.iloc[0]
Exemplo n.º 51
0
def run(data_dir: str = './env/data',
        vae_dir: str = './vae/model',
        mdnrnn_dir: str = './mdnrnn/model',
        epochs: int = 20) -> None:
    """
    Train mdnrnn using saved environment rollouts.

    Parameters
    ----------
    data_dir
        Directory with train and test data.
    vae_dir
        Directory to load VAE model from.
    mdnrnn_dir
        Directory to optionally load MDNRNN model from and save trained model to.
    epochs
        Number of training epochs.
    """
    # set random seed and deterministic backend
    SEED = 123
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    # use GPU if available
    cuda = torch.cuda.is_available()
    device = torch.device("cuda" if cuda else "cpu")

    # define input transformations
    transform_train = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((H, W)),
        transforms.ToTensor(),
    ])

    transform_test = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((H, W)),
        transforms.ToTensor(),
    ])

    # define train and test datasets
    dir_train = os.path.join(data_dir, 'train/')
    dir_test = os.path.join(data_dir, 'test/')
    dataset_train = GymDataset(dir_train,
                               seq_len=SEQ_LEN,
                               transform=transform_train)
    dataset_test = GymDataset(dir_test,
                              seq_len=SEQ_LEN,
                              transform=transform_test)
    dataset_test.load_batch(0)  # 1 batch of data used for test set
    dataloader_test = torch.utils.data.DataLoader(dataset_test,
                                                  batch_size=BATCH_SIZE,
                                                  shuffle=False,
                                                  collate_fn=collate_fn)

    # define and load VAE model
    vae = VAE(CHANNELS, LATENT_SIZE)
    load_vae_file = os.path.join(vae_dir, 'best.tar')
    state_vae = torch.load(load_vae_file)
    vae.load_state_dict(state_vae['state_dict'])
    vae.to(device)

    # set save and optional load directories for the MDNRNN model
    load_mdnrnn_file = os.path.join(mdnrnn_dir, 'best.tar')
    try:
        state_mdnrnn = torch.load(load_mdnrnn_file)
    except FileNotFoundError:
        state_mdnrnn = None

    # define and load MDNRNN model
    mdnrnn = MDNRNN(LATENT_SIZE,
                    ACTION_SIZE,
                    HIDDEN_SIZE,
                    N_GAUSS,
                    rewards_terminal=False)
    if state_mdnrnn is not None:
        mdnrnn.load_state_dict(state_mdnrnn['state_dict'])
    mdnrnn.zero_grad()
    mdnrnn.to(device)

    # optimizer
    params = [p for p in mdnrnn.parameters() if p.requires_grad]
    optimizer = RMSprop(params, lr=LR, alpha=.9)
    if state_mdnrnn is not None:
        optimizer.load_state_dict(state_mdnrnn['optimizer'])

    # learning rate scheduling
    lr_scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
    if state_mdnrnn is not None:
        lr_scheduler.load_state_dict(state_mdnrnn['scheduler'])

    # helper function
    def img2latent(obs, batch_size):
        """ Function to go from image to latent space. """
        with torch.no_grad():
            obs = obs.view(-1, CHANNELS, H, W)
            _, mu, logsigma = vae(obs)
            latent = (mu + logsigma.exp() * torch.randn_like(mu)).view(
                batch_size, SEQ_LEN, LATENT_SIZE)
        return latent

    # define test fn
    def test():
        """ One test epoch """
        mdnrnn.eval()
        test_loss = 0
        n_test = len(dataloader_test.dataset)
        with torch.no_grad():
            for (obs, action, next_obs) in generate_obs(dataloader_test):

                batch_size = len(obs)

                # place on device
                try:
                    obs = torch.stack(obs).to(device)
                    next_obs = torch.stack(next_obs).to(device)
                    action = torch.stack(action).to(device)
                except:
                    print(
                        'Did not manage to stack test observations and actions.'
                    )
                    n_test -= batch_size
                    continue

                # convert to latent space
                latent_obs = img2latent(obs, batch_size)
                next_latent_obs = img2latent(next_obs, batch_size)

                # need to flip dims to feed into LSTM from [batch, seq_len, dim] to [seq_len, batch, dim]
                latent_obs, action, next_latent_obs = [
                    arr.transpose(1, 0)
                    for arr in [latent_obs, action, next_latent_obs]
                ]

                # forward pass model
                mus, sigmas, logpi = mdnrnn(action, latent_obs)

                # compute loss
                loss = gmm_loss(next_latent_obs, mus, sigmas, logpi)
                test_loss += loss.item()

        test_loss /= n_test
        return test_loss

    # train
    n_batch_train = len(dataset_train.batch_list)
    optimizer.zero_grad()

    cur_best = None

    tq_episode = tqdm_notebook(range(epochs))
    for epoch in tq_episode:

        mdnrnn.train()
        loss_train = 0
        n_batch = 0

        tq_batch = tqdm_notebook(range(n_batch_train))
        for i in tq_batch:  # loop over training data for each epoch

            dataset_train.load_batch(i)
            dataloader_train = torch.utils.data.DataLoader(
                dataset_train,
                batch_size=BATCH_SIZE,
                shuffle=True,
                collate_fn=collate_fn)

            tq_minibatch = tqdm_notebook(generate_obs(dataloader_train),
                                         total=len(dataloader_train),
                                         leave=False)
            for j, (obs, action, next_obs) in enumerate(tq_minibatch):

                n_batch += 1

                # place on device
                batch_size = len(obs)
                try:
                    obs = torch.stack(obs).to(device)
                    next_obs = torch.stack(next_obs).to(device)
                    action = torch.stack(action).to(device)
                except:
                    print('Did not manage to stack observations and actions.')
                    continue

                # convert to latent space
                latent_obs = img2latent(obs, batch_size)
                next_latent_obs = img2latent(next_obs, batch_size)

                # need to flip dims to feed into LSTM from [batch, seq_len, dim] to [seq_len, batch, dim]
                latent_obs, action, next_latent_obs = [
                    arr.transpose(1, 0)
                    for arr in [latent_obs, action, next_latent_obs]
                ]

                # forward pass model
                mus, sigmas, logpi = mdnrnn(action, latent_obs)

                # compute loss
                loss = gmm_loss(next_latent_obs, mus, sigmas, logpi)

                # backward pass
                loss.backward()

                # store loss value
                loss_train += loss.item()
                loss_train_avg = loss_train / (n_batch * BATCH_SIZE)

                # apply gradients and learning rate scheduling with optional gradient accumulation
                if (j + 1) % GRAD_ACCUMULATION_STEPS == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                tq_minibatch.set_postfix(loss_train=loss_train_avg)

            tq_batch.set_postfix(loss_train=loss_train_avg)

        lr_scheduler.step()

        # evaluate on test set
        loss_test_avg = test()

        # checkpointing
        best_filename = os.path.join(mdnrnn_dir, 'best.tar')
        filename = os.path.join(mdnrnn_dir, 'checkpoint.tar')
        is_best = not cur_best or loss_test_avg < cur_best
        if is_best:
            cur_best = loss_test_avg

        save_checkpoint(
            {
                'epoch': epoch,
                'state_dict': mdnrnn.state_dict(),
                'precision': loss_test_avg,
                'optimizer': optimizer.state_dict(),
                'scheduler': lr_scheduler.state_dict()
            }, is_best, filename, best_filename)

        tq_episode.set_postfix(loss_train=loss_train_avg,
                               loss_test=loss_test_avg)
Exemplo n.º 52
0
 def on_train_begin(self, logs):
     self.progbar = tqdm_notebook(desc='', total=self.params['nb_steps'], leave=True, mininterval=0.5)
     self.train_start = timeit.default_timer()
     self.metrics_names = self.model.metrics_names
     print('Training for {} steps ...'.format(self.params['nb_steps']))
Exemplo n.º 53
0
# ## BioRxiv -> Doc Embeddings

biorxiv_xpath_str = "//abstract/p|//abstract/title|//body/sec//p|//body/sec//title"
word_model = Word2Vec.load(
    str(
        Path(
            "../word_vector_experiment/output/word2vec_models/300/biorxiv_300.model"
        )))

biorxiv_document_map = {
    document: generate_doc_vector(
        word_model,
        document_path=str(Path("output/biorxiv_xml_files") / document),
        xpath=biorxiv_xpath_str,
    )
    for document in tqdm_notebook(biorxiv_documents)
}

# +
biorxiv_vec_df = (pd.DataFrame.from_dict(
    biorxiv_document_map, orient="index").rename(
        columns={col: f"feat_{col}"
                 for col in range(int(300))}).rename_axis(
                     "document").reset_index())

biorxiv_vec_df.to_csv("output/polka_et_al_biorxiv_embeddings.tsv",
                      sep="\t",
                      index=False)

biorxiv_vec_df.head().T
# -
Exemplo n.º 54
0
from sentence 
where entity_types::text like '%%Gene%%' or entity_types::text like '%%Disease%%';
'''
sentence_df = pd.read_sql(sql, database_str)
sentence_df.head(2)


# In[8]:


entity_data = []
tagging_error_ids = set({})

#skip tagging error
skip_tag_error = False
for index, row in tqdm_notebook(sentence_df.iterrows()):
    
    #create dictionay for mapping entity types
    entity_mapper = {"sentence_id": row['sentence_id']}
    
    #Keep track of previous entity
    previous_entity = 'o'
    
    #For all entitys in a given sentence decide what is tagged
    for entity in row['entity_types']:
        entity = entity.lower()

        #Non-O tag
        if entity != 'o' and previous_entity =='o':
            #If entity not seen before instanciate it
            if entity not in entity_mapper:
Exemplo n.º 55
0
    ]
    f3list = [
        'Census_ProcessorCoreCount', 'Census_OEMNameIdentifier',
        'CityIdentifier'
    ]
    f4list = [
        'GeoNameIdentifier', 'Census_OEMNameIdentifier',
        'Census_OSBuildRevision'
    ]
    f5list = [
        'Census_OEMModelIdentifier', 'CityIdentifier',
        'Census_FirmwareVersionIdentifier'
    ]

    flist = [f1list, f2list, f3list, f4list, f5list]
    for i in tqdm_notebook(range(5)):
        temp = all_data.groupby(flist[i]).size().reset_index(name='counts' +
                                                             str(i))
        all_data = pd.merge(all_data,
                            temp,
                            how='left',
                            left_on=flist[i],
                            right_on=flist[i])
        col = 'counts' + str(i)
        all_data[col] = all_data[col].astype('int32')
    train = all_data[:train_shape[0]]
    test = all_data[train_shape[0]:]
    del all_data, temp
    gc.collect()

cols = train.columns.tolist()
Exemplo n.º 56
0
def worker(args):
    df, seg = args
    n_sample = len(df)

    output = []
    for i, sample in tqdm_notebook(df.iterrows()):
        filename, ebird_code, duration = sample[[
            'filename', 'ebird_code', 'duration'
        ]]
        path_folder = sample['folder']
        path_audio = os.path.join(path_folder, ebird_code, filename)

        try:
            signal, _ = librosa.load(path_audio,
                                     sr=sr,
                                     mono=True,
                                     res_type='kaiser_fast')
        except:
            print('file {} corrupted.'.format(filename))
            continue
        signal = librosa.effects.trim(signal)[0]
        len_signal = len(signal)

        max_attemp = 100
        cnt_attemp = 0
        max_snr = -1
        tmp_spec = None
        tmp_idx = None
        while cnt_attemp < max_attemp:
            cnt_attemp += 1

            chunk = np.zeros(len_frame)
            if len_signal > len_frame:
                i_start = np.random.randint(len_signal - len_frame)
                chunk[:] = signal[i_start:i_start + len_frame]
            elif len_signal < len_frame:
                i_start = np.random.randint(len_frame - len_signal)
                chunk[i_start:i_start + len_signal] = signal
            else:
                chunk[:] = signal

            mel_spec = melspectrogram(chunk, sr, mel_filterbank,
                                      **paras_mel_spectrogram)
            mel_spec = librosa.power_to_db(mel_spec)
            mel_spec = to_image(mel_spec)

            snr = signal_noise_ratio(mel_spec)
            if (snr > snr_threshold) & (cnt_attemp < max_attemp):
                tmp_chunk = chunk
                break
            elif snr > max_snr:
                tmp_chunk = chunk
                max_snr = snr

        chunk = add_noise(chunk)
        mel_spec = melspectrogram(chunk, sr, mel_filterbank,
                                  **paras_mel_spectrogram)
        mel_spec = librosa.power_to_db(mel_spec)
        mel_spec = to_image(mel_spec)
        output.append((mel_spec * 255).astype(np.uint8))

        gc.collect()

    output = np.array(output)
    np.save('spectrogram{}.npy'.format(seg), output)

    print('segment {} complete'.format(seg))
Exemplo n.º 57
0
 def __init__(self, **kwargs):
     self.bar = tqdm_notebook(**kwargs)
Exemplo n.º 58
0
# Preprocessing libraries
import nltk
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import pandas as pd
import os.path

from tqdm import tqdm_notebook
#Search Engine 1
dir_path = r"C:\Users\loren\Downloads\HW3\TSVFile"
 
# Step 1 concatenates various path components 
for i in tqdm_notebook(range (len(os.listdir(dir_path)))): 
    filename = os.path.join(dir_path, "article_{}.tsv".format(i))
        
    df = pd.read_csv(filename, sep='\t', encoding  = 'utf-8') # Creating a dataframe for each movie
    doc = 'article_{}.tsv'.format(i)
    
    col = []
    col, message = information(df)  
    if message == 'Continue':
        continue
    elif message == 'Pass':
        pass
     
    # Step 2 Taking all the info
       
    to_tokenize = col[0]+col[1]+col[2]+col[3]+col[4]+col[5]+col[6]+col[7]+col[8]+col[9]+col[10]+col[11]+col[12]+col[13]