Пример #1
0
    def _preprocess_episodes(self, episodes, dictionary, mode):
        """
        Tokenize all the fields in Wizard-of-Wikipedia
        """
        colorlog.info("Preprocess wizard of wikipedia dataset")
        tokenize = lambda x: ' '.join([str(data_vocab.BERT_CLS_ID)] + [
            str(y)
            for y in dictionary.convert_tokens_to_ids(dictionary.tokenize(x))
        ] + [str(data_vocab.BERT_SEP_ID)])

        new_episodes = []
        for episode_num, episode in enumerate(tqdm(episodes, ncols=70)):
            new_examples = []
            for example_num, example in enumerate(episode):
                # Tokenize inputs and convert to tokens
                context = tokenize(example['text'])
                if mode == "train":
                    response = tokenize(example['labels'][0])
                else:
                    response = tokenize(example['eval_labels'][0])
                chosen_topic = tokenize(example['chosen_topic'])

                # Set up knowledge
                checked_knowledge = example[
                    'title'] + ' __knowledge__ ' + example['checked_sentence']
                knowledges = [checked_knowledge] + \
                    [k for k in example['knowledge'].rstrip().split('\n')]
                for idx, k in enumerate(knowledges[1:]):
                    if k == checked_knowledge:
                        break
                else:
                    # Sometimes, knowledge does not include checked_sentnece
                    idx = None
                    colorlog.warning(
                        "Knowledge does not include checked sentence.")
                if idx:
                    del knowledges[idx + 1]

                # Tokenize knowledge
                knowledge_sentences = [tokenize(k) for k in knowledges]

                new_example = {
                    'context': context,
                    'response': response,
                    'chosen_topic': chosen_topic,
                    'knowledge_sentences': knowledge_sentences,
                    'episode_num': episode_num,
                    'example_num': example_num
                }
                new_examples.append(new_example)
            new_episodes.append(new_examples)

        if self._datapath:
            episodes_fname = self._get_preprocessed_fname(mode)
            colorlog.info(f"Cache preprocessed dataset to {episodes_fname}")
            with open(episodes_fname, 'w') as fp:
                for episode in new_episodes:
                    fp.write(json.dumps(episode) + '\n')

        return new_episodes, dictionary
Пример #2
0
def average_gradients(tower_grads, skip_none=False):
    """
      From tensorflow cifar 10 tutorial codes
      Calculate the average gradient for each shared variable across all towers.

    Note that this function provides a synchronization point across all towers.

    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
      skip_none: Boolean. Whether to throw away non gradient variable or raise exception.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Skip None gradients
        if grad_and_vars[0][0] is None and skip_none:
            colorlog.warning("%s has None gradient" % grad_and_vars[0][1].name)
            continue

        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads
Пример #3
0
    def _create_report(self, onweb=True):
        section = """<div>
        <b>DrugID:</b> %(drugid)s</br>
        <b>Regression method:</b> %(method)s </br>
        <b>Regression, alpha parameter used:</b> %(alpha)s</br>
        <b>Bayes factor:</b> %(bayes)s</br>
        <b>Coefficient of regression (pearson):</b> %(Rp)s</br>
        </div>
        """ % self.params
        self.jinja['sections'].append(section)

        text = {}
        text['boxplot'] = ("This boxplot shows the %s most important features "
                           "(based on the weights of the regression).")
        text['boxplot'] %= self.caller.config["boxplot_n"]

        text['importance'] = ("Feature with non-null weights. If empty, it"
                              " means no feature of interests were found")

        text['randomness'] = (
            "Here we run the regression analysis %s times and "
            "plot the regression value (x-axis) for the real data (blue) "
            "and randomising the variable to explain (red). ")
        text['randomness'] %= self.caller.config['randomness']

        text['weights'] = ("Feature with non-null weights. If empty, it"
                           " means no feature of interests were found")

        for this in ["boxplot", "randomness", "importance", "weights"]:
            self.params['name'] = this
            self.params['text'] = text[this]
            filename = self.caller.prefix_images + "%(name)s_%(drugid)s.png" % self.params
            self.params["filename"] = filename
            self.params['title'] = this.title()
            if os.path.exists(filename):
                section = """<div>
                <h2>%(title)s results</h2>
                <p>%(text)s</p>
                <img src="%(filename)s">
                """ % self.params
                self.jinja['sections'].append(section)
            else:
                logger.warning("%s not found. Skipped" % filename)
Пример #4
0
def count_reps(pose_estimator: PoseEstimator, tts: TTS, exercises: dict, exercise: str,
               count_to: int):
  joints = exercises[exercise]['joints']
  mse_threshold = exercises[exercise]['mse_threshold']
  time_based = exercises[exercise]['time_based']
  joint_angle_estimator = JointAngleEstimator(joints)
  counter = 0

  countdown_thread = Thread(target=countdown, args=(3, tts))
  countdown_thread.start()

  while countdown_thread.is_alive():
    joint_angle_estimator.init(pose_estimator.get_keypoints())

  if not joint_angle_estimator.is_init:
    logging.warning('Couldn\'t capture initial joint angles. Cancelling exercise.')
    return
  else:
    logging.info(f'Initial joint angles captured as {joint_angle_estimator.get_init_angles()}.')

  if time_based:
    raise NotImplementedError
  else:
    prev_mse = curr_mse = 0
    thresholded_hist = deque([1] * 3, maxlen=3)
    thresholded_hist_lp = deque([1] * 2, maxlen=2)
    while counter < count_to:
      if pose_estimator.keypoints_available:
        curr_mse = joint_angle_estimator.mse(pose_estimator.get_keypoints())
        curr_mse = curr_mse if curr_mse is not None else prev_mse
        prev_mse = curr_mse
        thresholded = int(curr_mse < mse_threshold)
        thresholded_hist.append(thresholded)
        if thresholded == 0:  # Filterring step.
          thresholded = 0 if sum(thresholded_hist) == 0 else 1
        thresholded_hist_lp.append(thresholded)
        counter += thresholded_hist_lp[-2] == 0 and thresholded_hist_lp[-1] == 1
        # print(thresholded_hist, end='\r')
        if thresholded_hist_lp[-2] == 0 and thresholded_hist_lp[-1] == 1:
          tts.say(f'{counter}.')
Пример #5
0
    def boxplot_pancan(self, mode, fignum=1, title_prefix=''):
        """Create boxplot related to the MSI factor or Tissue factor

        :param mode: either set to **msi** or **tissue**

        """
        assert mode in ['tissue', 'msi', "media"]

        results = self._get_boxplot_data(mode)
        if results is None:
            logger.warning(
                "No tissue with at least 2 pos and 2 neg found (no image created)."
            )
            return

        fig = pylab.figure(fignum)
        oldsize = fig.get_size_inches()

        pylab.clf()  # or close ?
        data, names, significance = results
        N = len(names)
        if N <= 2:  # msi or 2 tissues
            fontsize = self.fontsize
        elif N <= 14:
            fontsize = max(
                4, int(self.fontsize - (N - 2.) / (self.fontsize - 4.)))
        else:
            fontsize = max(4, int(self.fontsize / 1.4))

        bb = boxswarm.BoxSwarm(data, names, fontsize=fontsize)

        bb.xlabel = r'%s log(IC50)' % self.drug
        if mode == 'tissue':
            bb.title = 'FEATURE/Cancer-type interactions'
        elif mode == 'msi':
            bb.title = 'FEATURE/MS-instability interactions'
        elif mode == "media":
            bb.title = 'FEATURE/Media interactions'
        ax = bb.plot(vert=False)
        # get info from left axis
        common_ylim = ax.get_ylim()
        common_ticks = ax.get_yticks()

        self.ax = ax.twinx()
        self.ax.set_ylim(common_ylim)
        self.ax.set_yticks(common_ticks)
        self.ax.set_yticklabels([str(len(this)) + " " for this in data],
                                fontsize=fontsize / 1.4)
        try:
            pylab.tight_layout()
        except:
            pass

        if self.savefig is True:
            filename = self.directory + os.sep
            filename += 'ODOF_{}_DRUG_{}____{}'.format(mode, self.drug,
                                                       self.feature)
            fig.set_size_inches(14, 16)
            pylab.savefig(filename + '.png', bbox_inches='tight')
            fig.set_size_inches(oldsize)
            fig.canvas.draw()
Пример #6
0
.. code-block:: python

  failed_testcases_list = swilog.get_error_list()
  if failed_testcases_list != []:
      assert 0, "Some tests failed:/\n%s"% "/\n".join(failed_testcases_list)

"""
import sys

try:
    import logging
    import colorlog
except:
    import logging as colorlog

    colorlog.warning("install colorlog to have colors: pip install colorlog")

__copyright__ = "Copyright (C) Sierra Wireless Inc."

DELIMITER = "===================="

LOG_FORMAT = "\n%(log_color)s%(asctime)s %(levelname)s %(message)s"
DATE_FORMAT = "%H:%M:%S"
default_log_colors = {
    "NOTSET": "white",
    "DEBUG": "blue",
    "INFO": "green",
    "STEP": "bold_blank",
    "WARNING": "bold_yellow",
    "ERROR": "bold_red",
    "CRITICAL": "bold_red",
    def copy_decode(self, mixed_inputs, encoder_outputs, decoder_outputs, attention_bias, training):
        """ Generate softmax values of logits in the target sequence.

        Args: Same as decode function's arguments
            - mixed_inputs: input values of context and chosen knowledge. Int tensor with shape
            [batch_size, mixed_input_length]
            - encoder_outputs: continuous representation of input sequence. float tensor
            with shape [batch_size, sentence_max_length, word_embed_size]
            - decoder_outputs: continuous representaiton of output sequence. float tensor
            with shape [batch_size, target_length - 1, word_embed_size]
            - attention_bias: float tensor with shape [batch_size, 1, 1, sentence_max_length]
            training: boolean, whether in training mode or not.
        Returns:
            float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        with tf.name_scope("copy_decode"):
            colorlog.warning("Use pointer-generator mechanism. \
                             Note that output is not logit but softmax.")
            if training:
                batch_size = tf.shape(mixed_inputs)[0]
                # batch_size = self.hparams.batch_size
            else:
                batch_size = tf.shape(mixed_inputs)[0] * self.hparams.beam_size
                # batch_size = self.hparams.batch_size * self.hparams.beam_size

            w_q = self._copy_q_layer
            w_k = self._copy_k_layer
            w_v = self._copy_v_layer

            q = w_q(decoder_outputs)
            k = w_k(encoder_outputs)
            v = w_v(encoder_outputs)

            # Codes for multi heads attention, but not necessary.

            q = self.decoder_stack.layers[-1][1].layer.split_heads(q)
            k = self.decoder_stack.layers[-1][1].layer.split_heads(k)
            v = self.decoder_stack.layers[-1][1].layer.split_heads(v)

            depth = (self.hparams.word_embed_size // self.hparams.num_heads)
            q *= depth ** -0.5

            a_t = tf.matmul(q, k, transpose_b=True)
            a_t += attention_bias
            # [batch_size, num_heads, target_length - 1, mixed_input_length]
            p_att = _float32_softmax(a_t, name="p_copy")
            if training:
                p_att = tf.nn.dropout(p_att,
                                      noise_shape=[tf.shape(p_att)[0], tf.shape(p_att)[1], 1, 1],
                                      rate=self.hparams.attention_dropout)

            # [batch_size, num_heads, target_length - 1, depth]
            hidden = tf.matmul(p_att, v)
            # [batch_size, target_length, word_embed_size]
            p_att = p_att[:,0]
            hidden = self.decoder_stack.layers[-1][1].layer.combine_heads(hidden)
            hidden = self.decoder_stack.layers[-1][1].layer.output_dense_layer(hidden)
            # feed forward network
            hidden = self.decoder_stack.layers[-1][2](hidden, training=training)
            hidden = self.decoder_stack.output_normalization(hidden)
            # [batch_size, target_length - 1, vocab_size]
            p_vocab = _float32_softmax(self._output_embedding(decoder_outputs, mode="linear"))
            # p_vocab = _float32_softmax(self._embedding(decoder_outputs, mode="linear"))

            # matching (p_att.shape) to (p_vocab.shape)
            initial_indices = tf.tile(mixed_inputs[:, tf.newaxis, :], [1, tf.shape(p_vocab)[1], 1])
            i1, i2 = tf.meshgrid(tf.range(batch_size),
                     tf.range(tf.shape(p_vocab)[1]), indexing="ij")
            i1 = tf.tile(i1[:, :, tf.newaxis], [1, 1, tf.shape(p_att)[2]])
            i2 = tf.tile(i2[:, :, tf.newaxis], [1, 1, tf.shape(p_att)[2]])
            # [batch_size, target_length - 1, mixed_input_length, 3]
            indices = tf.stack([i1, i2, initial_indices], axis=-1)
            # [batch_size, target_length - 1, vocab_size]
            p_att = tf.scatter_nd(indices, p_att, shape=tf.shape(p_vocab))

            p_gen = self._copy_layer(hidden)
            # [batch_size, target_length - 1, vocab_size]
            p_gen = tf.tile(p_gen, [1, 1, self.hparams.vocab_size])
            # [batch_size, target_length - 1, vocab_size]
            p_word = (1 - p_gen) * p_vocab + p_gen * p_att

            return p_word
Пример #8
0
    def __init__(self, filename=None, empty_tissue_name="UNDEFINED"):
        """.. rubric:: Constructor

        If no file is provided, using the default file provided in the
        package that is made of 1001 cell lines times 680 features.

        :param str empty_tissue_name: if a tissue name is let empty, replace
            it with this string.

        """
        # first reset the filename to the shared data (if not provided)
        if filename is None:
            from gdsctools.datasets import genomic_features
            filename = genomic_features
        # used in the header so should be ser before call to super()

        super(GenomicFeatures, self).__init__(filename)

        # FIXME Remove columns related to Drug if any. Can be removed in
        # the future
        self.df = self.df[[
            x for x in self.df.columns if x.startswith('Drug_') is False
        ]]

        for this in ['Sample Name', 'SAMPLE_NAME', 'Sample_Name', 'CELL_LINE']:
            if this in self.df.columns:
                self.df.drop(this, axis=1, inplace=True)

        # Let us rename "COSMIC ID" into "COSMIC_ID" if needed
        for old, new in {
                'Tissue Factor Value': 'TISSUE_FACTOR',
                'MS-instability Factor Value': 'MSI_FACTOR',
                'COSMIC ID': 'COSMIC_ID'
        }.items():
            if old in self.df.columns:
                colorlog.warning(
                    "'%s' column name is deprecated " % old +
                    " since 0.9.10. Please replace with '%s'" % new,
                    DeprecationWarning)
                self.df.columns = [
                    x.replace(old, new) for x in self.df.columns
                ]
        if "CL" in self.df.columns and "COSMID_ID" not in self.df.columns:
            self.df.columns = [
                x.replace("CL", "COSMIC_ID") for x in self.df.columns
            ]

        # There are 3 special columns to hold the factors
        self._special_names = []

        # If tissue factor is not provided, we create and fill it with dummies.
        # OTherwise, we need to change a lot in the original code in ANOVA
        if self.colnames.tissue not in self.df.columns:
            colorlog.warning(
                "column named '%s' not found" % self.colnames.tissue,
                UserWarning)
            self.df[self.colnames.tissue] = ['UNDEFINED'] * len(self.df)
            self._special_names.append(self.colnames.tissue)
        else:
            self._special_names.append(self.colnames.tissue)

        self.found_msi = self.colnames.msi in self.df.columns
        if self.found_msi is False:
            colorlog.warning("column named '%s' not found" % self.colnames.msi)
        else:
            self._special_names.append(self.colnames.msi)

        self.found_media = self.colnames.media in self.df.columns
        if self.found_media is False:
            pass
            #colorlog.warning("column named '%s' not found" % self.colnames.media)
        else:
            self._special_names.append(self.colnames.media)

        # order columns and index
        self._order()

        #
        self._interpret_cosmic()

        #
        self.check()

        self._fix_empty_tissues(empty_tissue_name)
Пример #9
0
    def _interpret(self):
        # if there is at least one column that starts with Drug or drug or
        # DRUG or variant then all other columns are dropped except "COSMIC ID"

        # For back compatibility with data that mixes Drug identifiers and
        # genomic features:
        _cols = [str(x) for x in self.df.columns]
        drug_prefix = None
        for this in _cols:
            if this.startswith("Drug_"):
                drug_prefix = "Drug"

        _cols = [str(x) for x in self.df.columns]
        if "COSMIC ID" in _cols and self.cosmic_name not in _cols:
            colorlog.warning(
                "'COSMIC ID' column name is deprecated since " +
                "0.9.10. Please replace with 'COSMIC_ID'", DeprecationWarning)
            self.df.columns = [
                x.replace("COSMIC ID", "COSMIC_ID") for x in self.df.columns
            ]
        if "CL" in _cols and "COSMID_ID" not in self.df.columns:
            colorlog.warning(
                "'CL column name is deprecated since " +
                "0.9.10. Please replace with 'COSMIC_ID'", DeprecationWarning)
            self.df.columns = [
                x.replace("CL", "COSMIC_ID") for x in self.df.columns
            ]

        # If the data has not been interpreted, COSMIC column should be
        # found in the column and set as the index
        _cols = [str(x) for x in self.df.columns]
        if self.cosmic_name in self.df.columns:
            self.df.set_index(self.cosmic_name, inplace=True)
            _cols = [str(x) for x in self.df.columns]
            if drug_prefix:
                columns = [x for x in _cols if x.startswith(drug_prefix)]
                self.df = self.df[columns]

        # If already interpreted, COSMIC name should be the index already.
        # and should be integers, so let us cast to integer
        elif self.df.index.name == self.cosmic_name:
            _cols = [str(x) for x in self.df.columns]
            if drug_prefix:
                columns = [x for x in _cols if x.startswith(drug_prefix)]
                columns = self.df.columns
                assert len(columns) == len(set(columns))
                self.df = self.df[columns]
        # Otherwise, raise an error
        else:
            raise ValueError(
                "{0} column could not be found in the header".format(
                    self.cosmic_name))

        # In v18, the drug ids may be duplicated
        if self._v18 is True:
            return

        self.df.columns = [drug_name_to_int(x) for x in self.df.columns]
        self.df.columns = self.df.columns.astype(int)

        self.df.index = [int(x) for x in self.df.index]
        self.df.index = self.df.index.astype(int)
        self.df.index.name = "COSMIC_ID"

        # Check uniqueness
        self._check_uniqueness(self.df.index)