Пример #1
0
def _determine_unit(tokens, units, abbrev_units):
    """
    Identify the unit assigned to an array of tokens.
    
    Args:
        tokens: An array of tokens. 
        units: A list of units that an array of tokens can map to.
        abbrev_units: A list of abbreviated units. 
        
    Returns:
        An identified unit corresponding to tokens. 
        unspecified is returned if no unit were identified.
    Raises:
        TableException for invalid units
    """
    if _get_token_type(tokens[-1]) != 'NAME':
        raise TableException('%s does not contain a unit' %
                             _get_token_value(tokens[-1]))
    unit = _get_token_value(tokens[-1]).lower()
    if unit in abbrev_units:
        unit = abbrev_units[unit].lower()

    if unit not in units:
        raise TableException('%s is an invalid unit' % unit)
    return units[unit]
    def process_value_unit_without_validation(self, text):
        """Process given text for a value followed by a unit.
        Note that unit in this function is defined by any alpha-numeric value and will not be validated.
        """
        tokens = self._cell_tokenizer.tokenize(text,
                                               keep_separator=False,
                                               keep_skip=False)
        if not self.is_valued_cell(text):
            raise TableException('%s does not contain a unit' % text)
        if len(tokens) < 1:
            raise TableException(
                'Invalid value: %s does not contain a numerical value' % text)

        values, unit = self._get_values_unit(tokens)
        return values[0], unit
    def parse_content_item(self,
                           text: str,
                           text_with_uri: Dict,
                           fluid_units: Tuple = {},
                           timepoint_units: Tuple = {}):
        contents = []
        tokens = self._cell_tokenizer.tokenize(text, keep_skip=False)
        if len(tokens) < 1:
            raise TableException('Invalid value: %s does not contain a name' %
                                 text)

        cell_type = self._get_token_type(self._cell_parser.parse(tokens))
        if cell_type == 'NAME_VALUE_UNIT':
            label, value, unit = self._get_name_values_unit(tokens)
            named_link = self.create_name_with_uri(label, text_with_uri)
            unit = self.process_content_item_unit(unit, fluid_units,
                                                  timepoint_units)
            measured_unit = MeasuredUnit(float(value), unit)
            content = ReagentIntent(named_link)
            content.add_reagent_value(measured_unit)
            contents.append(content)
        elif cell_type == 'NAME':
            for label in self.extract_name_value(text):
                named_link = self.create_name_with_uri(label, text_with_uri)
                name = NamedStringValue(named_link)
                contents.append(name)
        else:
            # default to a name string value
            for label in self.extract_name_value(text):
                named_link = self.create_name_with_uri(label, text_with_uri)
                name = NamedStringValue(named_link)
                contents.append(name)
        return contents
 def _map_control_with_captions(self, control_tables):
     control_map = {}
     for table_caption, control_data in control_tables.items():
         if table_caption:
             control_map[table_caption] = control_data
     if not control_map:
         raise TableException('No reference to a Control table.')
     return control_map
Пример #5
0
    def process_values_unit(self, cell, units={}, unit_type=None):
        """
        Parses the content of a cell to identify its value and unit. 
    
        Args: 
            cell: the content of a cell
            units: a list of units that the cell can be assigned to as its unit type.
            unit_type: an optional variable to specify what type of cell this function is parsing. Default to None. 
        
        Return:
            a list of dictionaries for representing values and units. 
        Raises:
            A TableException is thrown for a cell that has no unit. 
        """
        result = []
        tokens = self._cell_tokenizer.tokenize(cell.get_text(),
                                               keep_space=False,
                                               keep_skip=False)
        if not self.is_valued_cell(cell):
            raise TableException('%s does not contain a unit' % cell)
        if len(tokens) < 1:
            raise TableException(
                'Invalid value: %s does not contain a numerical value' %
                cell.get_text())
        cell_type = self._get_token_type(self._cell_parser.parse(tokens))

        abbrev_units = self._abbreviated_unit_dict[
            unit_type] if unit_type is not None else {}
        if cell_type == 'VALUES_UNIT':
            values, unit = self._get_values_unit(tokens)
            validated_unit = self._determine_unit(unit, units, abbrev_units)
            if units and validated_unit in units:
                for value in values:
                    result.append({
                        'value': float(value),
                        'unit': validated_unit
                    })
        elif cell_type == 'VALUE_UNIT_PAIRS':
            for value, unit in self._get_values_unit_pairs(
                    tokens, units, unit_type):
                validated_unit = self._determine_unit(unit, units,
                                                      abbrev_units)
                if units and validated_unit in units:
                    result.append({'value': float(value), 'unit': unit})
        return result
Пример #6
0
 def parse_content_item(self, cell, fluid_units={}, timepoint_units={}):
     list_of_contents = []
     tokens = self._cell_tokenizer.tokenize(cell.get_text(),
                                            keep_skip=False)
     if len(tokens) < 1:
         raise TableException('Invalid value: %s does not contain a name' %
                              cell.get_text())
     cell_type = self._get_token_type(self._cell_parser.parse(tokens))
     label, value, unit, timepoint_value, timepoint_unit = (None, None,
                                                            None, None,
                                                            None)
     if cell_type == 'NAME_VALUE_UNIT_TIMEPOINT':
         label, value, unit, timepoint_value, timepoint_unit = self._get_name_values_unit_timepoint(
             tokens)
         content = {}
         content['name'] = self.process_name_with_uri(
             label, cell.get_text_with_url())
         content['value'] = value
         content['unit'] = self.process_content_item_unit(
             unit, fluid_units, timepoint_units)
         content['timepoints'] = self.process_timepoint(
             timepoint_value, timepoint_unit, timepoint_units)
         list_of_contents.append(content)
     elif cell_type == 'NAME_VALUE_UNIT':
         label, value, unit = self._get_name_values_unit(tokens)
         content = {}
         content['name'] = self.process_name_with_uri(
             label, cell.get_text_with_url())
         content['value'] = value
         content['unit'] = self.process_content_item_unit(
             unit, fluid_units, timepoint_units)
         list_of_contents.append(content)
     elif cell_type == 'NAME':
         labels = table_utils.extract_name_value(cell.get_text())
         for label in labels:
             content = {}
             content['name'] = self.process_name_with_uri(
                 label, cell.get_text_with_url())
             list_of_contents.append(content)
     else:
         raise TableException('Unable to parse %s' % cell.get_text())
     return list_of_contents
    def process_boolean_flag(self, text: str) -> List[bool]:
        tokens = self._cell_tokenizer.tokenize(text.lower(),
                                               keep_separator=False,
                                               keep_skip=False)
        result = []
        for token in tokens:
            token_type = self._get_token_type(token)
            if token_type == 'BOOLEAN_FALSE':
                result.append(False)
            elif token_type == 'BOOLEAN_TRUE':
                result.append(True)
            else:
                raise TableException('%s is not a boolean value' % text)

        return result
Пример #8
0
 def _parse_parameter_field_value(self, parameter_field, parameter_value):
     if parameter_field in self.FIELD_WITH_FLOAT_VALUE: 
         values = table_utils.extract_number_value(parameter_value)
         return parameter_field, [float(float_val) for float_val in values]
     elif parameter_field in self.FIELD_WITH_BOOLEAN_VALUE:
         parameter_value = parameter_value.lower().strip()
         if parameter_value == 'false':
             return parameter_field, [False]
         elif parameter_value == 'true':
             return parameter_field, [True]
         else:
             raise TableException('Parameter table has invalid %s value: %s should be a boolean value' % (parameter_field, parameter_value))
     elif parameter_field in self.FIELD_WITH_LIST_OF_STRING:
         return parameter_field, [parameter_value] 
     elif parameter_field in self.FIELD_WITH_NESTED_STRUCTURE:
         json_parameter_value = json.loads(parameter_value)
         return parameter_field, [json_parameter_value] 
     
     return parameter_field, table_utils.transform_strateos_string(parameter_value)
Пример #9
0
 def _process_row(self, row_index):
     row = self._intent_parser_table.get_row(row_index)
     param_field = ''
     param_value = '' 
     for cell_index in range(len(row)):
         cell = self._intent_parser_table.get_cell(row_index, cell_index)
         # Cell type based on column header
         header_row_index = self._intent_parser_table.header_row_index()
         header_cell = self._intent_parser_table.get_cell(header_row_index, cell_index)
         cell_type = cell_parser.PARSER.get_header_type(header_cell)
         if 'PARAMETER' == cell_type:
             param_field = self._get_parameter_field(cell.get_text().strip())
         elif 'PARAMETER_VALUE' == cell_type:
             param_value = cell.get_text()
     if not param_field:
         raise TableException('Parameter field should not be empty')
     if not param_value:
         return param_field, []
     return self._parse_parameter_field_value(param_field, param_value)  
Пример #10
0
    def process_reagent_or_media_header(self, text, text_with_uri, units,
                                        unit_type):
        tokens = self._cell_tokenizer.tokenize(text, keep_skip=False)
        cell_type = self._get_token_type(self._cell_parser.parse(tokens))

        if cell_type == 'NAME_SEPARATOR_VALUE_UNIT':
            label, timepoint_value, timepoint_unit = self._get_name_timepoint(
                tokens)
            name = self.create_name_with_uri(label, text_with_uri)
            abbrev_units = self._abbreviated_unit_dict[
                unit_type] if unit_type is not None else {}
            unit = self._determine_unit(timepoint_unit, units, abbrev_units)
            timepoint = TimepointIntent(float(timepoint_value), unit)
            return name, timepoint
        elif cell_type == 'NAME':
            name = self.create_name_with_uri(text.strip(), text_with_uri)
            return name, None
        else:
            raise TableException('%s cannot be parsed as a reagent' % text)
Пример #11
0
def transform_cell(cell, units, cell_type=None):
    """
    Parses the content of a cell to identify its value and unit. 
    
    Args: 
        cell: the content of a cell
        units: a list of units that the cell can be assigned to as its unit type.
        cell_type: an optional variable to specify what type of cell this function is parsing. Default to None. 
        
    Return:
        Yield two variables. 
        The first variable represents the cell's content.
        The second variable represents an identified unit for the cell.
    
    Raises:
        A TableException is thrown for a cell that has no unit. 
    """
    tokens = _tokenize(cell)
    if not _is_valued_cells(tokens):
        raise TableException('%s does not contain a unit' % cell)
    else:
        index = 0
        tokens = [
            token for token in tokens
            if _get_token_type(token) not in ['SEPARATOR', 'SKIP']
        ]
        abbrev_units = _abbreviated_unit_dict[
            cell_type] if cell_type is not None else {}
        unit = _determine_unit(tokens, _canonicalize_units(units),
                               abbrev_units)
        while index < len(tokens) - 1:
            value = tokens[index][1]

            if _get_token_type(tokens[index + 1]) == 'NAME':
                index = index + 2
            else:
                index = index + 1
            yield value, unit

        if index == len(tokens) - 1:
            yield _get_token_value(tokens[index]), unit
Пример #12
0
    def _determine_unit(self, unit, units, abbrev_units):
        """
        Identify the unit assigned to an array of tokens.
        
        Args:
            unit: a unit
            units: A list of supported units.
            abbrev_units: A list of abbreviated units. 
            
        Returns:
            A unit
        Raises:
            TableException for invalid units
        """
        determined_unit = unit
        if unit in abbrev_units:
            determined_unit = abbrev_units[unit].lower()

        if determined_unit not in units:
            raise TableException('%s is an invalid unit' % unit)
        return determined_unit
Пример #13
0
 def process_numbers(self, text: str) -> List[str]:
     """
     Process a given string for a list of numbers, using commas as a delimiter.
     Args:
         text: a string
     Returns:
         A list of strings
     """
     tokens = self._cell_tokenizer.tokenize(text,
                                            keep_separator=True,
                                            keep_skip=False)
     cell_type = self._get_token_type(self._cell_parser.parse(tokens))
     if cell_type == 'NUMBER':
         return [self._get_token_value(token) for token in tokens]
     elif cell_type == 'NUMBER_LIST':
         number_tokens = filter(
             lambda x: self._get_token_type(x) == 'NUMBER', tokens)
         return [self._get_token_value(token) for token in number_tokens]
     else:
         raise TableException(
             '%s does not follow correct format to specify a number or a list of number'
             % text)
Пример #14
0
 def _get_parameter_field(self, cell_txt):
     if not self._parameter_fields:
         raise DictionaryMaintainerException('There are no parameters that could map to a Strateos protocol')
     if cell_txt not in self._parameter_fields:
         raise TableException('%s does not map to a Strateos UID' % cell_txt)
     return self._parameter_fields[cell_txt]
Пример #15
0
 def process_table(self):
     self._table_caption = self._intent_parser_table.caption()
     if not self._table_caption:
         raise TableException('Control Table must have a caption but none was found.')
     for row_index in range(self._intent_parser_table.data_row_start_index(), self._intent_parser_table.number_of_rows()):
         self._process_row(row_index)