Python IntentParserTableFactory 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: intent_parser.table.intent_parser_table_factory

hotexamples.com에서의 예제들: 3

Python IntentParserTableFactory - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 intent_parser.table.intent_parser_table_factory.IntentParserTableFactory에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

IntentParserTableFactory(7)

from_google_doc(6)

get_caption_row_index(1)

get_header_row_index(1)

get_table_type(1)

예제 #1

파일 보기

파일: test_table_parser.py 프로젝트: SD2E/experimental-intent-parser

class TableParserTest(unittest.TestCase):
    """
    Test Intent Parser when parsing content from tables.
    """

    def setUp(self):
        self.ip_table_factory = IntentParserTableFactory()

    def tearDown(self):
        pass

    def test_table_with_whitespace(self):
        input_table = {'tableRows': [
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'notes'}}]}}]}]},
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'a note'}}]}}]}]}]
        }
        ip_table = self.ip_table_factory.from_google_doc({'table': input_table,
                                                         'startIndex': 0,
                                                         'endIndex': 100})

        self.assertEqual('notes', ip_table.get_cell(0, 0).get_text())
        self.assertEqual('a note', ip_table.get_cell(1, 0).get_text())

    def test_table_with_trailing_whitespace(self):
        input_table = {'tableRows': [
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'notes'}}]}}]}]},
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': '  a  note  '}}]}}]}]}]
        }
        ip_table = self.ip_table_factory.from_google_doc({'table': input_table,
                                                         'startIndex': 0,
                                                         'endIndex': 100})

        self.assertEqual('notes', ip_table.get_cell(0, 0).get_text())
        self.assertEqual('  a  note  ', ip_table.get_cell(1, 0).get_text())

    def test_table_with_commas(self):
        input_table = {'tableRows': [
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'strains'}}]}}]}]},
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'AND_00, AND_01, AND_10'}}]}}]}]}]
        }
        ip_table = self.ip_table_factory.from_google_doc({'table': input_table,
                                                          'startIndex': 0,
                                                          'endIndex': 100})

        self.assertEqual('strains', ip_table.get_cell(0, 0).get_text())
        self.assertEqual('AND_00, AND_01, AND_10', ip_table.get_cell(1, 0).get_text())

    def test_table_with_commas_and_newline(self):
        input_table = {'tableRows': [
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'strains'}}]}}]}]},
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'AND_00, AND_01, AND_10\n'}}]}}]}]}]
        }
        ip_table = self.ip_table_factory.from_google_doc({'table': input_table,
                                                          'startIndex': 0,
                                                          'endIndex': 100})

        self.assertEqual('strains', ip_table.get_cell(0, 0).get_text())
        self.assertEqual('AND_00, AND_01, AND_10\n', ip_table.get_cell(1, 0).get_text())

    def test_table_with_newline_before_commas(self):
        input_table = {'tableRows': [
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'strains'}}]}}]}]},
            {'tableCells': [{'content': [{'paragraph': {'elements': [{'textRun': {
                'content': 'AND_00, \nAND_01,\n AND_10\n'}}]}}]}]}]
        }
        ip_table = self.ip_table_factory.from_google_doc({'table': input_table,
                                                          'startIndex': 0,
                                                          'endIndex': 100})

        self.assertEqual('strains', ip_table.get_cell(0, 0).get_text())
        self.assertEqual('AND_00, \nAND_01,\n AND_10\n', ip_table.get_cell(1, 0).get_text())

예제 #2

파일 보기

파일: test_table_parser.py 프로젝트: SD2E/experimental-intent-parser

 def setUp(self):
     self.ip_table_factory = IntentParserTableFactory()

예제 #3

파일 보기

파일: intent_parser.py 프로젝트: SD2E/experimental-intent-parser-mw

class IntentParser(object):
    """
    Processes information from a lab experiment to:
        - link information to/from a SynBioHub data repository
        - generate and validate a structure request
    """
    
    # Used for inserting experiment result data
    # Since the experiment result data is uploaded with the requesting document id
    # and the test documents are copies of those, the ids won't match
    # In order to test this, if we receive a document Id in the key of this map, we will instead query for the value
    _test_doc_id_map = {'1xMqOx9zZ7h2BIxSdWp2Vwi672iZ30N_2oPs8rwGUoT' : '10HqgtfVCtYhk3kxIvQcwljIUonSNlSiLBC8UFmlwm1s',
                       '1RenmUdhsXMgk4OUWReI2oS6iF5R5rfWU5t7vJ0NZOHw': '1g0SjxU2Y5aOhUbM63r8lqV50vnwzFDpJg4eLXNllut4',
                       '1_I4pxB26zOLb209Xlv8QDJuxiPWGDafrejRDKvZtEl8': '1K5IzBAIkXqJ7iPF4OZYJR7xgSts1PUtWWM2F0DKhct0',
                       '1zf9l0K4rj7I08ZRpxV2ZY54RMMQc15Rlg7ULviJ7SBQ': '1uXqsmRLeVYkYJHqgdaecmN_sQZ2Tj4Ck1SZKcp55yEQ' }

    logger = logging.getLogger('intent_parser')
    
    def __init__(self, lab_experiment, datacatalog_config, sbh_instance, sbol_dictionary):
        self.lab_experiment = lab_experiment 
        self.catalog_accessor = CatalogAccessor()
        self.datacatalog_config = datacatalog_config
        self.sbh = sbh_instance
        self.sbol_dictionary = sbol_dictionary
        self.request = {} 
        self.validation_errors = []
        self.validation_warnings = []
        self.ip_table_factory = IntentParserTableFactory()
       
    def process(self):
        self._generate_request()
        self._validate_schema()
    
    def calculate_samples(self):
        doc_tables = self.lab_experiment.tables()
        
        table_ids = []
        sample_indices = []
        samples_values = []
        for tIdx in range(len(doc_tables)):
            table = doc_tables[tIdx]

            is_new_measurement_table = table_utils.detect_new_measurement_table(table)
            if not is_new_measurement_table:
                continue

            rows = table['tableRows']
            headerRow = rows[0]
            samples_col = -1
            for cell_idx in range(len(headerRow['tableCells'])):
                cellTxt = intent_parser_utils.get_paragraph_text(headerRow['tableCells'][cell_idx]['content'][0]['paragraph']).strip()
                if cellTxt == intent_parser_constants.COL_HEADER_SAMPLES:
                    samples_col = cell_idx

            samples = []
            numCols = len(headerRow['tableCells'])

            # Scrape data for each row
            for row in rows[1:]:
                comp_count = []
                is_type_col = False
                colIdx = 0
                # Process reagents
                while colIdx < numCols and not is_type_col:
                    paragraph_element = headerRow['tableCells'][colIdx]['content'][0]['paragraph']
                    headerTxt =  intent_parser_utils.get_paragraph_text(paragraph_element).strip()
                    if headerTxt == intent_parser_constants.COL_HEADER_MEASUREMENT_TYPE:
                        is_type_col = True
                    else:
                        cellContent = row['tableCells'][colIdx]['content']
                        cellTxt = ' '.join([intent_parser_utils.get_paragraph_text(c['paragraph']).strip() for c in cellContent]).strip()
                        comp_count.append(len(cellTxt.split(sep=',')))
                    colIdx += 1

                # Process the rest of the columns
                while colIdx < numCols:
                    paragraph_element = headerRow['tableCells'][colIdx]['content'][0]['paragraph']
                    headerTxt =  intent_parser_utils.get_paragraph_text(paragraph_element).strip()
                    # Certain columns don't contain info about samples
                    if headerTxt == intent_parser_constants.COL_HEADER_MEASUREMENT_TYPE or headerTxt == intent_parser_constants.COL_HEADER_NOTES or headerTxt == intent_parser_constants.COL_HEADER_SAMPLES:
                        colIdx += 1
                        continue

                    cellContent = row['tableCells'][colIdx]['content']
                    cellTxt = ' '.join([intent_parser_utils.get_paragraph_text(c['paragraph']).strip() for c in cellContent]).strip()

                    if headerTxt == intent_parser_constants.COL_HEADER_REPLICATE:
                        comp_count.append(int(cellTxt))
                    else:
                        comp_count.append(len(cellTxt.split(sep=',')))
                    colIdx += 1
                samples.append(int(np.prod(comp_count)))

            table_ids.append(tIdx)
            sample_indices.append(samples_col)
            samples_values.append(samples)

        samples = {}
        samples['action'] = 'calculateSamples'
        samples['tableIds'] = table_ids
        samples['sampleIndices'] = sample_indices
        samples['sampleValues'] = samples_values
        return samples

     
    def generate_report(self):
        links_info = self.lab_experiment.links_info() 
        mapped_names = []
        term_map = {}
        for link_info in links_info:
            try:
                term = link_info[0].strip()
                url = link_info[1]['url']
                if len(term) == 0:
                    continue

                if term in term_map:
                    if term_map[term] == url:
                        continue

                url_host = url.split('/')[2]
                if url_host not in self.sbh.get_sbh_link_host():
                    continue

                term_map[term] = url
                mapped_name = {}
                mapped_name['label'] = term
                mapped_name['sbh_url'] = url
                mapped_names.append(mapped_name)
            except:
                continue

        report = {}
        report['challenge_problem_id'] = 'undefined'
        report['experiment_reference_url'] = 'https://docs.google.com/document/d/' + self.lab_experiment.document_id()
        report['labs'] = []
        report['mapped_names'] = mapped_names
        return report
    
    def generate_displayId_from_selection(self, start_paragraph, start_offset, end_offset):
        paragraphs = self.lab_experiment.paragraphs()
        paragraph_text = intent_parser_utils.get_paragraph_text(paragraphs[start_paragraph])
        selection = paragraph_text[start_offset:end_offset + 1]
        # Remove leading/trailing space
        selection = selection.strip()
        return selection, self.sbh.sanitize_name_to_display_id(selection)
      
    def get_structured_request(self):
        return self.request
    
    def get_validation_errors(self):
        return self.validation_errors
    
    def get_validation_warnings(self):
        return self.validation_warnings
    
    def update_experimental_results(self):
        # For test documents, replace doc id with corresponding production doc
        if self.lab_experiment.document_id() in self._test_doc_id_map:
            source_doc_uri = 'https://docs.google.com/document/d/' + self._test_doc_id_map[self.lab_experiment.document_id()]
        else:
            source_doc_uri = 'https://docs.google.com/document/d/' + self.lab_experiment.document_id()

        # Search SBH to get data
        target_collection = '%s/user/%s/experiment_test/experiment_test_collection/1' % (self.sbh.get_sbh_url(), self.sbh.get_sbh_collection_user())
        exp_collection = self.sbh.query_experiments(self.sbh, target_collection)
        data = {}
        for exp in exp_collection:
            exp_uri = exp['uri']
            timestamp = exp['timestamp']
            title = exp['title']
            request_doc = self.sbh.query_experiment_request(exp_uri)
            if source_doc_uri == request_doc:
                source_uri = self.sbh.query_experiment_source(exp_uri)  # Get the reference to the source document with lab data
                data[exp_uri] = {'timestamp' : timestamp, 'agave' : source_uri[0], 'title' : title}

        exp_data = []
        exp_links = []
        for exp in data:
            exp_data.append((data[exp]['title'], ' updated on ', data[exp]['timestamp'], ', ', 'Agave link', '\n'))
            exp_links.append((exp, '', '', '',  data[exp]['agave'], ''))

        if exp_data == '':
            exp_data = ['No currently run experiments.']

        paragraphs = self.lab_experiment.paragraphs()

        headerIdx = -1
        contentIdx = -1
        for pIdx in range(len(paragraphs)):
            para_text = intent_parser_utils.get_paragraph_text(paragraphs[pIdx])
            if para_text == "Experiment Results\n":
                headerIdx = pIdx
            elif headerIdx >= 0 and not para_text == '\n':
                contentIdx = pIdx
                break

        if headerIdx >= 0 and contentIdx == -1:
            self.logger.error('ERROR: Couldn\'t find a content paragraph index for experiment results!')

        experimental_result = {}
        experimental_result['action'] = 'updateExperimentResults'
        experimental_result['headerIdx'] = headerIdx
        experimental_result['contentIdx'] = contentIdx
        experimental_result['expData'] = exp_data
        experimental_result['expLinks'] = exp_links
        return experimental_result
   
    def get_challenge_problem_id(self, text):
        """
        Find the closest matching measurement type to the given type, and return that as a string
        """
        # challenge problem ids have underscores, so replace spaces with underscores to make the inputs match better
        text = text.replace(' ', '_')
        best_match_type = None
        best_match_size = 0
        for cid in self.catalog_accessor.get_challenge_problem_ids():
            matches = intent_parser_utils.find_common_substrings(text.lower(), cid.lower(), 1, 0)
            for m in matches:
                if m.size > best_match_size and m.size > int(0.25 * len(cid)):
                    best_match_type = cid
                    best_match_size = m.size
        return best_match_type
     
    def _generate_request(self):
        """
        Generates a structured request for a given doc id
        """
        output_doc = { "experiment_reference_url" : "https://docs.google.com/document/d/%s" % self.lab_experiment.document_id() }
        if self.datacatalog_config['mongodb']['authn']:
            try:
                map_experiment_reference(self.datacatalog_config, output_doc)
            except:
                pass # We don't need to do anything, failure is handled later, but we don't want it to crash

        title = self.lab_experiment.title()[0]

        if 'challenge_problem' in output_doc and 'experiment_reference' in output_doc and 'experiment_reference_url' in output_doc:
            cp_id = output_doc['challenge_problem']
            experiment_reference = output_doc['experiment_reference']
            experiment_reference_url = output_doc['experiment_reference_url']
        else:
            self.logger.info('WARNING: Failed to map experiment reference for doc id %s!' % self.lab_experiment.document_id())
            titleToks = title.split(sep='-')
            if len(titleToks) > 1:
                experiment_reference = title.split(sep='-')[1].strip()
            else:
                experiment_reference = title
            experiment_reference_url = 'https://docs.google.com/document/d/' + self.lab_experiment.document_id()
            # This will return a parent list, which should have one or more Ids of parent directories
            # We want to navigate those and see if they are a close match to a challenge problem ID
            parent_list = self.lab_experiment.parents()
            cp_id = 'Unknown'
            if not parent_list['kind'] == 'drive#parentList':
                self.logger.info('ERROR: expected a drive#parent_list, received a %s' % parent_list['kind'])
            else:
                for parent_ref in parent_list['items']:
                    if not parent_ref['kind'] == 'drive#parentReference':
                        continue
                    parent_experiment = LabExperiment(parent_ref['id'])
                    parent_meta = parent_experiment.load_metadata_from_google_doc()
                    new_cp_id = self.get_challenge_problem_id(parent_meta['title'])
                    if new_cp_id is not None:
                        cp_id = new_cp_id

        control_tables, lab_tables, measurement_tables, parameter_tables = self._sort_tables()
        ref_controls = self._process_control_tables(control_tables)
        lab_content = self._process_lab_table(lab_tables)
        parameter = self._process_parameter_table(parameter_tables)
        measurements = self._process_measurement_table(measurement_tables, ref_controls)
        
        self.request['name'] = title
        self.request['experiment_id'] = lab_content['experiment_id']
        self.request['challenge_problem'] = cp_id
        self.request['experiment_reference'] = experiment_reference
        self.request['experiment_reference_url'] = experiment_reference_url
        self.request['experiment_version'] = 1
        self.request['lab'] = lab_content['lab']
        self.request['runs'] = measurements
        self.request['doc_revision_id'] = self.lab_experiment.head_revision()
        if parameter:
            self.request['parameters'] = parameter
    
    def _process_control_tables(self, control_tables):
        ref_controls = {}
        if not control_tables:
            return ref_controls
        
        for table in control_tables:
            controls_table = ControlsTable(table, 
                                           self.catalog_accessor.get_control_type(),
                                           self.catalog_accessor.get_fluid_units(),
                                           self.catalog_accessor.get_time_units()) 
            controls_data = controls_table.process_table()
            table_caption = controls_table.get_table_caption()
            if table_caption:
                ref_controls[table_caption] = controls_data
            self.validation_errors.extend(controls_table.get_validation_errors())
            self.validation_warnings.extend(controls_table.get_validation_warnings())
        return ref_controls
    
    def _process_lab_table(self, lab_tables):
        default_lab = 'tacc'
        lab_content = {'lab': default_lab, 
                       'experiment_id' : 'experiment.%s.TBD' % default_lab}
        if not lab_tables:
            message = ('There is no lab table specified in this experiment.')
            self.logger.warning(message)
        else:    
            if len(lab_tables) > 1: 
                message = ('There is more than one lab table specified in this experiment.' 
                           'Only the last lab table identified in the document will be used for generating a request.')
                self.logger.warning(message)
            table = lab_tables[-1]
            lab_table = LabTable(table)
            lab_content = lab_table.process_table()
            self.validation_errors.extend(lab_table.get_validation_errors())
            self.validation_warnings.extend(lab_table.get_validation_warnings())
        return lab_content 
    
    def _process_measurement_table(self, measurement_tables, ref_controls):
        measurements = []
        if not measurement_tables:
            return measurements 
        if len(measurement_tables) > 1: 
                message = ('There are more than one measurement table specified in this experiment.'
                       'Only the last measurement table identified in the document will be used for generating a request.')
                self.validation_warnings.extend(message)
        table = measurement_tables[-1]
        meas_table = MeasurementTable(table, 
                                      self.catalog_accessor.get_temperature_units(), 
                                      self.catalog_accessor.get_time_units(), 
                                      self.catalog_accessor.get_fluid_units(), 
                                      self.catalog_accessor.get_measurement_types(), 
                                      self.catalog_accessor.get_file_types())
        measurement_data = meas_table.process_table(control_tables=ref_controls, bookmarks=self.lab_experiment.bookmarks())
        measurements.append({ 'measurements' : measurement_data})
        self.validation_errors.extend(meas_table.get_validation_errors())
        self.validation_warnings.extend(meas_table.get_validation_warnings())
        return measurements
    
    def _process_parameter_table(self, parameter_tables):
        parameter_data = []
        if not parameter_tables:
            return parameter_data 
        
        if len(parameter_tables) > 1:
            message = ('There are more than one parameter table specified in this experiment.'
                       'Only the last parameter table identified in the document will be used for generating a request.')
            self.logger.warning(message)
        try:
            table = parameter_tables[-1]
            parameter_table = ParameterTable(table, self.sbol_dictionary.get_strateos_mappings())
            parameter = parameter_table.process_table()
            parameter_data.append(parameter)
            self.validation_errors.extend(parameter_table.get_validation_errors())
        except DictionaryMaintainerException as err:
            self.validation_errors.extend(err.get_message())
        return parameter_data 
                        
    def _sort_tables(self):
        list_of_tables = self.lab_experiment.tables()
        measurement_tables = []
        lab_tables = []
        parameter_tables = []
        control_tables = []
        for table in list_of_tables:
            ip_table = self.ip_table_factory.from_google_doc(table) 
            caption_index = self.ip_table_factory.get_caption_row_index(ip_table)
            header_index = self.ip_table_factory.get_header_row_index(ip_table)
            if caption_index is not None:
                ip_table.set_caption_row_index(caption_index)
            if header_index is not None:
                ip_table.set_header_row_index(header_index)
                    
            table_type = self.ip_table_factory.get_table_type(ip_table)
            if table_type == TableType.CONTROL:
                control_tables.append(ip_table)
            elif table_type == TableType.LAB:
                lab_tables.append(ip_table)
            elif table_type == TableType.MEASUREMENT:
                measurement_tables.append(ip_table)
            elif table_type == TableType.PARAMETER:
                parameter_tables.append(ip_table)
        return control_tables, lab_tables, measurement_tables, parameter_tables
    
    def _validate_schema(self):
        if self.request:
            try:
                schema = { '$ref' : 'https://schema.catalog.sd2e.org/schemas/structured_request.json' }
                validate(self.request, schema)
            except ValidationError as err:
                self.validation_errors.append(format(err).replace('\n', '&#13;&#10;'))