def parse(self, data_string): """Parses input data and returns a dict of result data""" start_time = time.time() results = [] threads = [] # Creating/starting a thread for each parser module for module in self.config.enabled_modules: thread = ParserThread(self.config, module, data_string) thread.start() threads.append(thread) # Synchronizing/finishing parser threads for thr in threads: thr.join() # The threads are done, let's get the results out of them for thr in threads: results.extend(thr.results) # Unique list of all major types types = list(set([result.type for result in results])) if results: # Getting a unique list of result types. all_types = [] for res in results: all_types.extend([res.type, res.subtype]) # Hierarchical Confidence Normalization normalizer_chain = HierarchicalNormalizerChain( self.config, types, list(set(all_types)) ) results = normalizer_chain.normalize(results) # Sorting our results by confidence value results = sorted( results, key=lambda result: result.confidence, reverse=True ) return { 'query': truncate_text(data_string), 'date': datetime.datetime.now(), 'execution_seconds': time.time() - start_time, 'top': results[0] if len(results) > 0 else None, 'results': { 'count': len(results), 'types': types, 'matches': results } }
def test_normalizer_normalizes(self): res = [ ParseResult('Test', 'Test', 100), ParseResult('Test', 'Test', 0) ] conf = TestConfig() conf.enabled_confidence_normalizers.append(NormalizerStub) hnc = HierarchicalNormalizerChain(conf, [], []) results = hnc.normalize(res) self.assertEqual(1, len(results)) self.assertIsInstance(results[0], ParseResult)
def parse(self, data_string): """ Parses input data and returns a dict of result data :param data_string: the string we want to parse :type data_string: str :return: yields parse result data if there is any :rtype: dict """ start_time = time.time() results = [] threads = [] # Creating/starting a thread for each parser module for module in self.config.enabled_modules: thread = ParserThread(self.config, module, data_string) thread.start() threads.append(thread) # Synchronizing/finishing parser threads for thr in threads: thr.join() # The threads are done, let's get the results out of them for thr in threads: results.extend(thr.results) # Unique list of all major types types = list(set([result.type for result in results])) if results: # Getting a unique list of result types. all_types = [] for res in results: all_types.extend([res.type, res.subtype]) # Hierarchical Confidence Normalization normalizer_chain = HierarchicalNormalizerChain( self.config, types, list(set(all_types)) ) results = normalizer_chain.normalize(results) # Sorting our results by confidence value results = sorted( results, key=lambda result: result.confidence, reverse=True ) return { 'query': truncate_text(data_string), 'date': datetime.datetime.now(), 'execution_seconds': time.time() - start_time, 'top': results[0] if len(results) > 0 else None, 'results': { 'count': len(results), 'types': types, 'matches': results } }