def string_sentence_ruleapplication(sentence_segment, format='xml', idx=True, id=-1, filtering=None, timeout=None, **kwargs): try: sentence_collection = loads(sentence_segment, format=format) except xml.ParseError: raise Exception('Error parsing sentence XML for id %d' % id) read_graphs(sentence_collection, idx=idx) return sentence_ruleapplication(sentence_collection[0], filtering=filtering, timeout=timeout, **kwargs)
def dataset_translation_rulextraction(sentence_collection, idx=True, filtering=None, timeout=None, **kwargs): read_graphs(sentence_collection, idx=idx) rules = list() for sentence in sentence_collection: if idx: tok = sentence.target.tokenized_idx else: tok = sentence.target.tokenized_text if filtering is not None and not filtering.filter(sentence): logging.info( 'Skipping sentence with id %s (%s) due to filtering (source token number %d, graph size %d)' % (sentence.id, sentence.orig_id, len(sentence.source.tokenized_text), len(sentence.source.graph))) continue if sentence.source.graph is not None and tok is not None and sentence.alignment is not None and sentence.alignment.sgtt is not None: logging.debug('Starting rule extraction for sentence %s' % sentence.id) alignments = dict(index for plain, index in sentence.alignment.sgtt) alignment_dict = utility.create_alignment_dict( alignments, sentence.source.graph) tok = [unicode(x) for x in tok] try: if timeout: with to.timeout(seconds=timeout): sentence_rules = rulextraction(sentence.source.graph, tok, alignment_dict, **kwargs) else: sentence_rules = rulextraction(sentence.source.graph, tok, alignment_dict, **kwargs) rules.extend(sentence_rules) logging.info( 'Extracted %d rules from sentence with id %s (%s)' % (len(sentence_rules), sentence.id, sentence.orig_id)) except to.TimeoutError: logging.warn( 'Rule extraction for sentence with id %s (%s) failed due to timeout after %d seconds' % (sentence.id, sentence.orig_id, timeout)) continue return rules
def dataset_ruleapplication(sentence_collection, idx=True, filtering=None, timeout=None, **kwargs): read_graphs(sentence_collection, idx=idx) graph_coverages = dict() for sentence in sentence_collection: sent_id, applied_rules = sentence_ruleapplication(sentence, filtering=filtering, timeout=timeout, **kwargs) graph_coverages[sent_id] = applied_rules return graph_coverages
def string_create_sentence_disc_rules(sentence_xml_string, _, disc_rule_id_offset=120000000, idx=True, filtering=None, format='xml'): try: sentence_collection = loads(sentence_xml_string, format=format) except xml.ParseError: raise Exception('Error parsing sentence XML for id %d' % id) read_graphs(sentence_collection, idx=idx) sentence = sentence_collection[0] return create_sentence_disc_rules(sentence, disc_rule_id_offset, filtering)