def test_setting_spell_guess(self): po = ParseOptions(spell_guess=True) self.assertEqual(po.spell_guess, 7) po = ParseOptions(spell_guess=5) self.assertEqual(po.spell_guess, 5) po = ParseOptions(spell_guess=False) self.assertEqual(po.spell_guess, 0)
def test_setting_verbosity(self): po = ParseOptions() po.verbosity = 2 #Ensure that the PO object reports the value correctly self.assertEqual(po.verbosity, 2) #Ensure that it's actually setting it. self.assertEqual(clg.parse_options_get_verbosity(po._obj), 2)
def is_grammatical_sentence(sentence_text, language_dict, verbose=False): parse_options = ParseOptions(verbosity=0) parse_options.max_null_count = 999 # max number of words in single pass parse_options.linkage_limit = 100 # max number of linkages to generate parse_options.max_parse_time = 10 # in seconds sent = Sentence(str(sentence_text), language_dict, parse_options) wrong_sentences = [] linkages = None try: linkages = sent.parse() except LG_TimerExhausted: wrong_sentences.append(sentence_text) if verbose: print('Sentence too complex for parsing in {} seconds.'.format( parse_options.max_parse_time)) return False if not linkages or len(linkages) <= 0: wrong_sentences.append(sentence_text) if verbose: print('Error occurred - sentence ignored.') null_count = sent.null_count() if null_count == 0: return True else: wrong_sentences.append(sentence_text) return False
def Make_Random(sents): """ Make random parses (from LG-parser "any"), to use as baseline """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_parse = [] # subtitute words with numbers, as we only care about the parse tree fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] # restore final dot to maintain --ignore functionality if sent[-1] == ".": fake_words[-1] = "." sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] curr_parse.append([str(llink), tokens[llink], str(rlink), tokens[rlink]]) random_parses.append(curr_parse) return random_parses
def test_setting_all_short_connectors(self): po = ParseOptions() po.all_short_connectors = True self.assertEqual(po.all_short_connectors, True) self.assertEqual(clg.parse_options_get_all_short_connectors(po._obj), 1) po.all_short_connectors = False self.assertEqual(po.all_short_connectors, False) self.assertEqual(clg.parse_options_get_all_short_connectors(po._obj), 0)
def test_setting_islands_ok(self): po = ParseOptions() po.islands_ok = True self.assertEqual(po.islands_ok, True) self.assertEqual(clg.parse_options_get_islands_ok(po._obj), 1) po.islands_ok = False self.assertEqual(po.islands_ok, False) self.assertEqual(clg.parse_options_get_islands_ok(po._obj), 0)
def test_setting_display_morphology(self): po = ParseOptions() po.display_morphology = True self.assertEqual(po.display_morphology, True) self.assertEqual(clg.parse_options_get_display_morphology(po._obj), 1) po.display_morphology = False self.assertEqual(po.display_morphology, False) self.assertEqual(clg.parse_options_get_display_morphology(po._obj), 0)
def test_setting_spell_guess(self): po = ParseOptions(spell_guess=True) if po.spell_guess == 0: raise unittest.SkipTest("Library is not configured with spell guess") self.assertEqual(po.spell_guess, 7) po = ParseOptions(spell_guess=5) self.assertEqual(po.spell_guess, 5) po = ParseOptions(spell_guess=False) self.assertEqual(po.spell_guess, 0)
def make_random(sentences: Union[List[Tuple[str, set]], List[str]], options: int, **kwargs) -> List[Tuple[str, set]]: """ Make random parses (from LG-parser "any"), to use as baseline :param sentences: List of either tuples of sentence and set of links in case of .ull input file format or strings in case of text input file format. :param options: Integer representing parse options bit masks. :return: List of parses (tuples of sentence and set of links) """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = int(kwargs.get("limit", 100)) options |= BIT_STRIP options |= BIT_CAPS if isinstance(sentences[0], tuple): is_ull = True elif isinstance(sentences[0], str): is_ull = False else: raise ValueError( "The first argument should be either List[Tuple[str, set] or List[str]." ) random_parses = [] for sent in sentences: words = tokenize_sentence(sent[0] if is_ull else sent) num_words = len(words) # substitute words with numbers, to avoid token-splitting by LG "any" fake_words = [f"w{x}" for x in range(1, num_words)] # fake_words = [f"w{x}" for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence links = [] if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) if num_words != len(tokens): logger.error( f"Number of tokens mismatch:\n{words}\n{tokens}\nfor sentence:\n{sent[0]}" ) random_parses.append((sent[0], set(links))) return random_parses
def test_2_step_parsing_with_null_links(self): self.po = ParseOptions(min_null_count=0, max_null_count=0) sent = Sentence('about people attended', self.d, self.po) linkages = sent.parse() self.assertEqual(len(linkages), 0) self.po = ParseOptions(min_null_count=1, max_null_count=999) linkages = sent.parse(self.po) self.assertEqual(len(linkages), 2) self.assertEqual(linkages.next().unused_word_cost(), 1)
def parse_text(dict_path, corpus_path, output_id=OUTPUT_DIAGRAM): """ Link parser invocation routine :param dict_path: name or path to the dictionary :param corpus_path: path to the test text file :param output_id: numetic type of one of three possible output format :return: """ def s(q): """ Helper routine """ return '' if q == 1 else 's' def linkage_stat(psent, lang, lkgs, sent_po): """ This function mimics the linkage status report style of link-parser """ random = ' of {} random linkages'. \ format(clg.sentence_num_linkages_post_processed((psent._obj))) \ if clg.sentence_num_linkages_found(psent._obj) > sent_po.linkage_limit else '' print('`{}: Found {} linkage{} ({}{} had no P.P. violations)`'. \ format(lang, clg.sentence_num_linkages_found(psent._obj), s(clg.sentence_num_linkages_found(psent._obj)), len(lkgs), random)) po = ParseOptions(min_null_count=0, max_null_count=999) #po.linkage_limit = 3 dict = Dictionary(dict_path) # open the dictionary only once with open(corpus_path) as f: for line in f: print(line, end="") sent = Sentence(line, dict, po) linkages = sent.parse() linkage_stat(sent, dict_path, linkages, po) if output_id == OUTPUT_POSTSCRIPT: for linkage in linkages: print(linkage.postscript()) elif output_id == OUTPUT_CONSTITUENT: for linkage in linkages: print(linkage.constituent_tree()) else: for linkage in linkages: print(linkage.diagram()) # Prevent interleaving "Dictionary close" messages po = ParseOptions(verbosity=0)
def Make_Random(sents, **kwargs): """ Make random parses (from LG-parser "any"), to use as baseline """ output_path = kwargs.get("output_path", os.environ["PWD"]) any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_sent = sent[:] curr_sent.insert(0, "###LEFT-WALL###") curr_parse = [] # subtitute words with numbers, to avoid token-splitting by LG "any" fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] # attach words from sent, which are the actual words curr_parse.append([ str(llink), curr_sent[llink], str(rlink), curr_sent[rlink] ]) random_parses.append(curr_parse) Print_parses(sents, random_parses, f"{output_path}/random_parses.ull") return random_parses
def test_that_parse_returns_empty_iterator_on_no_linkage_sat(self): """Parsing a bad sentence with no null-links shouldn't give any linkage (sat)""" self.po = ParseOptions(use_sat=True) if self.po.use_sat != True: raise unittest.SkipTest("Library not configured with SAT parser") result = self.parse_sent("This this doesn't parse", self.po) linkage_exists = False for _ in result: linkage_exists = True self.assertFalse(linkage_exists, "SAT: Unparsable sentence has linkages.")
def test_that_sentence_can_be_destroyed_when_linkages_still_exist(self): """ If the parser is deleted before the associated swig objects are, there will be bad pointer dereferences (as the swig objects will be pointing into freed memory). This test ensures that parsers can be created and deleted without regard for the existence of PYTHON Linkage objects """ s = Sentence('This is a sentence.', Dictionary(), ParseOptions()) linkages = s.parse() del s
def test_null_link_range_starting_with_zero(self): """Test parsing with a minimal number of null-links, including 0.""" # This sentence has no complete linkage. Validate that the library # doesn't mangle parsing with null-count>0 due to power_prune()'s # connector-discard optimization at null-count==0. Without commit # "Allow calling classic_parse() with and w/o nulls", the number of # linkages here is 1 instead of 2 and the unused_word_cost is 5. self.po = ParseOptions(min_null_count=0, max_null_count=999) linkages = Sentence('about people attended', self.d, self.po).parse() self.assertEqual(len(linkages), 2) self.assertEqual(linkages.next().unused_word_cost(), 1)
def __init__(self, lang, limit=None): """ Constructor for local LG use """ super().__init__() try: self._obj_dict = Dictionary(lang) self._dict = lang except LG_DictionaryError as err: print(str(err)) self._parse_options = ParseOptions(min_null_count=0, max_null_count=999) if limit is not None: self._parse_options.linkage_limit = limit
def test_setting_linkage_limit(self): po = ParseOptions() po.linkage_limit = 3 self.assertEqual(clg.parse_options_get_linkage_limit(po._obj), 3)
def setUp(self): self.d, self.po = Dictionary(lang='lt'), ParseOptions()
def setUpClass(cls): cls.d, cls.po = Dictionary(lang='ru'), ParseOptions()
def test_setting_verbosity_to_not_allow_value_raises_value_error(self): po = ParseOptions() self.assertRaises(ValueError, setattr, po, "verbosity", 16)
def test_setting_verbosity_to_non_integer_raises_type_error(self): po = ParseOptions() self.assertRaises(TypeError, setattr, po, "verbosity", "a")
#! /usr/bin/env python # -*- coding: utf8 -*- # ASuMa, Mar 2018 # Tokenizer that uses LG functionality # See main() documentation import getopt, sys import os from linkgrammar import Linkage, Sentence, ParseOptions, Dictionary, Clinkgrammar as clg any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) def main(argv): """ Tokenizer procedure that uses LG tokenizer with python bindings Usage: tokenizer.py -i <inputdir> -o <outdir> inputdir Name of input directory outdir Name of ouput directory """ inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv, "hi:o:", ["inputdir=", "outdir="]) except getopt.GetoptError:
def test_setting_disjunct_cost(self): po = ParseOptions() po.disjunct_cost = 3.0 self.assertEqual(clg.parse_options_get_disjunct_cost(po._obj), 3.0)
def setUp(self): self.d, self.po = Dictionary(lang='en'), ParseOptions() self.po = ParseOptions(use_sat=True) if self.po.use_sat != True: raise unittest.SkipTest("Library not configured with SAT parser")
def setUpClass(cls): cls.d, cls.po = Dictionary(), ParseOptions()
def parse_sent(self, text, po=ParseOptions()): return list(Sentence(text, self.d, po).parse())
def test_timer_exhausted_exception(self): self.po = ParseOptions(max_parse_time=1) self.assertRaises( LG_TimerExhausted, self.parse_sent, "This should take more than one second to parse! " * 20, self.po)
def test_that_invalid_option_properties_cannot_be_used(self): po = ParseOptions() self.assertRaisesRegexp(TypeError, "Unknown parse option", setattr, po, "invalid_option", 1)
def test_specifying_parse_options(self): po = ParseOptions(linkage_limit=99) self.assertEqual(clg.parse_options_get_linkage_limit(po._obj), 99)
help="show word sentence position") args.add_argument("-nm", "--no-morphology", dest='morphology', action='store_false', help="do not display morphology") args.add_argument("-i", "--interactive", action="store_true", help="interactive mode after each result") arg = args.parse_args() try: lgdict = Dictionary(arg.lang) except LG_Error: # The default error handler will print the error message args.print_usage() sys.exit(2) po = ParseOptions(verbosity=arg.verbosity) po.max_null_count = 999 # > allowed maximum number of words po.max_parse_time = 10 # actual parse timeout may be about twice bigger po.spell_guess = True if DISPLAY_GUESSES else False po.display_morphology = arg.morphology # iter(): avoid python2 input buffering while True: sentence_text = get_input("sentence-check: ") if sentence_text.strip() == '': continue sent = Sentence(str(sentence_text), lgdict, po) try: linkages = sent.parse()
def test_setting_max_parse_time(self): po = ParseOptions() po.max_parse_time = 3 self.assertEqual(clg.parse_options_get_max_parse_time(po._obj), 3)
def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \ -> (ParseMetrics, ParseQuality): """ Link Grammar API parser invokation routine. :param dict_path: Dictionary file or directory path. :param corpus_path: Corpus file or directory path. :param output_path: Output file or directory path. :param ref_path: Reference file or directory path. :param options: Bit field. See `optconst.py` for details :return: Tuple (ParseMetrics, ParseQuality) """ input_file_handle = None output_file_handle = None ref_parses = [] # Sentence statistics variables total_metrics, total_quality = ParseMetrics(), ParseQuality() sentence_count = 0 # number of sentences in the corpus print("Info: Parsing a corpus file: '" + corpus_path + "'") print("Info: Using dictionary: '" + dict_path + "'") if output_path is not None: print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'") else: print("Info: Output file name is not specified. Parses are redirected to 'stdout'.") try: if options & BIT_PARSE_QUALITY and ref_path is not None: try: data = load_ull_file(ref_path) ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False) except Exception as err: print("Exception: " + str(err)) link_line = re.compile(r"\A[0-9].+") po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = self._linkage_limit di = Dictionary(dict_path) input_file_handle = open(corpus_path) output_file_handle = sys.stdout if output_path is None \ else open(output_path+get_output_suffix(options), "w") for line in input_file_handle: # Filter out links when ULL parses are used as input if options & BIT_ULL_IN > 0 and link_line.match(line): continue # Skip empty lines to get proper statistics estimation and skip commented lines if len(line.strip()) < 1: # or line.startswith("#"): continue # Tokenize and parse the sentence sent = Sentence(line, di, po) linkages = sent.parse() sent_metrics, sent_quality = ParseMetrics(), ParseQuality() linkage_count = 0 for linkage in linkages: # Only the first linkage is counted. if linkage_count == 1: break if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM: print(linkage.diagram(), file=output_file_handle) elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT: print(linkage.postscript(), file=output_file_handle) elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE: print(linkage.constituent_tree(), file=output_file_handle) elif not (options & BIT_OUTPUT): tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options, output_file_handle) # Print ULL formated parses print_output(tokens, links, options, output_file_handle) # Calculate parseability sent_metrics += parse_metrics(prepare_tokens(tokens, options)) # Calculate parse quality if the option is set if options & BIT_PARSE_QUALITY and len(ref_parses): sent_quality += parse_quality(get_link_set(tokens, links, options), ref_parses[sentence_count][1]) linkage_count += 1 assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0" assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0" total_metrics += sent_metrics total_quality += sent_quality # if not linkage_count: # sent_metrics.completely_unparsed_ratio += 1 sentence_count += 1 total_metrics.sentences = sentence_count total_quality.sentences = sentence_count # Prevent interleaving "Dictionary close" messages ParseOptions(verbosity=0) except LG_DictionaryError as err: print("LG_DictionaryError: " + str(err)) except LG_Error as err: print("LG_Error: " + str(err)) except IOError as err: print("IOError: " + str(err)) except FileNotFoundError as err: print("FileNotFoundError: " + str(err)) finally: if input_file_handle is not None: input_file_handle.close() if output_file_handle is not None and output_file_handle != sys.stdout: output_file_handle.close() return total_metrics, total_quality
def test_setting_short_length(self): po = ParseOptions() po.short_length = 3 self.assertEqual(clg.parse_options_get_short_length(po._obj), 3)
args.add_argument("-nm", "--no-morphology", dest='morphology', action='store_false', help="do not display morphology") arg = args.parse_args() try: lgdict = Dictionary(arg.lang) except LG_Error: # The default error handler will print the error message args.print_usage() sys.exit(2) po = ParseOptions(verbosity=arg.verbosity) po.max_null_count = 999 # > allowed maximum number of words po.max_parse_time = 10 # actual parse timeout may be about twice bigger po.spell_guess = True if DISPLAY_GUESSES else False po.display_morphology = arg.morphology print("Enter sentences:") # iter(): avoid python2 input buffering for sentence_text in iter(sys.stdin.readline, ''): if sentence_text.strip() == '': continue sent = Sentence(str(sentence_text), lgdict, po) try: linkages = sent.parse() except LG_TimerExhausted:
def test_setting_max_null_count(self): po = ParseOptions() po.max_null_count = 3 self.assertEqual(clg.parse_options_get_max_null_count(po._obj), 3)