def Make_Random(sents): """ Make random parses (from LG-parser "any"), to use as baseline """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_parse = [] # subtitute words with numbers, as we only care about the parse tree fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] # restore final dot to maintain --ignore functionality if sent[-1] == ".": fake_words[-1] = "." sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] curr_parse.append([str(llink), tokens[llink], str(rlink), tokens[rlink]]) random_parses.append(curr_parse) return random_parses
def is_grammatical_sentence(sentence_text, language_dict, verbose=False): parse_options = ParseOptions(verbosity=0) parse_options.max_null_count = 999 # max number of words in single pass parse_options.linkage_limit = 100 # max number of linkages to generate parse_options.max_parse_time = 10 # in seconds sent = Sentence(str(sentence_text), language_dict, parse_options) wrong_sentences = [] linkages = None try: linkages = sent.parse() except LG_TimerExhausted: wrong_sentences.append(sentence_text) if verbose: print('Sentence too complex for parsing in {} seconds.'.format( parse_options.max_parse_time)) return False if not linkages or len(linkages) <= 0: wrong_sentences.append(sentence_text) if verbose: print('Error occurred - sentence ignored.') null_count = sent.null_count() if null_count == 0: return True else: wrong_sentences.append(sentence_text) return False
def make_random(sentences: Union[List[Tuple[str, set]], List[str]], options: int, **kwargs) -> List[Tuple[str, set]]: """ Make random parses (from LG-parser "any"), to use as baseline :param sentences: List of either tuples of sentence and set of links in case of .ull input file format or strings in case of text input file format. :param options: Integer representing parse options bit masks. :return: List of parses (tuples of sentence and set of links) """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = int(kwargs.get("limit", 100)) options |= BIT_STRIP options |= BIT_CAPS if isinstance(sentences[0], tuple): is_ull = True elif isinstance(sentences[0], str): is_ull = False else: raise ValueError( "The first argument should be either List[Tuple[str, set] or List[str]." ) random_parses = [] for sent in sentences: words = tokenize_sentence(sent[0] if is_ull else sent) num_words = len(words) # substitute words with numbers, to avoid token-splitting by LG "any" fake_words = [f"w{x}" for x in range(1, num_words)] # fake_words = [f"w{x}" for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence links = [] if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) if num_words != len(tokens): logger.error( f"Number of tokens mismatch:\n{words}\n{tokens}\nfor sentence:\n{sent[0]}" ) random_parses.append((sent[0], set(links))) return random_parses
def Make_Random(sents, **kwargs): """ Make random parses (from LG-parser "any"), to use as baseline """ output_path = kwargs.get("output_path", os.environ["PWD"]) any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_sent = sent[:] curr_sent.insert(0, "###LEFT-WALL###") curr_parse = [] # subtitute words with numbers, to avoid token-splitting by LG "any" fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] # attach words from sent, which are the actual words curr_parse.append([ str(llink), curr_sent[llink], str(rlink), curr_sent[rlink] ]) random_parses.append(curr_parse) Print_parses(sents, random_parses, f"{output_path}/random_parses.ull") return random_parses
def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \ -> (ParseMetrics, ParseQuality): """ Link Grammar API parser invokation routine. :param dict_path: Dictionary file or directory path. :param corpus_path: Corpus file or directory path. :param output_path: Output file or directory path. :param ref_path: Reference file or directory path. :param options: Bit field. See `optconst.py` for details :return: Tuple (ParseMetrics, ParseQuality) """ input_file_handle = None output_file_handle = None ref_parses = [] # Sentence statistics variables total_metrics, total_quality = ParseMetrics(), ParseQuality() sentence_count = 0 # number of sentences in the corpus print("Info: Parsing a corpus file: '" + corpus_path + "'") print("Info: Using dictionary: '" + dict_path + "'") if output_path is not None: print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'") else: print("Info: Output file name is not specified. Parses are redirected to 'stdout'.") try: if options & BIT_PARSE_QUALITY and ref_path is not None: try: data = load_ull_file(ref_path) ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False) except Exception as err: print("Exception: " + str(err)) link_line = re.compile(r"\A[0-9].+") po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = self._linkage_limit di = Dictionary(dict_path) input_file_handle = open(corpus_path) output_file_handle = sys.stdout if output_path is None \ else open(output_path+get_output_suffix(options), "w") for line in input_file_handle: # Filter out links when ULL parses are used as input if options & BIT_ULL_IN > 0 and link_line.match(line): continue # Skip empty lines to get proper statistics estimation and skip commented lines if len(line.strip()) < 1: # or line.startswith("#"): continue # Tokenize and parse the sentence sent = Sentence(line, di, po) linkages = sent.parse() sent_metrics, sent_quality = ParseMetrics(), ParseQuality() linkage_count = 0 for linkage in linkages: # Only the first linkage is counted. if linkage_count == 1: break if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM: print(linkage.diagram(), file=output_file_handle) elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT: print(linkage.postscript(), file=output_file_handle) elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE: print(linkage.constituent_tree(), file=output_file_handle) elif not (options & BIT_OUTPUT): tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options, output_file_handle) # Print ULL formated parses print_output(tokens, links, options, output_file_handle) # Calculate parseability sent_metrics += parse_metrics(prepare_tokens(tokens, options)) # Calculate parse quality if the option is set if options & BIT_PARSE_QUALITY and len(ref_parses): sent_quality += parse_quality(get_link_set(tokens, links, options), ref_parses[sentence_count][1]) linkage_count += 1 assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0" assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0" total_metrics += sent_metrics total_quality += sent_quality # if not linkage_count: # sent_metrics.completely_unparsed_ratio += 1 sentence_count += 1 total_metrics.sentences = sentence_count total_quality.sentences = sentence_count # Prevent interleaving "Dictionary close" messages ParseOptions(verbosity=0) except LG_DictionaryError as err: print("LG_DictionaryError: " + str(err)) except LG_Error as err: print("LG_Error: " + str(err)) except IOError as err: print("IOError: " + str(err)) except FileNotFoundError as err: print("FileNotFoundError: " + str(err)) finally: if input_file_handle is not None: input_file_handle.close() if output_file_handle is not None and output_file_handle != sys.stdout: output_file_handle.close() return total_metrics, total_quality
def test_setting_linkage_limit(self): po = ParseOptions() po.linkage_limit = 3 self.assertEqual(clg.parse_options_get_linkage_limit(po._obj), 3)
def parse_file_with_api(dict_path, corpus_path, output_path, linkage_limit, options) \ -> (float, float, float): """ Link parser invocation routine. :param dict_path: name or path to the dictionary :param corpus_path: path to the test text file :param output_path: output file path :param linkage_limit: maximum number of linkages LG may return when parsing a sentence :param options: bit field. Use bit mask constants to set or reset one or multiple bits: BIT_CAPS = 0x01 Keep capitalized letters in tokens untouched if set, make all lowercase otherwise. BIT_RWALL = 0x02 Keep all links with RIGHT-WALL if set, ignore them otherwise. BIT_STRIP = 0x04 Strip off token suffixes if set, remove them otherwise. :return: tuple (float, float, float): - percentage of totally parsed sentences; - percentage of completely unparsed sentences; - percentage of parsed sentences; """ input_file_handle = None output_file_handle = None # Sentence statistics variables sent_full = 0 # number of fully parsed sentences sent_none = 0 # number of completely unparsed sentences sent_stat = 0.0 # average value of parsed sentences (linkages) line_count = 0 # number of sentences in the corpus print("Info: Parsing a corpus file: '" + corpus_path + "'") print("Info: Using dictionary: '" + dict_path + "'") if output_path is not None: print("Info: Parses are saved in: '" + output_path + get_output_suffix(options) + "'") else: print( "Info: Output file name is not specified. Parses are redirected to 'stdout'." ) try: link_line = re.compile(r"\A[0-9].+") po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = linkage_limit di = Dictionary(dict_path) input_file_handle = open(corpus_path) output_file_handle = sys.stdout if output_path is None else open( output_path + get_output_suffix(options), "w") for line in input_file_handle: # Filter out links when ULL parses are used as input if options & BIT_ULL_IN > 0 and link_line.match(line): continue # Skip empty lines to get proper statistics estimation and skip commented lines if len(line.strip()) < 1 or line.startswith("#"): continue sent = Sentence(line, di, po) linkages = sent.parse() # Number of linkages taken in statistics estimation linkage_countdown = 1 temp_full = 0 temp_none = 0 temp_stat = 0.0 for linkage in linkages: #============================================================================================================= if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM: print(linkage.diagram(), file=output_file_handle) elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT: print(linkage.postscript(), file=output_file_handle) elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE: print(linkage.constituent_tree(), file=output_file_handle) tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options, output_file_handle) if not (options & BIT_OUTPUT): print_output(tokens, links, options, output_file_handle) (f, n, s) = calc_stat(tokens) if linkage_countdown: temp_full += f temp_none += n temp_stat += s linkage_countdown -= 1 if len(linkages) > 0: sent_full += temp_full sent_none += temp_none sent_stat += temp_stat / float(len(linkages)) else: sent_none += 1 line_count += 1 # Prevent interleaving "Dictionary close" messages ParseOptions(verbosity=0) except LG_Error as err: print(str(err)) except IOError as err: print(str(err)) except FileNotFoundError as err: print(str(err)) finally: if input_file_handle is not None: input_file_handle.close() if output_file_handle is not None and output_file_handle != sys.stdout: output_file_handle.close() return (0.0, 0.0, 0.0) if line_count == 0 else (float(sent_full) / float(line_count), float(sent_none) / float(line_count), sent_stat / float(line_count))
action="store_true", help="interactive mode after each result") arg = args.parse_args() try: lgdict = Dictionary(arg.lang) except LG_Error: # The default error handler will print the error message args.print_usage() sys.exit(2) po = ParseOptions(verbosity=arg.verbosity) po.max_null_count = 999 # > allowed maximum number of words po.linkage_limit = 10000 # maximum number of linkages to generate po.max_parse_time = 10 # actual parse timeout may be about twice bigger po.spell_guess = True if DISPLAY_GUESSES else False po.display_morphology = arg.morphology while True: try: sentence_text = input(PROMPT) except EOFError: print("EOF") exit(0) if not is_stdin_atty and sentence_text: if sentence_text[0] == '%': continue if sentence_text[0] == '!': # ignore user-settings for now
def on_get(self, req, resp): """ Handle HTTP GET request """ link_list = {} # output dictionary link_list['errors'] = [] # list of errors if any link_list['linkages'] = [] # list of linkages in requested format try: # logging IPs just in case logging.info("Connection from: " + (", ".join(req.access_route))) # Get input parammeters lang = req.get_param('lang') text = req.get_param('text') mode = req.get_param_as_int('mode') limit = req.get_param_as_int('limit') # If no sentence is specified, then nothing to do... if text == None: logging.debug( "Parameter 'text' is not specified. Nothing to parse.") raise falcon.HTTPBadRequest( "Parameter 'text' is not specified. Nothing to parse.") # Use default language if no language is specified if lang is None: lang = DEFAULT_LANGUAGE logging.info( "'lang' parameter is not specified in request. 'lang' is set to '" + DEFAULT_LANGUAGE + "'") # Use default mode if no or improper value is specified if mode is None or mode < 0 or mode > MAX_MODE_VALUE: mode = DEFAULT_MODE logging.info( "'mode' value is not properly specified in request. 'mode' is set to " + str(mode)) # Use default limit if no value is specified # or value is not within the range [1, MAX_LINKAGE_LIMIT] if limit is None or limit < 1 or limit > MAX_LINKAGE_LIMIT: limit = DEFAULT_LIMIT logging.info( "'limit' value is not properly specified in request. 'limit' is set to " + str(limit)) # Save input parammeters to the output dictionary, just in case someone needs them link_list['lang'] = lang link_list['mode'] = mode link_list['text'] = text link_list['limit'] = limit # Use default dictionary if it was not explicitly specified dict_path = LG_DICT_DEFAULT_PATH + "/" + lang dict_path = lang if not os.path.isdir(dict_path) else dict_path logging.info("Dictionary path used: " + dict_path) # Invoke link-parser, if the parameters are correctly specified po = ParseOptions(verbosity=0, min_null_count=0, max_null_count=999) po.linkage_limit = limit sent = Sentence(text, Dictionary(dict_path), po) logging.debug("Sentence: '" + sent.text + "'") linkages = sent.parse() if mode == MOD_CONSTTREE: for linkage in linkages: link_list['linkages'].append(linkage.constituent_tree()) elif mode == MOD_POSTSCRIPT: for linkage in linkages: link_list['linkages'].append(linkage.postscript()) elif mode == MOD_ULL_SENT: for linkage in linkages: link_list['linkages'].append( get_ull_sentence(linkage.postscript())) else: # MOD_DIAGRAM is default mode for linkage in linkages: link_list['linkages'].append(linkage.diagram()) # Prevent interleaving "Dictionary close" messages po = ParseOptions(verbosity=0) except LG_Error as err: error_msg = "LG_Error: " + str(err) link_list["errors"].append(error_msg) logging.error(error_msg) except Exception as err: error_msg = "Exception: " + str(err) link_list["errors"].append(error_msg) logging.error(error_msg) except BaseException as err: error_msg = "BaseException: " + str(err) link_list["errors"].append(error_msg) logging.error(error_msg) except: error_msg = "Unhandled exception." link_list["errors"].append(error_msg) logging.error(error_msg) # Return proper JSON output resp.body = json.dumps(link_list) resp.status = falcon.HTTP_200
def parse_text(dict_path, corpus_path, output_path, is_caps, is_rwall, is_strip, linkage_limit): """ Link parser invocation routine :param dict_path: name or path to the dictionary :param corpus_path: path to the test text file :param output_path: output file path :param is_caps: boolean value tells to leave CAPS in tokens if set to True, make all lowercase otherwise :param is_rwall: boolean value tells to leave RIGHT-WALL tokens if set to True, remove otherwise :param is_strip: boolean value tells to strip off token suffixes if set to True, remove otherwise :param linkage_limit: maximum number of linkages LG may return when parsing a sentence :return: """ def parse_postscript(text, ofile): """ Parse postscript notation of the linkage. :param text: text string returned by Linkage.postscript() method. :param ofile: output file object refference :return: """ def strip_token(token) -> str: """ Strip off suffix substring :param token: token string :return: stripped token if a suffix found, the same token otherwise """ if token.startswith(".") or token.startswith("["): return token pos = token.find("[") # If "." is not found if pos < 0: pos = token.find(".") # If "[" is not found or token starts with "[" return token as is. if pos <= 0: return token return token[:pos:] def parse_tokens(text, caps=False, rw=False, strip=True) -> list: """ Parse string of tokens :param text: string token line extracted from postfix notation output string returned by Linkage.postfix() method. :param caps: boolean value indicating weather or not CAPS should be untouched or lowercased :param rw: boolean value indicating weather or not RIGHT-WALL should be taken into account or ignored :param strip: boolean value indicating weather or not token suffixes should be stripped off or left untouched :return: list of tokes """ tokens = [] start_pos = 1 end_pos = text.find(")") while end_pos - start_pos > 0: token = text[start_pos:end_pos:] if strip: token = strip_token(token) if token.find("-WALL") > 0: token = "###" + token + "###" else: if not caps: token = token.lower() if token.find("RIGHT-WALL") < 0: tokens.append(token) elif rw: tokens.append(token) start_pos = end_pos + 2 end_pos = text.find(")", start_pos) return tokens def parse_links(text, tokens) -> list: """ Parse links represented in postfix notation and prints them in OpenCog notation. :param text: link list in postfix notation :param tokens: list of tokens previously extracted from postfix notated output :return: """ links = [] token_count = len(tokens) start_pos = 1 end_pos = text.find("]") p = re.compile('(\d+)\s(\d+)\s\d+\s\(.+\)') while end_pos - start_pos > 0: m = p.match(text[start_pos:end_pos:]) if m is not None: index1 = int(m.group(1)) index2 = int(m.group(2)) if index2 < token_count: links.append( (index1, tokens[index1], index2, tokens[index2])) start_pos = end_pos + 2 end_pos = text.find("]", start_pos) return links def print_output(tokens, links, ofile): for token in tokens[1:]: ofile.write(token + ' ') ofile.write('\n') for link in links: print(link[0], link[1], link[2], link[3], file=ofile) print('', file=ofile) # def parse_postscript(text, ofile): p = re.compile('\[(\(LEFT-WALL\).+\(RIGHT-WALL\))\]\[(.+)\]\[0\]') m = p.match(text) if m is not None: tokens = parse_tokens(m.group(1)) sorted_links = sorted(parse_links(m.group(2), tokens), key=lambda x: (x[0], x[2])) print_output(tokens, sorted_links, ofile) f = None o = None try: po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = linkage_limit di = Dictionary(dict_path) f = open(corpus_path) o = sys.stdout if output_path is None else open(output_path, "w") for line in f: sent = Sentence(line, di, po) linkages = sent.parse() # print("Linkages:", len(linkages)) for linkage in linkages: # print(linkage.diagram()) parse_postscript(linkage.postscript().replace("\n", ""), o) # Prevent interleaving "Dictionary close" messages po = ParseOptions(verbosity=0) except LG_Error as err: print(str(err)) except IOError as err: print(str(err)) except FileNotFoundError as err: print(str(err)) finally: if f is not None: f.close() if o is not None and o != sys.stdout: o.close()