def check(self, sentence): result = False s = sentence.encode('ascii') sent = Sentence(s) if sent.parse() > 0: result = True del sent return result
def test_that_sentence_can_be_destroyed_when_linkages_still_exist(self): """ If the parser is deleted before the associated swig objects are, there will be bad pointer dereferences (as the swig objects will be pointing into freed memory). This test ensures that parsers can be created and deleted without regard for the existence of PYTHON Linkage objects """ s = Sentence('This is a sentence.', Dictionary(), ParseOptions()) linkages = s.parse() del s
def parseString(s,debug,linkNum=0,file=sys.stdout): sent = Sentence(s) num_links=sent.parse() if num_links>linkNum: linkage = Linkage(linkNum,sent) if debug: linkage.print_diagram(sys.stderr) findProblems(linkage, sent, file) return linkage else: return None
def test_that_sentence_can_be_destroyed_when_linkages_still_exist(self): """ If the parser is deleted before the associated swig objects are, there will be bad pointer dereferences (as the swig objects will be pointing into freed memory). This test ensures that parsers can be created and deleted without regard for the existence of PYTHON Linkage objects """ #pylint: disable=unused-variable s = Sentence('This is a sentence.', Dictionary(), ParseOptions()) linkages = s.parse() del s
def is_grammar_OK(text): flag = 0 sent = Sentence(text, en_dir, po) linkages = sent.parse() #linkage_stat(sent, 'English') for linkage in linkages: if '(RIGHT-WALL)]' in linkage.postscript(): flag = 1 if flag == 1: return True return False
def parse_text(dict_path, corpus_path, output_id=OUTPUT_DIAGRAM): """ Link parser invocation routine :param dict_path: name or path to the dictionary :param corpus_path: path to the test text file :param output_id: numetic type of one of three possible output format :return: """ def s(q): """ Helper routine """ return '' if q == 1 else 's' def linkage_stat(psent, lang, lkgs, sent_po): """ This function mimics the linkage status report style of link-parser """ random = ' of {} random linkages'. \ format(clg.sentence_num_linkages_post_processed((psent._obj))) \ if clg.sentence_num_linkages_found(psent._obj) > sent_po.linkage_limit else '' print('`{}: Found {} linkage{} ({}{} had no P.P. violations)`'. \ format(lang, clg.sentence_num_linkages_found(psent._obj), s(clg.sentence_num_linkages_found(psent._obj)), len(lkgs), random)) po = ParseOptions(min_null_count=0, max_null_count=999) #po.linkage_limit = 3 dict = Dictionary(dict_path) # open the dictionary only once with open(corpus_path) as f: for line in f: print(line, end="") sent = Sentence(line, dict, po) linkages = sent.parse() linkage_stat(sent, dict_path, linkages, po) if output_id == OUTPUT_POSTSCRIPT: for linkage in linkages: print(linkage.postscript()) elif output_id == OUTPUT_CONSTITUENT: for linkage in linkages: print(linkage.constituent_tree()) else: for linkage in linkages: print(linkage.diagram()) # Prevent interleaving "Dictionary close" messages po = ParseOptions(verbosity=0)
def checkSummary(self, sentence): logging.debug('checkSummary start') result = "" s = sentence.encode('ascii') sent = Sentence(s) lc = sent.parse() logging.debug('checkSummary sent parsed') if lc > 0: linkage = Linkage(0, sent) result = linkage.get_diagram() logging.debug('checkSummary OK') del linkage del sent logging.debug('checkSummary end') return result
def Tokenize_Sentence(sentence, po): """ Tokenizes the given sentence using LG grammar bindings """ tokenized_sentence = "" sent = Sentence(sentence, any_dict, po) linkages = sent.parse() #linkage = Linkage(0, sent, po) for linkage in linkages: num_words = linkage.num_of_words() for i in range(num_words - 1): # index shift ignores ###LEFT-WALL word_start = linkage.word_byte_start(i + 1) word_end = linkage.word_byte_end(i + 1) tokenized_sentence += sentence[word_start:word_end] + " " break tokenized_sentence += "\n" return tokenized_sentence
def Make_Random(sents, **kwargs): """ Make random parses (from LG-parser "any"), to use as baseline """ output_path = kwargs.get("output_path", os.environ["PWD"]) any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_sent = sent[:] curr_sent.insert(0, "###LEFT-WALL###") curr_parse = [] # subtitute words with numbers, to avoid token-splitting by LG "any" fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] # attach words from sent, which are the actual words curr_parse.append([ str(llink), curr_sent[llink], str(rlink), curr_sent[rlink] ]) random_parses.append(curr_parse) Print_parses(sents, random_parses, f"{output_path}/random_parses.ull") return random_parses
def Make_Random(sents): """ Make random parses (from LG-parser "any"), to use as baseline """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_parse = [] # subtitute words with numbers, as we only care about the parse tree fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] # restore final dot to maintain --ignore functionality if sent[-1] == ".": fake_words[-1] = "." sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options, "dummy") for link in links: llink = link[0] rlink = link[1] curr_parse.append( [str(llink), tokens[llink], str(rlink), tokens[rlink]]) random_parses.append(curr_parse) return random_parses
def parse_file_with_api(dict_path, corpus_path, output_path, linkage_limit, options) \ -> (float, float, float): """ Link parser invocation routine. :param dict_path: name or path to the dictionary :param corpus_path: path to the test text file :param output_path: output file path :param linkage_limit: maximum number of linkages LG may return when parsing a sentence :param options: bit field. Use bit mask constants to set or reset one or multiple bits: BIT_CAPS = 0x01 Keep capitalized letters in tokens untouched if set, make all lowercase otherwise. BIT_RWALL = 0x02 Keep all links with RIGHT-WALL if set, ignore them otherwise. BIT_STRIP = 0x04 Strip off token suffixes if set, remove them otherwise. :return: tuple (float, float, float): - percentage of totally parsed sentences; - percentage of completely unparsed sentences; - percentage of parsed sentences; """ input_file_handle = None output_file_handle = None # Sentence statistics variables sent_full = 0 # number of fully parsed sentences sent_none = 0 # number of completely unparsed sentences sent_stat = 0.0 # average value of parsed sentences (linkages) line_count = 0 # number of sentences in the corpus print("Info: Parsing a corpus file: '" + corpus_path + "'") print("Info: Using dictionary: '" + dict_path + "'") if output_path is not None: print("Info: Parses are saved in: '" + output_path + get_output_suffix(options) + "'") else: print( "Info: Output file name is not specified. Parses are redirected to 'stdout'." ) try: link_line = re.compile(r"\A[0-9].+") po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = linkage_limit di = Dictionary(dict_path) input_file_handle = open(corpus_path) output_file_handle = sys.stdout if output_path is None else open( output_path + get_output_suffix(options), "w") for line in input_file_handle: # Filter out links when ULL parses are used as input if options & BIT_ULL_IN > 0 and link_line.match(line): continue # Skip empty lines to get proper statistics estimation and skip commented lines if len(line.strip()) < 1 or line.startswith("#"): continue sent = Sentence(line, di, po) linkages = sent.parse() # Number of linkages taken in statistics estimation linkage_countdown = 1 temp_full = 0 temp_none = 0 temp_stat = 0.0 for linkage in linkages: #============================================================================================================= if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM: print(linkage.diagram(), file=output_file_handle) elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT: print(linkage.postscript(), file=output_file_handle) elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE: print(linkage.constituent_tree(), file=output_file_handle) tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options, output_file_handle) if not (options & BIT_OUTPUT): print_output(tokens, links, options, output_file_handle) (f, n, s) = calc_stat(tokens) if linkage_countdown: temp_full += f temp_none += n temp_stat += s linkage_countdown -= 1 if len(linkages) > 0: sent_full += temp_full sent_none += temp_none sent_stat += temp_stat / float(len(linkages)) else: sent_none += 1 line_count += 1 # Prevent interleaving "Dictionary close" messages ParseOptions(verbosity=0) except LG_Error as err: print(str(err)) except IOError as err: print(str(err)) except FileNotFoundError as err: print(str(err)) finally: if input_file_handle is not None: input_file_handle.close() if output_file_handle is not None and output_file_handle != sys.stdout: output_file_handle.close() return (0.0, 0.0, 0.0) if line_count == 0 else (float(sent_full) / float(line_count), float(sent_none) / float(line_count), sent_stat / float(line_count))
po = ParseOptions(verbosity=arg.verbosity) po.max_null_count = 999 # > allowed maximum number of words po.max_parse_time = 10 # actual parse timeout may be about twice bigger po.spell_guess = True if DISPLAY_GUESSES else False po.display_morphology = arg.morphology print("Enter sentences:") # iter(): avoid python2 input buffering for sentence_text in iter(sys.stdin.readline, ''): if sentence_text.strip() == '': continue sent = Sentence(str(sentence_text), lgdict, po) try: linkages = sent.parse() except LG_TimerExhausted: print('Sentence too complex for parsing in ~{} second{}.'.format( po.max_parse_time, nsuffix(po.max_parse_time))) continue if not linkages: print('Error occurred - sentence ignored.') continue if len(linkages) <= 0: print('Cannot parse the input sentence') continue null_count = sent.null_count() if null_count == 0: print("Sentence parsed OK") guess_found = False
This function mimics the linkage status report style of link-parser """ random = ' of {0} random linkages'. \ format(psent.num_linkages_post_processed()) \ if psent.num_valid_linkages() < psent.num_linkages_found() else '' print ('{0}: Found {1} linkage{2} ({3}{4} had no P.P. violations)'. \ format(lang, psent.num_linkages_found(), s(psent.num_linkages_found()), psent.num_valid_linkages(), random)) # English is the default language sent = Sentence("This is a test.", Dictionary(), po) linkages = sent.parse() linkage_stat(sent, 'English') for linkage in linkages: desc(linkage) # Russian sent = Sentence("Целью курса является обучение магистрантов основам построения и функционирования программного обеспечения сетей ЭВМ.", Dictionary('ru'), po) linkages = sent.parse() linkage_stat(sent, 'Russian') for linkage in linkages: desc(linkage) # Turkish po = ParseOptions(islands_ok=True, max_null_count=1, display_morphology=True) sent = Sentence("Senin ne istediğini bilmiyorum", Dictionary('tr'), po) linkages = sent.parse()
def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \ -> (ParseMetrics, ParseQuality): """ Link Grammar API parser invokation routine. :param dict_path: Dictionary file or directory path. :param corpus_path: Corpus file or directory path. :param output_path: Output file or directory path. :param ref_path: Reference file or directory path. :param options: Bit field. See `optconst.py` for details :return: Tuple (ParseMetrics, ParseQuality) """ input_file_handle = None output_file_handle = None ref_parses = [] # Sentence statistics variables total_metrics, total_quality = ParseMetrics(), ParseQuality() sentence_count = 0 # number of sentences in the corpus print("Info: Parsing a corpus file: '" + corpus_path + "'") print("Info: Using dictionary: '" + dict_path + "'") if output_path is not None: print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'") else: print("Info: Output file name is not specified. Parses are redirected to 'stdout'.") try: if options & BIT_PARSE_QUALITY and ref_path is not None: try: data = load_ull_file(ref_path) ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False) except Exception as err: print("Exception: " + str(err)) link_line = re.compile(r"\A[0-9].+") po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = self._linkage_limit di = Dictionary(dict_path) input_file_handle = open(corpus_path) output_file_handle = sys.stdout if output_path is None \ else open(output_path+get_output_suffix(options), "w") for line in input_file_handle: # Filter out links when ULL parses are used as input if options & BIT_ULL_IN > 0 and link_line.match(line): continue # Skip empty lines to get proper statistics estimation and skip commented lines if len(line.strip()) < 1: # or line.startswith("#"): continue # Tokenize and parse the sentence sent = Sentence(line, di, po) linkages = sent.parse() sent_metrics, sent_quality = ParseMetrics(), ParseQuality() linkage_count = 0 for linkage in linkages: # Only the first linkage is counted. if linkage_count == 1: break if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM: print(linkage.diagram(), file=output_file_handle) elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT: print(linkage.postscript(), file=output_file_handle) elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE: print(linkage.constituent_tree(), file=output_file_handle) elif not (options & BIT_OUTPUT): tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options, output_file_handle) # Print ULL formated parses print_output(tokens, links, options, output_file_handle) # Calculate parseability sent_metrics += parse_metrics(prepare_tokens(tokens, options)) # Calculate parse quality if the option is set if options & BIT_PARSE_QUALITY and len(ref_parses): sent_quality += parse_quality(get_link_set(tokens, links, options), ref_parses[sentence_count][1]) linkage_count += 1 assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0" assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0" total_metrics += sent_metrics total_quality += sent_quality # if not linkage_count: # sent_metrics.completely_unparsed_ratio += 1 sentence_count += 1 total_metrics.sentences = sentence_count total_quality.sentences = sentence_count # Prevent interleaving "Dictionary close" messages ParseOptions(verbosity=0) except LG_DictionaryError as err: print("LG_DictionaryError: " + str(err)) except LG_Error as err: print("LG_Error: " + str(err)) except IOError as err: print("IOError: " + str(err)) except FileNotFoundError as err: print("FileNotFoundError: " + str(err)) finally: if input_file_handle is not None: input_file_handle.close() if output_file_handle is not None and output_file_handle != sys.stdout: output_file_handle.close() return total_metrics, total_quality
def parse_text(dict_path, corpus_path, output_path, is_caps, is_rwall, is_strip, linkage_limit): """ Link parser invocation routine :param dict_path: name or path to the dictionary :param corpus_path: path to the test text file :param output_path: output file path :param is_caps: boolean value tells to leave CAPS in tokens if set to True, make all lowercase otherwise :param is_rwall: boolean value tells to leave RIGHT-WALL tokens if set to True, remove otherwise :param is_strip: boolean value tells to strip off token suffixes if set to True, remove otherwise :param linkage_limit: maximum number of linkages LG may return when parsing a sentence :return: """ def parse_postscript(text, ofile): """ Parse postscript notation of the linkage. :param text: text string returned by Linkage.postscript() method. :param ofile: output file object refference :return: """ def strip_token(token) -> str: """ Strip off suffix substring :param token: token string :return: stripped token if a suffix found, the same token otherwise """ if token.startswith(".") or token.startswith("["): return token pos = token.find("[") # If "." is not found if pos < 0: pos = token.find(".") # If "[" is not found or token starts with "[" return token as is. if pos <= 0: return token return token[:pos:] def parse_tokens(text, caps=False, rw=False, strip=True) -> list: """ Parse string of tokens :param text: string token line extracted from postfix notation output string returned by Linkage.postfix() method. :param caps: boolean value indicating weather or not CAPS should be untouched or lowercased :param rw: boolean value indicating weather or not RIGHT-WALL should be taken into account or ignored :param strip: boolean value indicating weather or not token suffixes should be stripped off or left untouched :return: list of tokes """ tokens = [] start_pos = 1 end_pos = text.find(")") while end_pos - start_pos > 0: token = text[start_pos:end_pos:] if strip: token = strip_token(token) if token.find("-WALL") > 0: token = "###" + token + "###" else: if not caps: token = token.lower() if token.find("RIGHT-WALL") < 0: tokens.append(token) elif rw: tokens.append(token) start_pos = end_pos + 2 end_pos = text.find(")", start_pos) return tokens def parse_links(text, tokens) -> list: """ Parse links represented in postfix notation and prints them in OpenCog notation. :param text: link list in postfix notation :param tokens: list of tokens previously extracted from postfix notated output :return: """ links = [] token_count = len(tokens) start_pos = 1 end_pos = text.find("]") p = re.compile('(\d+)\s(\d+)\s\d+\s\(.+\)') while end_pos - start_pos > 0: m = p.match(text[start_pos:end_pos:]) if m is not None: index1 = int(m.group(1)) index2 = int(m.group(2)) if index2 < token_count: links.append( (index1, tokens[index1], index2, tokens[index2])) start_pos = end_pos + 2 end_pos = text.find("]", start_pos) return links def print_output(tokens, links, ofile): for token in tokens[1:]: ofile.write(token + ' ') ofile.write('\n') for link in links: print(link[0], link[1], link[2], link[3], file=ofile) print('', file=ofile) # def parse_postscript(text, ofile): p = re.compile('\[(\(LEFT-WALL\).+\(RIGHT-WALL\))\]\[(.+)\]\[0\]') m = p.match(text) if m is not None: tokens = parse_tokens(m.group(1)) sorted_links = sorted(parse_links(m.group(2), tokens), key=lambda x: (x[0], x[2])) print_output(tokens, sorted_links, ofile) f = None o = None try: po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = linkage_limit di = Dictionary(dict_path) f = open(corpus_path) o = sys.stdout if output_path is None else open(output_path, "w") for line in f: sent = Sentence(line, di, po) linkages = sent.parse() # print("Linkages:", len(linkages)) for linkage in linkages: # print(linkage.diagram()) parse_postscript(linkage.postscript().replace("\n", ""), o) # Prevent interleaving "Dictionary close" messages po = ParseOptions(verbosity=0) except LG_Error as err: print(str(err)) except IOError as err: print(str(err)) except FileNotFoundError as err: print(str(err)) finally: if f is not None: f.close() if o is not None and o != sys.stdout: o.close()
def on_get(self, req, resp): """ Handle HTTP GET request """ link_list = {} # output dictionary link_list['errors'] = [] # list of errors if any link_list['linkages'] = [] # list of linkages in requested format try: # logging IPs just in case logging.info("Connection from: " + (", ".join(req.access_route))) # Get input parammeters lang = req.get_param('lang') text = req.get_param('text') mode = req.get_param_as_int('mode') limit = req.get_param_as_int('limit') # If no sentence is specified, then nothing to do... if text == None: logging.debug( "Parameter 'text' is not specified. Nothing to parse.") raise falcon.HTTPBadRequest( "Parameter 'text' is not specified. Nothing to parse.") # Use default language if no language is specified if lang is None: lang = DEFAULT_LANGUAGE logging.info( "'lang' parameter is not specified in request. 'lang' is set to '" + DEFAULT_LANGUAGE + "'") # Use default mode if no or improper value is specified if mode is None or mode < 0 or mode > MAX_MODE_VALUE: mode = DEFAULT_MODE logging.info( "'mode' value is not properly specified in request. 'mode' is set to " + str(mode)) # Use default limit if no value is specified # or value is not within the range [1, MAX_LINKAGE_LIMIT] if limit is None or limit < 1 or limit > MAX_LINKAGE_LIMIT: limit = DEFAULT_LIMIT logging.info( "'limit' value is not properly specified in request. 'limit' is set to " + str(limit)) # Save input parammeters to the output dictionary, just in case someone needs them link_list['lang'] = lang link_list['mode'] = mode link_list['text'] = text link_list['limit'] = limit # Use default dictionary if it was not explicitly specified dict_path = LG_DICT_DEFAULT_PATH + "/" + lang dict_path = lang if not os.path.isdir(dict_path) else dict_path logging.info("Dictionary path used: " + dict_path) # Invoke link-parser, if the parameters are correctly specified po = ParseOptions(verbosity=0, min_null_count=0, max_null_count=999) po.linkage_limit = limit sent = Sentence(text, Dictionary(dict_path), po) logging.debug("Sentence: '" + sent.text + "'") linkages = sent.parse() if mode == MOD_CONSTTREE: for linkage in linkages: link_list['linkages'].append(linkage.constituent_tree()) elif mode == MOD_POSTSCRIPT: for linkage in linkages: link_list['linkages'].append(linkage.postscript()) elif mode == MOD_ULL_SENT: for linkage in linkages: link_list['linkages'].append( get_ull_sentence(linkage.postscript())) else: # MOD_DIAGRAM is default mode for linkage in linkages: link_list['linkages'].append(linkage.diagram()) # Prevent interleaving "Dictionary close" messages po = ParseOptions(verbosity=0) except LG_Error as err: error_msg = "LG_Error: " + str(err) link_list["errors"].append(error_msg) logging.error(error_msg) except Exception as err: error_msg = "Exception: " + str(err) link_list["errors"].append(error_msg) logging.error(error_msg) except BaseException as err: error_msg = "BaseException: " + str(err) link_list["errors"].append(error_msg) logging.error(error_msg) except: error_msg = "Unhandled exception." link_list["errors"].append(error_msg) logging.error(error_msg) # Return proper JSON output resp.body = json.dumps(link_list) resp.status = falcon.HTTP_200