Exemplo n.º 1
0
def Make_Random(sents):
    """
        Make random parses (from LG-parser "any"), to use as baseline
    """
    any_dict = Dictionary('any') # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = 100
    options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN
    options |= BIT_CAPS

    random_parses = []
    for sent in sents:
        num_words = len(sent)
        curr_parse = []
        # subtitute words with numbers, as we only care about the parse tree
        fake_words = ["w{}".format(x) for x in range(1, num_words + 1)]
        # restore final dot to maintain --ignore functionality
        if sent[-1] == ".": 
            fake_words[-1] = "."
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages) # check nbr of linkages in sentence
        if num_parses > 0:
            idx = random.randint(0, num_parses - 1) # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj) # get the random linkage
            tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options)
            for link in links:
                llink = link[0]
                rlink = link[1]
                curr_parse.append([str(llink), tokens[llink], str(rlink), tokens[rlink]])

            random_parses.append(curr_parse)

    return random_parses
Exemplo n.º 2
0
def is_grammatical_sentence(sentence_text, language_dict, verbose=False):
    parse_options = ParseOptions(verbosity=0)

    parse_options.max_null_count = 999  # max number of words in single pass
    parse_options.linkage_limit = 100  # max number of linkages to generate
    parse_options.max_parse_time = 10  # in seconds

    sent = Sentence(str(sentence_text), language_dict, parse_options)
    wrong_sentences = []

    linkages = None
    try:
        linkages = sent.parse()
    except LG_TimerExhausted:
        wrong_sentences.append(sentence_text)
        if verbose:
            print('Sentence too complex for parsing in {} seconds.'.format(
                parse_options.max_parse_time))
        return False

    if not linkages or len(linkages) <= 0:
        wrong_sentences.append(sentence_text)
        if verbose:
            print('Error occurred - sentence ignored.')

    null_count = sent.null_count()
    if null_count == 0:
        return True
    else:
        wrong_sentences.append(sentence_text)
        return False
Exemplo n.º 3
0
def make_random(sentences: Union[List[Tuple[str, set]], List[str]],
                options: int, **kwargs) -> List[Tuple[str, set]]:
    """
    Make random parses (from LG-parser "any"), to use as baseline

    :param sentences:       List of either tuples of sentence and set of links in case of .ull input file format
                            or strings in case of text input file format.
    :param options:         Integer representing parse options bit masks.
    :return:                List of parses (tuples of sentence and set of links)
    """
    any_dict = Dictionary('any')  # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = int(kwargs.get("limit", 100))
    options |= BIT_STRIP
    options |= BIT_CAPS

    if isinstance(sentences[0], tuple):
        is_ull = True
    elif isinstance(sentences[0], str):
        is_ull = False
    else:
        raise ValueError(
            "The first argument should be either List[Tuple[str, set] or List[str]."
        )

    random_parses = []

    for sent in sentences:
        words = tokenize_sentence(sent[0] if is_ull else sent)
        num_words = len(words)

        # substitute words with numbers, to avoid token-splitting by LG "any"
        fake_words = [f"w{x}" for x in range(1, num_words)]
        # fake_words = [f"w{x}" for x in range(1, num_words + 1)]
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages)  # check nbr of linkages in sentence

        links = []

        if num_parses > 0:
            idx = random.randint(0, num_parses -
                                 1)  # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj)  # get the random linkage
            tokens, links = parse_postscript(
                linkage.postscript().replace("\n", ""), options)

            if num_words != len(tokens):
                logger.error(
                    f"Number of tokens mismatch:\n{words}\n{tokens}\nfor sentence:\n{sent[0]}"
                )

        random_parses.append((sent[0], set(links)))

    return random_parses
Exemplo n.º 4
0
def Make_Random(sents, **kwargs):
    """
        Make random parses (from LG-parser "any"), to use as baseline
    """
    output_path = kwargs.get("output_path", os.environ["PWD"])

    any_dict = Dictionary('any')  # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = 100
    options = 0x00000000 | BIT_STRIP  #| BIT_ULL_IN
    options |= BIT_CAPS

    random_parses = []
    for sent in sents:
        num_words = len(sent)
        curr_sent = sent[:]
        curr_sent.insert(0, "###LEFT-WALL###")
        curr_parse = []
        # subtitute words with numbers, to avoid token-splitting by LG "any"
        fake_words = ["w{}".format(x) for x in range(1, num_words + 1)]
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages)  # check nbr of linkages in sentence
        if num_parses > 0:
            idx = random.randint(0, num_parses -
                                 1)  # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj)  # get the random linkage
            tokens, links = parse_postscript(
                linkage.postscript().replace("\n", ""), options)
            for link in links:
                llink = link[0]
                rlink = link[1]
                # attach words from sent, which are the actual words
                curr_parse.append([
                    str(llink), curr_sent[llink],
                    str(rlink), curr_sent[rlink]
                ])

            random_parses.append(curr_parse)

    Print_parses(sents, random_parses, f"{output_path}/random_parses.ull")

    return random_parses
Exemplo n.º 5
0
    def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \
            -> (ParseMetrics, ParseQuality):
        """
        Link Grammar API parser invokation routine.

        :param dict_path:       Dictionary file or directory path.
        :param corpus_path:     Corpus file or directory path.
        :param output_path:     Output file or directory path.
        :param ref_path:        Reference file or directory path.
        :param options:         Bit field. See `optconst.py` for details
        :return:                Tuple (ParseMetrics, ParseQuality)
        """
        input_file_handle = None
        output_file_handle = None

        ref_parses = []

        # Sentence statistics variables
        total_metrics, total_quality = ParseMetrics(), ParseQuality()

        sentence_count = 0                  # number of sentences in the corpus

        print("Info: Parsing a corpus file: '" + corpus_path + "'")
        print("Info: Using dictionary: '" + dict_path + "'")

        if output_path is not None:
            print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'")
        else:
            print("Info: Output file name is not specified. Parses are redirected to 'stdout'.")

        try:
            if options & BIT_PARSE_QUALITY and ref_path is not None:
                try:
                    data = load_ull_file(ref_path)
                    ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False)

                except Exception as err:
                    print("Exception: " + str(err))

            link_line = re.compile(r"\A[0-9].+")

            po = ParseOptions(min_null_count=0, max_null_count=999)
            po.linkage_limit = self._linkage_limit

            di = Dictionary(dict_path)

            input_file_handle = open(corpus_path)
            output_file_handle = sys.stdout if output_path is None \
                                            else open(output_path+get_output_suffix(options), "w")

            for line in input_file_handle:

                # Filter out links when ULL parses are used as input
                if options & BIT_ULL_IN > 0 and link_line.match(line):
                    continue

                # Skip empty lines to get proper statistics estimation and skip commented lines
                if len(line.strip()) < 1:  # or line.startswith("#"):
                    continue

                # Tokenize and parse the sentence
                sent = Sentence(line, di, po)
                linkages = sent.parse()

                sent_metrics, sent_quality = ParseMetrics(), ParseQuality()
                linkage_count = 0

                for linkage in linkages:

                    # Only the first linkage is counted.
                    if linkage_count == 1:
                        break

                    if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM:
                        print(linkage.diagram(), file=output_file_handle)

                    elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT:
                        print(linkage.postscript(), file=output_file_handle)

                    elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE:
                        print(linkage.constituent_tree(), file=output_file_handle)

                    elif not (options & BIT_OUTPUT):

                        tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options,
                                                         output_file_handle)

                        # Print ULL formated parses
                        print_output(tokens, links, options, output_file_handle)

                        # Calculate parseability
                        sent_metrics += parse_metrics(prepare_tokens(tokens, options))

                        # Calculate parse quality if the option is set
                        if options & BIT_PARSE_QUALITY and len(ref_parses):
                            sent_quality += parse_quality(get_link_set(tokens, links, options),
                                                          ref_parses[sentence_count][1])

                    linkage_count += 1

                assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0"
                assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0"

                total_metrics += sent_metrics
                total_quality += sent_quality

                # if not linkage_count:
                #     sent_metrics.completely_unparsed_ratio += 1

                sentence_count += 1

            total_metrics.sentences = sentence_count
            total_quality.sentences = sentence_count

            # Prevent interleaving "Dictionary close" messages
            ParseOptions(verbosity=0)

        except LG_DictionaryError as err:
            print("LG_DictionaryError: " + str(err))

        except LG_Error as err:
            print("LG_Error: " + str(err))

        except IOError as err:
            print("IOError: " + str(err))

        except FileNotFoundError as err:
            print("FileNotFoundError: " + str(err))

        finally:
            if input_file_handle is not None:
                input_file_handle.close()

            if output_file_handle is not None and output_file_handle != sys.stdout:
                output_file_handle.close()

            return total_metrics, total_quality
Exemplo n.º 6
0
 def test_setting_linkage_limit(self):
     po = ParseOptions()
     po.linkage_limit = 3
     self.assertEqual(clg.parse_options_get_linkage_limit(po._obj), 3)
Exemplo n.º 7
0
def parse_file_with_api(dict_path, corpus_path, output_path, linkage_limit, options) \
        -> (float, float, float):
    """
    Link parser invocation routine.

    :param dict_path: name or path to the dictionary
    :param corpus_path: path to the test text file
    :param output_path: output file path
    :param linkage_limit: maximum number of linkages LG may return when parsing a sentence
    :param options: bit field. Use bit mask constants to set or reset one or multiple bits:
                BIT_CAPS  = 0x01    Keep capitalized letters in tokens untouched if set,
                                    make all lowercase otherwise.
                BIT_RWALL = 0x02    Keep all links with RIGHT-WALL if set, ignore them otherwise.
                BIT_STRIP = 0x04    Strip off token suffixes if set, remove them otherwise.
    :return: tuple (float, float, float):
                - percentage of totally parsed sentences;
                - percentage of completely unparsed sentences;
                - percentage of parsed sentences;
    """

    input_file_handle = None
    output_file_handle = None

    # Sentence statistics variables
    sent_full = 0  # number of fully parsed sentences
    sent_none = 0  # number of completely unparsed sentences
    sent_stat = 0.0  # average value of parsed sentences (linkages)

    line_count = 0  # number of sentences in the corpus

    print("Info: Parsing a corpus file: '" + corpus_path + "'")
    print("Info: Using dictionary: '" + dict_path + "'")

    if output_path is not None:
        print("Info: Parses are saved in: '" + output_path +
              get_output_suffix(options) + "'")
    else:
        print(
            "Info: Output file name is not specified. Parses are redirected to 'stdout'."
        )

    try:
        link_line = re.compile(r"\A[0-9].+")

        po = ParseOptions(min_null_count=0, max_null_count=999)
        po.linkage_limit = linkage_limit

        di = Dictionary(dict_path)

        input_file_handle = open(corpus_path)
        output_file_handle = sys.stdout if output_path is None else open(
            output_path + get_output_suffix(options), "w")

        for line in input_file_handle:

            # Filter out links when ULL parses are used as input
            if options & BIT_ULL_IN > 0 and link_line.match(line):
                continue

            # Skip empty lines to get proper statistics estimation and skip commented lines
            if len(line.strip()) < 1 or line.startswith("#"):
                continue

            sent = Sentence(line, di, po)
            linkages = sent.parse()

            # Number of linkages taken in statistics estimation
            linkage_countdown = 1

            temp_full = 0
            temp_none = 0
            temp_stat = 0.0

            for linkage in linkages:
                #=============================================================================================================
                if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM:
                    print(linkage.diagram(), file=output_file_handle)

                elif (options
                      & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT:
                    print(linkage.postscript(), file=output_file_handle)

                elif (options
                      & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE:
                    print(linkage.constituent_tree(), file=output_file_handle)

                tokens, links = parse_postscript(
                    linkage.postscript().replace("\n", ""), options,
                    output_file_handle)

                if not (options & BIT_OUTPUT):
                    print_output(tokens, links, options, output_file_handle)

                (f, n, s) = calc_stat(tokens)

                if linkage_countdown:
                    temp_full += f
                    temp_none += n
                    temp_stat += s
                    linkage_countdown -= 1

            if len(linkages) > 0:
                sent_full += temp_full
                sent_none += temp_none
                sent_stat += temp_stat / float(len(linkages))
            else:
                sent_none += 1

            line_count += 1

        # Prevent interleaving "Dictionary close" messages
        ParseOptions(verbosity=0)

    except LG_Error as err:
        print(str(err))

    except IOError as err:
        print(str(err))

    except FileNotFoundError as err:
        print(str(err))

    finally:
        if input_file_handle is not None:
            input_file_handle.close()

        if output_file_handle is not None and output_file_handle != sys.stdout:
            output_file_handle.close()

        return (0.0, 0.0,
                0.0) if line_count == 0 else (float(sent_full) /
                                              float(line_count),
                                              float(sent_none) /
                                              float(line_count),
                                              sent_stat / float(line_count))
Exemplo n.º 8
0
 def test_setting_linkage_limit(self):
     po = ParseOptions()
     po.linkage_limit = 3
     self.assertEqual(clg.parse_options_get_linkage_limit(po._obj), 3)
Exemplo n.º 9
0
                  action="store_true",
                  help="interactive mode after each result")

arg = args.parse_args()

try:
    lgdict = Dictionary(arg.lang)
except LG_Error:
    # The default error handler will print the error message
    args.print_usage()
    sys.exit(2)

po = ParseOptions(verbosity=arg.verbosity)

po.max_null_count = 999  # > allowed maximum number of words
po.linkage_limit = 10000  # maximum number of linkages to generate
po.max_parse_time = 10  # actual parse timeout may be about twice bigger
po.spell_guess = True if DISPLAY_GUESSES else False
po.display_morphology = arg.morphology

while True:
    try:
        sentence_text = input(PROMPT)
    except EOFError:
        print("EOF")
        exit(0)

    if not is_stdin_atty and sentence_text:
        if sentence_text[0] == '%':
            continue
        if sentence_text[0] == '!':  # ignore user-settings for now
Exemplo n.º 10
0
    def on_get(self, req, resp):
        """ Handle HTTP GET request """
        link_list = {}  # output dictionary
        link_list['errors'] = []  # list of errors if any
        link_list['linkages'] = []  # list of linkages in requested format

        try:
            # logging IPs just in case
            logging.info("Connection from: " + (", ".join(req.access_route)))

            # Get input parammeters
            lang = req.get_param('lang')
            text = req.get_param('text')
            mode = req.get_param_as_int('mode')
            limit = req.get_param_as_int('limit')

            # If no sentence is specified, then nothing to do...
            if text == None:
                logging.debug(
                    "Parameter 'text' is not specified. Nothing to parse.")
                raise falcon.HTTPBadRequest(
                    "Parameter 'text' is not specified. Nothing to parse.")

            # Use default language if no language is specified
            if lang is None:
                lang = DEFAULT_LANGUAGE
                logging.info(
                    "'lang' parameter is not specified in request. 'lang' is set to '"
                    + DEFAULT_LANGUAGE + "'")

            # Use default mode if no or improper value is specified
            if mode is None or mode < 0 or mode > MAX_MODE_VALUE:
                mode = DEFAULT_MODE
                logging.info(
                    "'mode' value is not properly specified in request. 'mode' is set to "
                    + str(mode))

            # Use default limit if no value is specified
            #   or value is not within the range [1, MAX_LINKAGE_LIMIT]
            if limit is None or limit < 1 or limit > MAX_LINKAGE_LIMIT:
                limit = DEFAULT_LIMIT
                logging.info(
                    "'limit' value is not properly specified in request. 'limit' is set to "
                    + str(limit))

            # Save input parammeters to the output dictionary, just in case someone needs them
            link_list['lang'] = lang
            link_list['mode'] = mode
            link_list['text'] = text
            link_list['limit'] = limit

            # Use default dictionary if it was not explicitly specified
            dict_path = LG_DICT_DEFAULT_PATH + "/" + lang
            dict_path = lang if not os.path.isdir(dict_path) else dict_path

            logging.info("Dictionary path used: " + dict_path)

            # Invoke link-parser, if the parameters are correctly specified
            po = ParseOptions(verbosity=0,
                              min_null_count=0,
                              max_null_count=999)
            po.linkage_limit = limit

            sent = Sentence(text, Dictionary(dict_path), po)
            logging.debug("Sentence: '" + sent.text + "'")

            linkages = sent.parse()

            if mode == MOD_CONSTTREE:
                for linkage in linkages:
                    link_list['linkages'].append(linkage.constituent_tree())

            elif mode == MOD_POSTSCRIPT:
                for linkage in linkages:
                    link_list['linkages'].append(linkage.postscript())

            elif mode == MOD_ULL_SENT:
                for linkage in linkages:
                    link_list['linkages'].append(
                        get_ull_sentence(linkage.postscript()))

            else:  # MOD_DIAGRAM is default mode
                for linkage in linkages:
                    link_list['linkages'].append(linkage.diagram())

            # Prevent interleaving "Dictionary close" messages
            po = ParseOptions(verbosity=0)

        except LG_Error as err:
            error_msg = "LG_Error: " + str(err)
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        except Exception as err:
            error_msg = "Exception: " + str(err)
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        except BaseException as err:
            error_msg = "BaseException: " + str(err)
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        except:
            error_msg = "Unhandled exception."
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        # Return proper JSON output
        resp.body = json.dumps(link_list)
        resp.status = falcon.HTTP_200
Exemplo n.º 11
0
def parse_text(dict_path, corpus_path, output_path, is_caps, is_rwall,
               is_strip, linkage_limit):
    """
    Link parser invocation routine

    :param dict_path: name or path to the dictionary
    :param corpus_path: path to the test text file
    :param output_path: output file path
    :param is_caps: boolean value tells to leave CAPS in tokens if set to True, make all lowercase otherwise
    :param is_rwall: boolean value tells to leave RIGHT-WALL tokens if set to True, remove otherwise
    :param is_strip: boolean value tells to strip off token suffixes if set to True, remove otherwise
    :param linkage_limit: maximum number of linkages LG may return when parsing a sentence
    :return:
    """
    def parse_postscript(text, ofile):
        """
        Parse postscript notation of the linkage.

        :param text: text string returned by Linkage.postscript() method.
        :param ofile: output file object refference
        :return:
        """
        def strip_token(token) -> str:
            """
            Strip off suffix substring
            :param token: token string
            :return: stripped token if a suffix found, the same token otherwise
            """
            if token.startswith(".") or token.startswith("["):
                return token

            pos = token.find("[")

            # If "." is not found
            if pos < 0:
                pos = token.find(".")

                # If "[" is not found or token starts with "[" return token as is.
                if pos <= 0:
                    return token

            return token[:pos:]

        def parse_tokens(text, caps=False, rw=False, strip=True) -> list:
            """
            Parse string of tokens
            :param text: string token line extracted from postfix notation output string returned by Linkage.postfix()
                    method.
            :param caps: boolean value indicating weather or not CAPS should be untouched or lowercased
            :param rw: boolean value indicating weather or not RIGHT-WALL should be taken into account or ignored
            :param strip: boolean value indicating weather or not token suffixes should be stripped off or left
                    untouched
            :return: list of tokes
            """
            tokens = []
            start_pos = 1
            end_pos = text.find(")")

            while end_pos - start_pos > 0:
                token = text[start_pos:end_pos:]

                if strip:
                    token = strip_token(token)

                if token.find("-WALL") > 0:
                    token = "###" + token + "###"
                else:
                    if not caps:
                        token = token.lower()

                if token.find("RIGHT-WALL") < 0:
                    tokens.append(token)
                elif rw:
                    tokens.append(token)

                start_pos = end_pos + 2
                end_pos = text.find(")", start_pos)

            return tokens

        def parse_links(text, tokens) -> list:
            """
            Parse links represented in postfix notation and prints them in OpenCog notation.

            :param text: link list in postfix notation
            :param tokens: list of tokens previously extracted from postfix notated output
            :return:
            """
            links = []
            token_count = len(tokens)
            start_pos = 1
            end_pos = text.find("]")

            p = re.compile('(\d+)\s(\d+)\s\d+\s\(.+\)')

            while end_pos - start_pos > 0:
                m = p.match(text[start_pos:end_pos:])

                if m is not None:
                    index1 = int(m.group(1))
                    index2 = int(m.group(2))

                    if index2 < token_count:
                        links.append(
                            (index1, tokens[index1], index2, tokens[index2]))

                start_pos = end_pos + 2
                end_pos = text.find("]", start_pos)

            return links

        def print_output(tokens, links, ofile):

            for token in tokens[1:]:
                ofile.write(token + ' ')

            ofile.write('\n')

            for link in links:
                print(link[0], link[1], link[2], link[3], file=ofile)

            print('', file=ofile)

        # def parse_postscript(text, ofile):

        p = re.compile('\[(\(LEFT-WALL\).+\(RIGHT-WALL\))\]\[(.+)\]\[0\]')
        m = p.match(text)

        if m is not None:
            tokens = parse_tokens(m.group(1))
            sorted_links = sorted(parse_links(m.group(2), tokens),
                                  key=lambda x: (x[0], x[2]))
            print_output(tokens, sorted_links, ofile)

    f = None
    o = None

    try:
        po = ParseOptions(min_null_count=0, max_null_count=999)
        po.linkage_limit = linkage_limit

        di = Dictionary(dict_path)

        f = open(corpus_path)
        o = sys.stdout if output_path is None else open(output_path, "w")

        for line in f:
            sent = Sentence(line, di, po)
            linkages = sent.parse()

            # print("Linkages:", len(linkages))

            for linkage in linkages:
                # print(linkage.diagram())
                parse_postscript(linkage.postscript().replace("\n", ""), o)

        # Prevent interleaving "Dictionary close" messages
        po = ParseOptions(verbosity=0)

    except LG_Error as err:
        print(str(err))

    except IOError as err:
        print(str(err))

    except FileNotFoundError as err:
        print(str(err))

    finally:
        if f is not None:
            f.close()

        if o is not None and o != sys.stdout:
            o.close()