Пример #1
0
def is_grammatical_sentence(sentence_text, language_dict, verbose=False):
    parse_options = ParseOptions(verbosity=0)

    parse_options.max_null_count = 999  # max number of words in single pass
    parse_options.linkage_limit = 100  # max number of linkages to generate
    parse_options.max_parse_time = 10  # in seconds

    sent = Sentence(str(sentence_text), language_dict, parse_options)
    wrong_sentences = []

    linkages = None
    try:
        linkages = sent.parse()
    except LG_TimerExhausted:
        wrong_sentences.append(sentence_text)
        if verbose:
            print('Sentence too complex for parsing in {} seconds.'.format(
                parse_options.max_parse_time))
        return False

    if not linkages or len(linkages) <= 0:
        wrong_sentences.append(sentence_text)
        if verbose:
            print('Error occurred - sentence ignored.')

    null_count = sent.null_count()
    if null_count == 0:
        return True
    else:
        wrong_sentences.append(sentence_text)
        return False
Пример #2
0
def Make_Random(sents):
    """
        Make random parses (from LG-parser "any"), to use as baseline
    """
    any_dict = Dictionary('any') # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = 100
    options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN
    options |= BIT_CAPS

    random_parses = []
    for sent in sents:
        num_words = len(sent)
        curr_parse = []
        # subtitute words with numbers, as we only care about the parse tree
        fake_words = ["w{}".format(x) for x in range(1, num_words + 1)]
        # restore final dot to maintain --ignore functionality
        if sent[-1] == ".": 
            fake_words[-1] = "."
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages) # check nbr of linkages in sentence
        if num_parses > 0:
            idx = random.randint(0, num_parses - 1) # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj) # get the random linkage
            tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options)
            for link in links:
                llink = link[0]
                rlink = link[1]
                curr_parse.append([str(llink), tokens[llink], str(rlink), tokens[rlink]])

            random_parses.append(curr_parse)

    return random_parses
Пример #3
0
 def check(self, sentence):
     result = False
     s = sentence.encode('ascii')
     sent = Sentence(s)
     if sent.parse() > 0:
         result = True
     
     del sent
     return result
Пример #4
0
    def check(self, sentence):
        result = False
        s = sentence.encode('ascii')
        sent = Sentence(s)
        if sent.parse() > 0:
            result = True

        del sent
        return result
Пример #5
0
def make_random(sentences: Union[List[Tuple[str, set]], List[str]],
                options: int, **kwargs) -> List[Tuple[str, set]]:
    """
    Make random parses (from LG-parser "any"), to use as baseline

    :param sentences:       List of either tuples of sentence and set of links in case of .ull input file format
                            or strings in case of text input file format.
    :param options:         Integer representing parse options bit masks.
    :return:                List of parses (tuples of sentence and set of links)
    """
    any_dict = Dictionary('any')  # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = int(kwargs.get("limit", 100))
    options |= BIT_STRIP
    options |= BIT_CAPS

    if isinstance(sentences[0], tuple):
        is_ull = True
    elif isinstance(sentences[0], str):
        is_ull = False
    else:
        raise ValueError(
            "The first argument should be either List[Tuple[str, set] or List[str]."
        )

    random_parses = []

    for sent in sentences:
        words = tokenize_sentence(sent[0] if is_ull else sent)
        num_words = len(words)

        # substitute words with numbers, to avoid token-splitting by LG "any"
        fake_words = [f"w{x}" for x in range(1, num_words)]
        # fake_words = [f"w{x}" for x in range(1, num_words + 1)]
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages)  # check nbr of linkages in sentence

        links = []

        if num_parses > 0:
            idx = random.randint(0, num_parses -
                                 1)  # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj)  # get the random linkage
            tokens, links = parse_postscript(
                linkage.postscript().replace("\n", ""), options)

            if num_words != len(tokens):
                logger.error(
                    f"Number of tokens mismatch:\n{words}\n{tokens}\nfor sentence:\n{sent[0]}"
                )

        random_parses.append((sent[0], set(links)))

    return random_parses
Пример #6
0
    def test_2_step_parsing_with_null_links(self):
        self.po = ParseOptions(min_null_count=0, max_null_count=0)

        sent = Sentence('about people attended', self.d, self.po)
        linkages = sent.parse()
        self.assertEqual(len(linkages), 0)
        self.po = ParseOptions(min_null_count=1, max_null_count=999)
        linkages = sent.parse(self.po)
        self.assertEqual(len(linkages), 2)
        self.assertEqual(linkages.next().unused_word_cost(), 1)
Пример #7
0
def parseString(s,debug,linkNum=0,file=sys.stdout):
    sent = Sentence(s)
    num_links=sent.parse()
    if num_links>linkNum:
        linkage = Linkage(linkNum,sent)
        if debug:
            linkage.print_diagram(sys.stderr)
        findProblems(linkage, sent, file)
        return linkage
    else:
        return None
Пример #8
0
 def test_that_sentence_can_be_destroyed_when_linkages_still_exist(self):
     """
     If the parser is deleted before the associated swig objects
     are, there will be bad pointer dereferences (as the swig
     objects will be pointing into freed memory).  This test ensures
     that parsers can be created and deleted without regard for
     the existence of PYTHON Linkage objects
     """
     s = Sentence('This is a sentence.', Dictionary(), ParseOptions())
     linkages = s.parse()
     del s
Пример #9
0
 def test_that_sentence_can_be_destroyed_when_linkages_still_exist(self):
     """
     If the parser is deleted before the associated swig objects
     are, there will be bad pointer dereferences (as the swig
     objects will be pointing into freed memory).  This test ensures
     that parsers can be created and deleted without regard for
     the existence of PYTHON Linkage objects
     """
     s = Sentence('This is a sentence.', Dictionary(), ParseOptions())
     linkages = s.parse()
     del s
Пример #10
0
 def test_null_link_range_starting_with_zero(self):
     """Test parsing with a minimal number of null-links, including 0."""
     # This sentence has no complete linkage. Validate that the library
     # doesn't mangle parsing with null-count>0 due to power_prune()'s
     # connector-discard optimization at null-count==0.  Without commit
     # "Allow calling classic_parse() with and w/o nulls", the number of
     # linkages here is 1 instead of 2 and the unused_word_cost is 5.
     self.po = ParseOptions(min_null_count=0, max_null_count=999)
     linkages = Sentence('about people attended', self.d, self.po).parse()
     self.assertEqual(len(linkages), 2)
     self.assertEqual(linkages.next().unused_word_cost(), 1)
Пример #11
0
def parseString(s, debug, linkNum=0, file=sys.stdout):
    sent = Sentence(s)
    num_links = sent.parse()
    if num_links > linkNum:
        linkage = Linkage(linkNum, sent)
        if debug:
            linkage.print_diagram(sys.stderr)
        findProblems(linkage, sent, file)
        return linkage
    else:
        return None
def is_grammar_OK(text):
    flag = 0
    sent = Sentence(text, en_dir, po)
    linkages = sent.parse()
    #linkage_stat(sent, 'English')
    for linkage in linkages:
        if '(RIGHT-WALL)]' in linkage.postscript():
            flag = 1

    if flag == 1:
        return True

    return False
def parse_text(dict_path, corpus_path, output_id=OUTPUT_DIAGRAM):
    """
    Link parser invocation routine

    :param dict_path: name or path to the dictionary
    :param corpus_path: path to the test text file
    :param output_id: numetic type of one of three possible output format
    :return:
    """
    def s(q):
        """ Helper routine """
        return '' if q == 1 else 's'

    def linkage_stat(psent, lang, lkgs, sent_po):
        """
        This function mimics the linkage status report style of link-parser
        """
        random = ' of {} random linkages'. \
            format(clg.sentence_num_linkages_post_processed((psent._obj))) \
            if clg.sentence_num_linkages_found(psent._obj) > sent_po.linkage_limit else ''

        print('`{}: Found {} linkage{} ({}{} had no P.P. violations)`'. \
              format(lang, clg.sentence_num_linkages_found(psent._obj),
                     s(clg.sentence_num_linkages_found(psent._obj)), len(lkgs), random))

    po = ParseOptions(min_null_count=0, max_null_count=999)
    #po.linkage_limit = 3

    dict = Dictionary(dict_path)  # open the dictionary only once

    with open(corpus_path) as f:
        for line in f:
            print(line, end="")

            sent = Sentence(line, dict, po)
            linkages = sent.parse()
            linkage_stat(sent, dict_path, linkages, po)

            if output_id == OUTPUT_POSTSCRIPT:
                for linkage in linkages:
                    print(linkage.postscript())
            elif output_id == OUTPUT_CONSTITUENT:
                for linkage in linkages:
                    print(linkage.constituent_tree())
            else:
                for linkage in linkages:
                    print(linkage.diagram())

    # Prevent interleaving "Dictionary close" messages
    po = ParseOptions(verbosity=0)
Пример #14
0
 def checkSummary(self, sentence):
     logging.debug('checkSummary start')
     result = ""
     s = sentence.encode('ascii')
     sent = Sentence(s)
     lc = sent.parse()
     logging.debug('checkSummary sent parsed')
     if lc > 0:
         linkage = Linkage(0, sent)
         result = linkage.get_diagram()
         logging.debug('checkSummary OK')
         del linkage
     del sent
     logging.debug('checkSummary end')
     return result
Пример #15
0
 def checkSummary(self, sentence):
     logging.debug('checkSummary start')
     result = ""
     s = sentence.encode('ascii')
     sent = Sentence(s)
     lc = sent.parse()
     logging.debug('checkSummary sent parsed')      
     if lc > 0:  
         linkage = Linkage(0, sent)
         result = linkage.get_diagram()
         logging.debug('checkSummary OK')
         del linkage
     del sent
     logging.debug('checkSummary end')
     return result
Пример #16
0
def Tokenize_Sentence(sentence, po):
    """
        Tokenizes the given sentence using LG grammar bindings
    """
    tokenized_sentence = ""
    sent = Sentence(sentence, any_dict, po)
    linkages = sent.parse()
    #linkage = Linkage(0, sent, po)
    for linkage in linkages:
        num_words = linkage.num_of_words()
        for i in range(num_words - 1):  # index shift ignores ###LEFT-WALL
            word_start = linkage.word_byte_start(i + 1)
            word_end = linkage.word_byte_end(i + 1)
            tokenized_sentence += sentence[word_start:word_end] + " "
        break
    tokenized_sentence += "\n"
    return tokenized_sentence
Пример #17
0
 def test_a_constiuents_after_parse_list(self):
     """
     Validate that the post-processing data of the first linkage is not
     getting clobbered by later linkages.
     """
     linkages = list(Sentence("This is a test.", self.d, self.po).parse())
     self.assertEqual(linkages[0].constituent_tree(),
             "(S (NP this.p)\n   (VP is.v\n       (NP a test.n))\n   .)\n")
Пример #18
0
def linkage_testfile(self, lgdict, popt, desc = ''):
    """
    Reads sentences and their corresponding
    linkage diagrams / constituent printings.
    """
    if '' != desc:
        desc = desc + '-'
    testfile = clg.test_data_srcdir + "parses-" + desc + clg.dictionary_get_lang(lgdict._obj) + ".txt"
    parses = open(testfile)
    diagram = None
    sent = None
    lineno = 0
    for line in parses:
        lineno += 1
        # Lines starting with I are the input sentences
        if 'I' == line[0]:
            sent = line[1:]
            diagram = ""
            constituents = ""
            linkages = Sentence(sent, lgdict, popt).parse()
            linkage = linkages.next()

        # Generate the next linkage of the last input sentence
        if 'N' == line[0]:
            diagram = ""
            constituents = ""
            linkage = next(linkages, None)
            if not linkage:
                self.assertTrue(linkage, "{}:{}: Sentence has too few linkages".format(testfile, lineno))

        # Lines starting with O are the parse diagram
        # It ends with an empty line
        if 'O' == line[0]:
            diagram += line[1:]
            if '\n' == line[1] and 1 < len(diagram):
                self.assertEqual(linkage.diagram(), diagram)

        # Lines starting with C are the constituent output (type 1)
        # It ends with an empty line
        if 'C' == line[0]:
            if '\n' == line[1] and 1 < len(constituents):
                self.assertEqual(linkage.constituent_tree(), constituents)
            constituents += line[1:]
    parses.close()
Пример #19
0
def Make_Random(sents, **kwargs):
    """
        Make random parses (from LG-parser "any"), to use as baseline
    """
    output_path = kwargs.get("output_path", os.environ["PWD"])

    any_dict = Dictionary('any')  # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = 100
    options = 0x00000000 | BIT_STRIP  #| BIT_ULL_IN
    options |= BIT_CAPS

    random_parses = []
    for sent in sents:
        num_words = len(sent)
        curr_sent = sent[:]
        curr_sent.insert(0, "###LEFT-WALL###")
        curr_parse = []
        # subtitute words with numbers, to avoid token-splitting by LG "any"
        fake_words = ["w{}".format(x) for x in range(1, num_words + 1)]
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages)  # check nbr of linkages in sentence
        if num_parses > 0:
            idx = random.randint(0, num_parses -
                                 1)  # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj)  # get the random linkage
            tokens, links = parse_postscript(
                linkage.postscript().replace("\n", ""), options)
            for link in links:
                llink = link[0]
                rlink = link[1]
                # attach words from sent, which are the actual words
                curr_parse.append([
                    str(llink), curr_sent[llink],
                    str(rlink), curr_sent[rlink]
                ])

            random_parses.append(curr_parse)

    Print_parses(sents, random_parses, f"{output_path}/random_parses.ull")

    return random_parses
Пример #20
0
    def test_getting_links(self):
        parses = open(clg.test_data_srcdir + "parses-en.txt")
        diagram = None
        sent = None
        for line in parses :
            # Lines starting with I are the input sentences
            if 'I' == line[0] :
                sent = line[1:]
                diagram = ""

            # Lines starting with O are the parse diagrams
            if 'O' == line[0] :
                diagram += line[1:]

                # We have a complete diagram if it ends with an
                # empty line.
                if '\n' == line[1] and 1 < len(diagram) :
                    linkage = Sentence(sent, self.d, self.po).parse().next()
                    self.assertEqual(linkage.diagram(), diagram)
        parses.close()
Пример #21
0
def linkage_testfile(self, lgdict, popt, desc = ''):
    """
    Reads sentences and their corresponding
    linkage diagrams / constituent printings.
    """
    self.__class__.longMessage = True
    if '' != desc:
        desc = desc + '-'
    testfile = clg.test_data_srcdir + "parses-" + desc + clg.dictionary_get_lang(lgdict._obj) + ".txt"
    parses = open(testfile, "rb")
    diagram = None
    sent = None
    lineno = 0
    opcode_detected = 0 # function sanity check
    for line in parses:
        lineno += 1
        if sys.version_info > (3, 0):
            line = line.decode('utf-8')
        # Lines starting with I are the input sentences
        if 'I' == line[0]:
            opcode_detected += 1
            sent = line[1:]
            diagram = ""
            constituents = ""
            linkages = Sentence(sent, lgdict, popt).parse()
            linkage = next(linkages, None)
            self.assertTrue(linkage, "at {}:{}: Sentence has no linkages".format(testfile, lineno))

        # Generate the next linkage of the last input sentence
        if 'N' == line[0]:
            opcode_detected += 1
            diagram = ""
            constituents = ""
            linkage = next(linkages, None)
            self.assertTrue(linkage, "at {}:{}: Sentence has too few linkages".format(testfile, lineno))

        # Lines starting with O are the parse diagram
        # It ends with an empty line
        if 'O' == line[0]:
            opcode_detected += 1
            diagram += line[1:]
            if '\n' == line[1] and 1 < len(diagram):
                self.assertEqual(linkage.diagram(), diagram, "at {}:{}".format(testfile, lineno))

        # Lines starting with C are the constituent output (type 1)
        # It ends with an empty line
        if 'C' == line[0]:
            opcode_detected += 1
            if '\n' == line[1] and 1 < len(constituents):
                self.assertEqual(linkage.constituent_tree(), constituents, "at {}:{}".format(testfile, lineno))
            constituents += line[1:]
    parses.close()

    self.assertGreaterEqual(opcode_detected, 2, "Nothing has been done for " + testfile)
Пример #22
0
    def test_getting_links(self):
        parses = open(clg.test_data_srcdir + "parses-en.txt")
        diagram = None
        sent = None
        for line in parses:
            # Lines starting with I are the input sentences
            if 'I' == line[0]:
                sent = line[1:]
                diagram = ""

            # Lines starting with O are the parse diagrams
            if 'O' == line[0]:
                diagram += line[1:]

                # We have a complete diagram if it ends with an
                # empty line.
                if '\n' == line[1] and 1 < len(diagram):
                    linkage = Sentence(sent, self.d, self.po).parse().next()
                    self.assertEqual(linkage.diagram(), diagram)
        parses.close()
Пример #23
0
def linkage_testfile(self, dict, popt, desc=''):
    """
    Reads sentences and their corresponding
    linkage diagrams / constituent printings.
    """
    if '' != desc:
        desc = desc + '-'
    parses = open(clg.test_data_srcdir + "parses-" + desc +
                  clg.dictionary_get_lang(dict._obj) + ".txt")
    diagram = None
    sent = None
    for line in parses:
        # Lines starting with I are the input sentences
        if 'I' == line[0]:
            sent = line[1:]
            diagram = ""
            constituents = ""
            linkages = Sentence(sent, dict, popt).parse()
            linkage = linkages.next()

        # Generate the next linkage of the last input sentence
        if 'N' == line[0]:
            diagram = ""
            constituents = ""
            linkage = linkages.next()

        # Lines starting with O are the parse diagram
        # It ends with an empty line
        if 'O' == line[0]:
            diagram += line[1:]
            if '\n' == line[1] and 1 < len(diagram):
                self.assertEqual(linkage.diagram(), diagram)

        # Lines starting with C are the constituent output (type 1)
        # It ends with an empty line
        if 'C' == line[0]:
            if '\n' == line[1] and 1 < len(constituents):
                self.assertEqual(linkage.constituent_tree(), constituents)
            constituents += line[1:]
    parses.close()
Пример #24
0
def linkage_testfile(self, dict, popt, desc=""):
    """
    Reads sentences and their corresponding
    linkage diagrams / constituent printings.
    """
    if "" != desc:
        desc = desc + "-"
    parses = open(clg.test_data_srcdir + "parses-" + desc + clg.dictionary_get_lang(dict._obj) + ".txt")
    diagram = None
    sent = None
    for line in parses:
        # Lines starting with I are the input sentences
        if "I" == line[0]:
            sent = line[1:]
            diagram = ""
            constituents = ""
            linkages = Sentence(sent, dict, popt).parse()
            linkage = linkages.next()

        # Generate the next linkage of the last input sentence
        if "N" == line[0]:
            diagram = ""
            constituents = ""
            linkage = linkages.next()

        # Lines starting with O are the parse diagram
        # It ends with an empty line
        if "O" == line[0]:
            diagram += line[1:]
            if "\n" == line[1] and 1 < len(diagram):
                self.assertEqual(linkage.diagram(), diagram)

        # Lines starting with C are the constituent output (type 1)
        # It ends with an empty line
        if "C" == line[0]:
            if "\n" == line[1] and 1 < len(constituents):
                self.assertEqual(linkage.constituent_tree(), constituents)
            constituents += line[1:]
    parses.close()
Пример #25
0
    def parse_cbf(self, line, callback, param1=None, param2=None):
        """ Parse sentence, using user defined callback function for processing linkages """

        if hasattr(self, '_obj_dict') and self._obj_dict is not None and \
                hasattr(self, '_parse_options') and self._parse_options is not None:

            linkages = Sentence(line, self._obj_dict,
                                self._parse_options).parse()

            if callback is not None:
                callback(linkages, param1, param2)
        else:
            raise LGClientError(
                "Sentence can not be parsed because Dictionary object was not created."
            )
Пример #26
0
    def parse(self, line, callback):
        """
            Parse sentence, using user defined callback class
            for processing linkages
        """
        if hasattr(self, '_obj_dict') and self._obj_dict is not None and \
                hasattr(self, '_parse_options') and self._parse_options is not None:

            linkages = Sentence(line, self._obj_dict,
                                self._parse_options).parse()

            if callback is not None and isinstance(callback, LGClientCallback):
                callback.on_linkages(linkages)
            else:
                raise LGClientError(
                    "Error: 'callback' is not an instance of LGClientCallback")
Пример #27
0
def linkage_stat(psent, lang):
    """
    This function mimics the linkage status report style of link-parser
    """
    random = ' of {0} random linkages'. \
             format(psent.num_linkages_post_processed()) \
             if psent.num_valid_linkages() < psent.num_linkages_found() else ''

    print '{0}: Found {1} linkage{2} ({3}{4} had no P.P. violations)'. \
          format(lang, psent.num_linkages_found(),
                 s(psent.num_linkages_found()),
                 psent.num_valid_linkages(), random)


# English is the default language
sent = Sentence("This is a test.", Dictionary(), po)
linkages = sent.parse()
linkage_stat(sent, 'English')
for linkage in linkages:
    desc(linkage)

# Russian
sent = Sentence("это большой тест.", Dictionary('ru'), po)
linkages = sent.parse()
linkage_stat(sent, 'Russian')
for linkage in linkages:
    desc(linkage)

# Turkish
sent = Sentence("çok şişman adam geldi", Dictionary('tr'), po)
linkages = sent.parse()
Пример #28
0
    sys.exit(2)

po = ParseOptions(verbosity=arg.verbosity)

po.max_null_count = 999  # > allowed maximum number of words
po.max_parse_time = 10   # actual parse timeout may be about twice bigger
po.spell_guess = True if DISPLAY_GUESSES else False
po.display_morphology = arg.morphology

# iter(): avoid python2 input buffering
while True:
    sentence_text = get_input("sentence-check: ")

    if sentence_text.strip() == '':
        continue
    sent = Sentence(str(sentence_text), lgdict, po)
    try:
        linkages = sent.parse()
    except LG_TimerExhausted:
        print('Sentence too complex for parsing in ~{} second{}.'.format(
            po.max_parse_time,nsuffix(po.max_parse_time)))
        continue
    if not linkages:
        print('Error occurred - sentence ignored.')
        continue
    if len(linkages) <= 0:
        print('Cannot parse the input sentence')
        continue
    null_count = sent.null_count()
    if null_count == 0:
        print("Sentence parsed OK", end='')
Пример #29
0
    args.print_usage()
    sys.exit(2)

po = ParseOptions(verbosity=arg.verbosity)

po.max_null_count = 999  # > allowed maximum number of words
po.max_parse_time = 10  # actual parse timeout may be about twice bigger
po.spell_guess = True if DISPLAY_GUESSES else False
po.display_morphology = arg.morphology

print("Enter sentences:")
# iter(): avoid python2 input buffering
for sentence_text in iter(sys.stdin.readline, ''):
    if sentence_text.strip() == '':
        continue
    sent = Sentence(str(sentence_text), lgdict, po)
    try:
        linkages = sent.parse()
    except LG_TimerExhausted:
        print('Sentence too complex for parsing in ~{} second{}.'.format(
            po.max_parse_time, nsuffix(po.max_parse_time)))
        continue
    if not linkages:
        print('Error occurred - sentence ignored.')
        continue
    if len(linkages) <= 0:
        print('Cannot parse the input sentence')
        continue
    null_count = sent.null_count()
    if null_count == 0:
        print("Sentence parsed OK")
Пример #30
0
def linkage_stat(psent, lang):
    """
    This function mimics the linkage status report style of link-parser
    """
    random = ' of {0} random linkages'. \
             format(psent.num_linkages_post_processed()) \
             if psent.num_valid_linkages() < psent.num_linkages_found() else ''

    print '{0}: Found {1} linkage{2} ({3}{4} had no P.P. violations)'. \
          format(lang, psent.num_linkages_found(),
                 s(psent.num_linkages_found()),
                 psent.num_valid_linkages(), random)


# English is the default language
sent = Sentence("This is a test.", Dictionary(), po)
linkages = sent.parse()
linkage_stat(sent, 'English')
for linkage in linkages:
    desc(linkage)

# Russian
sent = Sentence("это большой тест.", Dictionary('ru'), po)
linkages = sent.parse()
linkage_stat(sent, 'Russian')
for linkage in linkages:
    desc(linkage)

# Turkish
po = ParseOptions(islands_ok=True, max_null_count=1, display_morphology=True)
sent = Sentence("Senin ne istediğini bilmiyorum", Dictionary('tr'), po)
Пример #31
0
    """
    This function mimics the linkage status report style of link-parser
    """
    random = ' of {0} random linkages'. \
             format(psent.num_linkages_post_processed()) \
             if psent.num_valid_linkages() < psent.num_linkages_found() else ''

    print ('{0}: Found {1} linkage{2} ({3}{4} had no P.P. violations)'. \
          format(lang, psent.num_linkages_found(),
                 s(psent.num_linkages_found()),
                 psent.num_valid_linkages(), random))



# English is the default language
sent = Sentence("This is a test.", Dictionary(), po)
linkages = sent.parse()
linkage_stat(sent, 'English')
for linkage in linkages:
        desc(linkage)

# Russian
sent = Sentence("Целью курса является обучение магистрантов основам построения и функционирования программного обеспечения сетей ЭВМ.", Dictionary('ru'), po)
linkages = sent.parse()
linkage_stat(sent, 'Russian')
for linkage in linkages:
        desc(linkage)

# Turkish
po = ParseOptions(islands_ok=True, max_null_count=1, display_morphology=True)
sent = Sentence("Senin ne istediğini bilmiyorum", Dictionary('tr'), po)
Пример #32
0
    def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \
            -> (ParseMetrics, ParseQuality):
        """
        Link Grammar API parser invokation routine.

        :param dict_path:       Dictionary file or directory path.
        :param corpus_path:     Corpus file or directory path.
        :param output_path:     Output file or directory path.
        :param ref_path:        Reference file or directory path.
        :param options:         Bit field. See `optconst.py` for details
        :return:                Tuple (ParseMetrics, ParseQuality)
        """
        input_file_handle = None
        output_file_handle = None

        ref_parses = []

        # Sentence statistics variables
        total_metrics, total_quality = ParseMetrics(), ParseQuality()

        sentence_count = 0                  # number of sentences in the corpus

        print("Info: Parsing a corpus file: '" + corpus_path + "'")
        print("Info: Using dictionary: '" + dict_path + "'")

        if output_path is not None:
            print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'")
        else:
            print("Info: Output file name is not specified. Parses are redirected to 'stdout'.")

        try:
            if options & BIT_PARSE_QUALITY and ref_path is not None:
                try:
                    data = load_ull_file(ref_path)
                    ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False)

                except Exception as err:
                    print("Exception: " + str(err))

            link_line = re.compile(r"\A[0-9].+")

            po = ParseOptions(min_null_count=0, max_null_count=999)
            po.linkage_limit = self._linkage_limit

            di = Dictionary(dict_path)

            input_file_handle = open(corpus_path)
            output_file_handle = sys.stdout if output_path is None \
                                            else open(output_path+get_output_suffix(options), "w")

            for line in input_file_handle:

                # Filter out links when ULL parses are used as input
                if options & BIT_ULL_IN > 0 and link_line.match(line):
                    continue

                # Skip empty lines to get proper statistics estimation and skip commented lines
                if len(line.strip()) < 1:  # or line.startswith("#"):
                    continue

                # Tokenize and parse the sentence
                sent = Sentence(line, di, po)
                linkages = sent.parse()

                sent_metrics, sent_quality = ParseMetrics(), ParseQuality()
                linkage_count = 0

                for linkage in linkages:

                    # Only the first linkage is counted.
                    if linkage_count == 1:
                        break

                    if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM:
                        print(linkage.diagram(), file=output_file_handle)

                    elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT:
                        print(linkage.postscript(), file=output_file_handle)

                    elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE:
                        print(linkage.constituent_tree(), file=output_file_handle)

                    elif not (options & BIT_OUTPUT):

                        tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options,
                                                         output_file_handle)

                        # Print ULL formated parses
                        print_output(tokens, links, options, output_file_handle)

                        # Calculate parseability
                        sent_metrics += parse_metrics(prepare_tokens(tokens, options))

                        # Calculate parse quality if the option is set
                        if options & BIT_PARSE_QUALITY and len(ref_parses):
                            sent_quality += parse_quality(get_link_set(tokens, links, options),
                                                          ref_parses[sentence_count][1])

                    linkage_count += 1

                assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0"
                assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0"

                total_metrics += sent_metrics
                total_quality += sent_quality

                # if not linkage_count:
                #     sent_metrics.completely_unparsed_ratio += 1

                sentence_count += 1

            total_metrics.sentences = sentence_count
            total_quality.sentences = sentence_count

            # Prevent interleaving "Dictionary close" messages
            ParseOptions(verbosity=0)

        except LG_DictionaryError as err:
            print("LG_DictionaryError: " + str(err))

        except LG_Error as err:
            print("LG_Error: " + str(err))

        except IOError as err:
            print("IOError: " + str(err))

        except FileNotFoundError as err:
            print("FileNotFoundError: " + str(err))

        finally:
            if input_file_handle is not None:
                input_file_handle.close()

            if output_file_handle is not None and output_file_handle != sys.stdout:
                output_file_handle.close()

            return total_metrics, total_quality
Пример #33
0
    def on_get(self, req, resp):
        """ Handle HTTP GET request """
        link_list = {}  # output dictionary
        link_list['errors'] = []  # list of errors if any
        link_list['linkages'] = []  # list of linkages in requested format

        try:
            # logging IPs just in case
            logging.info("Connection from: " + (", ".join(req.access_route)))

            # Get input parammeters
            lang = req.get_param('lang')
            text = req.get_param('text')
            mode = req.get_param_as_int('mode')
            limit = req.get_param_as_int('limit')

            # If no sentence is specified, then nothing to do...
            if text == None:
                logging.debug(
                    "Parameter 'text' is not specified. Nothing to parse.")
                raise falcon.HTTPBadRequest(
                    "Parameter 'text' is not specified. Nothing to parse.")

            # Use default language if no language is specified
            if lang is None:
                lang = DEFAULT_LANGUAGE
                logging.info(
                    "'lang' parameter is not specified in request. 'lang' is set to '"
                    + DEFAULT_LANGUAGE + "'")

            # Use default mode if no or improper value is specified
            if mode is None or mode < 0 or mode > MAX_MODE_VALUE:
                mode = DEFAULT_MODE
                logging.info(
                    "'mode' value is not properly specified in request. 'mode' is set to "
                    + str(mode))

            # Use default limit if no value is specified
            #   or value is not within the range [1, MAX_LINKAGE_LIMIT]
            if limit is None or limit < 1 or limit > MAX_LINKAGE_LIMIT:
                limit = DEFAULT_LIMIT
                logging.info(
                    "'limit' value is not properly specified in request. 'limit' is set to "
                    + str(limit))

            # Save input parammeters to the output dictionary, just in case someone needs them
            link_list['lang'] = lang
            link_list['mode'] = mode
            link_list['text'] = text
            link_list['limit'] = limit

            # Use default dictionary if it was not explicitly specified
            dict_path = LG_DICT_DEFAULT_PATH + "/" + lang
            dict_path = lang if not os.path.isdir(dict_path) else dict_path

            logging.info("Dictionary path used: " + dict_path)

            # Invoke link-parser, if the parameters are correctly specified
            po = ParseOptions(verbosity=0,
                              min_null_count=0,
                              max_null_count=999)
            po.linkage_limit = limit

            sent = Sentence(text, Dictionary(dict_path), po)
            logging.debug("Sentence: '" + sent.text + "'")

            linkages = sent.parse()

            if mode == MOD_CONSTTREE:
                for linkage in linkages:
                    link_list['linkages'].append(linkage.constituent_tree())

            elif mode == MOD_POSTSCRIPT:
                for linkage in linkages:
                    link_list['linkages'].append(linkage.postscript())

            elif mode == MOD_ULL_SENT:
                for linkage in linkages:
                    link_list['linkages'].append(
                        get_ull_sentence(linkage.postscript()))

            else:  # MOD_DIAGRAM is default mode
                for linkage in linkages:
                    link_list['linkages'].append(linkage.diagram())

            # Prevent interleaving "Dictionary close" messages
            po = ParseOptions(verbosity=0)

        except LG_Error as err:
            error_msg = "LG_Error: " + str(err)
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        except Exception as err:
            error_msg = "Exception: " + str(err)
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        except BaseException as err:
            error_msg = "BaseException: " + str(err)
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        except:
            error_msg = "Unhandled exception."
            link_list["errors"].append(error_msg)
            logging.error(error_msg)

        # Return proper JSON output
        resp.body = json.dumps(link_list)
        resp.status = falcon.HTTP_200
Пример #34
0
 def parse_sent(self, text):
     return list(Sentence(text, self.d, self.po).parse())
Пример #35
0
 def parse_sent(self, text, po=ParseOptions()):
     return list(Sentence(text, self.d, po).parse())
Пример #36
0
          format(lang, clg.sentence_num_linkages_found(psent._obj),
                 s(clg.sentence_num_linkages_found(psent._obj)), len(lkgs), random))


en_lines = [
    'This is a test.',
    'I feel is the exciter than other things', # from issue #303 (10 linkages)
]

po = ParseOptions(min_null_count=0, max_null_count=999)
#po.linkage_limit = 3

# English is the default language
en_dir = Dictionary() # open the dictionary only once
for text in en_lines:
    sent = Sentence(text, en_dir, po)
    linkages = sent.parse()
    linkage_stat(sent, 'English', linkages, po)
    for linkage in linkages:
        desc(linkage)

# # Russian
# sent = Sentence("Целью курса является обучение магистрантов основам построения и функционирования программного обеспечения сетей ЭВМ.", Dictionary('ru'), po)
# linkages = sent.parse()
# linkage_stat(sent, 'Russian', linkages, po)
# for linkage in linkages:
#     desc(linkage)
#
# # Turkish
# po = ParseOptions(islands_ok=True, max_null_count=1, display_morphology=True, verbosity=1)
# sent = Sentence("Senin ne istediğini bilmiyorum", Dictionary('tr'), po)
Пример #37
0
        lang, clg.sentence_num_linkages_found(psent._obj),
        s(clg.sentence_num_linkages_found(psent._obj)), len(lkgs), random))


en_lines = [
    'This is a test.',
    'I feel is the exciter than other things',  # from issue #303 (10 linkages)
]

po = ParseOptions(min_null_count=0, max_null_count=999)
#po.linkage_limit = 3

# English is the default language
en_dir = Dictionary()  # open the dictionary only once
for text in en_lines:
    sent = Sentence(text, en_dir, po)
    linkages = sent.parse()
    linkage_stat(sent, 'English', linkages, po)
    for linkage in linkages:
        desc(linkage)

# Russian
sent = Sentence(
    "Целью курса является обучение магистрантов основам построения и функционирования программного обеспечения сетей ЭВМ.",
    Dictionary('ru'), po)
linkages = sent.parse()
linkage_stat(sent, 'Russian', linkages, po)
for linkage in linkages:
    desc(linkage)

# Turkish
Пример #38
0
def parse_file_with_api(dict_path, corpus_path, output_path, linkage_limit, options) \
        -> (float, float, float):
    """
    Link parser invocation routine.

    :param dict_path: name or path to the dictionary
    :param corpus_path: path to the test text file
    :param output_path: output file path
    :param linkage_limit: maximum number of linkages LG may return when parsing a sentence
    :param options: bit field. Use bit mask constants to set or reset one or multiple bits:
                BIT_CAPS  = 0x01    Keep capitalized letters in tokens untouched if set,
                                    make all lowercase otherwise.
                BIT_RWALL = 0x02    Keep all links with RIGHT-WALL if set, ignore them otherwise.
                BIT_STRIP = 0x04    Strip off token suffixes if set, remove them otherwise.
    :return: tuple (float, float, float):
                - percentage of totally parsed sentences;
                - percentage of completely unparsed sentences;
                - percentage of parsed sentences;
    """

    input_file_handle = None
    output_file_handle = None

    # Sentence statistics variables
    sent_full = 0  # number of fully parsed sentences
    sent_none = 0  # number of completely unparsed sentences
    sent_stat = 0.0  # average value of parsed sentences (linkages)

    line_count = 0  # number of sentences in the corpus

    print("Info: Parsing a corpus file: '" + corpus_path + "'")
    print("Info: Using dictionary: '" + dict_path + "'")

    if output_path is not None:
        print("Info: Parses are saved in: '" + output_path +
              get_output_suffix(options) + "'")
    else:
        print(
            "Info: Output file name is not specified. Parses are redirected to 'stdout'."
        )

    try:
        link_line = re.compile(r"\A[0-9].+")

        po = ParseOptions(min_null_count=0, max_null_count=999)
        po.linkage_limit = linkage_limit

        di = Dictionary(dict_path)

        input_file_handle = open(corpus_path)
        output_file_handle = sys.stdout if output_path is None else open(
            output_path + get_output_suffix(options), "w")

        for line in input_file_handle:

            # Filter out links when ULL parses are used as input
            if options & BIT_ULL_IN > 0 and link_line.match(line):
                continue

            # Skip empty lines to get proper statistics estimation and skip commented lines
            if len(line.strip()) < 1 or line.startswith("#"):
                continue

            sent = Sentence(line, di, po)
            linkages = sent.parse()

            # Number of linkages taken in statistics estimation
            linkage_countdown = 1

            temp_full = 0
            temp_none = 0
            temp_stat = 0.0

            for linkage in linkages:
                #=============================================================================================================
                if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM:
                    print(linkage.diagram(), file=output_file_handle)

                elif (options
                      & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT:
                    print(linkage.postscript(), file=output_file_handle)

                elif (options
                      & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE:
                    print(linkage.constituent_tree(), file=output_file_handle)

                tokens, links = parse_postscript(
                    linkage.postscript().replace("\n", ""), options,
                    output_file_handle)

                if not (options & BIT_OUTPUT):
                    print_output(tokens, links, options, output_file_handle)

                (f, n, s) = calc_stat(tokens)

                if linkage_countdown:
                    temp_full += f
                    temp_none += n
                    temp_stat += s
                    linkage_countdown -= 1

            if len(linkages) > 0:
                sent_full += temp_full
                sent_none += temp_none
                sent_stat += temp_stat / float(len(linkages))
            else:
                sent_none += 1

            line_count += 1

        # Prevent interleaving "Dictionary close" messages
        ParseOptions(verbosity=0)

    except LG_Error as err:
        print(str(err))

    except IOError as err:
        print(str(err))

    except FileNotFoundError as err:
        print(str(err))

    finally:
        if input_file_handle is not None:
            input_file_handle.close()

        if output_file_handle is not None and output_file_handle != sys.stdout:
            output_file_handle.close()

        return (0.0, 0.0,
                0.0) if line_count == 0 else (float(sent_full) /
                                              float(line_count),
                                              float(sent_none) /
                                              float(line_count),
                                              sent_stat / float(line_count))
Пример #39
0
def parse_text(dict_path, corpus_path, output_path, is_caps, is_rwall,
               is_strip, linkage_limit):
    """
    Link parser invocation routine

    :param dict_path: name or path to the dictionary
    :param corpus_path: path to the test text file
    :param output_path: output file path
    :param is_caps: boolean value tells to leave CAPS in tokens if set to True, make all lowercase otherwise
    :param is_rwall: boolean value tells to leave RIGHT-WALL tokens if set to True, remove otherwise
    :param is_strip: boolean value tells to strip off token suffixes if set to True, remove otherwise
    :param linkage_limit: maximum number of linkages LG may return when parsing a sentence
    :return:
    """
    def parse_postscript(text, ofile):
        """
        Parse postscript notation of the linkage.

        :param text: text string returned by Linkage.postscript() method.
        :param ofile: output file object refference
        :return:
        """
        def strip_token(token) -> str:
            """
            Strip off suffix substring
            :param token: token string
            :return: stripped token if a suffix found, the same token otherwise
            """
            if token.startswith(".") or token.startswith("["):
                return token

            pos = token.find("[")

            # If "." is not found
            if pos < 0:
                pos = token.find(".")

                # If "[" is not found or token starts with "[" return token as is.
                if pos <= 0:
                    return token

            return token[:pos:]

        def parse_tokens(text, caps=False, rw=False, strip=True) -> list:
            """
            Parse string of tokens
            :param text: string token line extracted from postfix notation output string returned by Linkage.postfix()
                    method.
            :param caps: boolean value indicating weather or not CAPS should be untouched or lowercased
            :param rw: boolean value indicating weather or not RIGHT-WALL should be taken into account or ignored
            :param strip: boolean value indicating weather or not token suffixes should be stripped off or left
                    untouched
            :return: list of tokes
            """
            tokens = []
            start_pos = 1
            end_pos = text.find(")")

            while end_pos - start_pos > 0:
                token = text[start_pos:end_pos:]

                if strip:
                    token = strip_token(token)

                if token.find("-WALL") > 0:
                    token = "###" + token + "###"
                else:
                    if not caps:
                        token = token.lower()

                if token.find("RIGHT-WALL") < 0:
                    tokens.append(token)
                elif rw:
                    tokens.append(token)

                start_pos = end_pos + 2
                end_pos = text.find(")", start_pos)

            return tokens

        def parse_links(text, tokens) -> list:
            """
            Parse links represented in postfix notation and prints them in OpenCog notation.

            :param text: link list in postfix notation
            :param tokens: list of tokens previously extracted from postfix notated output
            :return:
            """
            links = []
            token_count = len(tokens)
            start_pos = 1
            end_pos = text.find("]")

            p = re.compile('(\d+)\s(\d+)\s\d+\s\(.+\)')

            while end_pos - start_pos > 0:
                m = p.match(text[start_pos:end_pos:])

                if m is not None:
                    index1 = int(m.group(1))
                    index2 = int(m.group(2))

                    if index2 < token_count:
                        links.append(
                            (index1, tokens[index1], index2, tokens[index2]))

                start_pos = end_pos + 2
                end_pos = text.find("]", start_pos)

            return links

        def print_output(tokens, links, ofile):

            for token in tokens[1:]:
                ofile.write(token + ' ')

            ofile.write('\n')

            for link in links:
                print(link[0], link[1], link[2], link[3], file=ofile)

            print('', file=ofile)

        # def parse_postscript(text, ofile):

        p = re.compile('\[(\(LEFT-WALL\).+\(RIGHT-WALL\))\]\[(.+)\]\[0\]')
        m = p.match(text)

        if m is not None:
            tokens = parse_tokens(m.group(1))
            sorted_links = sorted(parse_links(m.group(2), tokens),
                                  key=lambda x: (x[0], x[2]))
            print_output(tokens, sorted_links, ofile)

    f = None
    o = None

    try:
        po = ParseOptions(min_null_count=0, max_null_count=999)
        po.linkage_limit = linkage_limit

        di = Dictionary(dict_path)

        f = open(corpus_path)
        o = sys.stdout if output_path is None else open(output_path, "w")

        for line in f:
            sent = Sentence(line, di, po)
            linkages = sent.parse()

            # print("Linkages:", len(linkages))

            for linkage in linkages:
                # print(linkage.diagram())
                parse_postscript(linkage.postscript().replace("\n", ""), o)

        # Prevent interleaving "Dictionary close" messages
        po = ParseOptions(verbosity=0)

    except LG_Error as err:
        print(str(err))

    except IOError as err:
        print(str(err))

    except FileNotFoundError as err:
        print(str(err))

    finally:
        if f is not None:
            f.close()

        if o is not None and o != sys.stdout:
            o.close()