示例#1
0
def Make_Random(sents):
    """
        Make random parses (from LG-parser "any"), to use as baseline
    """
    any_dict = Dictionary('any') # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = 100
    options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN
    options |= BIT_CAPS

    random_parses = []
    for sent in sents:
        num_words = len(sent)
        curr_parse = []
        # subtitute words with numbers, as we only care about the parse tree
        fake_words = ["w{}".format(x) for x in range(1, num_words + 1)]
        # restore final dot to maintain --ignore functionality
        if sent[-1] == ".": 
            fake_words[-1] = "."
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages) # check nbr of linkages in sentence
        if num_parses > 0:
            idx = random.randint(0, num_parses - 1) # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj) # get the random linkage
            tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options)
            for link in links:
                llink = link[0]
                rlink = link[1]
                curr_parse.append([str(llink), tokens[llink], str(rlink), tokens[rlink]])

            random_parses.append(curr_parse)

    return random_parses
示例#2
0
def make_random(sentences: Union[List[Tuple[str, set]], List[str]],
                options: int, **kwargs) -> List[Tuple[str, set]]:
    """
    Make random parses (from LG-parser "any"), to use as baseline

    :param sentences:       List of either tuples of sentence and set of links in case of .ull input file format
                            or strings in case of text input file format.
    :param options:         Integer representing parse options bit masks.
    :return:                List of parses (tuples of sentence and set of links)
    """
    any_dict = Dictionary('any')  # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = int(kwargs.get("limit", 100))
    options |= BIT_STRIP
    options |= BIT_CAPS

    if isinstance(sentences[0], tuple):
        is_ull = True
    elif isinstance(sentences[0], str):
        is_ull = False
    else:
        raise ValueError(
            "The first argument should be either List[Tuple[str, set] or List[str]."
        )

    random_parses = []

    for sent in sentences:
        words = tokenize_sentence(sent[0] if is_ull else sent)
        num_words = len(words)

        # substitute words with numbers, to avoid token-splitting by LG "any"
        fake_words = [f"w{x}" for x in range(1, num_words)]
        # fake_words = [f"w{x}" for x in range(1, num_words + 1)]
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages)  # check nbr of linkages in sentence

        links = []

        if num_parses > 0:
            idx = random.randint(0, num_parses -
                                 1)  # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj)  # get the random linkage
            tokens, links = parse_postscript(
                linkage.postscript().replace("\n", ""), options)

            if num_words != len(tokens):
                logger.error(
                    f"Number of tokens mismatch:\n{words}\n{tokens}\nfor sentence:\n{sent[0]}"
                )

        random_parses.append((sent[0], set(links)))

    return random_parses
示例#3
0
def parseString(s,debug,linkNum=0,file=sys.stdout):
    sent = Sentence(s)
    num_links=sent.parse()
    if num_links>linkNum:
        linkage = Linkage(linkNum,sent)
        if debug:
            linkage.print_diagram(sys.stderr)
        findProblems(linkage, sent, file)
        return linkage
    else:
        return None
示例#4
0
def parseString(s, debug, linkNum=0, file=sys.stdout):
    sent = Sentence(s)
    num_links = sent.parse()
    if num_links > linkNum:
        linkage = Linkage(linkNum, sent)
        if debug:
            linkage.print_diagram(sys.stderr)
        findProblems(linkage, sent, file)
        return linkage
    else:
        return None
示例#5
0
 def checkSummary(self, sentence):
     logging.debug('checkSummary start')
     result = ""
     s = sentence.encode('ascii')
     sent = Sentence(s)
     lc = sent.parse()
     logging.debug('checkSummary sent parsed')
     if lc > 0:
         linkage = Linkage(0, sent)
         result = linkage.get_diagram()
         logging.debug('checkSummary OK')
         del linkage
     del sent
     logging.debug('checkSummary end')
     return result
示例#6
0
 def checkSummary(self, sentence):
     logging.debug('checkSummary start')
     result = ""
     s = sentence.encode('ascii')
     sent = Sentence(s)
     lc = sent.parse()
     logging.debug('checkSummary sent parsed')      
     if lc > 0:  
         linkage = Linkage(0, sent)
         result = linkage.get_diagram()
         logging.debug('checkSummary OK')
         del linkage
     del sent
     logging.debug('checkSummary end')
     return result
示例#7
0
def Make_Random(sents, **kwargs):
    """
        Make random parses (from LG-parser "any"), to use as baseline
    """
    output_path = kwargs.get("output_path", os.environ["PWD"])

    any_dict = Dictionary('any')  # Opens dictionary only once
    po = ParseOptions(min_null_count=0, max_null_count=999)
    po.linkage_limit = 100
    options = 0x00000000 | BIT_STRIP  #| BIT_ULL_IN
    options |= BIT_CAPS

    random_parses = []
    for sent in sents:
        num_words = len(sent)
        curr_sent = sent[:]
        curr_sent.insert(0, "###LEFT-WALL###")
        curr_parse = []
        # subtitute words with numbers, to avoid token-splitting by LG "any"
        fake_words = ["w{}".format(x) for x in range(1, num_words + 1)]
        sent_string = " ".join(fake_words)
        sentence = Sentence(sent_string, any_dict, po)
        linkages = sentence.parse()
        num_parses = len(linkages)  # check nbr of linkages in sentence
        if num_parses > 0:
            idx = random.randint(0, num_parses -
                                 1)  # choose a random linkage index
            linkage = Linkage(idx, sentence, po._obj)  # get the random linkage
            tokens, links = parse_postscript(
                linkage.postscript().replace("\n", ""), options)
            for link in links:
                llink = link[0]
                rlink = link[1]
                # attach words from sent, which are the actual words
                curr_parse.append([
                    str(llink), curr_sent[llink],
                    str(rlink), curr_sent[rlink]
                ])

            random_parses.append(curr_parse)

    Print_parses(sents, random_parses, f"{output_path}/random_parses.ull")

    return random_parses