def Make_Random(sents): """ Make random parses (from LG-parser "any"), to use as baseline """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_parse = [] # subtitute words with numbers, as we only care about the parse tree fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] # restore final dot to maintain --ignore functionality if sent[-1] == ".": fake_words[-1] = "." sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] curr_parse.append([str(llink), tokens[llink], str(rlink), tokens[rlink]]) random_parses.append(curr_parse) return random_parses
def make_random(sentences: Union[List[Tuple[str, set]], List[str]], options: int, **kwargs) -> List[Tuple[str, set]]: """ Make random parses (from LG-parser "any"), to use as baseline :param sentences: List of either tuples of sentence and set of links in case of .ull input file format or strings in case of text input file format. :param options: Integer representing parse options bit masks. :return: List of parses (tuples of sentence and set of links) """ any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = int(kwargs.get("limit", 100)) options |= BIT_STRIP options |= BIT_CAPS if isinstance(sentences[0], tuple): is_ull = True elif isinstance(sentences[0], str): is_ull = False else: raise ValueError( "The first argument should be either List[Tuple[str, set] or List[str]." ) random_parses = [] for sent in sentences: words = tokenize_sentence(sent[0] if is_ull else sent) num_words = len(words) # substitute words with numbers, to avoid token-splitting by LG "any" fake_words = [f"w{x}" for x in range(1, num_words)] # fake_words = [f"w{x}" for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence links = [] if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) if num_words != len(tokens): logger.error( f"Number of tokens mismatch:\n{words}\n{tokens}\nfor sentence:\n{sent[0]}" ) random_parses.append((sent[0], set(links))) return random_parses
def Make_Random(sents, **kwargs): """ Make random parses (from LG-parser "any"), to use as baseline """ output_path = kwargs.get("output_path", os.environ["PWD"]) any_dict = Dictionary('any') # Opens dictionary only once po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = 100 options = 0x00000000 | BIT_STRIP #| BIT_ULL_IN options |= BIT_CAPS random_parses = [] for sent in sents: num_words = len(sent) curr_sent = sent[:] curr_sent.insert(0, "###LEFT-WALL###") curr_parse = [] # subtitute words with numbers, to avoid token-splitting by LG "any" fake_words = ["w{}".format(x) for x in range(1, num_words + 1)] sent_string = " ".join(fake_words) sentence = Sentence(sent_string, any_dict, po) linkages = sentence.parse() num_parses = len(linkages) # check nbr of linkages in sentence if num_parses > 0: idx = random.randint(0, num_parses - 1) # choose a random linkage index linkage = Linkage(idx, sentence, po._obj) # get the random linkage tokens, links = parse_postscript( linkage.postscript().replace("\n", ""), options) for link in links: llink = link[0] rlink = link[1] # attach words from sent, which are the actual words curr_parse.append([ str(llink), curr_sent[llink], str(rlink), curr_sent[rlink] ]) random_parses.append(curr_parse) Print_parses(sents, random_parses, f"{output_path}/random_parses.ull") return random_parses