def test_read_conll_sentences_diff_comment_string(): text_tokens = [["#", "1", "2"], ["b", "3", "4"], ["c", "5", "6"]] gold = [text_tokens] text = StringIO("\n".join(" ".join([t for t in tt]) for tt in text_tokens)) for p, g in zip(read_conll_sentences(text, comment_pattern="# comment: "), gold): assert p == g
def test_read_conll_sentences_hash_token(): text_tokens = [["# This is actually a comment"], ["a", "1", "2"], ["#", "3", "4"], ["c", "5", "6"]] gold = [text_tokens[1:]] text = StringIO("\n".join(" ".join([t for t in tt]) for tt in text_tokens)) for p, g in zip(read_conll_sentences(text), gold): assert p == g
def test_read_conll_sentences_no_comments(): text_tokens = [ ["#", "1", "2"], ["b", "3", "4"], ["c", "5", "6"], ] gold = [text_tokens] text = StringIO("\n".join(" ".join([t for t in tt]) for tt in text_tokens)) for p, g in zip(read_conll_sentences(text, allow_comments=False), gold): assert p == g
def _read_conll_file(f, delim): """Read a golds and predictions out of a conll file. :param f: `file` The open file object. :param delim: `str` The symbol that separates columns in the file. :returns: `Tuple[List[List[str]], List[List[str]]]` The golds and the predictions. They are aligned lists and each element is a List of strings that are the list of tags. Note: the file should contain lines with items separated by $delimiter characters (default space). The final two items should contain the correct tag and the guessed tag in that order. Sentences should be separated from each other by empty lines. """ golds = [] preds = [] for lines in read_conll_sentences(f, delim=delim): golds.append([l[-2] for l in lines]) preds.append([l[-1] for l in lines]) return golds, preds
def test_read_conll_sentences(): for p, g in zip(read_conll_sentences(TEST_FILE), gold_sentences): assert p == g