예제 #1
0
def prepare_one_ud_file(fname):
    """Prepare feature values from `fname` of Universal Dependencies data.

    We look at every token in this file. If the token's POS is one that we care
    about for this project, we extract its feature values.

    Parameters
    ----------
    fname : str

    Returns
    -------
    pd.DataFrame
        Contains columns for word form, pos, number, gender, case and person

    """
    conll = pyconll.iter_from_file(fname)
    result = []
    pos_of_interest = set(POS_MAPPING.values())
    for sentence in conll:
        for token in sentence:
            pos = token.upos
            if pos in pos_of_interest:
                data = {"word": token.form, "pos": pos, "lemma": token.lemma}
                for feature in ["number", "gender", "case", "person"]:
                    data[feature] = feature_value(token, feature)
                result.append(data)
    return pd.DataFrame(result)
예제 #2
0
def test_iter_from_file():
    """
    Test that CoNLL files can be iterated over without memory given the
    filename.
    """
    expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)]
    actual_ids = [
        sent.id for sent in iter_from_file(fixture_location('basic.conll'))
    ]

    assert expected_ids == actual_ids
예제 #3
0
파일: utils.py 프로젝트: yyht/cat
def conll2text(paths, outpath):
    """Write a conll file to a text file."""
    with open(outpath, 'w') as f:
        for path in paths:
            for sent in pyconll.iter_from_file(path):
                txt = []
                for x in sent:
                    txt.append(x.form)
                if txt:
                    txt = " ".join(txt).lower()
                    txt = "".join([x for x in txt if x.isprintable()])
                    f.write(f"{txt}\n")
    def _get_conllu_iter(self, fileid):
        """
        Parameters
        ----------
        fileid: str
            The file identifier of the file to read

        Returns
        -------
        generator : use it to iterate over sentences

        """

        return pyconll.iter_from_file(self._root.join(fileid))
예제 #5
0
    def iter_from_dir(directory, recursive=False):
        """
        Iters from all conll files in a directory.
        @args
            directory: str: The directory to search in. Probably needs to be fullpath.
            recursive: bool: If true, search subdirectories recursively.
        @output
            generator object that yields a sentence at a time
        """
        if recursive:
            for root, dirs, files in os.walk(directory):
                for file in files:
                    _, ext = os.path.splitext(file)
                    if ext == '.conllu':
                        fullpath = os.path.join(root, file)
                        yield from pyconll.iter_from_file(fullpath)

        else:
            files = os.listdir(directory)
            for file in files:
                _, ext = os.path.splitext(file)
                if ext == '.conllu':
                    fullpath = os.path.join(directory, file)
                    yield from pyconll.iter_from_file(fullpath)
def get_error_sents(path):
    error_sent_ids = []
    for file in os.listdir(path):
        conll = pyconll.iter_from_file(os.path.join(path, file))
        for sentence in conll:
            count_roots = 0
            for token in sentence:
                if token.id == token.head:
                    error_sent_ids.append(sentence.id)
                    continue
                if token.deprel == 'root' or token.head == '0':
                    # print(token.deprel)
                    count_roots += 1
            if count_roots != 1:
                if sentence.id not in error_sent_ids:
                    error_sent_ids.append(sentence.id)
    return error_sent_ids
예제 #7
0
def get_info(split, langs):
    count, empty = 0, 0
    paths = []
    for filename in glob.iglob(UD_DIR + "**/**", recursive=True):
        if ".conllu" in filename:
            lang = filename.split("/")[-1].split("_")[0]
            if lang in langs and split in filename:
                count += 1
                for ts in pyconll.iter_from_file(filename):
                    if len(ts) < 2:
                        continue
                    # if form == '_', it means that text is missing
                    if ts[0].form == ts[1].form == "_":
                        empty += 1
                        print(f"Text missing in {filename}...")
                    else:
                        paths.append(filename)
                    break
    return count, empty, paths
예제 #8
0
def _test_treebank(treebank_path):
    """
    Test that the provided treebank can be parsed and written without error.

    Args:
        treebank_path: The path to the treebank file that is to be parsed and written.
    """
    TMP_OUTPUT_FILE = '__tmp__ud.conllu'

    logging.info('Starting to parse %s', treebank_path)

    treebank = pyconll.iter_from_file(treebank_path)

    # For each sentence write back and make sure to include the proper
    # newlines between sentences.
    with open(TMP_OUTPUT_FILE, 'w', encoding='utf-8') as f:
        for sentence in treebank:
            f.write(sentence.conll())
            f.write('\n\n')

    # Clean up afterwards.
    os.remove(TMP_OUTPUT_FILE)
예제 #9
0
def fix_sent_ids(conll, file_count, total_count):
    lines = conll.split('\n')
    s_id = lines[1]
    x_id = lines[0]
    new_s_id = s_id.split(',')[0]+f',{file_count}.{total_count}'
    new_lines = [new_s_id, x_id] + lines[2:]
    return '\n'.join(new_lines)

input_folder = sys.argv[1]

dev_test_file = '1928.ntacts.rel-bib.conllu'

output_file = f'fo_farpahc-ud-dev.conllu'
print(f'Writing to file: {output_file}')
with open(output_file, 'w+') as f:
    conll = pyconll.iter_from_file(os.path.join(input_folder, dev_test_file))
    sent_count = 0
    for sentence in conll:
        sent_count += 1
        output_conll = fix_sent_ids(sentence.conll(), '', sent_count)
        f.write(output_conll)
        f.write('\n\n')
        if sent_count == 300:
            break

output_file = f'fo_farpahc-ud-test.conllu'
print(f'Writing to file: {output_file}')
with open(output_file, 'w+') as f:
    conll = pyconll.iter_from_file(os.path.join(input_folder, dev_test_file))
    sent_count = 0
    out_sent_count = 0 
# TEST_PATH = '../../CoNLLU'
TEST_PATH = sys.argv[1]
ERROR_SENTS = get_error_sents(TEST_PATH)

print(f'\nNo. of non-valid sentencens:\t{len(ERROR_SENTS)}\n')

for prefix in PREFIXES:
    total_sentences = 0
    output_file = f'is_icepahc-ud-{prefix}.conllu'
    print(f'Writing to file: {output_file}')
    with open(output_file, 'a+') as f, open('error_sents.conllu',
                                            'a+') as err_f:
        for file in SPLITS[prefix]:

            file_sentences = 0

            file += '.conllu'
            print(f'\t{file}')
            conll = pyconll.iter_from_file(os.path.join(TEST_PATH, file))
            for sentence in conll:
                file_sentences += 1
                total_sentences += 1
                # if sentence.id in ERROR_SENTS:
                #     err_f.write(sentence.conll())
                #     err_f.write('\n\n')
                # else:
                output_conll = fix_sent_ids(sentence.conll(), file_sentences,
                                            total_sentences)
                f.write(output_conll)
                f.write('\n\n')
예제 #11
0
if not args.edition:
    e = args.conll.replace("/annotation/", "/texts/data/")
    e = e.replace("_tb-grc1.conllu", "-grc1.xml")
if not os.path.isfile(e):
    raise FileNotFoundError(
        "File {} not found. Try to pass the path with the -e option".format(e))

r, f = os.path.split(e)

# TEI file
txt = CapitainCorpusReader(r, f)
tei_sents = [" ".join(s).replace("ʼ", "'") for s in txt.sents(f)]

# Treebank
cpath = args.conll
tb = pyconll.iter_from_file(cpath)
tb_sents = []
for s in tb:
    tks = [t.form for t in s if not t.misc.get("type")]
    tb_sents.append(" ".join(tks))

# now we glue them
# assert len(tei_sents) == len(tb_sents), "Nr. of sents does not correspond ({} tb vs {} tei)".format(len(tb_sents),
#                                                                                                    len(tei_sents))

if len(tei_sents) != len(tb_sents):
    print("Nr. of sents does not correspond ({} tb vs {} tei)".format(
        len(tb_sents), len(tei_sents)))
#    with open("error_sents.txt", "w") as out:
#        for s in tei_sents:
#            out.write(s + "\n")
예제 #12
0
import pyconll
import os
import re

dir = '../../CoNLLU_bkp'

error_sent_ids = []

for file in os.listdir(dir):
    conll = pyconll.iter_from_file(os.path.join(dir, file))
    for sentence in conll:
        count_roots = 0
        for token in sentence:
            if token.id == token.head:
                error_sent_ids.append((sentence.id, len(sentence)))
                continue
            if token.deprel == 'root' or token.head == '0':
                # print(token.deprel)
                count_roots += 1
        if count_roots != 1:
            if sentence.id not in error_sent_ids:
                error_sent_ids.append((sentence.id, len(sentence)))

for id in error_sent_ids:
    print(f'{id[0]}\tTokens: {id[1]}')

print(f'No. of non-valid sentencens:\t{len(error_sent_ids)}')
print(f'No. of non-valid tokens:\t{sum(i[1] for i in error_sent_ids)}')
예제 #13
0
def collect_agreement_relations(fname):
    """Prepare cloze examples from `fname`.

    Agreement relations are an overt morphophonological co-variance of feature
    values between two tokens. We say that one of the tokens (the target)
    agrees with the other (the controller) in a set of features. In this work,
    we are interested in four features which are commonly involved in agreement
    relations. We are interested in four types of cross-linguistically common
    agreement relations. In the list below, the target comes first and the
    controller second:

        * determiner ~ noun
        * (modifying) adjective ~ noun
        * (predicated) adjective ~ (subject) noun
        * verb(-like) ~ (subject) noun

    Not all languages will exhibit agreement relations in all four types, and
    even when they do the tokens may not agree in all four features (and indeed
    may agree in other features that we're not looking at).

    To collect agreement relations, we loop over the sentences in `file_name`
    looking for instances of the four types listed above (e.g. we look for a
    determiner and its head noun, a predicated adjective and its subject).
    Each instance we find is a potential agreement relation. For every instance
    we find, we extract the agreement values of the two tokens (see the
    agreement_value function for exactly what I mean by this). Provided the
    instance has at least one genuine agreement value then we will keep it.

    Parameters
    ----------
    fname : str

    Returns
    -------
    pd.DataFrame
        Contains the type of agreement relation, POS of the masked word, the
        agreement values for the four features, and the masked sentence

    """
    conll = pyconll.iter_from_file(fname)
    result = []
    for sentence in conll:
        for token in sentence:
            try:
                head = sentence[token.head]
            except (KeyError, ValueError):
                # problem with the underlying file or annotation
                continue
            if is_determiner_relation(token, head):
                instance = extract(sentence, token, head, "determiner")
                result.append(instance)
                instance = extract(
                    sentence, token, head, "determiner", reverse=True
                )  # quick fix to get examples with both maskings
                result.append(instance)
            elif is_modifying_adjective_relation(token, head):
                instance = extract(sentence, token, head, "modifying")
                result.append(instance)
                instance = extract(sentence,
                                   token,
                                   head,
                                   "modifying",
                                   reverse=True)
                result.append(instance)
            # The Universal Dependency schema annotates a predicated adjective
            # or a verb as the head of a nominal. However, syntactically the
            # adjective/verb is the target of agreement with the nominal. To
            # account for this, if we find one of the next two functions, we
            # pass in `head` as the `token1` and `token` as `token2`.
            elif is_predicated_adjective_relation(head, token):
                instance = extract(sentence, head, token, "predicated")
                result.append(instance)
                instance = extract(sentence,
                                   head,
                                   token,
                                   "predicated",
                                   reverse=True)
                result.append(instance)
            elif is_verb_relation(head, token):
                instance = extract(sentence, head, token, "verb")
                result.append(instance)
                instance = extract(sentence, head, token, "verb", reverse=True)
                result.append(instance)
            # The Universal Dependencies schema annotates copulas as dependents
            # of the predicate, and auxiliaries as dependents of the main verb.
            # However, we want to extract the subjects in these cases, so once
            # we find a copula or auxiliary, we have to go looking for the
            # subject too. The subject is the controller of the agreement,
            # while the copula/auxiliary is the target.
            elif is_copula_relation(token, head):
                subject = find_subject(token, sentence)
                if subject:  # maybe we didn't find a subject
                    instance = extract(sentence, token, subject, "verb")
                    result.append(instance)
                    instance = extract(sentence,
                                       token,
                                       subject,
                                       "verb",
                                       reverse=True)
                    result.append(instance)
            elif is_auxiliary_relation(token, head):
                subject = find_subject(token, sentence)
                if subject:
                    instance = extract(sentence, token, subject, "verb")
                    result.append(instance)
                    instance = extract(sentence,
                                       token,
                                       subject,
                                       "verb",
                                       reverse=True)
                    result.append(instance)
    result = pd.DataFrame(result)
    # remove instances with tokens that disagree or have no values for all
    # four features.
    features = ["number", "gender", "case", "person"]
    agree = result["agree"]
    has_no_values = (result[features] == NA).all(axis=1)
    result = result[agree & ~has_no_values]
    # order columns
    cols = [
        "uid",
        "lemma",
        "type",
        "pos",
        "number",
        "gender",
        "case",
        "person",
        "masked",
        "other_masked",
        "other_lemma",
        "intervening_noun",
        "num_distractors",
        "correct_form",
        "other_correct_form",
    ]
    return result[cols]
예제 #14
0
 def iter_from_file(filename):
     """Simple port of pyconll iter_from_file function"""
     yield from pyconll.iter_from_file(filename)
예제 #15
0
파일: fragments.py 프로젝트: yyht/cat
def trees_from_conll(path):
    """Get all trees for every sentence in a conll file."""
    for x in pyconll.iter_from_file(path):
        yield tree(x)
예제 #16
0
파일: fragments.py 프로젝트: yyht/cat
def nouns_from_conll(path):
    """Get all nouns, regardless of adjectival modification."""
    for sent in pyconll.iter_from_file(path):
        for token in sent:
            if token.upos == "NOUN":
                yield token.form.lower()