예제 #1
0
def beforeTurbo(data):
    print('parsing data as trees...')
    trees = [tree for tree in parse_tree_incr(data)]
    cleared_trees = [clear_tree(tree) for tree in trees]
    texts = [tree.serialize() for tree in cleared_trees]
    file = open("tmp/cleared.conll", 'w')
    for text in texts:
        file.write(text)
        file.write(u"")
    print("ready for TurboTesting")
예제 #2
0
def main():
    print("opening training data...")
    data = open("../PerDT/Data/train.conll", 'r')
    print('parsing data as trees...')
    trees = [tree for tree in parse_tree_incr(data)]
    print("clearing the trees of data")
    for tree in trees:
        clear_tree(tree)
    print("serializing the trees")
    texts = [tree.serialize() for tree in trees]
    file = open("tmp/cleared-train.conll", 'w')
    for text in texts:
        file.write(text)
        file.write(u"")
    print("finished")
예제 #3
0
def parse_all_sentences():
    for tokentree in conllu.parse_tree_incr(data_file):

        relevance_flag = BFS(tokentree, check_if_relevant_noun)

        if relevance_flag:
            list_of_relevant_roots.append(tokentree)

    with open('relevant sentences.txt', 'w', encoding="utf-8") as filehandle:
        for listitem in list_of_relevant_roots:
            back_to_conll = listitem.serialize()
            # text = back_to_conll.metadata
            filehandle.write('-----  new sentence  -----\n')
            # filehandle.write('%s\n' % text)
            filehandle.write('%s\n' % back_to_conll)
            filehandle.write('-----  sentence end  -----\n')
    print("relevants = ", list_of_relevant_roots)
    print("num of relevants = ", len(list_of_relevant_roots))
예제 #4
0
    def generate(self):
        """
        generates the dataset and writes to the output file.
        output format: full_sentence \t split1 <::::> split2
        """
        i = 0
        output_file = open(self.output_path_, 'wb')

        for line in conllu.parse_tree_incr(open(self.input_, 'r')):
            chunks = []

            full_sentence = self.to_string(line.children, [
                (line.__dict__['token']['form'], line.__dict__['token']['id'])
            ]).strip()

            for child in line.children:

                if child.__dict__['token']['deprel'].lower(
                ) not in self.ignore_projections_:
                    chunks.append(
                        self.to_string(child.children,
                                       [(line.__dict__['token']['form'],
                                         line.__dict__['token']['id']),
                                        (child.__dict__['token']['form'],
                                         child.__dict__['token']['id'])]))

            splits = self.split_(chunks)

            [
                output_file.write(" {0} \t {1} \n".format(
                    full_sentence, split).encode()) for split in splits
            ]

            logging.info("[WROTE] : {0}th sentence ".format(i))
            i += 1
        output_file.close()
예제 #5
0
    def __init__(self,
                 fname,
                 embed=None,
                 device=None,
                 max_len=1e3,
                 pos_to_id_dict=None,
                 read_tree=False):
        super(ConlluData, self).__init__()
        self.device = device

        if pos_to_id_dict is None:
            pos_to_id = defaultdict(lambda: len(pos_to_id))
        else:
            pos_to_id = pos_to_id_dict

        text = []
        tags = []
        trees = []
        heads = []
        embedding = []
        right_num_deps = []
        left_num_deps = []
        deps = []
        fin = open(fname, "r", encoding="utf-8")
        fin_tree = open(fname, "r", encoding="utf-8")
        data_file_tree = parse_tree_incr(fin_tree)
        data_file = parse_incr(fin)
        for id_, (sent, tree) in enumerate(zip(data_file, data_file_tree)):
            sent_list = []
            tag_list = []
            head_list = []
            right_num_deps_ = []
            left_num_deps_ = []
            sent_n = []
            deps_list = []

            # delete multi-word token
            for token in sent:
                if isinstance(token["id"], int):
                    sent_n += [token]

            for token in sent_n:
                sent_list.append(token["form"])
                pos_id = pos_to_id[token[
                    "upostag"]] if token["upostag"] != '_' else pos_to_id["X"]
                tag_list.append(pos_id)
                # -1 represents root
                head_list.append(token["head"] - 1)
                deps_list.append(token["deprel"])

            if len(tag_list) > max_len:
                continue

            right_num_deps_ = [0] * len(head_list)
            left_num_deps_ = [0] * len(head_list)

            for i, head_id in enumerate(head_list):
                if head_id != -1:
                    if i < head_id:
                        left_num_deps_[head_id] += 1
                    elif i > head_id:
                        right_num_deps_[head_id] += 1
                    else:
                        raise ValueError("head is itself !")

            text.append(sent_list)
            if embed is not None:
                embedding.append(self.text_to_embed(id_, sent_list, embed))
            tags.append(tag_list)
            heads.append(head_list)
            right_num_deps.append(right_num_deps_)
            left_num_deps.append(left_num_deps_)
            trees.append(tree)
            deps.append(deps_list)

        self.trees = trees
        self.text = text
        self.embed = embedding
        self.postags = tags
        self.heads = heads
        self.deps = deps
        self.right_num_deps = right_num_deps
        self.left_num_deps = left_num_deps
        self.pos_to_id = pos_to_id
        self.id_to_pos = {v: k for (k, v) in pos_to_id.items()}
        self.length = len(self.text)

        # if embed is not None:
        #     self.text_to_embed(embed)

        fin.close()
        fin_tree.close()
예제 #6
0
sentence.metadata

### Turn a TokenList back into CoNLL-U
sentence.serialize()  # The format is not desirable

### Turn a Tokenlist into a TokenTree
sentence.to_tree()

### Use parse_tree() to parse into a list of dependency trees
from conllu import parse_tree
sentences = parse_tree(data)
sentences

from conllu import parse_tree_incr
for tokentree in parse_tree_incr(data_file):
    print(tokentree)

root = sentences[0]
root

root.print_tree()

root.token

children = root.children
children

root.metadata
root.serialize()
예제 #7
0
 def test_parse_tree_incr(self):
     self.assertEqual(parse_tree(data), list(parse_tree_incr(StringIO(data))))
예제 #8
0
 def test_parse_tree_incr(self):
     self.assertEqual(parse_tree(data), list(parse_tree_incr(string_to_file(data))))
예제 #9
0
    print(
        "Usage: python conllu_to_docs.py UD-conllu-file.conllu output_folder/")

path_to_udfile = sys.argv[1]

outpath = sys.argv[2]
if not os.path.exists(outpath):
    os.mkdir(outpath)

outfile = None

numdocs = 0

with open(path_to_udfile, "r", encoding="utf-8") as data_file:

    for sentence in parse_tree_incr(data_file):
        md = sentence.metadata
        if "newdoc id" in md:
            # close the last one
            if outfile is not None:
                outfile.close()

            # open a new one
            docid = md["newdoc id"]
            outfile = open(os.path.join(outpath, docid), 'w')
            numdocs += 1

        # write the current sentence
        outfile.write(sentence.serialize())

print(f"Wrote out {numdocs} docs to {outpath}")
예제 #10
0
def readConllu(filename):
    data_file = open(filename, "r", encoding="utf-8")
    for tokentree in parse_tree_incr(data_file):
        print(tokentree)