Exemplo n.º 1
0
    def print_tree(self,
                   depth=0,
                   indent=4,
                   exclude_fields=DEFAULT_EXCLUDE_FIELDS):
        if not self.token:
            raise ParseException("Can't print, token is None.")

        if "deprel" not in self.token or "id" not in self.token:
            raise ParseException(
                "Can't print, token is missing either the id or deprel fields."
            )

        relevant_data = self.token.copy()
        for key in exclude_fields:
            if key in relevant_data:
                del relevant_data[key]

        node_repr = ' '.join([
            '{key}:{value}'.format(key=key, value=value)
            for key, value in relevant_data.items()
        ])

        print(' ' * indent * depth +
              '(deprel:{deprel}) {node_repr} [{idx}]'.format(
                  deprel=self.token['deprel'],
                  node_repr=node_repr,
                  idx=self.token['id'],
              ))
        for child in self.children:
            child.print_tree(depth=depth + 1,
                             indent=indent,
                             exclude_fields=exclude_fields)
Exemplo n.º 2
0
    def __init__(self, tokens, metadata=None):
        if not isinstance(tokens, list):
            raise ParseException(
                "Can't create TokenList, tokens is not a list.")

        self.tokens = tokens
        self.metadata = metadata
Exemplo n.º 3
0
def parse_token_and_metadata(data, fields=None, field_parsers=None):
    if not data:
        raise ParseException(
            "Can't create TokenList, no data sent to constructor.")

    fields = fields or DEFAULT_FIELDS
    field_parsers = field_parsers or DEFAULT_FIELD_PARSERS

    tokens = []
    texts = []

    for line in data.split('\n'):
        line = line.strip()

        if not line:
            continue

        if line.startswith('#'):
            var_name, var_value = parse_comment_line(line)
            if var_name == "text":
                texts.append(var_value)
        else:
            tokens.append(parse_line(line, fields, field_parsers))

    return tokens, texts
Exemplo n.º 4
0
def print_tree_b(tokentree, s="", exclude_fields=DEFAULT_EXCLUDE_FIELDS):
    if not tokentree.token:
        raise ParseException("Can't print, token is None.")

    if "deprel" not in tokentree.token or "id" not in tokentree.token:
        raise ParseException(
            "Can't print, token is missing either the id or deprel fields.")

    relevant_data = tokentree.token.copy()
    for key in exclude_fields:
        if key in relevant_data:
            del relevant_data[key]

    print(tokentree.token['form'], s)

    if len(tokentree.children) == 0 or tokentree.children is None:
        return "{}/{} ".format(tokentree.token['form'],
                               tokentree.token['upostag'])

    for child in tokentree.children:
        s += "(" + print_tree_b(child, s=s,
                                exclude_fields=exclude_fields) + ")"

    return s
Exemplo n.º 5
0
def parse_comment_line(line):
    line = line.strip()

    if line[0] != '#':
        raise ParseException(
            "Invalid comment format, comment must start with '#'")

    stripped = line[1:].strip()
    if '=' not in line and stripped != 'newdoc' and stripped != 'newpar':
        return None, None

    name_value = line[1:].split('=', 1)
    var_name = name_value[0].strip()
    var_value = None if len(name_value) == 1 else name_value[1].strip()

    return var_name, var_value
Exemplo n.º 6
0
    def serialize(self):
        if not self.token or "id" not in self.token:
            raise ParseException("Could not serialize tree, missing 'id' field.")

        def flatten_tree(root_token, token_list=[]):
            token_list.append(root_token.token)

            for child_token in root_token.children:
                flatten_tree(child_token, token_list)

            return token_list

        tokens = flatten_tree(self)
        tokens = sorted(tokens, key=lambda t: t['id'])
        tokenlist = TokenList(tokens, self.metadata)

        return serialize(tokenlist)
Exemplo n.º 7
0
    def get_noun_chunks(self, subject):
        NP = ['NN', 'NNS', 'NNP', 'NNPS']
        AJ = ['JJ', 'JJR', 'JJS']
        if len(self) > 0 and isinstance(self[0], OrderedDict):
            annotated_sentence = list()
            for word in self:
                annotated_sentence.append("{}_{}".format(word['xpostag'], word['form']))
            flag = 0
            noun_chunks = []
            stack = []
            for i in annotated_sentence:
                matches = re.search(r'([\w.,]+)_([\w.,]+)', i)
                if matches and matches.group(1) in NP:
                    if flag == 0:
                        flag = 1
                    stack.append(matches.group(2))

                    if stack[-1][-1] == '.':
                        stack[-1] = stack[-1][:-1]
                        noun_chunks.append('_'.join(['NP'] + stack))
                        noun_chunks.append('.')
                        flag = 0
                        stack = []
                else:
                    if flag == 1:
                        noun_chunks.append('_'.join(['NP'] + stack))
                        stack = []
                        flag = 0
                    try:
                        if matches.group(2) in ['.', ';', ':', ',']:
                            noun_chunks.append(matches.group(1))
                        elif matches.group(1) == 'CD':
                            noun_chunks.append(i)
                        elif matches.group(1) in AJ:
                            noun_chunks.append('AJ_' + matches.group(2))
                        elif matches.group(1) == 'PRP':
                            noun_chunks.append('NP_' + subject)
                        else:
                            noun_chunks.append(matches.group(2))
                    except:
                        noun_chunks.append(',')

                
            return ' '.join(noun_chunks)
        else:
            raise ParseException("Can't create noun chuncks for a group of sentences")