Пример #1
0
def parse_file(f):
    """
    Parses a corpus vertical file (or its stripped version containing only tags)

    arguments:
    f -- a file to be parsed

    returns:
    a tuple containing parsed div tags
    """
    opus_list = []
    metadata = {}

    words = []

    for line in f:
        tag, start, attrs = vp.parse_line(line)
        if start is True:
            if tag == 'opus':
                print('t = %s, start: %s, attrs: %s' % (tag, start, attrs))
                metadata.update(attrs)
        elif start is False:
            if tag == 'opus':
                metadata['wordcount'] = len(set(words))
                metadata['poscount'] = len(words)
                opus_list.append(metadata)
                metadata = {}
                words = []
        else:
            words.append(vp.parse_word(line))
    return tuple(opus_list)
Пример #2
0
def parse_file(f):
    """
    Parses a corpus vertical file (or its stripped version containing only tags)

    arguments:
    f -- a file to be parsed

    returns:
    a tuple containing parsed div tags
    """
    curr_doc = {}
    div_list = []
    metadata = {}

    pos_count = 0

    for line in f:
        tag, start, attrs = vp.parse_line(line)

        if start is True:
            if tag == 'doc':
                curr_doc.update(attrs)
            elif tag == 'div':
                attrs['__doc__'] = curr_doc
                attrs['group'] = curr_doc.get('group')
                attrs['id'] = normalize_div_id(attrs['id'])
                metadata.update(attrs)
        elif start is False:
            if tag == 'div':
                metadata['poscount'] = pos_count
                div_list.append(metadata)
                metadata = {}
                pos_count = 0
            elif tag == 'doc':
                curr_doc = {}
        else:
            pos_count += 1
    return tuple(div_list)