def convert_file(filename):
    tag_list = []
    sent_tags = []
    for line in open(filename):
        line = line.strip()
        if not line:
            continue
        elif line == '======================================':
            # Check if there are any tags to record
            if sent_tags:
                tag_list.append(sent_tags)
                sent_tags = []
        else:
            # Check if the chunk is a NP
            np_match = re.match(r'^\[(.*)\]$', line)
            if np_match:
                tag_string = np_match.group(1).strip()
            else:
                tag_string = line

            for token_id, token_tag in enumerate(tag_string.split(' ')):
                # Watch out for cases where there are two+ spaces...
                if not token_tag:
                    continue

                # According the documentation NP Chunking isn't reliable
                """
                # Get target val
                target_val = 'O'
                if np_match and token_id == 0:
                    target_val = 'B'
                elif np_match:
                    target_val = 'I'
                """

                # Record features, watch out for escaped / and |
                token_tag = token_tag.replace('\/', ';;;')
                token_tag = token_tag.replace('\|', ':::')
                tp = token_tag.split('/')
                if len(tp) != 2:  # Handles unfiltered brackets
                    continue
                token, tag = tp
                token = token.replace(';;;', '\/')
                token = token.replace(':::', '\|')
                tag = tag.replace(';;;', '\/')
                tag = tag.replace(':::', '\|')

                # If multiple tags are given than take the first
                tag = tag.split('|')[0]

                # Check whether to include extra features
                features_list = [token]
                if USE_ALL_FEATURES:
                    features_list.extend(
                        feature_extraction.token_features(token))
                
                # Add tag and append tuple
                features_list.append(tag)
                sent_tags.append(tuple(features_list))

    return tag_list
예제 #2
0
#!/usr/bin/python

#####################################################################
# tweets2yamcha.py
#
# Converts tokenized twitter posts to yamcha format
#####################################################################

import sys
import feature_extraction

for line in sys.stdin:
    line = line.rstrip("\n")
    words = line.split(' ')
    for word in words:
        print word + " " + " ".join(feature_extraction.token_features(word))
    print