def convert_file(filename): tag_list = [] sent_tags = [] for line in open(filename): line = line.strip() if not line: continue elif line == '======================================': # Check if there are any tags to record if sent_tags: tag_list.append(sent_tags) sent_tags = [] else: # Check if the chunk is a NP np_match = re.match(r'^\[(.*)\]$', line) if np_match: tag_string = np_match.group(1).strip() else: tag_string = line for token_id, token_tag in enumerate(tag_string.split(' ')): # Watch out for cases where there are two+ spaces... if not token_tag: continue # According the documentation NP Chunking isn't reliable """ # Get target val target_val = 'O' if np_match and token_id == 0: target_val = 'B' elif np_match: target_val = 'I' """ # Record features, watch out for escaped / and | token_tag = token_tag.replace('\/', ';;;') token_tag = token_tag.replace('\|', ':::') tp = token_tag.split('/') if len(tp) != 2: # Handles unfiltered brackets continue token, tag = tp token = token.replace(';;;', '\/') token = token.replace(':::', '\|') tag = tag.replace(';;;', '\/') tag = tag.replace(':::', '\|') # If multiple tags are given than take the first tag = tag.split('|')[0] # Check whether to include extra features features_list = [token] if USE_ALL_FEATURES: features_list.extend( feature_extraction.token_features(token)) # Add tag and append tuple features_list.append(tag) sent_tags.append(tuple(features_list)) return tag_list
#!/usr/bin/python ##################################################################### # tweets2yamcha.py # # Converts tokenized twitter posts to yamcha format ##################################################################### import sys import feature_extraction for line in sys.stdin: line = line.rstrip("\n") words = line.split(' ') for word in words: print word + " " + " ".join(feature_extraction.token_features(word)) print