def parse_test(self): path = '/Users/rgeorgi/Documents/treebanks/LDC95T07/RAW/combined/wsj/00/wsj_0001.mrg' tc = CountDict() def count_tokens(tokens): for token in tokens: tc.add(token.label) process_wsj_file(path, count_tokens) # There should be 31 total tokens in this file. self.assertEqual(31, tc.total()) self.assertEqual(tc['.'], 2)
def process_file(path, tm, delimeter='/'): c = POSEvalDict() def add_to_dict(tokens): for token in tokens: # Do the tagset remapping. if tm is not None: if token.label not in tm: DICT_LOG.warn('Tagmap defined, but "{}" not found.'.format(token.label)) label = token.label else: label = tm[token.label] else: label = token.label c.add(token.seq.lower(), label) print('Reading file "{}"'.format(os.path.basename(path))) ext = os.path.splitext(path)[1] # If the specified file extension is ".mrg", treat it as a WSJ file. if ext == '.mrg': cur_token_count, cur_linecount = process_wsj_file(path, add_to_dict) # Otherwise, assume it is a slashtag file. else: cur_token_count, cur_linecount = process_slashtag_file(path, add_to_dict, delimeter=delimeter) return c, cur_token_count, cur_linecount