def __init__(self, n_left=2, n_right=2): self.chunker = BILOUChunkEncoder() self.n_left = n_left self.n_right = n_right
class Encoder(object): """Abstract base class for feature encoders. Inputs ------ n_left : int, optional Number of tokens of left context to include. (Default: 2) n_right : int, optional Number of tokens of right context to include. (Default: 2) Attributes ---------- chunker : chunk.ChunkEncoder ChunkEncoder instance used to generate tags. """ def __init__(self, n_left=2, n_right=2): self.chunker = BILOUChunkEncoder() self.n_left = n_left self.n_right = n_right def get_feats_for_token(self, token): """Return features for token. Inputs ------ token : str Token. Outputs ------- feats : tuple of str Features vector. """ raise NotImplementedError def get_feats(self, tokens): """Return features corresponding to token sequence. Inputs ------ tokens : list of str Token sequence. Outputs ------- feats : lsit of tuples Feature vector sequence. """ feats = [self.get_feats_for_token(token) for token in tokens] feats = zip(*feats) new_feats = [] for ii, feats_ in enumerate(feats): for pos in xrange(-self.n_left, self.n_right + 1): feat_id = 'F%d[%d]' % (ii, pos) k = -pos new_feats.append([ '%s=%s' % (feat_id, val) if val is not None else val for val in roll(feats_, k) ]) new_feats = zip(*new_feats) #print new_feats[0] #print '============================================================================================' # for ii,row in enumerate(new_feats): # new_row = [v if not v is None else 'none' for v in row] # new_feats[ii] = new_row # Filter out None vals in rows where they occur. for ii, row in enumerate(new_feats): new_row = [v for v in row if not v is None] new_feats[ii] = new_row # print new_feats[0] # print '**********************************************************************************************' return new_feats def get_targets(self, tokens, mentions): """Return tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- targets : list of str Target label sequence. """ tags = ['O' for token in tokens] for tag, bi, ei in mentions: chunk = tokens[bi:ei + 1] tags[bi:ei + 1] = self.chunker.chunk_to_tags(chunk, tag) return tags def get_feats_targets(self, tokens, mentions): """Return features/tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- feats : list of tuples Feature vector sequence. targets : list of str Target label sequence. """ feats = self.get_feats(tokens) targets = self.get_targets(tokens, mentions) return feats, targets
default=1, type=int, metavar='n', dest='n_jobs', help='Set num threads to use (default: 1)') args = parser.parse_args() if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Determine ltfs to process. if not args.scpf is None: with open(args.scpf, 'r') as f: args.ltfs = [l.strip() for l in f.readlines()] # Initialize chunker, aligner, and encoder. chunker = BILOUChunkEncoder() aligner = Aligner() encf = os.path.join(args.model_dir, 'tagger.enc') with open(encf, 'r') as f: enc = cPickle.load(f) # Perform tagging in parallel, dumping results to args.tagged_dir. n_jobs = min(len(args.ltfs), args.n_jobs) modelf = os.path.join(args.model_dir, 'tagger.crf') f = delayed(tag_file) Parallel(n_jobs=n_jobs, verbose=0)( f(ltf, aligner, enc, chunker, modelf, args.tagged_dir, args.ext) for ltf in args.ltfs)
class Encoder(object): """Abstract base class for feature encoders. Inputs ------ n_left : int, optional Number of tokens of left context to include. (Default: 2) n_right : int, optional Number of tokens of right context to include. (Default: 2) Attributes ---------- chunker : chunk.ChunkEncoder ChunkEncoder instance used to generate tags. """ def __init__(self, n_left=2, n_right=2): self.chunker = BILOUChunkEncoder() self.n_left = n_left self.n_right = n_right def get_feats_for_token(self, token): """Return features for token. Inputs ------ token : str Token. Outputs ------- feats : tuple of str Features vector. """ raise NotImplementedError def get_feats(self, tokens, token_nums, token_As=None, token_Bs=None, token_Gs=None, token_Fs=None, token_Js=None, A_vals=None, B_vals=None, G_vals=None): """Return features corresponding to token sequence. Inputs ------ tokens : list of str Token sequence. Outputs ------- feats : lsit of tuples Feature vector sequence. """ # feats = [self.get_feats_for_token(token) for token in tokens]; feats = [] for ii, token in enumerate(tokens): ###################################################################################################### ###### Changes to inclusion of features in feature sets can be made here ############################# token_feats = [] """ Add prefix, suffix feats """ # token_feats = self.get_feats_for_token(token) """ Add word feats """ # token_feats.extend(word_type(token)) """ Add A-B-G triple as non-binary feature """ # if token_As != None and token_Bs != None and token_Gs != None: # token_feats.append("{}-{}-{}".format(str(token_As[ii]), str(token_Bs[ii]), str(token_Gs[ii]))) """ Add A-B double as non-binary feature """ # if token_As != None and token_Bs != None: # token_feats.append("{}-{}".format(str(token_As[ii]), str(token_Bs[ii]))) """ Add A as non-binary feature """ # if token_As != None: # token_feats.append(token_As[ii]) """ Add B as non-binary feature """ # if token_Bs != None: # token_feats.append(token_Bs[ii]) """ Add G as non-binary feature """ # if token_Gs != None: # token_feats.append(token_Gs[ii]) """ Add random A values as features (use in order to check for performance at chance) """ # if A_vals != None: # pseudo = random.choice(list(A_vals)) # for v in A_vals: # token_feats.append(pseudo == v) """ Add random B values as features (use in order to check for performance at chance) """ # if B_vals != None: # pseudo = random.choice(list(B_vals)) # for v in B_vals: # token_feats.append(pseudo == v) """ Add random G values as features (use in order to check for performance at chance) """ # if G_vals != None: # pseudo = random.choice(list(G_vals)) # for v in G_vals: # token_feats.append(pseudo == v) """ Add random F values as features (use in order to check for performance at chance) """ # pseudo = random.choice([-1, 0, 1]) # for v in [-1, 0, 1]: # token_feats.append(pseudo == v) """ Add random J values as features (use in order to check for performance at chance) """ # pseudo = random.choice([-1, 0, 1]) # for v in [-1, 0, 1]: # token_feats.append(pseudo == v) """ Add A as binary feature (True or False for each possible value) """ if token_As != None and A_vals != None: for v in A_vals: token_feats.append(token_As[ii] == v) """ Add B as binary feature (True or False for each possible value) """ if token_Bs != None and B_vals != None: for v in B_vals: token_feats.append(token_Bs[ii] == v) """ Add G as binary feature (True or False for each possible value) """ if token_Gs != None and G_vals != None: for v in G_vals: token_feats.append(token_Gs[ii] == v) """ Add F as binary feature (True or False for each possible value) """ if token_Fs != None: for v in [-1, 0, 1]: token_feats.append(token_Fs[ii] == v) """ Add J as binary feature (True or False for each possible value) """ if token_Js != None: for v in [-1, 0, 1]: token_feats.append(token_Js[ii] == v) """ Add whether token is first token as feature (may be useful for case where f = j = -1) """ token_feats.append(token_nums[ii] == 0) """ Add whether token is second token as feature (may be useful for case where f = j = -1) """ token_feats.append(token_nums[ii] == 1) ###################################################################################################### feats.append(token_feats) feats = zip(*feats) new_feats = [] for ii, feats_ in enumerate(feats): for pos in xrange(-self.n_left, self.n_right + 1): feat_id = 'F%d[%d]' % (ii, pos) k = -pos new_feats.append([ '%s=%s' % (feat_id, val) if val is not None else val for val in roll(feats_, k) ]) new_feats = zip(*new_feats) # Filter out None vals in rows where they occur. for ii, row in enumerate(new_feats): new_row = [v for v in row if not v is None] new_feats[ii] = new_row return new_feats def get_targets(self, tokens, mentions): """Return tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- targets : list of str Target label sequence. """ tags = ['O' for token in tokens] for tag, bi, ei in mentions: chunk = tokens[bi:ei + 1] tags[bi:ei + 1] = self.chunker.chunk_to_tags(chunk, tag) return tags def get_feats_targets(self, tokens, mentions, token_nums, token_As=None, token_Bs=None, token_Gs=None, token_Fs=None, token_Js=None, A_vals=None, B_vals=None, G_vals=None): """Return features/tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- feats : list of tuples Feature vector sequence. targets : list of str Target label sequence. """ feats = self.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) targets = self.get_targets(tokens, mentions) return feats, targets
class Encoder(object): """Abstract base class for feature encoders. Inputs ------ n_left : int, optional Number of tokens of left context to include. (Default: 2) n_right : int, optional Number of tokens of right context to include. (Default: 2) Attributes ---------- chunker : chunk.ChunkEncoder ChunkEncoder instance used to generate tags. """ def __init__(self, n_left=2, n_right=2): self.chunker = BILOUChunkEncoder() self.n_left = n_left self.n_right = n_right def get_feats_for_token(self, token): """Return features for token. Inputs ------ token : str Token. Outputs ------- feats : tuple of str Features vector. """ raise NotImplementedError def get_feats(self, tokens): """Return features corresponding to token sequence. Inputs ------ tokens : list of str Token sequence. Outputs ------- feats : lsit of tuples Feature vector sequence. """ feats = [self.get_feats_for_token(token) for token in tokens] feats = zip(*feats) new_feats = [] for ii, feats_ in enumerate(feats): for pos in xrange(-self.n_left, self.n_right+1): feat_id = 'F%d[%d]' % (ii, pos) k = -pos new_feats.append(['%s=%s' % (feat_id, val) if val is not None else val for val in roll(feats_, k)]) new_feats = zip(*new_feats) #print new_feats[0] #print '============================================================================================' # for ii,row in enumerate(new_feats): # new_row = [v if not v is None else 'none' for v in row] # new_feats[ii] = new_row # Filter out None vals in rows where they occur. for ii, row in enumerate(new_feats): new_row = [v for v in row if not v is None] new_feats[ii] = new_row # print new_feats[0] # print '**********************************************************************************************' return new_feats def get_targets(self, tokens, mentions): """Return tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- targets : list of str Target label sequence. """ tags = ['O' for token in tokens] for tag, bi, ei in mentions: chunk = tokens[bi:ei+1] tags[bi:ei+1] = self.chunker.chunk_to_tags(chunk, tag) return tags def get_feats_targets(self, tokens, mentions): """Return features/tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- feats : list of tuples Feature vector sequence. targets : list of str Target label sequence. """ feats = self.get_feats(tokens) targets = self.get_targets(tokens, mentions) return feats, targets