Python LabelDictionary.add示例

编程语言: Python

命名空间/包名称: sequences.label_dictionary

类/类型: LabelDictionary

方法/功能: add

hotexamples.com的示例: 3

Python LabelDictionary.add - 已找到3个示例。这些是从开源项目中提取的最受好评的sequences.label_dictionary.LabelDictionary.add现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

LabelDictionary(13)

get_label_id(5)

add(2)

示例#1

显示文件

文件： Conll2002NerCorpus.py 项目： leondz/hmm-reps

class Conll2002NerCorpus():
    """
    Optionally reads text to which we want to apply a wordrep such as hmm.
    - no update of the wordrep_dict; every word not in it (from x_dict),
    gets *unk* id needed for successful decoding
    "
    """

    def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None

    def read_sequence_list_conll(self, train_file, train_file_parsed=None, train_files_parsed_path=None,
                                 max_sent_len=100000, max_nr_sent=100000):
        """
        Read a conll2002 or conll2003 file into a sequence list.
        Optionally add a sequence list/tree with *unk* for decoding in wordrep.
        """
        instance_list = self.read_conll_instances(train_file, train_file_parsed, train_files_parsed_path, max_sent_len,
                                                  max_nr_sent)

        if self.wordrep_dict is not None:

            seq_list = SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)  # for indices
            for sent_x, sent_y, sent_ in instance_list:
                # sent_ is a normalized tree
                if self.use_wordrep_tree:
                    seq_list.add_sequence(sent_x, sent_y, None, sent_)
                # sent is a normalized chain
                else:
                    seq_list.add_sequence(sent_x, sent_y, sent_)
        else:
            seq_list = SequenceListLabel(self.word_dict, self.tag_dict)  # for indices
            for sent_x, sent_y in instance_list:
                seq_list.add_sequence(sent_x, sent_y)

        return seq_list

    def read_conll_instances(self, file, file_parsed, files_parsed_path, max_sent_len, max_nr_sent):
        """
        TODO: refactor the entire method, lots of overlap chain/tree/token/lemma
        """

        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname, lr=self.lr)
            trees.prepare_trees()
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append)
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        if self.use_wordrep_tree:
            if file_parsed is None or files_parsed_path is None:
                sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_y = []
        include_ex_z = (self.wordrep_dict is not None and not self.use_wordrep_tree)
        if include_ex_z:
            ex_z = []

        for line in contents:
            if line.startswith("-DOCSTART"):
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(ex_x) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instances.append([ex_x, ex_y, ex_z] if include_ex_z else [ex_x, ex_y])
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_y = []
            else:
                tag = toks[2]
                word = toks[0]
                if word not in self.word_dict:
                    self.word_dict.add(word)
                if tag not in self.tag_dict:
                    self.tag_dict.add(tag)
                ex_x.append(word)
                ex_y.append(tag)
                if include_ex_z:
                    ex_z.append(self.normalize_word(word))

        # add parsed data to use tree wordreps
        if self.use_wordrep_tree:
            for c, instance in enumerate(get_tree(len(instances))):
                # get parsed data
                inst = instance
                instances[c].append(inst)

        return instances  # try generator

    def prepare_lemmatized_conll2002(self, train_file, train_file_parsed=None, train_files_parsed_path=None,
                                     output_f=None):
        self.use_wordrep_tree = True  # need parsed data
        docstarts, instances = self.prepare_conll_instances(train_file, train_file_parsed, train_files_parsed_path)
        if output_f is None:
            return instances
        else:
            header = "-DOCSTART- -DOCSTART- O"
            with open(output_f, "w") as outfile:
                for n, instance in enumerate(instances):
                    # doc headers
                    if n in docstarts:
                        outfile.write("{}\n".format(header))
                    if isinstance(instance, list):
                        for _, postag, tag, lemma in zip(*instance):
                            outfile.write("{} {} {}\n".format(lemma, postag, tag))
                        outfile.write("\n")
                    else:
                        sys.exit("invalid instance")

    def prepare_conll_instances(self, file, file_parsed, files_parsed_path):
        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname, lr=self.lr)
            trees.prepare_trees()
            self.tree_vocab = trees.x_dict
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = trees.train[c_append]
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        max_sent_len = 1000000
        max_nr_sent = 1000000
        if file_parsed is None or files_parsed_path is None:
            sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_x_pos = []
        ex_y = []
        docstarts = set()  # track docstarts header

        for line in contents:
            if line.startswith("-DOCSTART"):
                docstarts.add(nr_sent)
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(ex_x) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instance = [ex_x, ex_x_pos, ex_y]
                    instances.append(instance)
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_x_pos = []
                ex_y = []
            else:
                tag = toks[2]
                postag = toks[1]
                word = toks[0]
                ex_x.append(word)
                ex_x_pos.append(postag)
                ex_y.append(tag)

        for c, instance in enumerate(get_tree(len(instances))):
            ex_z = self.get_words(instance, self.tree_vocab)  # should get lemmas (from ConllCorpus)
            if ex_z is None:
                inst = [i for i in instances[c][0]]
                print("None instance")
            else:
                assert len(ex_z) == len(instances[c][0])
                inst = ex_z
            instances[c].append(inst)

        return docstarts, instances  # try generator

    def normalize_word(self, word):
        if word not in self.wordrep_dict:
            return "*unk*" if word.lower() not in self.wordrep_dict else word.lower()
        else:
            return word

    def normalize_tree(self, tree, trees_vocab, c):
        """
        Recode the name index based on wordrep_dict.
        Modify tree.name such that *unk* or lowercase words are included.
        """
        for node in tree:
            w = trees_vocab.get_label_name(node.name)
            # if c==0:
            #    print("{}\t{}".format(w, self.normalize_word(w)))
            new_name = self.wordrep_dict.get_label_id(self.normalize_word(w))
            node.set_name(new_name)
        return tree

    def get_words(self, instance, vocab):
        if isinstance(instance, Tree):
            return get_words_from_tree(instance, vocab)
        print("None instance in Conll2002NerCorpus")
        return None

    def write_conll_instances(self, gold, predictions, file, sep=" "):
        """
        Create dataset with appended predictions as the last column.
        """
        assert len(gold) == len(predictions)
        contents = open(file, "w", encoding="iso-8859-1")
        for gold_seq, pred_seq in zip(gold.seq_list, predictions):
            for x, y, y_hat in zip(gold_seq.x, gold_seq.y, pred_seq.y):
                contents.write("{}{sep}{}{sep}{}\n".format(gold_seq.sequence_list.x_dict.get_label_name(x),
                                                           gold_seq.sequence_list.y_dict.get_label_name(y),
                                                           pred_seq.sequence_list.y_dict.get_label_name(y_hat),
                                                           sep=sep))
            contents.write("\n")

    # # Dumps a corpus into a file
    def save_corpus(self, dirname):
        if not os.path.isdir(dirname + "/"):
            os.mkdir(dirname + "/")
        #word_fn = open(dir+"word.dic","w")
        #for word_id,word in enumerate(self.int_to_word):
        #    word_fn.write("{}\t{}\n".format(word_id, word))
        #word_fn.close()
        #tag_fn = open(dir+"tag.dic","w")
        #for tag_id,tag in enumerate(self.int_to_tag):
        #    tag_fn.write("{}\t{}\n".format(tag_id, tag))
        #tag_fn.close()
        #word_count_fn = open(dir+"word.count","w")
        #for word_id,counts in self.word_counts.iteritems():
        #    word_count_fn.write("{}\t{}\n".format(word_id,counts))
        #word_count_fn.close()
        self.sequence_list.save(dirname + "sequence_list")

    ## Loads a corpus from a file
    def load_corpus(self, dirname):
        word_fn = open(dirname + "word.dic")
        for line in word_fn:
            word_nr, word = line.strip().split("\t")
            self.int_to_word.append(word)
            self.word_dict[word] = int(word_nr)
        word_fn.close()
        tag_fn = open(dirname + "tag.dic")
        for line in tag_fn:
            tag_nr, tag = line.strip().split("\t")
            if tag not in self.tag_dict:
                self.int_to_tag.append(tag)
                self.tag_dict[tag] = int(tag_nr)
        tag_fn.close()
        word_count_fn = open(dirname + "word.count")
        for line in word_count_fn:
            word_nr, word_count = line.strip().split("\t")
            self.word_counts[int(word_nr)] = int(word_count)
        word_count_fn.close()
        self.sequence_list.load(dirname + "sequence_list")

示例#2

显示文件

class Conll2002NerCorpus():
    """
    Optionally reads text to which we want to apply a wordrep such as hmm.
    - no update of the wordrep_dict; every word not in it (from x_dict),
    gets *unk* id needed for successful decoding
    "
    """
    def __init__(self,
                 wordrep_dict=None,
                 eval_spec_rel=False,
                 dirname=None,
                 lr=False,
                 use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None

    def read_sequence_list_conll(self,
                                 train_file,
                                 train_file_parsed=None,
                                 train_files_parsed_path=None,
                                 max_sent_len=100000,
                                 max_nr_sent=100000):
        """
        Read a conll2002 or conll2003 file into a sequence list.
        Optionally add a sequence list/tree with *unk* for decoding in wordrep.
        """
        instance_list = self.read_conll_instances(train_file,
                                                  train_file_parsed,
                                                  train_files_parsed_path,
                                                  max_sent_len, max_nr_sent)

        if self.wordrep_dict is not None:

            seq_list = SequenceListLabel(self.word_dict, self.tag_dict,
                                         self.wordrep_dict)  # for indices
            for sent_x, sent_y, sent_ in instance_list:
                # sent_ is a normalized tree
                if self.use_wordrep_tree:
                    seq_list.add_sequence(sent_x, sent_y, None, sent_)
                # sent is a normalized chain
                else:
                    seq_list.add_sequence(sent_x, sent_y, sent_)
        else:
            seq_list = SequenceListLabel(self.word_dict,
                                         self.tag_dict)  # for indices
            for sent_x, sent_y in instance_list:
                seq_list.add_sequence(sent_x, sent_y)

        return seq_list

    def read_conll_instances(self, file, file_parsed, files_parsed_path,
                             max_sent_len, max_nr_sent):
        """
        TODO: refactor the entire method, lots of overlap chain/tree/token/lemma
        """
        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed,
                                howbig=1000000,
                                lemmas=True,
                                eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname,
                                lr=self.lr)
            trees.prepare_trees()
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = self.normalize_tree(trees.train[c_append],
                                               trees.x_dict, c_append)
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        if self.use_wordrep_tree:
            if file_parsed is None or files_parsed_path is None:
                sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_y = []
        include_ex_z = (self.wordrep_dict is not None
                        and not self.use_wordrep_tree)
        if include_ex_z:
            ex_z = []

        for line in contents:
            if line.startswith("-DOCSTART"):
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(
                        ex_x
                ) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instances.append(
                        [ex_x, ex_y, ex_z] if include_ex_z else [ex_x, ex_y])
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_y = []
            else:
                tag = toks[2]
                word = toks[0]
                if word not in self.word_dict:
                    self.word_dict.add(word)
                if tag not in self.tag_dict:
                    self.tag_dict.add(tag)
                ex_x.append(word)
                ex_y.append(tag)
                if include_ex_z:
                    ex_z.append(self.normalize_word(word))

        # add parsed data to use tree wordreps
        if self.use_wordrep_tree:
            for c, instance in enumerate(get_tree(len(instances))):
                # get parsed data
                inst = instance
                instances[c].append(inst)

        return instances  # try generator

    def prepare_lemmatized_conll2002(self,
                                     train_file,
                                     train_file_parsed=None,
                                     train_files_parsed_path=None,
                                     output_f=None):
        self.use_wordrep_tree = True  # need parsed data
        docstarts, instances = self.prepare_conll_instances(
            train_file, train_file_parsed, train_files_parsed_path)
        if output_f is None:
            return instances
        else:
            header = "-DOCSTART- -DOCSTART- O"
            with open(output_f, "w") as outfile:
                for n, instance in enumerate(instances):
                    # doc headers
                    if n in docstarts:
                        outfile.write("{}\n".format(header))
                    if isinstance(instance, list):
                        for _, postag, tag, lemma in zip(*instance):
                            outfile.write("{} {} {}\n".format(
                                lemma, postag, tag))
                        outfile.write("\n")
                    else:
                        sys.exit("invalid instance")

    def prepare_conll_instances(self, file, file_parsed, files_parsed_path):
        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed,
                                howbig=1000000,
                                lemmas=True,
                                eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname,
                                lr=self.lr)
            trees.prepare_trees()
            self.tree_vocab = trees.x_dict
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = trees.train[c_append]
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        max_sent_len = 1000000
        max_nr_sent = 1000000
        if file_parsed is None or files_parsed_path is None:
            sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_x_pos = []
        ex_y = []
        docstarts = set()  # track docstarts header

        for line in contents:
            if line.startswith("-DOCSTART"):
                docstarts.add(nr_sent)
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(
                        ex_x
                ) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instance = [ex_x, ex_x_pos, ex_y]
                    instances.append(instance)
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_x_pos = []
                ex_y = []
            else:
                tag = toks[2]
                postag = toks[1]
                word = toks[0]
                ex_x.append(word)
                ex_x_pos.append(postag)
                ex_y.append(tag)

        for c, instance in enumerate(get_tree(len(instances))):
            ex_z = self.get_words(
                instance,
                self.tree_vocab)  # should get lemmas (from ConllCorpus)
            if ex_z is None:
                inst = [i for i in instances[c][0]]
                print("None instance")
            else:
                assert len(ex_z) == len(instances[c][0])
                inst = ex_z
            instances[c].append(inst)

        return docstarts, instances  # try generator

    def normalize_word(self, word):
        if word not in self.wordrep_dict:
            return "*unk*" if word.lower(
            ) not in self.wordrep_dict else word.lower()
        else:
            return word

    def normalize_tree(self, tree, trees_vocab, c):
        """
        Recode the name index based on wordrep_dict.
        Modify tree.name such that *unk* or lowercase words are included.
        """
        for node in tree:
            w = trees_vocab.get_label_name(node.name)
            # if c==0:
            #    print("{}\t{}".format(w, self.normalize_word(w)))
            new_name = self.wordrep_dict.get_label_id(self.normalize_word(w))
            node.set_name(new_name)
        return tree

    def get_words(self, instance, vocab):
        if isinstance(instance, Tree):
            return get_words_from_tree(instance, vocab)
        print("None instance in Conll2002NerCorpus")
        return None

    def write_conll_instances(self, gold, predictions, file, sep=" "):
        """
        Create dataset with appended predictions as the last column.
        """
        assert len(gold) == len(predictions)
        contents = open(file, "w", encoding="iso-8859-1")
        for gold_seq, pred_seq in zip(gold.seq_list, predictions):
            for x, y, y_hat in zip(gold_seq.x, gold_seq.y, pred_seq.y):
                contents.write("{}{sep}{}{sep}{}\n".format(
                    gold_seq.sequence_list.x_dict.get_label_name(x),
                    gold_seq.sequence_list.y_dict.get_label_name(y),
                    pred_seq.sequence_list.y_dict.get_label_name(y_hat),
                    sep=sep))
            contents.write("\n")

    # # Dumps a corpus into a file
    def save_corpus(self, dirname):
        if not os.path.isdir(dirname + "/"):
            os.mkdir(dirname + "/")
        #word_fn = open(dir+"word.dic","w")
        #for word_id,word in enumerate(self.int_to_word):
        #    word_fn.write("{}\t{}\n".format(word_id, word))
        #word_fn.close()
        #tag_fn = open(dir+"tag.dic","w")
        #for tag_id,tag in enumerate(self.int_to_tag):
        #    tag_fn.write("{}\t{}\n".format(tag_id, tag))
        #tag_fn.close()
        #word_count_fn = open(dir+"word.count","w")
        #for word_id,counts in self.word_counts.iteritems():
        #    word_count_fn.write("{}\t{}\n".format(word_id,counts))
        #word_count_fn.close()
        self.sequence_list.save(dirname + "sequence_list")

    ## Loads a corpus from a file
    def load_corpus(self, dirname):
        word_fn = open(dirname + "word.dic")
        for line in word_fn:
            word_nr, word = line.strip().split("\t")
            self.int_to_word.append(word)
            self.word_dict[word] = int(word_nr)
        word_fn.close()
        tag_fn = open(dirname + "tag.dic")
        for line in tag_fn:
            tag_nr, tag = line.strip().split("\t")
            if tag not in self.tag_dict:
                self.int_to_tag.append(tag)
                self.tag_dict[tag] = int(tag_nr)
        tag_fn.close()
        word_count_fn = open(dirname + "word.count")
        for line in word_count_fn:
            word_nr, word_count = line.strip().split("\t")
            self.word_counts[int(word_nr)] = int(word_count)
        word_count_fn.close()
        self.sequence_list.load(dirname + "sequence_list")

示例#3

显示文件

class IDFeatures:
    '''
        Base class to extract features from a particular dataset.

        feature_dic --> Dictionary of all existing features maps feature_name (string) --> feature_id (int) 
        feture_names --> List of feature names. Each position is the feature_id and contains the feature name
        nr_feats --> Total number of features
        feature_list --> For each sentence in the corpus contains a pair of node feature and edge features
        dataset --> The original dataset for which the features were extracted

        Caches (for speedup):
        initial_state_feature_cache -->
        node_feature_cache -->
        edge_feature_cache -->
        final_state_feature_cache -->
    '''

    def __init__(self, dataset):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.feature_list = []

        self.add_features = False
        self.dataset = dataset

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.features_used = set()

    def get_num_features(self):
        return len(self.feature_dict)

    def build_features(self):
        '''
        Generic function to build features for a given dataset.
        Iterates through all sentences in the dataset and extracts its features,
        saving the node/edge features in feature list.
        '''
        self.add_features = True
        for sequence in self.dataset.seq_list:
           initial_features, transition_features, final_features, emission_features = \
               self.get_sequence_features(sequence)
           self.feature_list.append([initial_features, transition_features, final_features, emission_features])
        self.add_features = False

    def get_sequence_features(self, sequence):
        '''
        Returns the features for a given sequence.
        For a sequence of size N returns:
        Node_feature a list of size N. Each entry contains the node potentials for that position.
        Edge_features a list of size N+1.
        - Entry 0 contains the initial features
        - Entry N contains the final features
        - Entry i contains entries mapping the transition from i-1 to i.
        '''
        emission_features = []
        initial_features = []
        transition_features = []
        final_features = []

        ## Take care of first position
        features = []
        features = self.add_initial_features(sequence, sequence.y[0], features)
        initial_features.append(features)


        ## Take care of middle positions
        for pos, tag in enumerate(sequence.y):
            features = {}
            features = self.add_emission_features(sequence, pos, sequence.y[pos], features)
            emission_features.append(features)
            if pos > 0:
                prev_tag = sequence.y[pos-1]
                features = []
                features = self.add_transition_features(sequence, pos-1, tag, prev_tag, features)            
                transition_features.append(features)
            """
            if pos > 1:
                prev_tag = sequence.y[pos-1]
                prev_prev_tag = sequence.y[pos-2]
                features = []
                features = self.add_transition_features(sequence, pos-1, tag, prev_tag, prev_prev_tag, features)
                transition_features.append(features)
            """
        ## Take care of final position
        features = []
        features = self.add_final_features(sequence, sequence.y[-1], features)
        final_features.append(features)

        return initial_features, transition_features, final_features, emission_features

    #f(t,y_t,X)
    # Add the word identity and if position is
    # the first also adds the tag position
    def get_emission_features(self, sequence, pos, y):
        all_feat = []
        x = sequence.x[pos]
        if x not in self.node_feature_cache:
            self.node_feature_cache[x] = {}
        if y not in self.node_feature_cache[x]:
            node_idx = []
            node_idx = self.add_emission_features(sequence, pos, y, node_idx)
            self.node_feature_cache[x][y] = node_idx
        idx = self.node_feature_cache[x][y]
        all_feat = idx[:]
        return all_feat

    #f(t,y_t,y_(t-1),X)
    ##Speed up of code
    def get_transition_features(self, sequence, pos, y, y_prev):
        assert (pos >= 0 and pos < len(sequence.x))

        if y not in self.edge_feature_cache:
            self.edge_feature_cache[y] = {}
        if y_prev not in self.edge_feature_cache[y]:
            edge_idx = []
            edge_idx = self.add_transition_features(sequence, pos, y, y_prev, edge_idx)            
            self.edge_feature_cache[y][y_prev] = edge_idx
        return self.edge_feature_cache[y][y_prev]

    def get_initial_features(self, sequence, y):
       if y not in self.initial_state_feature_cache:
           edge_idx = []
           edge_idx = self.add_initial_features(sequence, y, edge_idx)
           self.initial_state_feature_cache[y] = edge_idx
       return self.initial_state_feature_cache[y]

    def get_final_features(self, sequence, y_prev):
        if y_prev not in self.final_state_feature_cache:
            edge_idx = []
            edge_idx = self.add_final_features(sequence, y_prev, edge_idx)            
            self.final_state_feature_cache[y_prev] = edge_idx
        return self.final_state_feature_cache[y_prev]

    def add_initial_features(self, sequence, y, features):
        # Get label name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)
        # Generate feature name.
        feat_name = "init_tag:{}".format(y_name)
        self.features_used.add("init_tag")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        return features

    def add_final_features(self, sequence, y_prev, features):
        # Get label name from ID.
        y_name = self.dataset.y_dict.get_label_name(y_prev)
        # Generate feature name.
        feat_name = "final_prev_tag:{}".format(y_name)
        self.features_used.add("final_prev_tag")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if(feat_id != -1):
            features.append(feat_id)
        return features

    def add_emission_features(self, sequence, pos, y, features):
        '''Add word-tag pair feature.'''
        x = sequence.x[pos]
        # Get tag name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)
        # Get word name from ID.
        x_name = self.dataset.x_dict.get_label_name(x)
        # Generate feature name.
        feat_name = "id:{}::{}".format(x_name,y_name)
        self.features_used.add("id")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        return features

    def add_transition_features(self, sequence, pos, y, y_prev, features):
        """ Adds a feature to the edge feature list.
        Creates a unique id if its the first time the feature is visited
        or returns the existing id otherwise
        """
        assert pos < len(sequence.x)-1
        # Get label name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)
        # Get previous label name from ID.
        y_prev_name = self.dataset.y_dict.get_label_name(y_prev)
        # Generate feature name.
        feat_name = "prev_tag:{}::{}".format(y_prev_name, y_name)
        self.features_used.add("prev_tag")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        return features

    def add_feature(self, feat_name):
        """
        Builds a dictionary of feature name to feature id
        If we are at test time and we don't have the feature
        we return -1.
        """
        # Check if feature exists and if so, return the feature ID. 
        if feat_name in self.feature_dict:
            return self.feature_dict[feat_name]
        # If 'add_features' is True, add the feature to the feature 
        # dictionary and return the feature ID. Otherwise return -1.
        if not self.add_features:
            return -1
        return self.feature_dict.add(feat_name)