Exemplo n.º 1
0
 def train_markov_model_from_constraint_matrix(self,
                                               csv_path,
                                               mm_path,
                                               delim="\t"):
     table = [line.split(delim) for line in open(csv_path)]
     tags = []
     range_states = table.pop(0)[1:]
     for row in table:
         domain = row[0]
         for i, r in enumerate(row[1:]):
             s = r.replace(" ", "").strip("\n")
             if (s == ''):
                 continue
             if int(s) > 0:
                 for _ in range(0, int(s)):
                     tags.append((domain, range_states[i]))
     self.cfd_tags = nltk.ConditionalFreqDist(tags)
     print "cfd trained, counts:"
     self.cfd_tags.tabulate()
     print "test:"
     print tabulate_cfd(self.cfd_tags)
     # save this new cfd for later use
     pickle.dump(self.cfd_tags, open(mm_path, "wb"))
     # initialize the cpd
     self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags,
                                              nltk.MLEProbDist)
     # print "cpd summary:"
     # print self.cpd_tags.viewitems()
     print tabulate_cfd(self.cpd_tags)
     all_outcomes = [v.keys() for v in self.cfd_tags.values()]
     self.tag_set = set(self.cfd_tags.keys() +
                        [y for x in all_outcomes for y in x])
     self.viterbi_init()  # initialize viterbi
Exemplo n.º 2
0
    def train_markov_model_from_file(self,
                                     corpus_path,
                                     mm_path,
                                     update=False,
                                     non_sparse=False):
        """Adds to the self.cfd_tags conditional frequency distribution
        loaded, if there is one, else starts afresh.
        Recalculate the conditional prob distribution afresh.

        args:
        --filepath : filepath to newline separated file to learn sequence
        probabilities from.
        --mm_path : filepath to markov model distribution path to write to.
        --update : whether to update the current cfd, if not start anew.
        --non_sparse : whether to omit lines in the corpus without repairs,
        gives higher prob to repairs
        """
        tags = []
        # expects line separated sequences
        corpus_file = open(corpus_path)
        print "training decoder from", corpus_path
        for line in corpus_file:
            if line.strip("\n") == "":
                continue
            if non_sparse and ("<r" not in line):
                continue
            labels_data = line.strip("\n").split(",")
            if "<r" in labels_data[0]:
                continue  # TODO error with corpus creation
            previous = "s"
            # print "length sequence", len(labels_data)
            for i in range(len(labels_data)):
                if labels_data[i] not in self.observation_tags:
                    print labels_data[i], "not in obs tags"
                    continue
                if any(["<i" in t for t in self.observation_tags]):
                    if "<e" in labels_data[i] and i < len(labels_data) - 1:
                        rps_onset = None
                        for j in range(i, len(labels_data)):
                            if "<rm" in labels_data[j]:
                                rps_onset = j
                                break
                            if "<e" not in labels_data[j]:
                                break
                        if rps_onset:
                            for k in range(i, rps_onset):
                                labels_data[k] = labels_data[k].replace(
                                    "<e", "<i")
                # print labels_data[i]
                # adjust interregna
#                 if any(["<i" in t for t in self.observation_tags]):
#                     if "<rm-" in labels_data[i]:
#                         b = len(tags)-1
#                         while ("e" in tags[b][1] and (not tags[b][1]=="se")\
#                                 and b > 0):
#                             if "i" not in tags[b][1]:
#                                 new_1 = tags[b][1].replace('eR', 'i').\
#                                     replace('e', 'i')
#                                 tags[b] = (tags[b][0], new_1)
#                             if "e" in tags[b][0] and "i" not in tags[b][0]:
#                                 new_0 = tags[b][0].replace('eR', 'i').\
#                                     replace('e', 'i')
#                                 tags[b] = (new_0, tags[b][1])
#                             b -= 1
#                         previous = tags[-1][1]
                tag = self.convert_tag(previous, labels_data[i])
                tags.append((previous, tag))
                previous = tag

            if "se" in self.observation_tags:
                # add end tag
                tags.append((previous, 'se'))
        # print "If we have just seen 'DET', \
        # the probability of 'N' is", cpd_tags["DET"].prob("N")
        # assumes these are added to exisiting one
        if update:
            self.cfd_tags += nltk.ConditionalFreqDist(tags)
        else:
            self.cfd_tags = nltk.ConditionalFreqDist(tags)
        print "cfd trained, counts:"
        self.cfd_tags.tabulate()
        print "test:"
        print tabulate_cfd(self.cfd_tags)
        # save this new cfd for later use
        pickle.dump(self.cfd_tags, open(mm_path, "wb"))
        # initialize the cpd
        self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags,
                                                 nltk.MLEProbDist)
        # print "cpd summary:"
        # print self.cpd_tags.viewitems()
        print tabulate_cfd(self.cpd_tags)
        all_outcomes = [v.keys() for v in self.cfd_tags.values()]
        self.tag_set = set(self.cfd_tags.keys() +
                           [y for x in all_outcomes for y in x])
        self.viterbi_init()  # initialize viterbi
Exemplo n.º 3
0
        by an integer.
        """
        tag_dictionary = defaultdict(int)
        f = open(filepath)
        for line in f:
            l = line.strip('\n').split(",")
            tag_dictionary[l[1]] = int(l[0])
        f.close()
        return tag_dictionary

    tags_name = "swbd_disf1_uttseg_simple_033"
    tags = load_tags(
        "../data/tag_representations/{}_tags.csv".format(tags_name))
    if "disf" in tags_name:
        intereg_ind = len(tags.keys())
        interreg_tag = "<i/><cc/>" if "uttseg" in tags_name else "<i/>"
        tags[interreg_tag] = intereg_ind  # add the interregnum tag
    print tags

    h = FirstOrderHMM(tags, markov_model_file=None)
    mm_path = "models/{}_tags.pkl".format(tags_name)
    # corpus_path = "../data/tag_representations/{}_tag_corpus.csv".format(
    #    tags_name).replace("_021", "")
    # h.train_markov_model_from_file(corpus_path, mm_path, non_sparse=True)
    csv_file = "models/{}.csv".format(tags_name)
    h.train_markov_model_from_constraint_matrix(csv_file, mm_path, delim=",")
    table = tabulate_cfd(h.cpd_tags)
    test_f = open("models/{}_tags_table.csv".format(tags_name), "w")
    test_f.write(table)
    test_f.close()