示例#1
0
def main():
    hmm = HMM(*train(sys.argv[1]))
    
    with open(sys.argv[2]) as f:
        correct = 0
        wrong = 0
        
        correct_sents = 0
        wrong_sents = 0
        
        correct_known = 0
        wrong_known = 0
        
        for i, sent in enumerate(Reader(f)):
            prob, path = hmm.decode([word for (word, pos) in sent])
            correct1 = 0
            wrong1 = 0
            for (gold, predicted) in zip(sent, path):
                if gold == predicted:
                    correct1 += 1
                else:
                    wrong1 += 1
            print('%e\t%.3f\t%s' % (prob, correct1 / (correct1 + wrong1), ' '.join('%s/%s' % pair for pair in path)))
            if prob > 0:
                correct_sents += 1
                correct_known += correct1
                wrong_known += wrong1
            else:
                wrong_sents += 1
            correct += correct1
            wrong += wrong1
    
    print("Correctly tagged words: %s" % (correct / (correct + wrong)))
    print("Sentences with non-zero probability: %s" % (correct_sents / (correct_sents + wrong_sents)))
    print("Correctly tagged words when only considering sentences with non-zero probability: %s" % (correct_known / (correct_known + wrong_known)))
示例#2
0
def main():
    hmm = HMM(3, ('up', 'down', 'unchanged'),
              initial_probability=[0.5, 0.2, 0.3],
              transition_probability=[[0.6, 0.2, 0.2], [0.5, 0.3, 0.2],
                                      [0.4, 0.1, 0.5]],
              observation_probability=[[0.7, 0.1, 0.2], [0.1, 0.6, 0.3],
                                       [0.3, 0.3, 0.4]])

    observation = ("up", "up", "unchanged", "down", "unchanged", "down", "up")
    ob_length = len(observation)
    p, _ = hmm.forward(observation, ob_length)
    path = hmm.decode(observation, ob_length)
    print("P{} = {:.13f}".format(tuple(observation), p))
    print("Observation sequence =", tuple(i + 1 for i in path))
示例#3
0
def main():
    hmm = HMM(3, ('up', 'down', 'unchanged'),
              initial_probability=[0.5, 0.2, 0.3],
              transition_probability=[[0.6, 0.2, 0.2],
                                      [0.5, 0.3, 0.2],
                                      [0.4, 0.1, 0.5]],
              observation_probability=[[0.7, 0.1, 0.2],
                                       [0.1, 0.6, 0.3],
                                       [0.3, 0.3, 0.4]])

    observation = ("up", "up", "unchanged", "down", "unchanged", "down", "up")
    ob_length = len(observation)
    p, _ = hmm.forward(observation, ob_length)
    path = hmm.decode(observation, ob_length)
    print("P{} = {:.13f}".format(tuple(observation), p))
    print("Observation sequence =", tuple(i+1 for i in path))
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='the path to testing data')
    args = parser.parse_args()

    test__data_path = args.path
    test_words = load_raw_data(test__data_path)

    hmm = HMM()
    hmm.load_model('./hmmmodel.txt')

    tag_result = []

    for sentence in test_words:
        re = hmm.decode(sentence)
        tag_result.append(re)

    save_result(test_words, tag_result)
示例#5
0
class HMMClassifier(object):
    def __init__(self, **kwarg):    # lname, url, other prior knowledge
        super(HMMClassifier, self).__init__()
        self.HMMauthor = HMM('author', 2)
        self.HMMvenue = HMM('venue', 2)     # Not important
        self.HMMentire = HMM('entire', 6)   # Set empirically
        self.observations_raw = []
        self.observation_sequences = []
        self.labels = []


    def predict(self, segment):
        author_likelihood = self.HMMauthor.evaluate(segment)
        venue_likelihood = self.HMMvenue.evaluate(segment)
        print segment
        print 'author likelihood:\t' , author_likelihood
        print 'venue likelihood:\t' , venue_likelihood

    def decode(self, segment):
        # print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode(segment)
        
        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)


        # segment the labeling into parts
        author_field = []
        title_field = []
        venue_field = []
        year_field = []
        raw_tokens = Tokens(segment).tokens
        for i in range(len(decoded_sequence)):
            token_i = raw_tokens[i]
            label_i = decoded_sequence[i]
            if label_i in [0,1]:
                author_field.append(token_i)
            if label_i == 2:
                continue
            if label_i == 3:
                title_field.append(token_i)
            if label_i == 4:
                venue_field.append(token_i)
            if label_i == 5:
                year_field.append(token_i)

        return ' '.join(author_field), ' '.join(title_field), ' '.join(venue_field), list(set(year_field))
        # Additional step: to calculate the overall sum of P(X1|FN,LN,DL...) + P(X2|TI,TI,TI...) + P(X3|VN,VN,VN...) + P(X4|DT)
        # 1. Find boundaries: 
        # boundaries = [[], [], []]
        # label_ranges = [[0,1,2], [3], [2,4,5]]
        # for i in range(len(label_ranges)):
        #     label_range = label_ranges[i]
        #     for j in range(len(decoded_sequence)):
        #         if decoded_sequence[j] in label_range:
        #             boundaries[i].append()

    
    def decode_without_constraints(self, segment):
        print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints(segment)
        
        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        for vector, decoding, token in zip(observation_sequence, decoded_sequence, Tokens(segment).tokens):
            if decoding == 0:
                label = 'FN'
            elif decoding == 1:
                label = 'LN'
            elif decoding == 2:
                label = 'DL'
            elif decoding == 3:
                label = 'TI'
            elif decoding == 4:
                label = 'VN'
            elif decoding == 5:
                label = 'YR'
            else:
                label = str(decoding) + ', PROBLEM'
            print vector, '\t', label, '\t', token
        print '\n\n'

    def cross_correct(self):
        absolute_correct = []
        absolute_wrong = []

        # 1. Confirm what's the big structure of the publication inside this specific domain
        counter = {}
        for l in self.labels:
            first_label = str(l[0])
            if counter.has_key(first_label):
                counter[first_label] += 1
            else:
                counter[first_label] = 1
        sorted_counter = sorted(counter.iteritems(), key=operator.itemgetter(1), reverse=True)
        print 'First labels distribution: ', sorted_counter


    def serialize(self):
        fp = open('hmmc.pkl', 'wb')
        pickle.dump(self, fp, -1)
        fp.close()
示例#6
0
class HMMClassifier(object):
    def __init__(self, **kwarg):  # lname, url, other prior knowledge
        super(HMMClassifier, self).__init__()
        self.HMMauthor = HMM('author', 2)
        self.HMMvenue = HMM('venue', 2)  # Not important
        self.HMMentire = HMM('entire', 6)  # Set empirically
        self.observations_raw = []
        self.observation_sequences = []
        self.labels = []

    def predict(self, segment):
        author_likelihood = self.HMMauthor.evaluate(segment)
        venue_likelihood = self.HMMvenue.evaluate(segment)
        print segment
        print 'author likelihood:\t', author_likelihood
        print 'venue likelihood:\t', venue_likelihood

    def decode(self, segment):
        # print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode(segment)

        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        # segment the labeling into parts
        author_field = []
        title_field = []
        venue_field = []
        year_field = []
        raw_tokens = Tokens(segment).tokens
        for i in range(len(decoded_sequence)):
            token_i = raw_tokens[i]
            label_i = decoded_sequence[i]
            if label_i in [0, 1]:
                author_field.append(token_i)
            if label_i == 2:
                continue
            if label_i == 3:
                title_field.append(token_i)
            if label_i == 4:
                venue_field.append(token_i)
            if label_i == 5:
                year_field.append(token_i)

        return ' '.join(author_field), ' '.join(title_field), ' '.join(
            venue_field), list(set(year_field))
        # Additional step: to calculate the overall sum of P(X1|FN,LN,DL...) + P(X2|TI,TI,TI...) + P(X3|VN,VN,VN...) + P(X4|DT)
        # 1. Find boundaries:
        # boundaries = [[], [], []]
        # label_ranges = [[0,1,2], [3], [2,4,5]]
        # for i in range(len(label_ranges)):
        #     label_range = label_ranges[i]
        #     for j in range(len(decoded_sequence)):
        #         if decoded_sequence[j] in label_range:
        #             boundaries[i].append()

    def decode_without_constraints(self, segment):
        print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints(
            segment)

        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        for vector, decoding, token in zip(observation_sequence,
                                           decoded_sequence,
                                           Tokens(segment).tokens):
            if decoding == 0:
                label = 'FN'
            elif decoding == 1:
                label = 'LN'
            elif decoding == 2:
                label = 'DL'
            elif decoding == 3:
                label = 'TI'
            elif decoding == 4:
                label = 'VN'
            elif decoding == 5:
                label = 'YR'
            else:
                label = str(decoding) + ', PROBLEM'
            print vector, '\t', label, '\t', token
        print '\n\n'

    def cross_correct(self):
        absolute_correct = []
        absolute_wrong = []

        # 1. Confirm what's the big structure of the publication inside this specific domain
        counter = {}
        for l in self.labels:
            first_label = str(l[0])
            if counter.has_key(first_label):
                counter[first_label] += 1
            else:
                counter[first_label] = 1
        sorted_counter = sorted(counter.iteritems(),
                                key=operator.itemgetter(1),
                                reverse=True)
        print 'First labels distribution: ', sorted_counter

    def serialize(self):
        fp = open('hmmc.pkl', 'wb')
        pickle.dump(self, fp, -1)
        fp.close()
示例#7
0
class Retrainer(object):
    def __init__(self, raw_segments, observation_sequences, label_sequences):
        super(Retrainer, self).__init__()
        self.raw_segments = raw_segments
        self.observation_sequences = observation_sequences
        self.label_sequences = label_sequences
        self.hmm_new = None
        self.feature_entity_list = FeatureEntityList()
        self.lm = LanguageModel()
        self.boosting_feature_generator = BoostingFeatureGenerator()

        self.DOMINANT_RATIO = 0.85  # dominant label ratio: set empirically

        self.retrain_with_boosting_features()

    def retrain(self):
        self.hmm_new = HMM('retrainer', 6)
        self.hmm_new.train(
            self.observation_sequences,
            self.label_sequences,
            useLaplaceRule=False)  #important to set laplace to be no

    # With new features
    def retrain_with_boosting_features(self):
        # Build language model
        for raw_segment, label_sequence in zip(self.raw_segments,
                                               self.label_sequences):
            for token, label in zip(
                    Tokens(raw_segment).tokens, label_sequence):
                self.lm.add(token, label)
        self.lm.prettify()
        self.token_BGM = self.lm.prettify_model
        self.pattern_BGM = None

        # Retrain
        self.hmm_new = HMM('retrainer', 6)
        partial_features = []
        for raw_segment in self.raw_segments:
            partial_features.append(
                BoostingFeatureGenerator(raw_segment, self.token_BGM,
                                         self.pattern_BGM).features)
        self.hmm_new.train(partial_features,
                           self.label_sequences,
                           useLaplaceRule=False)
        self.observation_sequences = partial_features

    def run(self):
        i = 0
        self.new_labels = []
        for raw_segment, label_sequence in zip(self.raw_segments,
                                               self.label_sequences):
            new_labels = self.hmm_new.decode(raw_segment)[1]
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            feature_vectors = FeatureGenerator(raw_segment).features
            print i, ':  ', raw_segment
            for token, old_label, new_label, feature_vector in zip(
                    tokens, label_sequence, new_labels, feature_vectors):
                print to_label(old_label), '\t', to_label(
                    new_label), '\t', token
                self.feature_entity_list.add_entity(
                    feature_vector, old_label, token)  #???? Old label first
            print '\n'
            i += 1

    def find_pattern(self):
        self.hmm_new.feature_entity_list.print_all_entity()

    # Find the first tokens at VN boundaries
    def find_venue_boundary_tokens(self):
        recorder = {}
        for raw_segment, observation_sequence, label_sequence in zip(
                self.raw_segments, self.observation_sequences,
                self.label_sequences):
            first_target_label_flag = True
            tokens = Tokens(raw_segment).tokens
            for token, feature_vector, label in zip(tokens,
                                                    observation_sequence,
                                                    label_sequence):
                # First meet a VN label
                if label == 4 and first_target_label_flag:
                    key = token.lower()
                    if not key.islower():
                        continue
                    if recorder.has_key(key):
                        recorder[key] += 1
                    else:
                        recorder[key] = 1
                    first_target_label_flag = False

                elif (first_target_label_flag is False) and label in [0, 1, 3]:
                    first_target_label_flag = True

        for k, v in recorder.iteritems():
            print k, '\t', v
        return recorder

    # Learn the general order of structure of publications before moving forward
    def find_majority_structure(self):
        first_bit_counter = {'0': 0, '3': 0, '4': 0, '5': 0}
        overall_pattern_counter = {}
        for label_sequence in self.label_sequences:
            label = label_sequence[0]
            if label == 2:
                continue
            elif label == 5:
                continue
            elif label in [0, 1]:
                first_bit_counter['0'] += 1
            else:
                first_bit_counter[str(label)] += 1

            pattern = []
            for label in label_sequence:
                if label in [2, 5]:
                    continue
                elif label in [0, 1]:
                    if 0 in pattern:
                        continue
                    else:
                        pattern.append(0)
                elif label == 3:
                    if 3 in pattern:
                        continue
                    else:
                        pattern.append(3)
                elif label == 4:
                    if 4 in pattern:
                        continue
                    else:
                        pattern.append(4)
            key = str(pattern)
            if overall_pattern_counter.has_key(key):
                overall_pattern_counter[key] += 1
            else:
                overall_pattern_counter[key] = 1

        # Inducing the structure
        sorted_firstbit_counter = sorted(first_bit_counter.iteritems(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
        sorted_pattern_counter = sorted(overall_pattern_counter.iteritems(),
                                        key=operator.itemgetter(1),
                                        reverse=True)
        print '===========================================', sorted_pattern_counter
        return int(sorted_firstbit_counter[0][0]), ast.literal_eval(
            sorted_pattern_counter[0][0]), (float(
                sorted_pattern_counter[0][1])) / len(self.label_sequences)

    def run_with_boosting_features(self):
        i = 0
        self.new_labels = []
        self.combined_labels = []

        for raw_segment, label_sequence in zip(self.raw_segments,
                                               self.label_sequences):
            feature_vectors, new_labels = self.hmm_new.decode(
                raw_segment, True, True, self.token_BGM, self.pattern_BGM)
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            print i, ':  ', raw_segment

            # Combination step:
            tmp_combined_labels = []  # the decided combined labels so far
            for token, old_label, new_label, feature_vector in zip(
                    tokens, label_sequence, new_labels, feature_vectors):

                # Combine old and new labels to come out a combined label, and deciding...
                combined_label = -1

                if old_label == new_label:
                    combined_label = new_label
                    tmp_combined_labels.append(new_label)

                # Combine compatible labels: FN and LN
                elif old_label in [0, 1] and new_label in [0, 1]:
                    combined_label = old_label
                    tmp_combined_labels.append(new_label)

                # Combine labels that are not compatible
                else:
                    tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(
                        feature_vector
                    )  # Get the Background knowledge provided the feature vector: the language feature model
                    sorted_label_distribution = sorted(
                        tmp_feature_entity.label_distribution.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)
                    total_label_occurence = float(
                        sum(tmp[1] for tmp in sorted_label_distribution))

                    # ============================================================================================
                    # ============================================================================================
                    # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED
                    # sorted_label_distribution = []
                    # sum_prob = 0.0
                    # for pair in tmp_sorted_label_distribution:
                    #     sorted_label_distribution.append(pair)
                    #     sum_prob += pair[1]
                    #     if sum_prob/total_label_occurence >= 0.90:
                    #         break
                    # ============================================================================================
                    # ============================================================================================

                    # Dominant label case: Iterate from the highest label stats according to this feature vector:
                    for label_frequency in sorted_label_distribution:
                        if int(label_frequency[0]) in [
                                old_label, new_label
                        ] and (label_frequency[1] /
                               total_label_occurence) >= self.DOMINANT_RATIO:
                            print 'Dominant labels'
                            # Check for constraint:
                            tmp_label_to_check = int(label_frequency[0])

                            # Find last occurence position of this label
                            if tmp_label_to_check not in [0, 1]:
                                last_occurence = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind(str(tmp_label_to_check))
                            elif tmp_label_to_check in [0, 1]:
                                last_occurence_0 = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind('0')
                                last_occurence_1 = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind('1')
                                last_occurence = max(last_occurence_0,
                                                     last_occurence_1)

                            # Checking constraints by simplifying what we did in viterbi
                            if last_occurence == -1 or last_occurence == (
                                    len(tmp_combined_labels) - 1
                            ):  # Never occurred, or last occurence is the last label
                                # When we are deciding the first label
                                if len(tmp_combined_labels) == 0:
                                    first_bit = self.find_majority_structure(
                                    )[0]
                                    if first_bit == 0 and tmp_label_to_check not in [
                                            0, 1
                                    ]:
                                        continue
                                    if first_bit == 3 and tmp_label_to_check != 3:
                                        continue

                                # VN CANNOT FOLLOW TI W/O DL constraint
                                if tmp_label_to_check == 4 and tmp_combined_labels[
                                        -1] == 3:
                                    continue
                            elif tmp_label_to_check in [0, 1]:
                                flag = False
                                for j in range(last_occurence,
                                               len(tmp_combined_labels)):
                                    if tmp_combined_labels[j] not in [0, 1, 2]:
                                        flag = True
                                        break
                                if flag:
                                    continue
                            elif tmp_label_to_check == 3:
                                continue
                            elif tmp_label_to_check == 4:
                                if tmp_combined_labels[-1] == 3:  #????
                                    continue

                            combined_label = tmp_label_to_check
                            tmp_combined_labels.append(tmp_label_to_check)
                            break

                    # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order
                    if combined_label == -1:
                        # Iterate from the highest label stats according to this feature vector:

                        for label_frequency in sorted_label_distribution:
                            breakout_flag = False
                            #Test against constraints
                            # 1. DL separate labels principle
                            # 2. AU-TI-VN Order
                            if int(label_frequency[0]) in [
                                    old_label, new_label
                            ]:
                                tmp_label_to_check = int(label_frequency[0])

                                # find structure of the order, and find what have appeared, and so predict what to be appear next
                                structure_overview = [
                                ]  #will record the order in big sense: 0,3,4/4,0,3
                                for tmp_combined_label in tmp_combined_labels:
                                    if tmp_combined_label in [2, 5]:
                                        continue
                                    elif tmp_combined_label in [0, 1]:
                                        if 0 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(0)
                                    elif tmp_combined_label == 3:
                                        if 3 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(3)
                                    elif tmp_combined_label == 4:
                                        if 4 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(4)
                                # Based on the structure overview, find what should appear next
                                appear_next = []
                                if structure_overview == [0]:
                                    appear_next = [0, 1, 3, 2, 5]
                                elif structure_overview == [3]:
                                    appear_next = [3, 0, 1, 2, 5]
                                elif structure_overview == [0, 3]:
                                    appear_next = [3, 4, 2, 5]
                                elif structure_overview == [3, 0]:
                                    appear_next = [0, 1, 4, 2, 5]
                                elif structure_overview == [0, 3, 4]:
                                    appear_next = [4, 2, 5]
                                elif structure_overview == [3, 0, 4]:
                                    appear_next = [4, 2, 5]
                                else:  #weird case
                                    print 'Weird structure! Weird case!'
                                    if tmp_feature_entity.label_distribution[str(
                                            old_label
                                    )] > tmp_feature_entity.label_distribution[
                                            str(new_label)]:
                                        tmp_label_to_check_list = [
                                            old_label, new_label
                                        ]
                                    else:
                                        tmp_label_to_check_list = [
                                            new_label, old_label
                                        ]
                                    # Apply constraints here too
                                    for tmp_label_to_check in tmp_label_to_check_list:
                                        if tmp_label_to_check not in [0, 1]:
                                            last_occurence = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind(str(tmp_label_to_check))
                                        elif tmp_label_to_check in [0, 1]:
                                            last_occurence_0 = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind('0')
                                            last_occurence_1 = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind('1')
                                            last_occurence = max(
                                                last_occurence_0,
                                                last_occurence_1)

                                        # Checking constraints by simplifying what we did in viterbi
                                        if last_occurence == -1 or last_occurence == (
                                                len(tmp_combined_labels) - 1):
                                            # When we are deciding the first label
                                            if len(tmp_combined_labels) == 0:
                                                first_bit = self.find_majority_structure(
                                                )[0]
                                                if first_bit == 0 and tmp_label_to_check not in [
                                                        0, 1
                                                ]:
                                                    continue
                                                if first_bit == 3 and tmp_label_to_check != 3:
                                                    continue
                                            try:
                                                if tmp_label_to_check == 4 and tmp_combined_labels[
                                                        -1] == 3:
                                                    continue
                                            except:
                                                continue
                                        elif tmp_label_to_check in [0, 1]:
                                            flag = False
                                            for j in range(
                                                    last_occurence,
                                                    len(tmp_combined_labels)):
                                                if tmp_combined_labels[
                                                        j] not in [0, 1, 2]:
                                                    flag = True
                                                    break
                                            if flag:
                                                continue
                                        elif tmp_label_to_check == 3:
                                            continue
                                        elif tmp_label_to_check == 4:
                                            if tmp_combined_labels[-1] == 3:
                                                continue

                                        combined_label = tmp_label_to_check
                                        tmp_combined_labels.append(
                                            combined_label)
                                        breakout_flag = True
                                        break

                                if breakout_flag:
                                    break
                                if tmp_label_to_check in appear_next:
                                    # Then check constraint. find last occurence, DL constraints
                                    # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration
                                    if tmp_label_to_check not in [0, 1]:
                                        last_occurence = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind(str(tmp_label_to_check))
                                    elif tmp_label_to_check in [0, 1]:
                                        last_occurence_0 = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind('0')
                                        last_occurence_1 = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind('1')
                                        last_occurence = max(
                                            last_occurence_0, last_occurence_1)

                                    # Checking constraints by simplifying what we did in viterbi
                                    if last_occurence == -1 or last_occurence == (
                                            len(tmp_combined_labels) - 1):
                                        if tmp_label_to_check == 4 and tmp_combined_labels[
                                                -1] == 3:  #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect
                                            continue
                                    elif tmp_label_to_check in [0, 1]:
                                        flag = False
                                        for j in range(
                                                last_occurence,
                                                len(tmp_combined_labels)):
                                            if tmp_combined_labels[j] not in [
                                                    0, 1, 2
                                            ]:
                                                flag = True
                                                break
                                        if flag:
                                            continue

                                    elif tmp_label_to_check == 3:
                                        continue
                                        # flag = False
                                        # for j in range(last_occurence, len(tmp_combined_labels)):
                                        #     if tmp_combined_labels[j] not in [3,2]:
                                        #         flag = True
                                        #         break
                                        # if flag:
                                        #     continue

                                    elif tmp_label_to_check == 4:
                                        if tmp_combined_labels[-1] == 3:  #????
                                            continue

                                    # elif tmp_label_to_check == 2:
                                    # elif tmp_label_to_check == 5:

                                    # Otherwise, pass
                                    log_err('\t\t' + str(i) +
                                            'Should combine this one')
                                    combined_label = tmp_label_to_check
                                    tmp_combined_labels.append(
                                        tmp_label_to_check)
                                    # combined_label = (tmp_label_to_check, sorted_label_distribution)
                                    break

                                else:
                                    continue

                        # Debug
                        if combined_label == -1:
                            log_err(str(i) + 'problem')
                            combined_label = (appear_next,
                                              sorted_label_distribution)
                            tmp_combined_labels.append(-1)

            # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit
            print '==========================tmp_combined_labels', tmp_combined_labels
            majority_order_structure = self.find_majority_structure()[1]
            majority_rate = self.find_majority_structure()[2]
            tmp_combined_labels_length = len(tmp_combined_labels)
            if majority_rate > 0.80 and majority_order_structure == [0, 3, 4]:
                # p1(phase1): author segments
                for p1 in range(tmp_combined_labels_length):
                    if tmp_combined_labels[p1] in [0, 1, 2, 5]:
                        continue
                    else:
                        break

                # p2(phase2): title segments
                for p2 in range(p1, tmp_combined_labels_length):
                    if tmp_combined_labels[p2] == 3:
                        continue
                    else:
                        break

                #p3(phase3): venue segments
                for p3 in range(p2, tmp_combined_labels_length):
                    if tmp_combined_labels[p3] in [2, 5, 4]:
                        continue
                    else:
                        break

                # Decision
                if p1 == 0:
                    print 'Houston we got a SERIOUS problem!'
                    log_err('Houston we got a SERIOUS problem!!!!!!!!')

                if p2 == p1:
                    print 'Houston we got a problem!'
                    for sp2 in range(p2, tmp_combined_labels_length):
                        if tmp_combined_labels[sp2] != 2:
                            tmp_combined_labels[sp2] = 3
                        else:
                            break  # should fix common mislabeling at this point now??????????

            # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]:    # ???? not sure if this is normal
            #     # p1(phase1): title segments
            #     for p1 in range(tmp_combined_labels_length):
            #         if tmp_combined_labels[p1] in [3]:
            #             continue
            #         else:
            #             break

            #     # p2(phase2): author segments
            #     for p2 in range(p1, tmp_combined_labels_length):
            #         if tmp_combined_labels[p2] == 3:
            #             continue
            #         else:
            #             break

            #     #p3(phase3): venue segments
            #     for p3 in range(p2, tmp_combined_labels_length):
            #         if tmp_combined_labels[p3] in [2,5,4]:
            #             continue
            #         else:
            #             break

            #     # Decision
            #     if p1 == 0:
            #         print 'Houston we got a SERIOUS problem!'
            #         log_err('Houston we got a SERIOUS problem!!!!!!!!')

            #     if p2 == p1:
            #         print 'Houston we got a problem!'
            #         for sp2 in range(p2, tmp_combined_labels_length):
            #             if tmp_combined_labels[sp2] != 2:
            #                 tmp_combined_labels[sp2] = 3
            #             else:
            #                 break
            for old_label, new_label, tmp_combined_label, token, feature_vector in zip(
                    label_sequence, new_labels, tmp_combined_labels, tokens,
                    feature_vectors):
                print to_label(old_label), '\t', to_label(
                    new_label), '\t', to_label(
                        tmp_combined_label), '\t', token, '\t', feature_vector
            print '\n'
            i += 1
示例#8
0
class Retrainer(object):
    def __init__(self, raw_segments, observation_sequences, label_sequences):
        super(Retrainer, self).__init__()
        self.raw_segments = raw_segments
        self.observation_sequences = observation_sequences
        self.label_sequences = label_sequences
        self.hmm_new = None
        self.feature_entity_list = FeatureEntityList()
        self.lm = LanguageModel()
        self.boosting_feature_generator = BoostingFeatureGenerator()

        self.DOMINANT_RATIO = 0.85  # dominant label ratio: set empirically

        self.retrain_with_boosting_features()
    
    def retrain(self):
        self.hmm_new = HMM('retrainer', 6)
        self.hmm_new.train(self.observation_sequences, self.label_sequences, useLaplaceRule=False)  #important to set laplace to be no
    
    # With new features
    def retrain_with_boosting_features(self):
        # Build language model
        for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences):
            for token, label in zip(Tokens(raw_segment).tokens, label_sequence):
                self.lm.add(token, label)
        self.lm.prettify()
        self.token_BGM = self.lm.prettify_model
        self.pattern_BGM = None

        # Retrain
        self.hmm_new = HMM('retrainer', 6)
        partial_features = []
        for raw_segment in self.raw_segments:
            partial_features.append(BoostingFeatureGenerator(raw_segment, self.token_BGM, self.pattern_BGM).features)
        self.hmm_new.train(partial_features, self.label_sequences, useLaplaceRule=False)
        self.observation_sequences = partial_features


    def run(self):
        i = 0
        self.new_labels = []
        for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences):
            new_labels = self.hmm_new.decode(raw_segment)[1]
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            feature_vectors = FeatureGenerator(raw_segment).features
            print i, ':  ', raw_segment
            for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors):
                print to_label(old_label), '\t', to_label(new_label), '\t', token
                self.feature_entity_list.add_entity(feature_vector, old_label, token)   #???? Old label first
            print '\n'
            i+=1

    def find_pattern(self):        
        self.hmm_new.feature_entity_list.print_all_entity()


    # Find the first tokens at VN boundaries
    def find_venue_boundary_tokens(self):
        recorder = {}
        for raw_segment, observation_sequence, label_sequence in zip(self.raw_segments, self.observation_sequences, self.label_sequences):
            first_target_label_flag = True
            tokens = Tokens(raw_segment).tokens
            for token, feature_vector, label in zip(tokens, observation_sequence, label_sequence):
                # First meet a VN label
                if label == 4 and first_target_label_flag:
                    key = token.lower()
                    if not key.islower():
                        continue
                    if recorder.has_key(key):
                        recorder[key] += 1
                    else:
                        recorder[key] = 1
                    first_target_label_flag = False

                elif (first_target_label_flag is False) and label in [0,1,3]:
                    first_target_label_flag = True

        for k,v in recorder.iteritems():
            print k, '\t', v
        return recorder


    # Learn the general order of structure of publications before moving forward
    def find_majority_structure(self):
        first_bit_counter = {'0': 0, '3': 0, '4':0, '5':0}
        overall_pattern_counter = {}
        for label_sequence in self.label_sequences:
            label = label_sequence[0]
            if label == 2:
                continue
            elif label == 5:
                continue
            elif label in [0,1]:
                first_bit_counter['0'] += 1
            else:
                first_bit_counter[str(label)] += 1

            pattern = []
            for label in label_sequence:
                if label in [2,5]:
                    continue
                elif label in [0,1]:
                    if 0 in pattern:
                        continue
                    else:
                        pattern.append(0)
                elif label == 3:
                    if 3 in pattern:
                        continue
                    else:
                        pattern.append(3)
                elif label == 4:
                    if 4 in pattern:
                        continue
                    else:
                        pattern.append(4)
            key = str(pattern)
            if overall_pattern_counter.has_key(key):
                overall_pattern_counter[key] += 1
            else:
                overall_pattern_counter[key] = 1

        # Inducing the structure
        sorted_firstbit_counter = sorted(first_bit_counter.iteritems(), key=operator.itemgetter(1), reverse=True)
        sorted_pattern_counter = sorted(overall_pattern_counter.iteritems(), key=operator.itemgetter(1), reverse=True)
        print '===========================================', sorted_pattern_counter
        return int(sorted_firstbit_counter[0][0]), ast.literal_eval(sorted_pattern_counter[0][0]), (float(sorted_pattern_counter[0][1]))/len(self.label_sequences)

    def run_with_boosting_features(self):
        i = 0
        self.new_labels = []
        self.combined_labels = []

        for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences):
            feature_vectors, new_labels = self.hmm_new.decode(raw_segment, True, True, self.token_BGM, self.pattern_BGM)
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            print i, ':  ', raw_segment

            # Combination step: 
            tmp_combined_labels = []    # the decided combined labels so far
            for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors):

                # Combine old and new labels to come out a combined label, and deciding...
                combined_label = -1
                
                if old_label == new_label:
                    combined_label = new_label
                    tmp_combined_labels.append(new_label)
                
                # Combine compatible labels: FN and LN
                elif old_label in [0,1] and new_label in [0,1]:
                    combined_label = old_label
                    tmp_combined_labels.append(new_label)
                
                # Combine labels that are not compatible
                else:   
                    tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(feature_vector)    # Get the Background knowledge provided the feature vector: the language feature model
                    sorted_label_distribution = sorted(tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True)
                    total_label_occurence = float(sum(tmp[1] for tmp in sorted_label_distribution))

                    

                    # ============================================================================================
                    # ============================================================================================
                    # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED 
                    # sorted_label_distribution = []
                    # sum_prob = 0.0
                    # for pair in tmp_sorted_label_distribution:
                    #     sorted_label_distribution.append(pair)
                    #     sum_prob += pair[1]
                    #     if sum_prob/total_label_occurence >= 0.90:
                    #         break
                    # ============================================================================================
                    # ============================================================================================



                    # Dominant label case: Iterate from the highest label stats according to this feature vector:
                    for label_frequency in sorted_label_distribution:
                        if int(label_frequency[0]) in [old_label, new_label] and (label_frequency[1]/total_label_occurence)>=self.DOMINANT_RATIO:
                            print 'Dominant labels'
                            # Check for constraint:
                            tmp_label_to_check = int(label_frequency[0])
                            
                            # Find last occurence position of this label
                            if tmp_label_to_check not in [0,1]:
                                last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                            elif tmp_label_to_check in [0,1]:
                                last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                last_occurence = max(last_occurence_0, last_occurence_1)

                            # Checking constraints by simplifying what we did in viterbi
                            if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):  # Never occurred, or last occurence is the last label
                                # When we are deciding the first label
                                if len(tmp_combined_labels) == 0:
                                    first_bit = self.find_majority_structure()[0]
                                    if first_bit == 0 and tmp_label_to_check not in [0,1]:
                                        continue
                                    if first_bit == 3 and tmp_label_to_check != 3:
                                        continue

                                # VN CANNOT FOLLOW TI W/O DL constraint
                                if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3:
                                    continue
                            elif tmp_label_to_check in [0,1]:
                                flag = False
                                for j in range(last_occurence, len(tmp_combined_labels)):
                                    if tmp_combined_labels[j] not in [0,1,2]:
                                        flag = True
                                        break
                                if flag:
                                    continue
                            elif tmp_label_to_check == 3:
                                continue
                            elif tmp_label_to_check == 4:
                                if tmp_combined_labels[-1] == 3:    #????
                                    continue

                            combined_label = tmp_label_to_check
                            tmp_combined_labels.append(tmp_label_to_check)
                            break
                    
                    # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order
                    if combined_label == -1:
                        # Iterate from the highest label stats according to this feature vector:

                        for label_frequency in sorted_label_distribution:
                            breakout_flag = False
                            #Test against constraints
                            # 1. DL separate labels principle
                            # 2. AU-TI-VN Order 
                            if int(label_frequency[0]) in [old_label, new_label]:
                                tmp_label_to_check = int(label_frequency[0])
                                
                                # find structure of the order, and find what have appeared, and so predict what to be appear next
                                structure_overview = []     #will record the order in big sense: 0,3,4/4,0,3
                                for tmp_combined_label in tmp_combined_labels:
                                    if tmp_combined_label in [2,5]:
                                        continue                                            
                                    elif tmp_combined_label in [0,1]:
                                        if 0 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(0)
                                    elif tmp_combined_label == 3:
                                        if 3 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(3)
                                    elif tmp_combined_label == 4:
                                        if 4 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(4)
                                # Based on the structure overview, find what should appear next
                                appear_next = []
                                if structure_overview == [0]:
                                    appear_next = [0,1,3,2,5]
                                elif structure_overview == [3]:
                                    appear_next = [3,0,1,2,5]
                                elif structure_overview == [0,3]:
                                    appear_next = [3,4,2,5]
                                elif structure_overview == [3,0]:
                                    appear_next = [0,1,4,2,5]
                                elif structure_overview == [0,3,4]:
                                    appear_next = [4,2,5]
                                elif structure_overview == [3,0,4]:
                                    appear_next = [4,2,5]
                                else:   #weird case
                                    print 'Weird structure! Weird case!'
                                    if tmp_feature_entity.label_distribution[str(old_label)] > tmp_feature_entity.label_distribution[str(new_label)]:
                                        tmp_label_to_check_list = [old_label, new_label]
                                    else:
                                        tmp_label_to_check_list = [new_label, old_label]
                                    # Apply constraints here too
                                    for tmp_label_to_check in tmp_label_to_check_list:
                                        if tmp_label_to_check not in [0,1]:
                                            last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                                        elif tmp_label_to_check in [0,1]:
                                            last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                            last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                            last_occurence = max(last_occurence_0, last_occurence_1)

                                        # Checking constraints by simplifying what we did in viterbi
                                        if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):
                                            # When we are deciding the first label
                                            if len(tmp_combined_labels) == 0:
                                                first_bit = self.find_majority_structure()[0]
                                                if first_bit == 0 and tmp_label_to_check not in [0,1]:
                                                    continue
                                                if first_bit == 3 and tmp_label_to_check != 3:
                                                    continue
                                            try:
                                                if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3:
                                                    continue
                                            except:
                                                continue
                                        elif tmp_label_to_check in [0,1]:
                                            flag = False
                                            for j in range(last_occurence, len(tmp_combined_labels)):
                                                if tmp_combined_labels[j] not in [0,1,2]:
                                                    flag = True
                                                    break
                                            if flag:
                                                continue
                                        elif tmp_label_to_check == 3:
                                            continue
                                        elif tmp_label_to_check == 4:
                                            if tmp_combined_labels[-1] == 3:
                                                continue

                                        combined_label = tmp_label_to_check
                                        tmp_combined_labels.append(combined_label)
                                        breakout_flag = True
                                        break

                                if breakout_flag:
                                    break
                                if tmp_label_to_check in appear_next:
                                    # Then check constraint. find last occurence, DL constraints
                                    # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration
                                    if tmp_label_to_check not in [0,1]:
                                        last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                                    elif tmp_label_to_check in [0,1]:
                                        last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                        last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                        last_occurence = max(last_occurence_0, last_occurence_1)

                                    # Checking constraints by simplifying what we did in viterbi
                                    if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):
                                        if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect
                                            continue
                                    elif tmp_label_to_check in [0,1]:
                                        flag = False
                                        for j in range(last_occurence, len(tmp_combined_labels)):
                                            if tmp_combined_labels[j] not in [0,1,2]:
                                                flag = True
                                                break
                                        if flag:
                                            continue

                                    elif tmp_label_to_check == 3:
                                        continue
                                        # flag = False
                                        # for j in range(last_occurence, len(tmp_combined_labels)):
                                        #     if tmp_combined_labels[j] not in [3,2]:
                                        #         flag = True
                                        #         break
                                        # if flag:
                                        #     continue

                                    elif tmp_label_to_check == 4:
                                        if tmp_combined_labels[-1] == 3:    #????
                                            continue

                                    # elif tmp_label_to_check == 2:
                                    # elif tmp_label_to_check == 5:
                                    
                                    # Otherwise, pass
                                    log_err('\t\t' + str(i) + 'Should combine this one')
                                    combined_label = tmp_label_to_check
                                    tmp_combined_labels.append(tmp_label_to_check)
                                    # combined_label = (tmp_label_to_check, sorted_label_distribution)
                                    break
                                    
                                else:
                                    continue

                        # Debug
                        if combined_label == -1:
                            log_err(str(i) + 'problem')
                            combined_label = (appear_next, sorted_label_distribution)
                            tmp_combined_labels.append(-1)


            # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit
            print '==========================tmp_combined_labels', tmp_combined_labels
            majority_order_structure = self.find_majority_structure()[1]
            majority_rate = self.find_majority_structure()[2]
            tmp_combined_labels_length = len(tmp_combined_labels)
            if majority_rate > 0.80 and majority_order_structure == [0,3,4]:
                # p1(phase1): author segments
                for p1 in range(tmp_combined_labels_length):
                    if tmp_combined_labels[p1] in [0,1,2,5]:
                        continue
                    else:
                        break

                # p2(phase2): title segments
                for p2 in range(p1, tmp_combined_labels_length):
                    if tmp_combined_labels[p2] == 3:
                        continue
                    else:
                        break

                #p3(phase3): venue segments
                for p3 in range(p2, tmp_combined_labels_length):
                    if tmp_combined_labels[p3] in [2,5,4]:
                        continue
                    else:
                        break

                # Decision
                if p1 == 0:
                    print 'Houston we got a SERIOUS problem!'
                    log_err('Houston we got a SERIOUS problem!!!!!!!!')

                if p2 == p1:
                    print 'Houston we got a problem!'
                    for sp2 in range(p2, tmp_combined_labels_length):
                        if tmp_combined_labels[sp2] != 2:
                            tmp_combined_labels[sp2] = 3
                        else:
                            break   # should fix common mislabeling at this point now??????????


            # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]:    # ???? not sure if this is normal
            #     # p1(phase1): title segments
            #     for p1 in range(tmp_combined_labels_length):
            #         if tmp_combined_labels[p1] in [3]:
            #             continue
            #         else:
            #             break

            #     # p2(phase2): author segments
            #     for p2 in range(p1, tmp_combined_labels_length):
            #         if tmp_combined_labels[p2] == 3:
            #             continue
            #         else:
            #             break

            #     #p3(phase3): venue segments
            #     for p3 in range(p2, tmp_combined_labels_length):
            #         if tmp_combined_labels[p3] in [2,5,4]:
            #             continue
            #         else:
            #             break

            #     # Decision
            #     if p1 == 0:
            #         print 'Houston we got a SERIOUS problem!'
            #         log_err('Houston we got a SERIOUS problem!!!!!!!!')

            #     if p2 == p1:
            #         print 'Houston we got a problem!'
            #         for sp2 in range(p2, tmp_combined_labels_length):
            #             if tmp_combined_labels[sp2] != 2:
            #                 tmp_combined_labels[sp2] = 3
            #             else:
            #                 break
            for old_label, new_label, tmp_combined_label, token, feature_vector in zip(label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors):
                print to_label(old_label), '\t', to_label(new_label), '\t', to_label(tmp_combined_label), '\t', token, '\t', feature_vector
            print '\n'
            i+=1
示例#9
0
simulated data
"""
import numpy as np
from hmm import HMM


def get_input_data(N, T, V):
    return np.random.randint(0, V, size=(N, T))


def get_state(H):
    return np.array(range(0, H))


# simulation
V = [0, 1, 2, 3, 4]
X = get_input_data(20, 10, len(V))
Y = get_state(3)

# training
model = HMM()
model.fit(X, Y, V)

t = model.Transition
e = model.Emission
p = model.Pi

# decoding
model = HMM(t, e, p)
res = model.decode(np.random.randint(0, len(V), size=(2, 10)))
print(res)