def main(): # Step 1: Read in training data, initialize dictionary of features with weight 0. weights = init_suffix_weights("tag_train.dat") # Step 2: Get the gold tag histories using tagger history generator.py gold = open("q5_gold", "w") train_data = open("tag_train.dat", "r") call(["python", "tagger_history_generator.py", "GOLD"], stdout=gold, stdin=train_data) # Step 3: Enumerate all possible histories train_data = open("tag_train.dat", "r") histories = open("q5_histories", "w") call(["python", "tagger_history_generator.py", "ENUM"], stdout=histories, stdin=train_data) # Step 4: Run Perceptron k=4 times. perceptron(weights, "tag_train.dat", "q5_histories", "q5_gold", 5) # Step 5: Write the final model out to suffix_tagger.model. final_model = file("suffix_tagger.model", "w") for key in weights: line = key + " " + str(weights[key]) print(line, file=final_model) # Run model with suffix, tag and bigram features on development data tag_weights = tagmodel_weights() weights.update(tag_weights) weighted_history("tag_dev.dat","q4_histories",weights,"q5_weighted",True,True) best_tag = open("q5_best", "w") weighted = open("q5_weighted", "r") call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted) # 4. Save file with word-tag combos set_tags("tag_dev.dat","q5_best","q5_output") remove("q5_best") remove("q5_histories") remove("q5_weighted") remove("q5_gold")
def decode(): # read tag.model into a map from feature strings to weights. weights = tagmodel_weights() # For each sentence in development data (Steps 1-4): # 1. Enumerate all possible histories histories = open("q4_histories", "w") data = open("tag_dev.dat", "r") call(["python", "tagger_history_generator.py", "ENUM"], stdout=histories, stdin=data) # 2. Compute the features for each history and use tag.model to assign a weight to each history weighted_history("tag_dev.dat","q4_histories",weights,"q4_weighted",True,False) # 3. Call tagger_decoder.py HISTORY and pipe in the weighted histories to compute the highest scoring tagging. best_tag = open("q4_best", "w") weighted = open("q4_weighted", "r") call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted) # 4. Save file with word-tag combos set_tags("tag_dev.dat","q4_best","q4_output") remove("q4_best") remove("q4_weighted")
def main(): # Get suffix, tag and bigram feature vectors generated in quesiton4.py and question5.py weights = tagmodel_weights() weights.update(suffix_weights("suffix_tagger.model")) # Combo 1: Modify certain suffix rules: ============================================ weights_1 = weights # suffix "ly" is usually ADV; increase weight weights_1["SUFFIX:ly:ADV"] = float(weights["SUFFIX:ly:ADV"]) + 3 # suffix "ed" is usually VERB; increase weight weights_1["SUFFIX:ed:VERB"] = float(weights["SUFFIX:ed:VERB"]) + 3 # suffix "ing" is usually VERB; increase weight weights_1["SUFFIX:ing:VERB"] = float(weights["SUFFIX:ing:VERB"]) + 0.05 # Run model with suffix, tag and bigram features on development data weighted_history("tag_dev.dat","q4_histories",weights_1,"q6_weighted",True,True) best_tag = open("q6_best", "w") weighted = open("q6_weighted", "r") call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted) # 4. Save file with word-tag combos set_tags("tag_dev.dat","q6_best","q6_output_combo1") # Combo 2: Modify one bigram rule ================================================== weights_2 = weights # bigram "VERB VERB" is often wrong; decrease weight weights_2["BIGRAM:VERB:VERB"] = -0.5 # Run model with suffix, tag and bigram features on development data weighted_history("tag_dev.dat","q4_histories",weights_2,"q6_weighted",True,True) best_tag = open("q6_best", "w") weighted = open("q6_weighted", "r") call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted) # 4. Save file with word-tag combos set_tags("tag_dev.dat","q6_best","q6_output_combo2") # Combo 3: Add content rules: ===================================================== weights_3 = weights # If a word has a hyphen, tag as ADJ weights_3["CONTAINS:HYPHEN:ADJ"] = 5 # If word has digits, tag as NUM weights_3["CONTAINS:DIGIT:NUM"] = 5 weighted_history2("tag_dev.dat","q4_histories",weights_3,"q6_weighted",True,True) best_tag = open("q6_best", "w") weighted = open("q6_weighted", "r") call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted) # 4. Save file with word-tag combos set_tags("tag_dev.dat","q6_best","q6_output_combo3") remove("q6_best") # Combo 4: All together now: ===================================================== # suffix "ly" is usually ADV; increase weight weights["SUFFIX:ly:ADV"] = float(weights["SUFFIX:ly:ADV"]) + 3 # suffix "ed" is usually VERB; increase weight weights["SUFFIX:ed:VERB"] = float(weights["SUFFIX:ed:VERB"]) + 3 # No use to add to ["SUFFIX:ing:VERB"] feature # bigram "VERB VERB" is often wrong; decrease weight # weights["BIGRAM:VERB:VERB"] = -0.5 # If a word has a hyphen, tag as ADJ weights["CONTAINS:HYPHEN:ADJ"] = 5 # If word has digits, tag as NUM weights["CONTAINS:DIGIT:NUM"] = 5 weighted_history2("tag_dev.dat","q4_histories",weights,"q6_weighted",True,True) best_tag = open("q6_best", "w") weighted = open("q6_weighted", "r") call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted) # 4. Save file with word-tag combos set_tags("tag_dev.dat","q6_best","q6_output_combo4") remove("q6_best") remove("q6_weighted") remove("q4_histories")