def createModelFilesFromInput(input_filename, q_mle_filename, e_mle_filename): logging.basicConfig() log = logging.getLogger() log.setLevel(logging.DEBUG) log.debug("Reading input file") train_data = utils.read_input_file(input_filename, replace_numbers=False) log.debug("- Converting words\\tags to ids") W2I = list_to_ids(flatten(reduce_tuple_list(train_data, dim=0)), MIN_COUNT=COMMON_WORD_MIN_COUNT) #Unknown words unk_words = MLETrain.__generate_unk_words() i = max(W2I.values()) + 1 for w_unk in unk_words: W2I[w_unk] = i i += 1 T2I = list_to_ids(flatten(reduce_tuple_list(train_data, dim=1))) train_data_ids = MLETrain.__sentences_to_ids(train_data, W2I, T2I) # Inverse dictionary I2T = utils.inverse_dict(T2I) I2W = utils.inverse_dict(W2I) log.debug("- Counting:") count_tag_triplets = Counter() count_tag_pairs = Counter() count_tag_single = Counter() count_word_tags = Counter() count_word_tags.update() for sentence in train_data_ids: words_ids = sentence[0] tags_ids = sentence[1] # Q count_tag_triplets.update(utils.count_triplets(tags_ids)) count_tag_pairs.update(utils.count_pairs(tags_ids)) count_tag_single.update(utils.count_single(tags_ids)) # E count_word_tags.update(utils.count_word_tags(words_ids, tags_ids)) log.debug("Writing to file {}".format(q_mle_filename)) utils.write_q_mle_file(count_tag_triplets, count_tag_pairs, count_tag_single, I2T, q_mle_filename) log.debug("Writing to file {}".format(e_mle_filename)) utils.write_e_mle_file(count_word_tags, I2T, I2W, e_mle_filename) log.debug("Done")
def getPrediction(self, sentence_words): tags = self.__mletrain.getTags() T2I = utils.list_to_ids(tags, ID_SHIFT=0) I2T = utils.inverse_dict(T2I) words_count = len(sentence_words) tags_count = len(tags) start_tag_id = T2I[utils.START_TAG] getLogScore = lambda wi, t_id, t_prev_id, t_prev_prev_id : \ np.log(self.__mletrain.getQ(I2T[t_id], I2T[t_prev_id], I2T[t_prev_prev_id])) + \ np.log(self.__mletrain.getE(wi, I2T[t_id])) prediction_ids = viterbi.run_viterbi_2nd_order_log_with_beam_search( sentence_words, words_count, tags_count, start_tag_id, getLogScore) predictions = [I2T[p_id] for p_id in prediction_ids] return predictions
def __init__(self, model, feature_map_dict_vect, T2I, common_words): self.__model = model self.__feature_map_dict_vect = feature_map_dict_vect self.__common_words = common_words self.__I2T = utils.inverse_dict(T2I)
def get_histo_type(self, histo_key, data): histo_types = inverse_dict(FeatureExtractor.WEIRD_HT) return histo_types.get(histo_key, data['type'])
if __name__ == '__main__': arguments = docopt(__doc__, version='Naval Fate 2.0') words_filename = arguments['<WORDS_FILE>'] contexts_filename = arguments['<CONTEXTS_FILE>'] print("Reading input files...") W2I, C2I, words, contexts = load_from_files(words_filename, contexts_filename) #W2I, C2I, words, contexts = load_from_files("../data/word2vec/bow5/bow5.words","../data/word2vec/bow5/bow5.contexts") #W2I, C2I, words, contexts = load_from_files("../data/word2vec/deps/deps.words","../data/word2vec/deps/deps.contexts") k = 20 I2W = utils.inverse_dict(W2I.S2I) I2C = utils.inverse_dict(C2I.S2I) target_words = [ "car", "bus", "hospital", "hotel", "gun", "bomb", "horse", "fox", "table", "bowl", "guitar", "piano" ] # First order # Find top k context features for each target word # top features is highest dot product (word, context_word) print("\n1st order") dwc = DotWithCache() for from_w in target_words: from_w_id = W2I.get_id(from_w) from_vec = words[from_w_id, :]
if save_to_file: preprocess.save_to_file(out_dir + "/preprocess.pickle") time_e = time.time() print("Done. time: %.2f secs" % (time_e - time_s)) else: preprocess = Preprocess.load_from_file(out_dir + "/preprocess.pickle") target_words = [ "car", "bus", "hospital", "hotel", "gun", "bomb", "horse", "fox", "table", "bowl", "guitar", "piano" ] target_words_ids = [preprocess.W2I.get_id(w) for w in target_words] W2I_TREE, contexts = preprocess.contexts I2W = utils.inverse_dict(preprocess.W2I.S2I) if mod == "tree": target_words_ids = [ W2I_TREE.get_id(str(id)) for id in target_words_ids ] I2W_TREE = utils.inverse_dict(W2I_TREE.S2I) inv_func = lambda u: " ".join( [I2W[int(s)] if s.isdigit() else s for s in I2W_TREE[u].split()]) else: inv_func = lambda u: I2W[u] print( "Converting frequencies to pmis, calculating cosine distances for target words" ) if calc_sim: pmi_contexts = contexts_to_pmi_contexts(contexts)