Пример #1
0
    def createModelFilesFromInput(input_filename, q_mle_filename,
                                  e_mle_filename):
        logging.basicConfig()
        log = logging.getLogger()
        log.setLevel(logging.DEBUG)

        log.debug("Reading input file")
        train_data = utils.read_input_file(input_filename,
                                           replace_numbers=False)

        log.debug("- Converting words\\tags to ids")
        W2I = list_to_ids(flatten(reduce_tuple_list(train_data, dim=0)),
                          MIN_COUNT=COMMON_WORD_MIN_COUNT)
        #Unknown words
        unk_words = MLETrain.__generate_unk_words()
        i = max(W2I.values()) + 1
        for w_unk in unk_words:
            W2I[w_unk] = i
            i += 1
        T2I = list_to_ids(flatten(reduce_tuple_list(train_data, dim=1)))
        train_data_ids = MLETrain.__sentences_to_ids(train_data, W2I, T2I)
        # Inverse dictionary
        I2T = utils.inverse_dict(T2I)
        I2W = utils.inverse_dict(W2I)

        log.debug("- Counting:")
        count_tag_triplets = Counter()
        count_tag_pairs = Counter()
        count_tag_single = Counter()
        count_word_tags = Counter()
        count_word_tags.update()
        for sentence in train_data_ids:
            words_ids = sentence[0]
            tags_ids = sentence[1]
            # Q
            count_tag_triplets.update(utils.count_triplets(tags_ids))
            count_tag_pairs.update(utils.count_pairs(tags_ids))
            count_tag_single.update(utils.count_single(tags_ids))
            # E
            count_word_tags.update(utils.count_word_tags(words_ids, tags_ids))

        log.debug("Writing to file {}".format(q_mle_filename))
        utils.write_q_mle_file(count_tag_triplets, count_tag_pairs,
                               count_tag_single, I2T, q_mle_filename)

        log.debug("Writing to file {}".format(e_mle_filename))
        utils.write_e_mle_file(count_word_tags, I2T, I2W, e_mle_filename)

        log.debug("Done")
Пример #2
0
    def getPrediction(self, sentence_words):
        tags = self.__mletrain.getTags()
        T2I = utils.list_to_ids(tags, ID_SHIFT=0)
        I2T = utils.inverse_dict(T2I)

        words_count = len(sentence_words)
        tags_count = len(tags)
        start_tag_id = T2I[utils.START_TAG]
        getLogScore = lambda wi, t_id, t_prev_id, t_prev_prev_id : \
            np.log(self.__mletrain.getQ(I2T[t_id], I2T[t_prev_id], I2T[t_prev_prev_id])) + \
            np.log(self.__mletrain.getE(wi, I2T[t_id]))

        prediction_ids = viterbi.run_viterbi_2nd_order_log_with_beam_search(
            sentence_words, words_count, tags_count, start_tag_id, getLogScore)

        predictions = [I2T[p_id] for p_id in prediction_ids]
        return predictions
Пример #3
0
 def __init__(self, model, feature_map_dict_vect, T2I, common_words):
     self.__model = model
     self.__feature_map_dict_vect = feature_map_dict_vect
     self.__common_words = common_words
     self.__I2T = utils.inverse_dict(T2I)
Пример #4
0
    def get_histo_type(self, histo_key, data):
        histo_types = inverse_dict(FeatureExtractor.WEIRD_HT)

        return histo_types.get(histo_key, data['type'])
Пример #5
0
if __name__ == '__main__':
    arguments = docopt(__doc__, version='Naval Fate 2.0')
    words_filename = arguments['<WORDS_FILE>']
    contexts_filename = arguments['<CONTEXTS_FILE>']

    print("Reading input files...")
    W2I, C2I, words, contexts = load_from_files(words_filename,
                                                contexts_filename)

    #W2I, C2I, words, contexts = load_from_files("../data/word2vec/bow5/bow5.words","../data/word2vec/bow5/bow5.contexts")
    #W2I, C2I, words, contexts = load_from_files("../data/word2vec/deps/deps.words","../data/word2vec/deps/deps.contexts")

    k = 20

    I2W = utils.inverse_dict(W2I.S2I)
    I2C = utils.inverse_dict(C2I.S2I)

    target_words = [
        "car", "bus", "hospital", "hotel", "gun", "bomb", "horse", "fox",
        "table", "bowl", "guitar", "piano"
    ]

    # First order
    # Find top k context features for each target word
    # top features is highest dot product (word, context_word)
    print("\n1st order")
    dwc = DotWithCache()
    for from_w in target_words:
        from_w_id = W2I.get_id(from_w)
        from_vec = words[from_w_id, :]
Пример #6
0
        if save_to_file:
            preprocess.save_to_file(out_dir + "/preprocess.pickle")
        time_e = time.time()
        print("Done. time: %.2f secs" % (time_e - time_s))
    else:
        preprocess = Preprocess.load_from_file(out_dir + "/preprocess.pickle")

    target_words = [
        "car", "bus", "hospital", "hotel", "gun", "bomb", "horse", "fox",
        "table", "bowl", "guitar", "piano"
    ]
    target_words_ids = [preprocess.W2I.get_id(w) for w in target_words]

    W2I_TREE, contexts = preprocess.contexts

    I2W = utils.inverse_dict(preprocess.W2I.S2I)
    if mod == "tree":
        target_words_ids = [
            W2I_TREE.get_id(str(id)) for id in target_words_ids
        ]
        I2W_TREE = utils.inverse_dict(W2I_TREE.S2I)
        inv_func = lambda u: " ".join(
            [I2W[int(s)] if s.isdigit() else s for s in I2W_TREE[u].split()])
    else:
        inv_func = lambda u: I2W[u]

    print(
        "Converting frequencies to pmis, calculating cosine distances for target words"
    )
    if calc_sim:
        pmi_contexts = contexts_to_pmi_contexts(contexts)