示例#1
0
def extract_positional_bigram_features(window, mid_ix, feature_val=1):
    bi_grams = compute_ngrams(window, max_len=2, min_len=2)
    d = {}
    for i, bi_gram in enumerate(bi_grams):
        d["BI" + ":" + str(-mid_ix + i) + " " + bi_gram[0] + " | " +
          bi_gram[1]] = feature_val
    return d
def extend_chains(chains):
    ext_chains = set()
    for tokens in chains:
        ext_chains.add(",".join(tokens))
        ngrams = compute_ngrams(tokens, max_len=None, min_len=3)
        for t in ngrams:
            ext_chains.add(",".join(t))
    return ext_chains
示例#3
0
    def get_vector_space(self, tokenized_docs):
        def collapse(tag, ngram):
            return tag + ":" + "-".join(ngram)

        collapse_ngram = lambda ngram: collapse("ng", ngram)
        collapse_skip_gram = lambda ngram: collapse("sg", ngram)

        data = []
        df_tally = defaultdict(int)

        for doc in tokenized_docs:
            ngrams = compute_ngrams(doc, 2, 2)
            #skip_grams = compute_skip_grams(doc, 5)
            skip_grams = []

            example = doc + map(collapse_ngram, ngrams) + map(
                collapse_skip_gram, skip_grams)
            data.append(example)
            # compute doc freq
            s = set(example)
            for item in s:
                df_tally[item] += 1

        # Remove low ngram counts
        processed_data = []
        for example in data:
            row = []
            for term in example:
                if df_tally[term] >= self.min_word_count:
                    row.append(term)
            processed_data.append(row)
        del data  # prevent bugs due to later access

        lat_vector_model = self.vector_space_func(processed_data)
        """ Set to 1.0 if false """
        self.df = defaultdict(lambda: 1.0)

        if self.use_idf:
            for k, v in df_tally.items():
                self.df[k] = np.log(v + 1)

        collapsed = []
        for example in processed_data:
            vectors = []
            for token in example:
                v = lat_vector_model.project(token)
                if v != None:
                    """ Idf value will be 1.0 if False """
                    vectors.append(
                        np.array(v) * self.num_topics / self.df[token])

            collapse = self.func(vectors)
            collapsed.append(collapse)

        print "Constructed Vector Space"
        return (collapsed, dict())
    def get_vector_space(self, tokenized_docs):
        
        def collapse(tag, ngram):
            return tag + ":" + "-".join(ngram)

        collapse_ngram = lambda ngram: collapse("ng", ngram)
        collapse_skip_gram = lambda ngram: collapse("sg", ngram)

        data = []
        df_tally = defaultdict(int)

        for doc in tokenized_docs:
            ngrams = compute_ngrams(doc, 2, 2)
            #skip_grams = compute_skip_grams(doc, 5)
            skip_grams = []

            example = doc + map(collapse_ngram, ngrams) + map(collapse_skip_gram, skip_grams)
            data.append(example)
            # compute doc freq
            s = set(example)
            for item in s:
                df_tally[item] += 1

        # Remove low ngram counts
        processed_data = []
        for example in data:
            row = []
            for term in example:
                if df_tally[term] >= self.min_word_count:
                    row.append(term)
            processed_data.append(row)
        del data # prevent bugs due to later access

        lat_vector_model = self.vector_space_func(processed_data)

        """ Set to 1.0 if false """
        self.df = defaultdict(lambda: 1.0)

        if self.use_idf:
            for k,v in df_tally.items():
                self.df[k] = np.log(v + 1)

        collapsed = []
        for example in processed_data:
            vectors = []
            for token in example:
                v = lat_vector_model.project(token)
                if v != None:
                    """ Idf value will be 1.0 if False """
                    vectors.append(np.array(v) * self.num_topics / self.df[token])
                    
            collapse = self.func(vectors)
            collapsed.append(collapse)
        
        print "Constructed Vector Space"
        return (collapsed, dict())
def bigram_features(window, mid_ix=None, feature_val = 1):
    """
        window      :   list of str
                            words in window
        mid_ix      :   int
                            position of word to predict
        feature_val :   Any
                            value for feature
        returns     :   dct
                            dct[str]:val

        Extracts bi-gram word features, IGNORING POSITION
    """
    bi_grams = compute_ngrams(window, max_len = 2, min_len = 2)
    d = dict()
    for bi_gram in bi_grams:
        d["BI" + ":" + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val
    return d
示例#6
0
def tagged_sents_to_word_windows(tagged_sents, window_size):

    offset = int((window_size - 1) / 2)
    tagged_windows = []
    for sent in tagged_sents:
        wds, tags = zip(*sent)
        wds = list(wds)
        # pad sentence
        for _ in range(offset):
            wds.insert(0, SENT_START)
            wds.append(SENT_END)

        windows = compute_ngrams(wds, max_len=window_size, min_len=window_size)
        #numbered_windows = map(window_to_sequence, windows)
        #tagged = zip(numbered_windows, tags)
        tagged = zip(windows, tags)
        tagged_windows.extend(tagged)

    return tagged_windows
def trigram_features(window, mid_ix=None, feature_val = 1):
    """
        window      :   list of str
                            words in window
        mid_ix      :   int
                            position of word to predict
        feature_val :   Any
                            value for feature
        returns     :   dct
                            dct[str]:val

        Extracts tri-gram word features, IGNORING POSITION
    """
    tri_grams = compute_ngrams(window, max_len = 3, min_len = 3)
    d = {}
    for tri_gram in tri_grams:
        tri_gram_key = tri_gram[0] + " | " + tri_gram[1] + "|" + tri_gram[2]
        d["TRI" + ":" + " " + tri_gram_key] = feature_val
    return d
def positional_bigram_features(window, mid_ix=None, feature_val = 1):
    """
        window      :   list of str
                            words in window
        mid_ix      :   int
                            position of word to predict
        feature_val :   Any
                            value for feature
        returns     :   dct
                            dct[str]:val

        Extracts bi-gram word features, INCLUDING POSITION
    """
    if mid_ix is None:
        mid_ix = compute_middle_index(window)
    bi_grams = compute_ngrams(window, max_len = 2, min_len = 2)
    d = {}
    for i, bi_gram in enumerate(bi_grams):
        d["P_BI" + ":" + str(-mid_ix + i) + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val
    return d
def extract_ngram_features_stemmed(offset, ngram_size, input, val=1):
    """ offset      :   int
                           the number of words either side of the input to extract features from
        ngram_size  :   int
                            the size of the ngrams
        input      :    FeatureExtactorInput
                            input to feature extractor
        returns     :   dict
                            dictionary of features
    """

    feats = {}
    end = len(input.sentence) - 1

    # fix to within bounds only
    start = max(0, input.wordix - offset)
    stop = min(end, input.wordix + offset)

    window = list(input.sentence[start:stop + 1])
    window = list(map(stem, window))
    if input.wordix < offset:
        diff = offset - input.wordix
        for i in range(diff):
            window.insert(0, __START__)
    if input.wordix + offset > end:
        diff = input.wordix + offset - end
        for i in range(diff):
            window.append(__END__)

    ngrams = compute_ngrams(window, ngram_size, ngram_size)
    str_num_ngrams = str(ngram_size)

    for i, offset_ngram in enumerate(ngrams):
        relative_offset = str(i - offset)
        str_ngram = ",".join(offset_ngram)
        feats["POS_" + str_num_ngrams + "GRAMS:" + relative_offset + "->" +
              str_ngram] = val

    return feats
def extract_ngram_features_stemmed(offset, ngram_size, input, val = 1):
    """ offset      :   int
                           the number of words either side of the input to extract features from
        ngram_size  :   int
                            the size of the ngrams
        input      :    FeatureExtactorInput
                            input to feature extractor
        returns     :   dict
                            dictionary of features
    """

    feats = {}
    end = len(input.sentence) - 1

    # fix to within bounds only
    start = max(0, input.wordix - offset)
    stop  = min(end, input.wordix + offset)

    window = list(input.sentence[start:stop+1])
    window = list(map(stem, window))
    if input.wordix < offset:
        diff = offset - input.wordix
        for i in range(diff):
            window.insert(0,__START__)
    if input.wordix + offset > end:
        diff = input.wordix + offset - end
        for i in range(diff):
            window.append(__END__)

    ngrams = compute_ngrams(window, ngram_size, ngram_size)
    str_num_ngrams = str(ngram_size)

    for i, offset_ngram in enumerate(ngrams):
        relative_offset = str(i - offset)
        str_ngram = ",".join(offset_ngram)
        feats["POS_" + str_num_ngrams + "GRAMS:" + relative_offset + "->" + str_ngram] = val

    return feats
示例#11
0
    def extract_ngram_features(tokens, idx):
        feats = []
        end = len(tokens) - 1

        # fix to within bounds only
        start = max(0, idx - offset)
        stop = min(end, idx + offset)

        prefix = ""
        if stem_words:
            prefix = "STEM_"

        window = list(tokens[start:stop + 1])
        if stem_words:
            window = map(lambda x: stem(x), window)

        if idx < offset:
            diff = offset - idx
            for i in range(diff):
                window.insert(0, __START__)
        if idx + offset > end:
            diff = idx + offset - end
            for i in range(diff):
                window.append(__END__)

        ngrams = compute_ngrams(window, ngram_size, ngram_size)
        str_num_ngrams = str(ngram_size)

        for i, offset_ngram in enumerate(ngrams):
            if positional:
                relative_offset = str(i - offset)
            else:
                relative_offset = "BOW"
            str_ngram = ",".join(offset_ngram)
            feats.append(prefix + "POS_" + str_num_ngrams + "_GRAMS:" +
                         relative_offset + "->" + str_ngram)
        return feats
    def extract_ngram_features(tokens, idx):
        feats = []
        end = len(tokens) - 1

        # fix to within bounds only
        start = max(0, idx - offset)
        stop = min(end, idx + offset)

        prefix = ""
        if stem_words:
            prefix = "STEM_"

        window = list(tokens[start:stop + 1])
        if stem_words:
            window = map(lambda x: stem(x), window)

        if idx < offset:
            diff = offset - idx
            for i in range(diff):
                window.insert(0, __START__)
        if idx + offset > end:
            diff = idx + offset - end
            for i in range(diff):
                window.append(__END__)

        ngrams = compute_ngrams(window, ngram_size, ngram_size)
        str_num_ngrams = str(ngram_size)

        for i, offset_ngram in enumerate(ngrams):
            if positional:
                relative_offset = str(i - offset)
            else:
                relative_offset = "BOW"
            str_ngram = ",".join(offset_ngram)
            feats.append(prefix + "POS_" + str_num_ngrams + "_GRAMS:" + relative_offset + "->" + str_ngram)
        return feats
def extract_positional_bigram_features(window, mid_ix, feature_val = 1):
    bi_grams = compute_ngrams(window, max_len = 2, min_len = 2)
    d = {}
    for i, bi_gram in enumerate(bi_grams):
        d["BI" + ":" + str(-mid_ix + i) + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val
    return d
示例#14
0
 def extract(self, words: List[str]) -> List[str]:
     stemmed_words = [self.stem(word) for word in words]
     stemmed_ngrams = compute_ngrams(tokens=stemmed_words,
                                     max_len=self.max_ngram_len,
                                     min_len=1)  # type: List[List[str]]
     return [("--".join(ngram)).lower() for ngram in stemmed_ngrams]
示例#15
0
 def extract(self, words: List[str]) -> List[str]:
     ngrams = compute_ngrams(tokens=words,
                             max_len=self.max_ngram_len,
                             min_len=1)  # type: List[List[str]]
     return [("--".join(ngram)).lower() for ngram in ngrams]
    def get_conditional_feats(self, action_history, action_tag_pair_history, tos, buffer, previous_tags,
                              subsequent_tags):
        feats = {}
        if len(action_history) == 0:
            feats["first_action"] = self.positive_val
        if len(subsequent_tags) == 0:
            feats["last_tag"] = 1

        feats["num_actions"] = len(action_history)
        feats["num_prev_tags"] = len(previous_tags)
        feats["num_subsequent_tags"] = len(subsequent_tags)

        feats["num_tags"] = 1 + len(previous_tags) + len(subsequent_tags)

        feats["tos:" + tos] = self.positive_val
        feats["buffer:" + buffer] = self.positive_val
        feats["tos_buffer:" + tos + "|" + buffer] = self.positive_val
        feats["tos_buffer_combo:" + ",".join(sorted([tos, buffer]))] = self.positive_val

        ### PREVIOUS TAGS
        for i, tag in enumerate(previous_tags[::-1]):
            feats["prev_tag-{i}:{tag}".format(i=i, tag=tag)] = self.positive_val
            feats["prev_tag:{tag}".format(tag=tag)] = self.positive_val

        if len(previous_tags) > 0:
            feats["prev-tag-tos-buffer:{tag}_{tos}_{buffer}".format(tag=previous_tags[-1], tos=tos,
                                                                    buffer=buffer)] = self.positive_val
            feats["prev-tag-buffer:{tag}_{buffer}".format(tag=previous_tags[-1], buffer=buffer)] = self.positive_val
            feats["prev-tag-tos:{tag}_{tos}".format(tag=previous_tags[-1], tos=tos)] = self.positive_val
            bigrams = compute_ngrams(previous_tags, 2, 2)
            for i, bigram in enumerate(bigrams[::-1]):
                feats["prev_bigram-tag-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val
                feats["prev_bigram-tag:{tag}".format(tag=str(bigram))] = self.positive_val

        ### REMAINING TAGS
        for i, tag in enumerate(subsequent_tags):
            feats["subseq_tag-{i}:{tag}".format(i=i, tag=tag)] = self.positive_val
            feats["subseq_tag:{tag}".format(i=i, tag=tag)] = self.positive_val

        if len(subsequent_tags) > 0:
            feats["subseq-tag-tos-buffer:{tag}_{buffer}".format(tag=subsequent_tags[0], tos=tos,
                                                                buffer=buffer)] = self.positive_val
            feats["subseq-tag-buffer:{tag}_{buffer}".format(tag=subsequent_tags[0], buffer=buffer)] = self.positive_val
            feats["subseq-tag-tos:{tag}_{tos}".format(tag=subsequent_tags[0], tos=tos)] = self.positive_val
            bigrams = compute_ngrams(subsequent_tags, 2, 2)
            for i, bigram in enumerate(bigrams):
                feats["subseq_bigram-tag-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val
                feats["subseq_bigram-tag:{tag}".format(tag=str(bigram))] = self.positive_val

        # features for each previous action
        action_tally = defaultdict(int)
        for i, action in enumerate(action_history[::-1]):
            feats["action-{i}:{action}".format(i=i, action=action)] = self.positive_val
            feats["action:{action}".format(action=action)] = self.positive_val
            action_tally[action] += 1

            # Features for the number of times each action has been performed
        for action, count in action_tally.items():
            feats["action-tally:{action}_{count}".format(action=action, count=count)] = self.positive_val

        if len(action_history) > 0:
            feats["prev_action-tos-buffer:{action}_{tos}_{buffer}".format(action=action_history[-1], tos=tos,
                                                                          buffer=buffer)] = self.positive_val
            feats["prev_action-buffer:{action}_{buffer}".format(action=action_history[-1],
                                                                buffer=buffer)] = self.positive_val
            feats["prev_action-tos:{action}_{tos}".format(action=action_history[-1], tos=tos)] = self.positive_val
            bigrams = compute_ngrams(action_history, 2, 2)
            for i, bigram in enumerate(bigrams[::-1]):
                feats["prev_bigram_action-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val
                feats["prev_bigram_action:{tag}".format(tag=str(bigram))] = self.positive_val

        for i, (action, prev_tos, prev_buffer) in enumerate(action_tag_pair_history[::-1]):
            feats["actiontag-{i}:{action}_{tos}_{buffer}".format(i=i, action=action, tos=prev_tos,
                                                                 buffer=prev_buffer)] = self.positive_val
            feats["actiontag:{action}_{tos}_{buffer}".format(action=action, tos=prev_tos,
                                                             buffer=prev_buffer)] = self.positive_val

            feats["actiontos-{i}:{action}_{tos}".format(i=i, action=action, tos=prev_tos)] = self.positive_val
            feats["actiontos:{action}_{tos}".format(action=action, tos=prev_tos)] = self.positive_val

            feats[
                "actionbuffer-{i}:{action}_{buffer}".format(i=i, action=action, buffer=prev_buffer)] = self.positive_val
            feats["actionbuffer:{action}_{buffer}".format(action=action, buffer=prev_buffer)] = self.positive_val

        if len(action_tag_pair_history) > 0:
            action, prev_tos, prev_buffer = action_tag_pair_history[-1]
            feats[
                "prev_actiontag_tos_buffer_currnet_tos_current_buffer:{action}_{prev_tos}_{prev_buffer}_{tos}_{buffer}".format(
                    action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, tos=tos,
                    buffer=buffer)] = self.positive_val
            feats["prev_actiontag_tos_buffer_current_buffer:{action}_{prev_tos}_{prev_buffer}_{buffer}".format(
                action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, buffer=buffer)] = self.positive_val
            feats["prev_actiontag_tos_buffer_current_tos:{action}_{prev_tos}_{prev_buffer}_{tos}".format(action=action,
                                                                                                         prev_tos=prev_tos,
                                                                                                         prev_buffer=prev_buffer,
                                                                                                         tos=tos)] = self.positive_val

        return feats
    def get_conditional_feats(self, action_history, action_tag_pair_history, tos, buffer, previous_tags,
                              subsequent_tags):
        feats = {}
        if len(action_history) == 0:
            feats["first_action"] = self.positive_val
        if len(subsequent_tags) == 0:
            feats["last_tag"] = 1

        feats["num_actions"] = len(action_history)
        feats["num_prev_tags"] = len(previous_tags)
        feats["num_subsequent_tags"] = len(subsequent_tags)

        feats["num_tags"] = 1 + len(previous_tags) + len(subsequent_tags)

        feats["tos:" + tos] = self.positive_val
        feats["buffer:" + buffer] = self.positive_val
        feats["tos_buffer:" + tos + "|" + buffer] = self.positive_val
        feats["tos_buffer_combo:" + ",".join(sorted([tos, buffer]))] = self.positive_val

        ### PREVIOUS TAGS
        for i, tag in enumerate(previous_tags[::-1]):
            feats["prev_tag-{i}:{tag}".format(i=i, tag=tag)] = self.positive_val
            feats["prev_tag:{tag}".format(tag=tag)] = self.positive_val

        if len(previous_tags) > 0:
            feats["prev-tag-tos-buffer:{tag}_{tos}_{buffer}".format(tag=previous_tags[-1], tos=tos,
                                                                    buffer=buffer)] = self.positive_val
            feats["prev-tag-buffer:{tag}_{buffer}".format(tag=previous_tags[-1], buffer=buffer)] = self.positive_val
            feats["prev-tag-tos:{tag}_{tos}".format(tag=previous_tags[-1], tos=tos)] = self.positive_val
            bigrams = compute_ngrams(previous_tags, 2, 2)
            for i, bigram in enumerate(bigrams[::-1]):
                feats["prev_bigram-tag-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val
                feats["prev_bigram-tag:{tag}".format(tag=str(bigram))] = self.positive_val

        ### REMAINING TAGS
        for i, tag in enumerate(subsequent_tags):
            feats["subseq_tag-{i}:{tag}".format(i=i, tag=tag)] = self.positive_val
            feats["subseq_tag:{tag}".format(i=i, tag=tag)] = self.positive_val

        if len(subsequent_tags) > 0:
            feats["subseq-tag-tos-buffer:{tag}_{buffer}".format(tag=subsequent_tags[0], tos=tos,
                                                                buffer=buffer)] = self.positive_val
            feats["subseq-tag-buffer:{tag}_{buffer}".format(tag=subsequent_tags[0], buffer=buffer)] = self.positive_val
            feats["subseq-tag-tos:{tag}_{tos}".format(tag=subsequent_tags[0], tos=tos)] = self.positive_val
            bigrams = compute_ngrams(subsequent_tags, 2, 2)
            for i, bigram in enumerate(bigrams):
                feats["subseq_bigram-tag-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val
                feats["subseq_bigram-tag:{tag}".format(tag=str(bigram))] = self.positive_val

        # features for each previous action
        action_tally = defaultdict(int)
        for i, action in enumerate(action_history[::-1]):
            feats["action-{i}:{action}".format(i=i, action=action)] = self.positive_val
            feats["action:{action}".format(action=action)] = self.positive_val
            action_tally[action] += 1

            # Features for the number of times each action has been performed
        for action, count in action_tally.items():
            feats["action-tally:{action}_{count}".format(action=action, count=count)] = self.positive_val

        if len(action_history) > 0:
            feats["prev_action-tos-buffer:{action}_{tos}_{buffer}".format(action=action_history[-1], tos=tos,
                                                                          buffer=buffer)] = self.positive_val
            feats["prev_action-buffer:{action}_{buffer}".format(action=action_history[-1],
                                                                buffer=buffer)] = self.positive_val
            feats["prev_action-tos:{action}_{tos}".format(action=action_history[-1], tos=tos)] = self.positive_val
            bigrams = compute_ngrams(action_history, 2, 2)
            for i, bigram in enumerate(bigrams[::-1]):
                feats["prev_bigram_action-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val
                feats["prev_bigram_action:{tag}".format(tag=str(bigram))] = self.positive_val

        for i, (action, prev_tos, prev_buffer) in enumerate(action_tag_pair_history[::-1]):
            feats["actiontag-{i}:{action}_{tos}_{buffer}".format(i=i, action=action, tos=prev_tos,
                                                                 buffer=prev_buffer)] = self.positive_val
            feats["actiontag:{action}_{tos}_{buffer}".format(action=action, tos=prev_tos,
                                                             buffer=prev_buffer)] = self.positive_val

            feats["actiontos-{i}:{action}_{tos}".format(i=i, action=action, tos=prev_tos)] = self.positive_val
            feats["actiontos:{action}_{tos}".format(action=action, tos=prev_tos)] = self.positive_val

            feats[
                "actionbuffer-{i}:{action}_{buffer}".format(i=i, action=action, buffer=prev_buffer)] = self.positive_val
            feats["actionbuffer:{action}_{buffer}".format(action=action, buffer=prev_buffer)] = self.positive_val

        if len(action_tag_pair_history) > 0:
            action, prev_tos, prev_buffer = action_tag_pair_history[-1]
            feats[
                "prev_actiontag_tos_buffer_currnet_tos_current_buffer:{action}_{prev_tos}_{prev_buffer}_{tos}_{buffer}".format(
                    action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, tos=tos,
                    buffer=buffer)] = self.positive_val
            feats["prev_actiontag_tos_buffer_current_buffer:{action}_{prev_tos}_{prev_buffer}_{buffer}".format(
                action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, buffer=buffer)] = self.positive_val
            feats["prev_actiontag_tos_buffer_current_tos:{action}_{prev_tos}_{prev_buffer}_{tos}".format(action=action,
                                                                                                         prev_tos=prev_tos,
                                                                                                         prev_buffer=prev_buffer,
                                                                                                         tos=tos)] = self.positive_val

        return feats