Пример #1
0
 def __init__(self, k=1):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     # parameters
     self.p = 0.8
     self.k = k
Пример #2
0
 def __init__(self, alpha=6.0):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     self.tfidf = TFIDFmodel()
     # parameters
     self.alpha = alpha
     self.k = 3
     self.p = 0.80
Пример #3
0
class TermWindowW2VExpansion(QueryExpansion):

    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.8
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for idx, phrase in enumerate(w2v_phrases):
            # get adjacent terms
            prev_phrase = w2v_phrases[idx-1] if idx != 0 else u""
            next_phrase = w2v_phrases[idx+1] if idx != len(w2v_phrases)-1 else u""
            window = [e for e in [prev_phrase, phrase, next_phrase] if len(e) > 0]
            similar_phrases = self.model.inner_model.most_similar(window, topn=self.k)
            for i in range(self.k):
                translated_queries[i-1].append(similar_phrases[i-1][0])
        query_strings = [" ".join(q) for q in translated_queries]

        combined_query = query + "." + ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Пример #4
0
class TermwiseW2VExpansion(QueryExpansion):

    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.7
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for phrase in w2v_phrases:
            similar_phrases = self.model.inner_model.most_similar_cosmul(phrase,topn=self.k)
            for i in range(self.k):
                translated_queries[i-1].append(similar_phrases[i-1][0])
        query_strings = [" ".join(query) for query in translated_queries]
        combined_query = ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Пример #5
0
class WeightedW2VExpansion(QueryExpansion):

    def __init__(self, alpha=6.0):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        self.tfidf = TFIDFmodel()
        # parameters
        self.alpha = alpha
        self.k = 3
        self.p = 0.80

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.w2v]
        extra_terms = []
        for phrase in w2v_phrases:
            idf = 0.0
            if phrase in self.tfidf.dictionary.token2id:
                idf = self.tfidf.inner_model.idfs[self.tfidf.dictionary.token2id[phrase]]
            expansion = []
            if idf > self.alpha:
                expansion = self.w2v.inner_model.most_similar_cosmul(positive=[phrase], topn=self.k)
            # print phrase, idf, " ".join([e[0] for e in expansion])
            extra_terms += [e[0] for e in expansion]
        new_query = query + " " + " ".join(extra_terms)
        return new_query

    def __str__(self):
        return self.__class__.__name__
Пример #6
0
class TermwiseW2VExpansion(QueryExpansion):
    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.7
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for phrase in w2v_phrases:
            similar_phrases = self.model.inner_model.most_similar_cosmul(
                phrase, topn=self.k)
            for i in range(self.k):
                translated_queries[i - 1].append(similar_phrases[i - 1][0])
        query_strings = [" ".join(query) for query in translated_queries]
        combined_query = ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Пример #7
0
class AverageW2VExpansion(QueryExpansion):
    def __init__(self, p=0.7):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = p
        self.n = 10

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        similar_phrases = self.model.inner_model.most_similar(w2v_phrases, [],
                                                              topn=self.n)
        extra_terms = " ".join([
            phrase[0].replace('_', ' ') for phrase in similar_phrases
            if phrase[1] > self.p
        ])
        return "%s %s" % (
            query,
            extra_terms,
        )

    def __str__(self):
        return self.__class__.__name__
Пример #8
0
class WeightedW2VExpansion(QueryExpansion):
    def __init__(self, alpha=6.0):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        self.tfidf = TFIDFmodel()
        # parameters
        self.alpha = alpha
        self.k = 3
        self.p = 0.80

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.w2v]
        extra_terms = []
        for phrase in w2v_phrases:
            idf = 0.0
            if phrase in self.tfidf.dictionary.token2id:
                idf = self.tfidf.inner_model.idfs[
                    self.tfidf.dictionary.token2id[phrase]]
            expansion = []
            if idf > self.alpha:
                expansion = self.w2v.inner_model.most_similar_cosmul(
                    positive=[phrase], topn=self.k)
            # print phrase, idf, " ".join([e[0] for e in expansion])
            extra_terms += [e[0] for e in expansion]
        new_query = query + " " + " ".join(extra_terms)
        return new_query

    def __str__(self):
        return self.__class__.__name__
Пример #9
0
 def __init__(self, k=1):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     # parameters
     self.p = 0.8
     self.k = k
Пример #10
0
 def __init__(self, alpha=6.0):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     self.tfidf = TFIDFmodel()
     # parameters
     self.alpha = alpha
     self.k = 3
     self.p = 0.80
Пример #11
0
class AverageW2VExpansion(QueryExpansion):

    def __init__(self, p=0.7):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = p
        self.n = 10

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        similar_phrases = self.model.inner_model.most_similar(w2v_phrases, [], topn=self.n)
        extra_terms = " ".join([phrase[0].replace('_', ' ') for phrase in similar_phrases if phrase[1] > self.p])
        return "%s %s" % (query, extra_terms, )

    def __str__(self):
        return self.__class__.__name__
Пример #12
0
class TermWindowW2VExpansion(QueryExpansion):
    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.8
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for idx, phrase in enumerate(w2v_phrases):
            # get adjacent terms
            prev_phrase = w2v_phrases[idx - 1] if idx != 0 else u""
            next_phrase = w2v_phrases[
                idx + 1] if idx != len(w2v_phrases) - 1 else u""
            window = [
                e for e in [prev_phrase, phrase, next_phrase] if len(e) > 0
            ]
            similar_phrases = self.model.inner_model.most_similar(window,
                                                                  topn=self.k)
            for i in range(self.k):
                translated_queries[i - 1].append(similar_phrases[i - 1][0])
        query_strings = [" ".join(q) for q in translated_queries]

        combined_query = query + "." + ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__