示例#1
0
def load_dict(path):
    format = ">2I"
    try:
        d = RecordDAWG(format)
        d.load(path)
        return d
    except Exception, e:
        print "load dict error:..", e.message
        return None
示例#2
0
class AreaCode(object):
    def __init__(self, dict_path):
        reload(sys)
        sys.setdefaultencoding('utf8')
        self.dict_path = dict_path
        self.format = ">2I"
        try:
            self.dict = RecordDAWG(self.format)
            self.dict.load(dict_path)
        except Exception, e:
            print "load dict error:",dict_path, e.message
class SearchEngine():
    def __init__(self):
        self.seek_list = None
        self.comment_file = None
        self.index_file = None
        self.symbol_to_encoding_dict = None
        self.cids = None
        self.comment_offsets_cid = None
        self.comment_offsets = None
        self.comment_term_counts = None
        self.comment_csv_reader = None
        self.authors_list = None
        self.articles_list = None
        self.reply_to_index = None
        self.collection_term_count = 0
        self.stemmer = Stemmer.Stemmer('english')
        self.tokenizer = nltk.tokenize.ToktokTokenizer()
        self.report = Report()

    def load_index(self, directory):
        self.seek_list = RecordDAWG('>QQ')
        self.seek_list.load(f'{directory}/compressed_seek_list.dawg')
        self.index_file = open(f'{directory}/compressed_index', mode='rb')
        with open(f'{directory}/symbol_to_encoding_dict.pickle',
                  mode='rb') as f:
            self.symbol_to_encoding_dict = pickle.load(f)
        self.comment_offsets = numpy.load(
            f'{directory}/comment_offsets.npy', mmap_mode=None)
        self.comment_term_counts = numpy.load(
            f'{directory}/comment_term_counts.npy', mmap_mode=None)
        with open(f'{directory}/collection_term_count.pickle', mode='rb') as f:
            self.collection_term_count = pickle.load(f)
        self.comment_file = open(f'{directory}/comments.csv', mode='rb')
        self.comment_csv_reader = csv.reader(
            binary_read_line_generator(self.comment_file))
        with open(f'{directory}/authors_list.pickle', mode='rb') as f:
            self.authors_list = pickle.load(f)
        with open(f'{directory}/articles_list.pickle', mode='rb') as f:
            self.articles_list = pickle.load(f)
        with open(f'{directory}/reply_to_index.pickle', mode='rb') as f:
            self.reply_to_index = pickle.load(f)
        self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r')
        self.comment_offsets_cid = numpy.load(
            f'{directory}/comment_offsets_cid.npy', mmap_mode='r')

    def load_posting_list_parts(self, stem):
        offset, size = self.seek_list[stem][0]
        self.index_file.seek(offset)
        binary_data = self.index_file.read(size)
        decoded_posting_list = Huffman.decode(
            binary_data, self.symbol_to_encoding_dict)
        return [stem] + decoded_posting_list.split(posting_list_separator)

    def get_comment_term_count(self, comment_offset):
        return self.comment_term_counts[numpy.searchsorted(
            self.comment_offsets, comment_offset)]

    def get_cid_to_offset(self, cid):
        return self.comment_offsets_cid[numpy.searchsorted(self.cids, cid)]

    # returns score based on natural language model with dirichlet smoothing
    # query_terms: list of query terms, stemmed and filtered
    # comment_offsets: list of offsets of comments into comment file
    def get_dirichlet_smoothed_score(self, query_terms, comment_offsets,
                                     mu=1500):
        ranked_comments = [[0, offset] for offset in comment_offsets]
        for query_term in query_terms:
            query_stem = self.stemmer.stemWord(query_term)
            if query_stem not in self.seek_list or \
                    self.seek_list[query_stem][0][1] > \
                    self.collection_term_count / 100:
                continue
            posting_list_parts = self.load_posting_list_parts(query_stem)
            query_term_count = int(posting_list_parts[1])
            comment_offsets_index = 0
            for comment_list in posting_list_parts[2:]:
                if comment_offsets_index >= len(comment_offsets):
                    break
                first_occurence = int(comment_list.partition(',')[0])
                len_occurrences = comment_list.count(',') + 1
                while (comment_offsets_index < len(comment_offsets)
                        and first_occurence >
                        comment_offsets[comment_offsets_index]):
                    # term not found -> 0 occurences in comment
                    ranked_comments[comment_offsets_index][0] += math.log(
                        (mu * query_term_count / self.collection_term_count)
                        / (self.get_comment_term_count(comment_offsets[
                            comment_offsets_index]) + mu))
                    comment_offsets_index += 1

                if(comment_offsets_index < len(comment_offsets)
                        and first_occurence ==
                        comment_offsets[comment_offsets_index]):
                    fD_query_term = len_occurrences - 1
                    ranked_comments[comment_offsets_index][0] += math.log(
                        (fD_query_term + (mu * query_term_count
                                          / self.collection_term_count))
                        / (self.get_comment_term_count(comment_offsets[
                            comment_offsets_index]) + mu))
                    comment_offsets_index += 1
            while comment_offsets_index < len(comment_offsets):
                # no matches found
                ranked_comments[comment_offsets_index][0] += math.log(
                    (mu * query_term_count / self.collection_term_count)
                    / (self.get_comment_term_count(comment_offsets[
                        comment_offsets_index]) + mu))
                comment_offsets_index += 1

        return ranked_comments

    # load comment from given offset into comment file
    def load_comment(self, offset):
        self.comment_file.seek(offset)
        comment_as_list = next(self.comment_csv_reader)
        comment = Comment()
        comment.cid = int(comment_as_list[0])
        # comment.article_url = self.articles_list[int(comment_as_list[1])]
        # comment.author = self.authors_list[int(comment_as_list[2])]
        comment.text = comment_as_list[3]
        # comment.timestamp = comment_as_list[4]
        # comment.parent_cid = int(comment_as_list[5]) \
        #    if comment_as_list[5] != '' else -1
        comment.upvotes = int(comment_as_list[6]) \
            if len(comment_as_list) >= 7 else 0
        comment.downvotes = int(comment_as_list[7]) \
            if len(comment_as_list) >= 8 else 0

        return comment

    def load_comment_from_cid(self, cid):
        return self.load_comment(self.get_cid_to_offset(cid))

    def load_cid_only(self, offset):
        self.comment_file.seek(offset)
        csv_line_start = self.comment_file.read(8)
        comma_position = csv_line_start.find(b',')
        while comma_position == -1:
            csv_line_start += self.comment_file.read(8)
            comma_position = csv_line_start.find(b',')
        return csv_line_start[:comma_position].decode()

    # returns offsets into comment file for all comments containing stem in
    # ascending order
    def get_offsets_for_stem(self, stem):
        if stem not in self.seek_list:
            return []
        posting_list_parts = self.load_posting_list_parts(stem)
        return [int(x.partition(',')[0]) for x in posting_list_parts[2:]]

    def phrase_query(self, phrase, suffix=''):
        if phrase == '' and suffix != '':
            # suffix of the phrase now becomes prefix for a prefix query
            return self.prefix_query(suffix)

        if ' ' not in phrase:
            offsets = self.keyword_query(phrase)
        else:
            stem_offset_size_list = []  # may contain duplicates!
            for sentence in nltk.tokenize.sent_tokenize(phrase):
                for token in self.tokenizer.tokenize(sentence):
                    stem = self.stemmer.stemWord(token)
                    if stem not in self.seek_list:
                        continue
                    stem_offset_size_list.append((stem, self.seek_list[stem]))

            if len(stem_offset_size_list) == 0:
                return []

            # sort by posting_list size
            stem_offset_size_list.sort(key=lambda t: t[1][0][1])
            smallest_stem = stem_offset_size_list[0][0]
            second_smallest_stem = stem_offset_size_list[1][0] \
                if len(stem_offset_size_list) >= 2 and \
                stem_offset_size_list[1][1][0][1] < \
                self.collection_term_count / 100 else ''
            offsets = self.get_offsets_for_stem(smallest_stem)
            if second_smallest_stem != '':
                offsets = set(offsets)
                offsets.intersection_update(
                    self.get_offsets_for_stem(second_smallest_stem))

        result = []
        phrase_to_check = phrase if suffix == '' else f'{phrase} {suffix}'
        for offset in offsets:
            comment = self.load_comment(offset)
            if phrase_to_check in comment.text.lower():
                result.append(offset)
        return result

    def prefix_query(self, prefix):
        stems_with_prefix = self.seek_list.keys(prefix)
        result = []
        for stem in stems_with_prefix:
            result.extend(self.get_offsets_for_stem(stem))
        return result

    def keyword_query(self, keyword):
        return self.get_offsets_for_stem(
            self.stemmer.stemWord(keyword))

    def reply_to_query(self, target_cid):
        return [self.cid_to_offset[cid]
                for cid in self.reply_to_index.get(target_cid, ())]

    def basic_search(self, token_node):
        # search for a single query token

        if token_node.kind == 'phrase_prefix':  # phrase prefix query: 'hi ye'*
            return self.phrase_query(
                token_node.phrase_start, token_node.prefix)
        elif token_node.kind == 'phrase':  # phrase query: 'european union'
            return self.phrase_query(token_node.phrase)
        elif token_node.kind == 'prefix':  # prefix query: isra*
            return self.prefix_query(token_node.prefix)
        elif token_node.kind == 'reply_to':  # ReplyTo query: ReplyTo:12345
            return self.reply_to_query(token_node.target_cid)
        elif token_node.kind == 'keyword':  # keyword query: merkel
            return self.keyword_query(token_node.keyword)
        else:
            raise RuntimeError(f'unknown token_node.kind: {token_node.kind}')

    def print_comments(self, offset_iterable, printIdsOnly=True):
        if printIdsOnly:
            print(','.join((self.load_cid_only(offset)
                            for offset in offset_iterable)))
        else:
            for offset in offset_iterable:
                comment = self.load_comment(offset)
                print(f'{comment.cid},{comment.text}')

    def search(self, query, top_k=None, printIdsOnly=True):
        print(f'\nsearching for "{query}":')

        query_tree_root = build_query_tree(query)
        if query_tree_root.is_boolean_query:
            or_result = set()
            with self.report.measure('searching'):
                for and_node in query_tree_root.children:
                    and_result = None
                    to_be_removed = []
                    for child in and_node.children:
                        child_result = self.basic_search(child)
                        if child.is_negated:
                            to_be_removed.append(child_result)
                        elif and_result is None:
                            and_result = set(child_result)
                        else:
                            and_result.intersection_update(child_result)
                    and_result.difference_update(*to_be_removed)
                    or_result.update(and_result)

            self.print_comments(or_result, printIdsOnly)
        else:  # non bool query
            with self.report.measure('searching'):
                children_results = (self.basic_search(child)
                                    for child in query_tree_root.children)
                comment_offsets = list(frozenset().union(*children_results))

            with self.report.measure('calculating scores'):
                # rated_comment is a tuple of (score, offset)
                rated_comments = self.get_dirichlet_smoothed_score(
                    query_tree_root.query_terms, comment_offsets)
                if top_k is not None and len(rated_comments) > top_k:
                    top_k_rated_comments = \
                        rated_comments[:top_k]
                    heapq.heapify(top_k_rated_comments)
                    for rated_comment in rated_comments[top_k:]:
                        heapq.heappushpop(top_k_rated_comments, rated_comment)
                    result = top_k_rated_comments
                else:
                    result = rated_comments

                result.sort(key=lambda x: x[0], reverse=True)

            self.print_comments(
                (offset for score, offset in result), printIdsOnly)