def test_setitem(): od = OrderedDict() od['alice'] = 0 od['bob'] = 1 od['carol'] = 2 assert len(od) == 3 od._check()
def test_delitem(): od = OrderedDict(pairs) assert len(od) == 10 for value in range(10): del od[value] assert len(od) == 0 od._check()
def test_iloc(): od = OrderedDict(enumerate(range(10))) for num in range(10): assert od.iloc[num] == num od.iloc[-1] == 9 assert len(od.iloc) == 10 od._check()
def __init__(self, topic_formatter=None, topic_filter=None, reducer=None): self.handlers = OrderedDict() # {topic: handler} self.topic_formatter = topic_formatter self.topic_filter = topic_filter or ( lambda topic, received_ropic: topic == received_ropic ) self.reducer = reducer or delistify
def test_iloc(): od = OrderedDict(enumerate(range(10))) iloc = od.keys() for num in range(10): assert iloc[num] == num iloc[-1] == 9 assert len(iloc) == 10 od._check()
def test_pop(): od = OrderedDict(enumerate(range(10))) for num in range(10): assert od.pop(num) == num od._check() assert od.pop(0, 'thing') == 'thing' assert od.pop(1, default='thing') == 'thing' od._check()
def test_popitem(): od = OrderedDict(enumerate(range(10))) for num in reversed(range(10)): key, value = od.popitem() assert num == key == value od._check() od = OrderedDict(enumerate(range(10))) for num in range(10): key, value = od.popitem(last=False) assert num == key == value od._check()
class TopicHandler(object): """ Run handlers on events. Each event has topic as it's id, which is passed as argument to handler functions. """ def __init__(self, topic_formatter=None, topic_filter=None, reducer=None): self.handlers = OrderedDict() # {topic: handler} self.topic_formatter = topic_formatter self.topic_filter = topic_filter or ( lambda topic, received_ropic: topic == received_ropic ) self.reducer = reducer or delistify def add(self, topic, callback=None, formatter=None, filter=None, reducer=None): handler = Handler(reducer=reducer) handler.add(callback=callback, filter=filter, formatter=formatter) self.handlers.setdefault(topic, Handler()).add(callback=handler) def has_coroutine(self, topic): return self.handlers[topic].has_coroutine() def set_topic_formatter(self, formatter): self.topic_formatter = formatter def __call__(self, topic, *args, **kwargs): args = [topic] + list(args) if self.topic_formatter: topic = self.topic_formatter(topic) res = [] for _topic, handler in self.handlers.items(): if self.topic_filter(_topic, topic): res.append(handler(*args, **kwargs)) return self.reducer(res) def call(self, topic, *args, **kwargs): return self.__call__(topic, *args, **kwargs) async def call_async(self, topic, *args, **kwargs): args = [topic] + list(args) if self.topic_formatter: topic = self.topic_formatter(topic) res = [] for _topic, handler in self.handlers.items(): if self.topic_filter(_topic, topic): res.append(await handler.call_async(*args, **kwargs)) return self.reducer(res) def has_topic(self, topic): return topic in self.handlers def __getitem__(self, item): return self.handlers[item]
def test_setdefault(): od = OrderedDict() od.setdefault(0, False) assert od[0] == False od.setdefault(1, default=True) assert od[1] == True od.setdefault(2) assert od[2] == None assert od.setdefault(0) == False assert od.setdefault(1) == True
def start_posting_procedure(self): # Get new file descriptors for writing the buffer if self.term_posting_fd is None: self.get_new_term_posting_file() if self.doc_posting_fd is None: self.get_new_doc_posting_file() if self.stemmer is not None: self.stemmer.reset_dictionary() docs_thread = Thread(name='post_docs', target=self.post_docs()) docs_thread.start() main_dictionary_thread = Thread(name='store_terms', target=self.dictionary.add_terms( self.global_term_dict.values())) main_dictionary_thread.start() # Sort the term buffer before writing to storage self.global_term_dict = OrderedDict( sorted(self.global_term_dict.items())) # Post the data to storage self.post_terms_and_reset_posting(docs_thread, main_dictionary_thread)
def gen_search_gzh_url(keyword, page=1): """拼接搜索 公众号 URL Parameters ---------- keyword : str or unicode 搜索文字 page : int, optional 页数 the default is 1 Returns ------- str search_gzh_url """ assert isinstance(page, int) and page > 0 qs_dict = OrderedDict() qs_dict['type'] = _search_type_gzh qs_dict['page'] = page qs_dict['ie'] = 'utf8' qs_dict['query'] = keyword return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict))
def test_clear(): od = OrderedDict(pairs) assert len(od) == 10 od.clear() assert len(od) == 0 od._check()
def test_iter_reversed(): od = OrderedDict([('b', 0), ('a', 1), ('c', 2)]) assert list(od) == ['b', 'a', 'c'] assert list(reversed(od)) == ['c', 'a', 'b'] od._check()
def test_equality(): od = OrderedDict.fromkeys('abc') assert od == {'a': None, 'b': None, 'c': None} assert od != {} assert od != OrderedDict() od._check()
def test_pop_error(): od = OrderedDict() od.pop(0)
def test_values(): od = OrderedDict(enumerate(range(10))) assert list(reversed(od.values())) == list(reversed(range(10))) assert od.values()[:3] == [0, 1, 2] od._check()
def test_init(): od = OrderedDict() assert len(od) == 0 od._check() od = OrderedDict(enumerate(range(10))) assert len(od) == 10 od._check() od = OrderedDict(a=0, b=1, c=2) assert len(od) == 3 od._check() od = OrderedDict(pairs) assert len(od) == 10 od._check()
def test_pop_error(): od = OrderedDict() with pytest.raises(KeyError): od.pop(0)
def test_items(): od = OrderedDict(enumerate(range(10))) assert od.items() == list(enumerate(range(10))) od._check()
def test_viewkeys(): od = OrderedDict(enumerate(range(10))) view = od.viewkeys() assert list(reversed(view)) == list(reversed(range(10))) od._check()
def test_itervalues(): od = OrderedDict(enumerate(range(10))) assert list(od.itervalues()) == list(range(10)) od._check()
def test_copy(): od = OrderedDict(enumerate(range(10))) copy = od.copy() assert od == copy
def test_items(): items = list(enumerate(range(10))) od = OrderedDict(enumerate(range(10))) assert list(reversed(od.items())) == list(reversed(items)) assert od.items()[:3] == [(0, 0), (1, 1), (2, 2)] od._check()
def test_fromkeys(): od = OrderedDict.fromkeys('abc') assert od == {'a': None, 'b': None, 'c': None} od._check()
def test_values(): od = OrderedDict(enumerate(range(10))) assert od.values() == list(range(10)) od._check()
def test_repr(): od = OrderedDict() assert repr(od) == 'OrderedDict([])' assert str(od) == 'OrderedDict([])'
def test_reduce(): od = OrderedDict(enumerate(range(10))) data = pickle.dumps(od) copy = pickle.loads(data) assert od == copy
class Indexer(object): """ Inverted index data structure """ def __init__(self, corpus_folder_path, stop_words_file_path, to_stem, path_for_posting_and_dictionary): self.stop_words_container = StopWordsContainer(stop_words_file_path) self.stemmer = Stemmer() if to_stem else None self.parser = Parse() self.doc_posting_fd = None self.doc_counter = 0 # Will be used to count the number of iterations self.doc_repr_list = [] self.term_posting_fd = None self.global_term_dict = {} self.number_of_docs_processed = 0 self.corpus_folder = corpus_folder_path self.stop_words_file_path = stop_words_file_path self.path_for_posting_and_dictionary = path_for_posting_and_dictionary self.stem_doc_posting_folder = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["stem_doc_posting_folder"]) self.doc_posting_folder = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["doc_posting_folder"]) self.stem_term_posting_folder = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["stem_term_posting_folder"]) self.term_posting_folder = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["term_posting_folder"]) if self.stemmer is not None: self.doc_posting_file_path = self.stem_doc_posting_folder self.term_posting_file_target = os.path.join( self.stem_term_posting_folder, IR_CONFIG["storage"]["stem_term_posting_file_name"]) self.doc_posting_file_target = os.path.join( self.stem_doc_posting_folder, IR_CONFIG["storage"]["stem_doc_posting_file_name"]) self.cache_file_path = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["cache_file_name_stem"]) self.dictionary_file_path = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["dictionary_file_name_stem"]) else: self.doc_posting_file_path = self.doc_posting_folder self.term_posting_file_target = os.path.join( self.term_posting_folder, IR_CONFIG["storage"]["term_posting_file_name"]) self.doc_posting_file_target = os.path.join( self.doc_posting_folder, IR_CONFIG["storage"]["doc_posting_file_name"]) self.cache_file_path = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["cache_file_name"]) self.dictionary_file_path = os.path.join( path_for_posting_and_dictionary, IR_CONFIG["storage"]["dictionary_file_name"]) self.dictionary = MainDictionary(self.term_posting_file_target) def build_index(self): """ The main function for the index building. """ read_file_obj = ReadFile(self.corpus_folder) for data in read_file_obj.extract_from_tags(): # Get handler corpus_handler = self.get_handler(data) # Get doc object to work on doc_obj = self.get_doc_obj(corpus_handler) # Append the doc obj to the list self.doc_repr_list.append(doc_obj.__repr__()) # Update counter self.doc_counter += 1 # In case we got to counter limit if IR_CONFIG["indexer"]["doc_to_process"] == self.doc_counter: self.start_posting_procedure() # Still having data to process if 0 < len(self.global_term_dict.values()): self.start_posting_procedure() # Merge the results to one single file self.merge_terms_posting() if self.doc_posting_fd is not None: self.doc_posting_fd.close() self.dictionary.update_term_data() # self.dictionary.initialize_terms_pointer() # self.dictionary.update_term_tf() self.dictionary.filter_low_frequency_term() self.dictionary.init_cache() self.dictionary.initialize_document_dictionary( self.doc_posting_file_target) self.add_document_data() try: self.save_dict_and_cache(self.path_for_posting_and_dictionary) except: pass def update_term_data(self): self.dictionary.update_term_data() def add_document_data(self): if self.stemmer is not None: path = os.path.join( (os.path.join( self.path_for_posting_and_dictionary, IR_CONFIG["storage"]["stem_doc_posting_folder"])), IR_CONFIG["storage"]["stem_doc_posting_file_name"]) else: path = os.path.join( (os.path.join(self.path_for_posting_and_dictionary, IR_CONFIG["storage"]["doc_posting_folder"])), IR_CONFIG["storage"]["doc_posting_file_name"]) with open(path + "2", 'w') as out_file: with open(path, 'r') as in_file: for line in in_file: out_file.write( self.calc_document_wight(line.rstrip('\n')) + '\n') # os.remove(path) # os.rename(path + "2", path.replace("2", "")) def calc_document_wight(self, line): splitted_line = line.split(",") # doc id, doc file name, header, date, document length, term with max tf, *term#tf*.... document_weight = 0 max_tf_freq = int(splitted_line[5]) for data in splitted_line[6].split("*"): try: splitted_data = data.split("#") term_tf = int(splitted_data[1]) term = splitted_data[1] term_df = self.dictionary.data_dict[term][0] num_of_docs = len(self.dictionary.documents_dictionary) term_idf = math.log(num_of_docs / term_df, 2) document_weight += ((term_tf / max_tf_freq) * term_idf)**2 except Exception: pass # couldn't find the term in the dictionary return ",".join([ splitted_line[0], splitted_line[1], splitted_line[2], splitted_line[3], splitted_line[4], splitted_line[5], str(document_weight**0.5) ]) def merge_terms_posting(self): if self.stemmer is None: merge_sorter = MergeFiles( TermsMergerStrategy(), (os.path.join(self.path_for_posting_and_dictionary, IR_CONFIG["storage"]["term_posting_folder"])), IR_CONFIG["storage"]["term_posting_file_name"]) else: merge_sorter = MergeFiles( TermsMergerStrategy(), (os.path.join( self.path_for_posting_and_dictionary, IR_CONFIG["storage"]["stem_term_posting_folder"])), IR_CONFIG["storage"]["stem_term_posting_file_name"]) merge_sorter.merge_sort() def start_posting_procedure(self): # Get new file descriptors for writing the buffer if self.term_posting_fd is None: self.get_new_term_posting_file() if self.doc_posting_fd is None: self.get_new_doc_posting_file() if self.stemmer is not None: self.stemmer.reset_dictionary() docs_thread = Thread(name='post_docs', target=self.post_docs()) docs_thread.start() main_dictionary_thread = Thread(name='store_terms', target=self.dictionary.add_terms( self.global_term_dict.values())) main_dictionary_thread.start() # Sort the term buffer before writing to storage self.global_term_dict = OrderedDict( sorted(self.global_term_dict.items())) # Post the data to storage self.post_terms_and_reset_posting(docs_thread, main_dictionary_thread) def reset_posting(self): """ Resets the data for new group of documents to process :return: """ if self.term_posting_fd is not None: self.term_posting_fd.close() # self.doc_posting_fd = None self.doc_counter = 0 # Will be used to count the number of iterations self.doc_repr_list = [] self.term_posting_fd = None self.global_term_dict = {} def post_terms_and_reset_posting(self, docs_thread_in, main_dictionary_thread_in): """ :return: """ term_thread = Thread(name='post_term', target=self.post_terms()) term_thread.start() main_dictionary_thread_in.join() term_thread.join() docs_thread_in.join() self.reset_posting() def post_terms(self): for term_str, term_obj in self.global_term_dict.iteritems(): self.term_posting_fd.write(term_obj.__repr__() + "\n") def post_docs(self): index = 0 for doc_repr in self.doc_repr_list: self.number_of_docs_processed += 1 self.doc_posting_fd.write(doc_repr + "\n") index += 1 def get_doc_obj(self, corpus_handler): """ Creates a new Document object from given handler. :param corpus_handler: An handler of the current document :return: A new Document to process """ return Document.Document(self.global_term_dict, self.stop_words_container, self.stemmer, corpus_handler) def get_handler(self, data): """ Gets the proper handler for the given document :param data: a document :return: An handler for specific type of documents """ try: prefix = " ".join(data[1].split())[:2] if "FB" == prefix: corpus_handler = FBCorpusHandler(data, self.parser) elif "FT" == prefix: corpus_handler = FTCorpusHandler(data, self.parser) elif "LA" == prefix: corpus_handler = LACorpusHandler(data, self.parser) else: raise ValueError("Prefix '%s' is not supported" % (prefix, )) return corpus_handler except Exception, e: print traceback.print_exc(e.message)