예제 #1
0
 def __iter__(self):
     with open(self.path + ".info")  as info:
         for line in info:
             comment_info = self._parse_info(line)
             if self.clean_deleted and comment_info["author"] == "[deleted]":
                 continue
             if self.clean_bots and (is_bot(comment_info["author"]) or 
                 comment_info["author"] in FILTERED_USERS):
                 continue
             yield comment_info
예제 #2
0
 def __iter__(self, week=None):
     with open(self.path + ".info")  as info:
         with open(self.path + ".title.bin") as title_bin:
             for byte_string in Doc.read_bytes(title_bin):
                 info_line = info.readline()
                 comment_info = self._parse_info(info_line)
                 if not (week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string)
                 yield comment_info
예제 #3
0
 def __iter__(self, week=None):
     with open(self.path + ".bin", "rb") as bin:
         with open(self.path + ".info")  as info:
             for byte_string in Doc.read_bytes(bin):
                 comment_info = self._parse_info(info.next())
                 if (not week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 doc = Doc(self._vocab).from_bytes(byte_string)
                 comment_info["doc"] = doc
                 comment_info["text"] = self._text_from_doc(doc)
                 yield comment_info