def get_iterator(self): tweet_parser = TweetParser() if self.compression == 'bz2': self.mode = binary_mode(self.mode) csv_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding) elif self.compression == 'gzip': self.mode = binary_mode(self.mode) csv_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding) else: csv_handle = open(self.filepath, self.mode, encoding=self.encoding) for count, tweet in enumerate(csv.DictReader(csv_handle)): if self.limit < count + 1 and self.limit != 0: csv_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield dict(tweet) csv_handle.close()
def get_iterator(self): tweet_parser = TweetParser() bson_handle = open(self.filepath, 'rb') for count, tweet in enumerate(bson.decode_file_iter(bson_handle)): if self.limit < count + 1 and self.limit != 0: bson_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet bson_handle.close()
def get_iterator(self): tweet_parser = TweetParser() bson_handle = open(self.filepath, 'rb') for count, tweet in enumerate(bson.decode_file_iter(bson_handle)): if self.limit < count+1 and self.limit != 0: bson_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet bson_handle.close()
def get_iterator(self): tweet_parser = TweetParser() json_handle = open(self.filepath, 'r') for count, tweet in enumerate(json_handle): tweet = json_util.loads(tweet) if self.limit != 0 and self.limit <= count: return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet json_handle.close()
def get_iterator(self): tweet_parser = TweetParser() csv_handle = open(self.filepath, "rb") for count, tweet in enumerate(unicodecsv.DictReader(csv_handle)): if self.limit < count + 1 and self.limit != 0: csv_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) and tweet_parser.tweet_passes_custom_filter_list( self.custom_filters, tweet ): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet csv_handle.close()
def get_iterator(self): tweet_parser = TweetParser() if self.compression == 'bz2': self.mode = binary_mode(self.mode) json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding) elif self.compression == 'gzip': self.mode = binary_mode(self.mode) json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding) else: json_handle = open(self.filepath, self.mode, encoding=self.encoding) bad_lines = 0 for count, tweet in enumerate(json_handle): if not self.throw_error: try: tweet = json_util.loads(tweet) except: bad_lines += 1 else: tweet = json_util.loads(tweet) if self.limit != 0 and self.limit <= count: return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet if self.verbose: print("{} rows are ok.".format(count - bad_lines)) print("{} rows are corrupt.".format(bad_lines)) json_handle.close()