def collect_feedback(cls): seen_posts = set() try: data = urllib2.urlopen('https://forum.newsblur.com/posts.json').read() except (urllib2.HTTPError), e: logging.debug(" ***> Failed to collect feedback: %s" % e) return
def test_dont_query_myself(self): log.debug('test start') self.lookup.start() # Ongoing queries to (sorted: oldest first): # 155-4, 157-3, # Queued nodes to query (sorted by log_distance to info_hash): # 158-1, 159-0 # Notice 159-2 is kicked out from the queue eq_(self.lookup.num_parallel_queries, 2) nodes = [Node(tc.CLIENT_ADDR, self.lookup._my_id)] self.lookup._on_response(*_gen_nodes_args( tc.NODES_LD_IH[157][3], nodes)) eq_(self.lookup._get_announce_candidates(), [tc.NODES_LD_IH[157][3], ]) # This response triggers a new query to 158-1 (ignoring myself) eq_(self.lookup.num_parallel_queries, 2) # Ongoing queries to (sorted: oldest first): # 155-4, 158-1 # Queued nodes to query (sorted by log_distance to info_hash): # 159-0 self.lookup._on_timeout(tc.NODES_LD_IH[155][4]) # This timeout triggers a new query (to 159-0) eq_(self.lookup.num_parallel_queries, 2) self.lookup._on_timeout(tc.NODES_LD_IH[158][1]) # No more nodes to send queries to eq_(self.lookup.num_parallel_queries, 1) ok_(not self.lookup.is_done) self.lookup._on_timeout(tc.NODES_LD_IH[159][0]) # No more nodes to send queries to eq_(self.lookup.num_parallel_queries, 0) ok_(self.lookup.is_done)
def test_different_delay(self): # NOTICE: this test might fail if your configuration # (interpreter/processor) is too slow task_delays = (1, 1, 1, .5, 1, 1, 2, 1, 1, 1, 1, 1.5, 1, 1, 1, 1, .3) expected_list = ([], ['a', 16, 3, 'b'], #9 is cancelled ['a', 0, 1, 2, 4, 5, 7, 8, 10, 12, 13, 15, 'c', 'b'], ['a', 11, 'c', 'b'], ['a', 6, 'c', 'b'], ) tasks = [Task(delay, self.callback_f, i) \ for i, delay in enumerate(task_delays)] for task in tasks: self.task_m.add(task) for i, expected in enumerate(expected_list): while True: task = self.task_m.consume_task() if task is None: break task.fire_callbacks() log.debug('#: %d, result: %s, expected: %s' % (i, self.callback_order, expected)) assert self.callback_order == expected self.callback_order = [] self.task_m.add(Task(0, self.callback_f, 'a')) self.task_m.add(Task(.5, self.callback_f, 'b')) self.task_m.add(Task(1, self.callback_f, 'c')) time.sleep(.5) tasks[9].cancel() # too late (already fired) tasks[14].cancel() # should be cancelled
def test_cancel(self): for i in xrange(5): self.task_m.add(Task(.1, self.callback_f, i)) c_task = Task(.1, self.callback_f, 5) self.task_m.add(c_task) for i in xrange(6,10): self.task_m.add(Task(.1, self.callback_f, i)) while True: task = self.task_m.consume_task() if task is None: break task.fire_callback() log.debug('%s' % self.callback_order) assert self.callback_order == [] ok_(not c_task.cancelled) c_task.cancel() ok_(c_task.cancelled) time.sleep(.1) while True: task = self.task_m.consume_task() if task is None: break task.fire_callbacks() log.debug('%s' % self.callback_order) assert self.callback_order == [0,1,2,3,4, 6,7,8,9]
def query(cls, feed_ids, query, order, offset, limit, strip=False): cls.create_elasticsearch_mapping() cls.ES.indices.refresh() if strip: query = re.sub(r'([^\s\w_\-])+', ' ', query) # Strip non-alphanumeric sort = "date:desc" if order == "newest" else "date:asc" string_q = pyes.query.QueryStringQuery(query, default_operator="AND") feed_q = pyes.query.TermsQuery('feed_id', feed_ids[:1000]) q = pyes.query.BoolQuery(must=[string_q, feed_q]) try: results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()], partial_fields={}, sort=sort, start=offset, size=limit) except pyes.exceptions.NoServerAvailable: logging.debug(" ***> ~FRNo search server available.") return [] logging.info(" ---> ~FG~SNSearch ~FCstories~FG for: ~SB%s~SN (across %s feed%s)" % (query, len(feed_ids), 's' if len(feed_ids) != 1 else '')) try: result_ids = [r.get_id() for r in results] except pyes.InvalidQuery, e: logging.info(" ---> ~FRInvalid search query \"%s\": %s" % (query, e)) return []
def _test_error(self): outgoing_error_msg = OutgoingErrorMsg(tc.TID, GENERIC_E) data = outgoing_error_msg.encode() tid, msg_type, msg_dict = decode(data) incoming_error_msg = IncomingErrorMsg(msg_dict) log.debug(incoming_error_msg.error) assert incoming_error_msg.error == GENERIC_E
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF ).order_by("-last_read_date") for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) logging.debug( u" ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)" % ( feed.title[:30], stories_db.count(), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) self.calculate_feed_scores_with_stories(user_subs, stories_db) elif self.options.get("mongodb_replication_lag"): logging.debug( u" ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag" % (feed.title[:30], self.options.get("mongodb_replication_lag")) )
def on_response_received(self, response_msg, addr): # TYPE and TID already sanitized by rpc_manager log.debug('response received: %s' % repr(response_msg)) try: addr_query_list = self.pending[addr] except (KeyError): log.warning('No pending queries for %s', addr) return # Ignore response # There are pending queries from node (let's find the right one (TID) query_found = False for query_index, query in enumerate(addr_query_list): log.debug('response node: %s, query:\n(%s, %s)' % ( `addr`, `query.tid`, `query.query`)) if query.matching_tid(response_msg.tid): query_found = True break if not query_found: log.warning('No query for this response\n%s\nsource: %s' % ( response_msg, addr)) return # ignore response # This response matches query. Trigger query's callback response_is_ok = query.on_response_received(response_msg) if response_is_ok: # Remove this query from pending if len(addr_query_list) == 1: # There is one item in the list. Remove the whole list. del self.pending[addr] else: del addr_query_list[query_index] else: log.warning('Bad response from %r\n%r' % (addr, response_msg))
def main(): lang = 'zh' if len(sys.argv) == 2: lang = sys.argv[1] cd = sys.path[0] translation_path = os.path.join(cd, '../translation') # load lua pregame_file = os.path.join(translation_path, 'en_pregame.lua') client_file = os.path.join(translation_path, 'en_client.lua') ui_mgr = UiMgr() log.debug('loading lua file %s' % pregame_file) ui_mgr.load_lua_file(pregame_file) log.debug('loading lua file %s' % client_file) ui_mgr.load_lua_file(client_file) log.info('read %d lines.' % len(ui_mgr.ui_lines)) # save merged lines translate_file = os.path.join(translation_path, '%s_translate.txt' % lang) if os.path.exists(translate_file): choose = input('%s_translate.txt file exists, merge? [y/N]' % lang) choose = choose.lower().strip() if choose != '' and choose[0] == 'y': log.info('merging to translate file.') ui_mgr.apply_translate_from_txt_file(translate_file) else: log.info('skipped.') return with open(translate_file, 'wt', encoding='utf-8') as fp: fp.writelines(ui_mgr.get_txt_lines(replace=True)) log.info('save translate file succeed.')
def _2(*args, **kw): class Dispatch(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.result = None self.error = None self.setDaemon(True) self.start() def run(self): try: self.result = function(*args, **kw) except: self.error = sys.exc_info() c = Dispatch() c.join(timeout) if c.isAlive(): raise TimeoutError, 'took too long' if c.error: tb = ''.join(traceback.format_exception(c.error[0], c.error[1], c.error[2])) logging.debug(tb) mail_admins('Error in timeout: %s' % c.error[0], tb) raise c.error[0], c.error[1], c.error[2] return c.result
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF)\ .read_preference(pymongo.ReadPreference.PRIMARY) stories = Feed.format_stories(stories, feed.pk) logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % ( feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get('mongodb_replication_lag'): logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % ( feed.title[:30], self.options.get('mongodb_replication_lag')))
def add_missing_feeds(self): all_feeds = self.flat() subs = [us.feed_id for us in UserSubscription.objects.filter(user=self.user).only('feed')] missing_subs = set(all_feeds) - set(subs) if missing_subs: logging.debug(" ---> %s is missing %s subs. Adding %s..." % ( self.user, len(missing_subs), missing_subs)) for feed_id in missing_subs: feed = Feed.get_by_id(feed_id) if feed: us, _ = UserSubscription.objects.get_or_create(user=self.user, feed=feed, defaults={ 'needs_unread_recalc': True }) if not us.needs_unread_recalc: us.needs_unread_recalc = True us.save() missing_folder_feeds = set(subs) - set(all_feeds) if missing_folder_feeds: user_sub_folders = json.decode(self.folders) logging.debug(" ---> %s is missing %s folder feeds. Adding %s..." % ( self.user, len(missing_folder_feeds), missing_folder_feeds)) for feed_id in missing_folder_feeds: feed = Feed.get_by_id(feed_id) if feed and feed.pk == feed_id: user_sub_folders = add_object_to_folder(feed_id, "", user_sub_folders) self.folders = json.encode(user_sub_folders) self.save()
def fetch_image_from_page_data(self): image = None image_file = None if self.page_data: content = self.page_data elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page: key = settings.S3_PAGES_BUCKET.get_key(self.feed.s3_pages_key) compressed_content = key.get_contents_as_string() stream = StringIO(compressed_content) gz = gzip.GzipFile(fileobj=stream) try: content = gz.read() except IOError: content = None else: content = MFeedPage.get_data(feed_id=self.feed.pk) url = self._url_from_html(content) if not url: try: content = requests.get(self.feed.feed_link).content url = self._url_from_html(content) except (AttributeError, SocketError, requests.ConnectionError, requests.models.MissingSchema, requests.sessions.InvalidSchema, requests.sessions.TooManyRedirects, requests.models.InvalidURL, requests.models.ChunkedEncodingError, requests.models.ContentDecodingError, LocationParseError, OpenSSLError, PyAsn1Error), e: logging.debug(" ---> ~SN~FRFailed~FY to fetch ~FGfeed icon~FY: %s" % e)
def process_response(self, request, response): if not self.activated(request): return response if connection.queries: time_elapsed = sum([float(q["time"]) for q in connection.queries]) queries = connection.queries for query in queries: if query.get("mongo"): query["sql"] = "~FM%s: %s" % (query["mongo"]["collection"], query["mongo"]["query"]) elif query.get("redis"): query["sql"] = "~FC%s" % (query["redis"]["query"]) else: query["sql"] = re.sub(r"SELECT (.*?) FROM", "SELECT * FROM", query["sql"]) query["sql"] = re.sub(r"SELECT", "~FYSELECT", query["sql"]) query["sql"] = re.sub(r"INSERT", "~FGINSERT", query["sql"]) query["sql"] = re.sub(r"UPDATE", "~FY~SBUPDATE", query["sql"]) query["sql"] = re.sub(r"DELETE", "~FR~SBDELETE", query["sql"]) t = Template( "{% for sql in sqllog %}{% if not forloop.first %} {% endif %}[{{forloop.counter}}] ~FC{{sql.time}}s~FW: {{sql.sql|safe}}{% if not forloop.last %}\n{% endif %}{% endfor %}" ) if settings.DEBUG: logging.debug(t.render(Context({"sqllog": queries, "count": len(queries), "time": time_elapsed}))) times_elapsed = { "sql": sum([float(q["time"]) for q in queries if not q.get("mongo") and not q.get("redis")]), "mongo": sum([float(q["time"]) for q in queries if q.get("mongo")]), "redis": sum([float(q["time"]) for q in queries if q.get("redis")]), } setattr(request, "sql_times_elapsed", times_elapsed) return response
def save_page(self, html): if html and len(html) > 100: if settings.BACKED_BY_AWS.get('pages_on_s3'): k = Key(settings.S3_PAGES_BUCKET) k.key = self.feed.s3_pages_key k.set_metadata('Content-Encoding', 'gzip') k.set_metadata('Content-Type', 'text/html') k.set_metadata('Access-Control-Allow-Origin', '*') out = StringIO.StringIO() f = gzip.GzipFile(fileobj=out, mode='w') f.write(html) f.close() compressed_html = out.getvalue() k.set_contents_from_string(compressed_html) k.set_acl('public-read') try: feed_page = MFeedPage.objects.get(feed_id=self.feed.pk) feed_page.delete() logging.debug(' --->> [%-30s] ~FYTransfering page data to S3...' % (self.feed)) except MFeedPage.DoesNotExist: pass self.feed.s3_page = True self.feed.save() else: try: feed_page = MFeedPage.objects.get(feed_id=self.feed.pk) feed_page.page_data = html feed_page.save() except MFeedPage.DoesNotExist: feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html) return feed_page
def check_urls_against_pushed_data(self, parsed): if hasattr(parsed.feed, 'links'): # single notification hub_url = self.hub self_url = self.topic for link in parsed.feed.links: href = link.get('href', '') if any(w in href for w in ['wp-admin', 'wp-cron']): continue if link['rel'] == 'hub': hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] needs_update = False if hub_url and self.hub != hub_url: # hub URL has changed; let's update our subscription needs_update = True elif self_url != self.topic: # topic URL has changed needs_update = True if needs_update: logging.debug(u' ---> [%-30s] ~FR~BKUpdating PuSH hub/topic: %s / %s' % ( unicode(self.feed)[:30], hub_url, self_url)) expiration_time = self.lease_expires - datetime.now() seconds = expiration_time.days*86400 + expiration_time.seconds PushSubscription.objects.subscribe( self_url, feed=self.feed, hub=hub_url, lease_seconds=seconds)
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ identity = self.get_identity() log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (identity, unicode(self.feed)[:30], self.feed.id) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once: modified = None etag = None USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', URL ) self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def do_backup(schedule, follow_links): '''Handles the backup.''' from shutil import rmtree import utils.filesystem if schedule == 'daily': backup_list = config.daily_backup_list elif schedule == 'weekly': backup_list = config.weekly_backup_list else: backup_list = config.monthly_backup_list try: files = utils.filesystem.read_file_list(backup_list) archive_path, tar_type = create_archive(files, follow_links) if config.enc_backup == True: # We don't add the enc extension to the key - the metadata # will tell us whether the archive is encrypted. enc_file = utils.encrypt.encrypt_file(config.enc_key, archive_path, config.enc_piece_size) send_backup(enc_file, tar_type, schedule) # Delete the plaintext local version os.remove(archive_path) else: # Not encrypting send_backup(archive_path, tar_type, schedule) if config.delete_archive_when_finished == True: log.debug('Deleting archive.') rmtree(config.dest_location) except IOError: raise log.critical('Cannot open file: %s' % backup_list) sys.exit(1)
def fetch(self): """ Downloads and parses a feed. """ socket.setdefaulttimeout(30) identity = self.get_identity() log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (identity, unicode(self.feed)[:30], self.feed.id) logging.debug(log_msg) # Check if feed still needs to be updated # feed = Feed.objects.get(pk=self.feed.pk) # if feed.next_scheduled_update > datetime.datetime.now() and not self.options.get('force'): # log_msg = u' ---> Already fetched %s (%d)' % (self.feed.feed_title, # self.feed.id) # logging.debug(log_msg) # feed.save_feed_history(303, "Already fetched") # return FEED_SAME, None # else: self.feed.set_next_scheduled_update() etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force'): modified = None etag = None self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def get(self, *args, **kwargs): try: return super(UserSubscriptionManager, self).get(*args, **kwargs) except self.model.DoesNotExist: if isinstance(kwargs.get('feed'), int): feed_id = kwargs.get('feed') elif 'feed' in kwargs: feed_id = kwargs['feed'].pk elif 'feed__pk' in kwargs: feed_id = kwargs['feed__pk'] elif 'feed_id' in kwargs: feed_id = kwargs['feed_id'] dupe_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id) if dupe_feed: feed = dupe_feed[0].feed if 'feed' in kwargs: kwargs['feed'] = feed elif 'feed__pk' in kwargs: kwargs['feed__pk'] = feed.pk elif 'feed_id' in kwargs: kwargs['feed_id'] = feed.pk user = kwargs.get('user') if isinstance(user, int): user = User.objects.get(pk=user) logging.debug(" ---> [%s] ~BRFound dupe UserSubscription: ~SB%s (%s)" % (user and user.username, feed, feed_id)) return super(UserSubscriptionManager, self).get(*args, **kwargs) else: exc_info = sys.exc_info() raise exc_info[0], None, exc_info[2]
def create_zip(archive, files): '''Creates a zip file containing the files being backed up.''' import zipfile from utils.misc import add_file_hash try: # zipfile always follows links with zipfile.ZipFile(archive, 'w') as zipf: zipf.comment = 'Created by s3-backup' for f in files: f = f.strip() if os.path.exists(f): zipf.write(f) add_file_hash(archive, f) log.debug('Added %s.' % f) else: log.error('%s does not exist.' % f) if zipf.testzip() != None: log.error('An error occured creating the zip archive.') except zipfile.BadZipfile: # I assume this only happens on reads? Just in case... log.critical('The zip file is corrupt.') except zipfile.LargeZipFile: log.critical('The zip file is greater than 2 GB.' ' Enable zip64 functionality.')
def save_page(self, html): saved = False if not html or len(html) < 100: return if settings.BACKED_BY_AWS.get('pages_on_node'): saved = self.save_page_node(html) if saved and self.feed.s3_page and settings.BACKED_BY_AWS.get('pages_on_s3'): self.delete_page_s3() if settings.BACKED_BY_AWS.get('pages_on_s3') and not saved: saved = self.save_page_s3(html) if not saved: try: feed_page = MFeedPage.objects.get(feed_id=self.feed.pk) # feed_page.page_data = html.encode('utf-8') if feed_page.page() == html: logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.log_title[:30], self.feed.feed_link)) else: feed_page.page_data = html feed_page.save() except MFeedPage.DoesNotExist: feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html) return feed_page
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, unicode(self.feed)[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once: modified = None etag = None USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL ) self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF ).order_by("-last_read_date") logging.debug( u" ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers" % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete("usersub:%s" % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: for sub in user_subs: silent = False if self.options["verbose"] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def mark_story_as_read(request): story_ids = request.REQUEST.getlist('story_id') feed_id = int(request.REQUEST['feed_id']) usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=story_ids) if len(story_ids) > 1: logging.debug(" ---> [%s] Read %s stories in feed: %s" % (request.user, len(story_ids), usersub.feed)) else: logging.debug(" ---> [%s] Read story in feed: %s" % (request.user, usersub.feed)) for story_id in story_ids: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] now = datetime.datetime.utcnow() m = MUserStory(story=story, user_id=request.user.pk, feed_id=feed_id, read_date=now) try: m.save() except OperationError: logging.info(' ---> [%s] *** Marked story as read: Duplicate Story -> %s' % (request.user, story_id)) return data
def process_response(self, request, response): if not self.activated(request): return response if connection.queries: time_elapsed = sum([float(q['time']) for q in connection.queries]) queries = connection.queries for query in queries: if query.get('mongo'): query['sql'] = "~FM%s: %s" % (query['mongo']['collection'], query['mongo']['query']) elif query.get('redis'): query['sql'] = "~FC%s" % (query['redis']['query']) else: query['sql'] = re.sub(r'SELECT (.*?) FROM', 'SELECT * FROM', query['sql']) query['sql'] = re.sub(r'SELECT', '~FYSELECT', query['sql']) query['sql'] = re.sub(r'INSERT', '~FGINSERT', query['sql']) query['sql'] = re.sub(r'UPDATE', '~FY~SBUPDATE', query['sql']) query['sql'] = re.sub(r'DELETE', '~FR~SBDELETE', query['sql']) t = Template("{% for sql in sqllog %}{% if not forloop.first %} {% endif %}[{{forloop.counter}}] ~FC{{sql.time}}s~FW: {{sql.sql|safe}}{% if not forloop.last %}\n{% endif %}{% endfor %}") if settings.DEBUG: logging.debug(t.render(Context({ 'sqllog': queries, 'count': len(queries), 'time': time_elapsed, }))) times_elapsed = { 'sql': sum([float(q['time']) for q in queries if not q.get('mongo') and not q.get('redis')]), 'mongo': sum([float(q['time']) for q in queries if q.get('mongo')]), 'redis': sum([float(q['time']) for q in queries if q.get('redis')]), } setattr(request, 'sql_times_elapsed', times_elapsed) return response
def query(cls, text): try: cls.ES.default_indices = cls.index_name() cls.ES.indices.refresh() except pyes.exceptions.NoServerAvailable: logging.debug(" ***> ~FRNo search server available.") return [] logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text) q = MatchQuery('address', text, operator="and", type="phrase") results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5, doc_types=[cls.type_name()]) if not results.total: logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text) q = MatchQuery('title', text, operator="and") results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5, doc_types=[cls.type_name()]) if not results.total: logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text) q = MatchQuery('link', text, operator="and") results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5, doc_types=[cls.type_name()]) return results
def add_file_by_id(self, translation_path, file_id_str): """添加一个文件 Args: translation_path (str): 翻译文件的路径 file_id_str (str): 文件 id """ lang_groups = {} # 英文 file_path = os.path.join(translation_path, 'en.%s.lang.csv' % file_id_str) if os.path.isfile(file_path): for line in load_lang_csv(file_path, skip_header=False): file_id, unknown, index, offset = [int(v) for v in line[0:4]] origin = line[4] if index not in lang_groups.keys(): # 如果是新出现的 index,则新建 lang_groups[index] = LangGroup(index) lang_groups[index].add(file_id, unknown, index, offset, origin) # 日文 file_path_jp = os.path.join(translation_path, 'jp.%s.lang.csv' % file_id_str) if os.path.isfile(file_path_jp): for line in load_lang_csv(file_path_jp, skip_header=False): file_id, unknown, index, offset = [int(v) for v in line[0:4]] origin_jp = line[4] if index not in lang_groups.keys(): # 如果是新出现的 index,则舍弃 log.debug('new index from jp: %s' % str(line[0:4])) continue lang_groups[index].add_jp(file_id, unknown, index, offset, origin_jp) # 添加 self.all_lang_groups[file_id_str] = lang_groups
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) if self.options['slave_db']: slave_db = self.options['slave_db'] stories_db_orig = slave_db.stories.find({ "story_feed_id": feed.pk, "story_date": { "$gte": UNREAD_CUTOFF, }, }) stories_db = [] for story in stories_db_orig: stories_db.append(bunch(story)) else: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete('usersub:%s' % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: for sub in user_subs: silent = False if self.options['verbose'] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def collect_files(self, task_id=None): t1 = time.clock() self.files(self.path) self.result['no_extension'] = {'file_count': 0, 'file_list': []} for extension, values in self.type_nums.iteritems(): extension = extension.strip() self.result[extension] = {'file_count': len(values), 'file_list': []} # .php : 123 log.debug('{0} : {1}'.format(extension, len(values))) if task_id is not None: # Store ext = CobraExt(task_id, extension, len(values)) db.session.add(ext) for f in self.file: es = f.split(os.extsep) if len(es) >= 2: # Exists Extension # os.extsep + es[len(es) - 1] if f.endswith(extension): self.result[extension]['file_list'].append(f) else: # Didn't have extension self.result['no_extension']['file_count'] = int(self.result['no_extension']['file_count']) + 1 self.result['no_extension']['file_list'].append(f) if task_id is not None: db.session.commit() t2 = time.clock() self.result['file_nums'] = self.file_id self.result['collect_time'] = t2 - t1 return self.result
' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read())) feed.save_feed_history(e.code, e.msg, e.fp.read()) fetched_feed = None except Feed.DoesNotExist, e: logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30])) continue except TimeoutError, e: logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.title[:30])) feed.save_feed_history(505, 'Timeout', e) feed_code = 505 fetched_feed = None except Exception, e: logging.debug('[%d] ! -------------------------' % (feed_id, )) tb = traceback.format_exc() logging.error(tb) logging.debug('[%d] ! -------------------------' % (feed_id, )) ret_feed = FEED_ERREXC feed = Feed.get_by_id(getattr(feed, 'pk', feed_id)) if not feed: continue feed.save_feed_history(500, "Error", tb) feed_code = 500 fetched_feed = None # mail_feed_error_to_admin(feed, e, local_vars=locals()) if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and settings.RAVEN_CLIENT): settings.RAVEN_CLIENT.captureException() if not feed_code:
def __init__(self, client): super().__init__() log.debug('init recv thread...') self.client = client
def run(self): local_ip, local_port = get_ip_address() log.info(f'local IP Address: {local_ip}, {local_port}') message = local_ip + SEPERATOR \ + str(local_port) + SEPERATOR \ + self.client.serial TIMEOUT_S = 60 PING_INTERVAL_S = 30 LOOP_INTERVAL_S = 5 conn_flag = False server_peer = Peer(self.client.server_ip, self.client.server_port) while not self.is_interrupted(): current_time = get_current_time_sec() with self.client.mutex_for_kcp_peer_map: for kcp_peer in list(self.kcp_peer_map.values()): if kcp_peer.last_ping + TIMEOUT_S < current_time: if kcp_peer.peer == server_peer: continue log.info(f'removed kcp_peer: {kcp_peer.peer.key}') del self.kcp_peer_map[kcp_peer.peer.key] elif kcp_peer.last_ping + PING_INTERVAL_S < current_time: message_wrapper = MessageWrapper( registered_types=self.client.registered_types, message=None, message_type=MessageType.RAWBYTE, packet_type=RM.PING_REQUEST, connection_id=0) kcp_peer.send(message_wrapper) with self.client.mutex_for_rendvs_sess_map: for key, rendvs_sess in list(self.rendvs_sess_map.items()): if rendvs_sess is None: continue if rendvs_sess.relay_kcp_peer is not None: last_ping = rendvs_sess.relay_kcp_peer.last_ping if last_ping + TIMEOUT_S < current_time: log.warning(f'relay peer removed, {last_ping}') rendvs_sess.relay_kcp_peer = None if rendvs_sess.public_kcp_peer is not None: last_ping = rendvs_sess.public_kcp_peer.last_ping if last_ping + TIMEOUT_S < current_time: log.warning(f'public peer removed, {last_ping}') rendvs_sess.public_kcp_peer = None if rendvs_sess.private_kcp_peer is not None: last_ping = rendvs_sess.private_kcp_peer.last_ping if last_ping + TIMEOUT_S < current_time: log.warning(f'private peer removed, {last_ping}') rendvs_sess.private_kcp_peer = None if not rendvs_sess.is_connected(): del self.rendvs_sess_map[key] if self.client.on_disconnected is not None: self.client.on_disconnected(rendvs_sess) log.info(f'Disconnected, connectionID=\ {rendvs_sess.connection_id}') if not self.client.is_connected: if conn_flag: with self.client.mutex_for_kcp_peer_map: del self.client.kcp_peer_map[server_peer.key] self.client.sock = create_udp_socket() self.client.on_server_connect_failed() else: conn_flag = True self.client.on_server_connecting() message_wrapper = MessageWrapper( registered_types=self.client.registered_types, message=message, message_type=MessageType.RAWBYTE, packet_type=RM.REGISTRATION_RENDEZVOUS_CLIENT_REQUEST, connection_id=0) self.client.get_kcp_peer(server_peer).send(message_wrapper) elif self.client.get_kcp_peer(server_peer).last_ping \ + TIMEOUT_S < current_time: with self.client.mutex_for_kcp_peer_map: del self.client.kcp_peer_map[server_peer.key] conn_flag = False self.client.is_connected = False self.client.sock = create_udp_socket() self.client.on_server_disconnected() else: message_wrapper = MessageWrapper( registered_types=self.client.registered_types, message=message, message_type=MessageType.RAWBYTE, packet_type=RM.REGISTRATION_RENDEZVOUS_CLIENT_REQUEST, connection_id=0) self.client.get_kcp_peer(server_peer).send(message_wrapper) time.sleep(LOOP_INTERVAL_S) log.debug('registerThread finished') return
async def post(self, *args, **kwargs): """ create proxies """ datas = self.get_body() logger.debug('datas:', datas, caller=self) self.do_success({'ok': 1}, 'todo')
def check_string(text_to_check): """检查是否符合规范,输出错误 Args: text_to_check (str): 一个待检查的字符串 Returns: return (bool): 是否合格 """ if text_to_check == '': return True stack = {'<>': 0, 'c': 0, 't': 0} i = 0 len_text = len(text_to_check) while i < len_text: curr_char = text_to_check[i] # <<>> 的匹配 if curr_char == '<': if i + 1 < len_text and text_to_check[i + 1] == '<': # 可能只是单独的 < 符号 stack['<>'] += 1 i += 1 elif curr_char == '>': if i + 1 < len_text and text_to_check[i + 1] == '>': stack['<>'] -= 1 i += 1 elif curr_char == '|': # 颜色 if text_to_check[i + 1] == 'c': search_not_match = re.compile(r'[^0-9a-fA-F]').search if search_not_match(text_to_check[i + 2:i + 2 + 6]): # 含有颜色以外的字符 log.debug('find error: color |c') return False stack['c'] += 1 i += 7 # 颜色结束 elif text_to_check[i + 1] == 'r': stack['c'] -= 1 i += 1 # 调用 elif text_to_check[i + 1] == 't': if stack['t'] == 0: stack['t'] += 1 i += 1 else: stack['t'] -= 1 i += 1 elif curr_char == '\\': if text_to_check[i + 1] not in r'\n"': log.debug(r'find error: usage of \: is it \\ or \n?') return False i += 1 # <<>> 之间内容检查,暂无 # 数量检查 # |c |r 的情况似乎比较灵活 if stack['<>'] < 0 or stack['<>'] > 1 or stack['c'] < -1 or stack[ 'c'] > 1 or stack['t'] > 1: log.debug('find error: <<>>, |c|r not match') return False # iter i += 1 # 最终的匹配检查 if stack['<>'] != 0 or stack['c'] < -1 or stack['c'] > 1 or stack['t'] != 0: log.debug('find error: <<>>, |c|r not match') return False return True
def get_driver(self): debug('webdriver ->获取并返回当前driver实例。') return self.driver
def ReimportStripeHistory(): logging.debug(" ---> Reimporting Stripe history...") Profile.reimport_stripe_history(limit=10, days=1)
def CleanSpam(): logging.debug(" ---> Finding spammers...") Profile.clear_dead_spammers(confirm=True)
def process_feed_wrapper(self, feed_queue): delta = None current_process = multiprocessing.current_process() identity = "X" feed = None if current_process._identity: identity = current_process._identity[0] for feed_id in feed_queue: start_duration = time.time() feed_fetch_duration = None feed_process_duration = None page_duration = None icon_duration = None feed_code = None ret_entries = None start_time = time.time() ret_feed = FEED_ERREXC try: feed = self.refresh_feed(feed_id) skip = False if self.options.get('fake'): skip = True weight = "-" quick = "-" rand = "-" elif (self.options.get('quick') and not self.options['force'] and feed.known_good and feed.fetched_once and not feed.is_push): weight = feed.stories_last_month * feed.num_subscribers random_weight = random.randint(1, max(weight, 1)) quick = float(self.options.get('quick', 0)) rand = random.random() if random_weight < 100 and rand < quick: skip = True if skip: logging.debug( ' ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)...' % (feed.title[:30], weight, feed.num_subscribers, rand, quick)) continue ffeed = FetchFeed(feed_id, self.options) ret_feed, fetched_feed = ffeed.fetch() feed_fetch_duration = time.time() - start_duration if ((fetched_feed and ret_feed == FEED_OK) or self.options['force']): pfeed = ProcessFeed(feed_id, fetched_feed, self.options) ret_feed, ret_entries = pfeed.process() feed = pfeed.feed feed_process_duration = time.time() - start_duration if (ret_entries and ret_entries['new']) or self.options['force']: start = time.time() if not feed.known_good or not feed.fetched_once: feed.known_good = True feed.fetched_once = True feed = feed.save() if self.options['force'] or random.random() <= 0.02: logging.debug( ' ---> [%-30s] ~FBPerforming feed cleanup...' % (feed.title[:30], )) start_cleanup = time.time() feed.sync_redis() logging.debug( ' ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec.' % (feed.title[:30], time.time() - start_cleanup)) try: self.count_unreads_for_subscribers(feed) except TimeoutError: logging.debug( ' ---> [%-30s] Unread count took too long...' % (feed.title[:30], )) if self.options['verbose']: logging.debug( u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (feed.title[:30], time.time() - start)) except urllib2.HTTPError, e: logging.debug( ' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read())) feed.save_feed_history(e.code, e.msg, e.fp.read()) fetched_feed = None except Feed.DoesNotExist, e: logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30])) continue
class Dispatcher: def __init__(self, options, num_threads): self.options = options self.feed_stats = { FEED_OK: 0, FEED_SAME: 0, FEED_ERRPARSE: 0, FEED_ERRHTTP: 0, FEED_ERREXC: 0 } self.feed_trans = { FEED_OK: 'ok', FEED_SAME: 'unchanged', FEED_ERRPARSE: 'cant_parse', FEED_ERRHTTP: 'http_error', FEED_ERREXC: 'exception' } self.feed_keys = sorted(self.feed_trans.keys()) self.num_threads = num_threads self.time_start = datetime.datetime.utcnow() self.workers = [] def refresh_feed(self, feed_id): """Update feed, since it may have changed""" return Feed.objects.using('default').get(pk=feed_id) def process_feed_wrapper(self, feed_queue): delta = None current_process = multiprocessing.current_process() identity = "X" feed = None if current_process._identity: identity = current_process._identity[0] for feed_id in feed_queue: start_duration = time.time() feed_fetch_duration = None feed_process_duration = None page_duration = None icon_duration = None feed_code = None ret_entries = None start_time = time.time() ret_feed = FEED_ERREXC try: feed = self.refresh_feed(feed_id) skip = False if self.options.get('fake'): skip = True weight = "-" quick = "-" rand = "-" elif (self.options.get('quick') and not self.options['force'] and feed.known_good and feed.fetched_once and not feed.is_push): weight = feed.stories_last_month * feed.num_subscribers random_weight = random.randint(1, max(weight, 1)) quick = float(self.options.get('quick', 0)) rand = random.random() if random_weight < 100 and rand < quick: skip = True if skip: logging.debug( ' ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)...' % (feed.title[:30], weight, feed.num_subscribers, rand, quick)) continue ffeed = FetchFeed(feed_id, self.options) ret_feed, fetched_feed = ffeed.fetch() feed_fetch_duration = time.time() - start_duration if ((fetched_feed and ret_feed == FEED_OK) or self.options['force']): pfeed = ProcessFeed(feed_id, fetched_feed, self.options) ret_feed, ret_entries = pfeed.process() feed = pfeed.feed feed_process_duration = time.time() - start_duration if (ret_entries and ret_entries['new']) or self.options['force']: start = time.time() if not feed.known_good or not feed.fetched_once: feed.known_good = True feed.fetched_once = True feed = feed.save() if self.options['force'] or random.random() <= 0.02: logging.debug( ' ---> [%-30s] ~FBPerforming feed cleanup...' % (feed.title[:30], )) start_cleanup = time.time() feed.sync_redis() logging.debug( ' ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec.' % (feed.title[:30], time.time() - start_cleanup)) try: self.count_unreads_for_subscribers(feed) except TimeoutError: logging.debug( ' ---> [%-30s] Unread count took too long...' % (feed.title[:30], )) if self.options['verbose']: logging.debug( u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (feed.title[:30], time.time() - start)) except urllib2.HTTPError, e: logging.debug( ' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read())) feed.save_feed_history(e.code, e.msg, e.fp.read()) fetched_feed = None except Feed.DoesNotExist, e: logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30])) continue except TimeoutError, e: logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.title[:30])) feed.save_feed_history(505, 'Timeout', e) feed_code = 505 fetched_feed = None
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302: Temporary redirect: ignore # 301: Permanent redirect: save it if self.fpf.status == 301: if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: self.feed.last_modified = None pass self.fpf.entries = self.fpf.entries[:100] if self.fpf.feed.get('title'): self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) # Deleted by Xinyan Lu : No this table # tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) # if tagline: # self.feed.data.feed_tagline = utf8encode(tagline) # self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get( 'link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') stories.append(story) story_guids.append(story.get('guid')) existing_stories = dict((s.story_guid, s) for s in MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk).limit( max(int(len(story_guids) * 1.5), 10))) ret_values = self.feed.add_update_stories( stories, existing_stories, verbose=self.options['verbose']) if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub' and not hub_url: hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] push_expired = False if self.feed.is_push: try: push_expired = self.feed.push.lease_expires < datetime.datetime.now( ) except PushSubscription.DoesNotExist: self.feed.is_push = False if (hub_url and self_url and not settings.DEBUG and self.feed.active_subscribers > 0 and (push_expired or not self.feed.is_push or self.options.get('force'))): logging.debug( u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url)) try: PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) except TimeoutError: logging.debug( u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (self.feed.title[:30], hub_url)) elif (self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url)): logging.debug( u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (self.feed.title[:30])) self.feed.is_push = False self.feed = self.feed.save() logging.debug( u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (self.feed.title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(full=bool(ret_values['new']), force=self.options['force']) if ret_values['new']: self.feed.trim_feed() self.feed.expire_redis() self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def refresh_feed(self): self.feed = Feed.get_by_id(self.feed_id) if self.feed_id != self.feed.pk: logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk)) self.feed_id = self.feed.pk
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.log_title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302 and 307: Temporary redirect: ignore # 301 and 308: Permanent redirect: save it (after 10 tries) if self.fpf.status == 301 or self.fpf.status == 308: if self.fpf.href.endswith('feedburner.com/atom.xml'): return FEED_ERRHTTP, ret_values redirects, non_redirects = self.feed.count_redirects_in_history( 'feed') self.feed.save_feed_history( self.fpf.status, "HTTP Redirect (%d to go)" % (10 - len(redirects))) if len(redirects) >= 10 or len(non_redirects) == 0: address = self.fpf.href if self.options['force'] and address: address = qurl(address, remove=['_']) self.feed.feed_address = address if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.log_title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.log_title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf: logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30])) self.feed.save_feed_history(551, "Broken feed") return FEED_ERRHTTP, ret_values if self.fpf and not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields original_etag = self.feed.etag self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' if self.feed.etag != original_etag: self.feed.save(update_fields=['etag']) original_last_modified = self.feed.last_modified if hasattr(self.fpf, 'modified') and self.fpf.modified: try: self.feed.last_modified = datetime.datetime.strptime( self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z') except Exception, e: self.feed.last_modified = None logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e)) pass
# -*- encoding:utf-8 -*- from utils.log import debug from utils.abs import Singleton from congig import DRIVER from selenium import webdriver @Singleton class WebDriver(): driver=None def __init__(self): if self.driver==None: debug('webdriver ->初始化driver:%s'%DRIVER) if DRIVER=='Chrome': self.driver=webdriver.Chrome() else: self.driver=webdriver.Firefox() else: debug('webdriver ->%s driver已经实例化。'%DRIVER) def get_driver(self): debug('webdriver ->获取并返回当前driver实例。') return self.driver driver=WebDriver().get_driver() debug('webdriver.py -> 实例化并返回当前driver实例:%s'%str(driver))
def fetch_youtube(self, address): username = None channel_id = None list_id = None if 'gdata.youtube.com' in address: try: username_groups = re.search( 'gdata.youtube.com/feeds/\w+/users/(\w+)/', address) if not username_groups: return username = username_groups.group(1) except IndexError: return elif 'youtube.com/feeds/videos.xml?user='******'user'][0] except IndexError: return elif 'youtube.com/feeds/videos.xml?channel_id=' in address: try: channel_id = urlparse.parse_qs( urlparse.urlparse(address).query)['channel_id'][0] except (IndexError, KeyError): return elif 'youtube.com/playlist' in address: try: list_id = urlparse.parse_qs( urlparse.urlparse(address).query)['list'][0] except IndexError: return elif 'youtube.com/feeds/videos.xml?playlist_id' in address: try: list_id = urlparse.parse_qs( urlparse.urlparse(address).query)['playlist_id'][0] except IndexError: return if channel_id: video_ids_xml = requests.get( "https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id, verify=False) channel_json = requests.get( "https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" % (channel_id, settings.YOUTUBE_API_KEY)) channel = json.decode(channel_json.content) try: username = channel['items'][0]['snippet']['title'] description = channel['items'][0]['snippet']['description'] except (IndexError, KeyError): return elif list_id: playlist_json = requests.get( "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" % (list_id, settings.YOUTUBE_API_KEY)) playlist = json.decode(playlist_json.content) try: username = playlist['items'][0]['snippet']['title'] description = playlist['items'][0]['snippet']['description'] except (IndexError, KeyError): return channel_url = "https://www.youtube.com/playlist?list=%s" % list_id elif username: video_ids_xml = requests.get( "https://www.youtube.com/feeds/videos.xml?user=%s" % username, verify=False) description = "YouTube videos uploaded by %s" % username else: return if list_id: playlist_json = requests.get( "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" % (list_id, settings.YOUTUBE_API_KEY)) playlist = json.decode(playlist_json.content) try: video_ids = [ video['snippet']['resourceId']['videoId'] for video in playlist['items'] ] except (IndexError, KeyError): return else: if video_ids_xml.status_code != 200: return video_ids_soup = BeautifulSoup(video_ids_xml.content) channel_url = video_ids_soup.find('author').find('uri').getText() video_ids = [] for video_id in video_ids_soup.findAll('yt:videoid'): video_ids.append(video_id.getText()) videos_json = requests.get( "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" % (','.join(video_ids), settings.YOUTUBE_API_KEY)) videos = json.decode(videos_json.content) if 'error' in videos: logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos)) return data = {} data['title'] = ("%s's YouTube Videos" % username if 'Uploads' not in username else username) data['link'] = channel_url data['description'] = description data['lastBuildDate'] = datetime.datetime.utcnow() data[ 'generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL data['docs'] = None data['feed_url'] = address rss = feedgenerator.Atom1Feed(**data) for video in videos['items']: thumbnail = video['snippet']['thumbnails'].get('maxres') if not thumbnail: thumbnail = video['snippet']['thumbnails'].get('high') if not thumbnail: thumbnail = video['snippet']['thumbnails'].get('medium') duration_sec = isodate.parse_duration( video['contentDetails']['duration']).seconds if duration_sec >= 3600: hours = (duration_sec / 3600) minutes = (duration_sec - (hours * 3600)) / 60 seconds = duration_sec - (hours * 3600) - (minutes * 60) duration = "%s:%s:%s" % (hours, '{0:02d}'.format(minutes), '{0:02d}'.format(seconds)) else: minutes = duration_sec / 60 seconds = duration_sec - (minutes * 60) duration = "%s:%s" % ('{0:02d}'.format(minutes), '{0:02d}'.format(seconds)) content = """<div class="NB-youtube-player"><iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe></div> <div class="NB-youtube-stats"><small> <b>From:</b> <a href="%s">%s</a><br /> <b>Duration:</b> %s<br /> </small></div><hr> <div class="NB-youtube-description">%s</div> <img src="%s" style="display:none" />""" % ( ("https://www.youtube.com/embed/" + video['id']), channel_url, username, duration, linkify(linebreaks(video['snippet']['description'])), thumbnail['url'] if thumbnail else "", ) link = "http://www.youtube.com/watch?v=%s" % video['id'] story_data = { 'title': video['snippet']['title'], 'link': link, 'description': content, 'author_name': username, 'categories': [], 'unique_id': "tag:youtube.com,2008:video:%s" % video['id'], 'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']), } rss.add_item(**story_data) return rss.writeString('utf-8')
def fetch(self): """ Uses requests to download the feed, parsing it in feedparser. Will be storified later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, self.feed.log_title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): self.options['force'] = True modified = None etag = None address = qurl(address, add={"_": random.randint(0, 10000)}) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (self.feed.log_title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None if self.options.get('feed_xml'): logging.debug( u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (self.feed.log_title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug( u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.log_title[:30])) return FEED_OK, self.fpf if 'youtube.com' in address: try: youtube_feed = self.fetch_youtube(address) except (requests.adapters.ConnectionError): youtube_feed = None if not youtube_feed: logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(youtube_feed) elif re.match('(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])): twitter_feed = self.fetch_twitter(address) if not twitter_feed: logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(twitter_feed) if not self.fpf: try: headers = self.feed.fetch_headers() if etag: headers['If-None-Match'] = etag if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. short_weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % ( short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]) headers['If-Modified-Since'] = modified_header if etag or modified: headers['A-IM'] = 'feed' raw_feed = requests.get(address, headers=headers) if raw_feed.status_code >= 400: logging.debug( " ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)) raw_feed = requests.get( address, headers=self.feed.fetch_headers(fake=True)) if raw_feed.content and 'application/json' in raw_feed.headers.get( 'Content-Type', ""): # JSON Feed json_feed = self.fetch_json_feed(address, raw_feed) if not json_feed: logging.debug( u' ***> [%-30s] ~FRJSON fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(json_feed) elif raw_feed.content and raw_feed.status_code < 400: response_headers = raw_feed.headers response_headers['Content-Location'] = raw_feed.url self.raw_feed = smart_unicode(raw_feed.content) self.fpf = feedparser.parse( self.raw_feed, response_headers=response_headers) if self.options.get('debug', False): logging.debug( " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode( raw_feed.content)), raw_feed.headers)) except Exception, e: logging.debug( " ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100])) if not self.fpf or self.options.get('force_fp', False): try: self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e)) pass
def adjust_crush_tunables(self): log.info("Adjust Crush Tunables") self.adjust_crush = "ceph osd crush tunables optimal" log.debug(self.adjust_crush) os.system(self.adjust_crush)
class ProcessFeed: def __init__(self, feed_id, fpf, options, raw_feed=None): self.feed_id = feed_id self.options = options self.fpf = fpf self.raw_feed = raw_feed def refresh_feed(self): self.feed = Feed.get_by_id(self.feed_id) if self.feed_id != self.feed.pk: logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk)) self.feed_id = self.feed.pk def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.log_title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302 and 307: Temporary redirect: ignore # 301 and 308: Permanent redirect: save it (after 10 tries) if self.fpf.status == 301 or self.fpf.status == 308: if self.fpf.href.endswith('feedburner.com/atom.xml'): return FEED_ERRHTTP, ret_values redirects, non_redirects = self.feed.count_redirects_in_history( 'feed') self.feed.save_feed_history( self.fpf.status, "HTTP Redirect (%d to go)" % (10 - len(redirects))) if len(redirects) >= 10 or len(non_redirects) == 0: address = self.fpf.href if self.options['force'] and address: address = qurl(address, remove=['_']) self.feed.feed_address = address if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.log_title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.log_title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf: logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30])) self.feed.save_feed_history(551, "Broken feed") return FEED_ERRHTTP, ret_values if self.fpf and not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields original_etag = self.feed.etag self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' if self.feed.etag != original_etag: self.feed.save(update_fields=['etag']) original_last_modified = self.feed.last_modified if hasattr(self.fpf, 'modified') and self.fpf.modified: try: self.feed.last_modified = datetime.datetime.strptime( self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z') except Exception, e: self.feed.last_modified = None logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e)) pass if self.feed.last_modified != original_last_modified: self.feed.save(update_fields=['last_modified']) self.fpf.entries = self.fpf.entries[:100] original_title = self.feed.feed_title if self.fpf.feed.get('title'): self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) if self.feed.feed_title != original_title: self.feed.save(update_fields=['feed_title']) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: original_tagline = self.feed.data.feed_tagline self.feed.data.feed_tagline = smart_unicode(tagline) if self.feed.data.feed_tagline != original_tagline: self.feed.data.save(update_fields=['feed_tagline']) if not self.feed.feed_link_locked: new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get( 'id') or self.feed.feed_link if self.options['force'] and new_feed_link: new_feed_link = qurl(new_feed_link, remove=['_']) if new_feed_link != self.feed.feed_link: logging.debug( " ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.log_title[:30], self.feed.feed_link, new_feed_link)) redirects, non_redirects = self.feed.count_redirects_in_history( 'page') self.feed.save_page_history( 301, "HTTP Redirect (%s to go)" % (10 - len(redirects))) if len(redirects) >= 10 or len(non_redirects) == 0: self.feed.feed_link = new_feed_link self.feed.save(update_fields=['feed_link']) # Determine if stories aren't valid and replace broken guids guids_seen = set() permalinks_seen = set() for entry in self.fpf.entries: guids_seen.add(entry.get('guid')) permalinks_seen.add(Feed.get_permalink(entry)) guid_difference = len(guids_seen) != len(self.fpf.entries) single_guid = len(guids_seen) == 1 replace_guids = single_guid and guid_difference permalink_difference = len(permalinks_seen) != len(self.fpf.entries) single_permalink = len(permalinks_seen) == 1 replace_permalinks = single_permalink and permalink_difference # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_hashes = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry, self.fpf.encoding) if story.get('published') < start_date: start_date = story.get('published') if replace_guids: if replace_permalinks: new_story_guid = unicode(story.get('published')) if self.options['verbose']: logging.debug( u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (self.feed.log_title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid else: new_story_guid = Feed.get_permalink(story) if self.options['verbose']: logging.debug( u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (self.feed.log_title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid story['story_hash'] = MStory.feed_guid_hash_unsaved( self.feed.pk, story.get('guid')) stories.append(story) story_hashes.append(story.get('story_hash')) original_story_hash_count = len(story_hashes) story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[: original_story_hash_count] story_hashes.extend(story_hashes_in_unread_cutoff) story_hashes = list(set(story_hashes)) if self.options['verbose'] or settings.DEBUG: logging.debug( u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (self.feed.log_title[:30], original_story_hash_count, len(story_hashes) - original_story_hash_count, len(story_hashes_in_unread_cutoff))) existing_stories = dict((s.story_hash, s) for s in MStory.objects( story_hash__in=story_hashes, # story_date__gte=start_date, # story_feed_id=self.feed.pk )) # if len(existing_stories) == 0: # existing_stories = dict((s.story_hash, s) for s in MStory.objects( # story_date__gte=start_date, # story_feed_id=self.feed.pk # )) ret_values = self.feed.add_update_stories( stories, existing_stories, verbose=self.options['verbose'], updates_off=self.options['updates_off']) # PubSubHubbub if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub' and not hub_url: hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] push_expired = False if self.feed.is_push: try: push_expired = self.feed.push.lease_expires < datetime.datetime.now( ) except PushSubscription.DoesNotExist: self.feed.is_push = False if (hub_url and self_url and not settings.DEBUG and self.feed.active_subscribers > 0 and (push_expired or not self.feed.is_push or self.options.get('force'))): logging.debug( u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (self.feed.log_title[:30], "~SKRe-~SN" if push_expired else "", hub_url)) try: PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) except TimeoutError: logging.debug( u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (self.feed.log_title[:30], hub_url)) elif (self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url)): logging.debug( u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (self.feed.log_title[:30])) self.feed.is_push = False self.feed = self.feed.save() # Push notifications if ret_values['new'] > 0 and MUserFeedNotification.feed_has_users( self.feed.pk) > 0: QueueNotifications.delay(self.feed.pk, ret_values['new']) # All Done logging.debug( u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (self.feed.log_title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(has_new_stories=bool( ret_values['new']), force=self.options['force']) fetch_date = datetime.datetime.now() if ret_values['new']: if not getattr(settings, 'TEST_DEBUG', False): self.feed.trim_feed() self.feed.expire_redis() if MStatistics.get('raw_feed', None) == self.feed.pk: self.feed.save_raw_feed(self.raw_feed, fetch_date) self.feed.save_feed_history(200, "OK", date=fetch_date) if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (self.feed.log_title[:30], time.time() - start)) return FEED_OK, ret_values
async def new_proxy(self, item): key = build_key(item) logger.debug('Got proxy: %s' % item) return await self.cli.hmset_dict(key, item)
if not self.fpf or self.options.get('force_fp', False): try: self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e)) pass if not self.fpf: try: logging.debug(u' ***> [%-30s] ~FRTurning off headers...' % (self.feed.log_title[:30])) self.fpf = feedparser.parse(address, agent=self.feed.user_agent) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' % (self.feed.log_title[:30], e)) return FEED_ERRHTTP, None logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (self.feed.log_title[:30], time.time() - start)) return FEED_OK, self.fpf def get_identity(self): identity = "X"
def get_crushtype_id(self): log.debug('api testing with each type in the crush type') log.debug('****************************************') for each_id in self.json_crush_node: api = self.construct_api() + '/' + str(each_id['id']) log.debug('config with id %s' % str(each_id['id'])) log.debug('api: %s' % api) response = self.auth.request('GET', api, verify=False) response.raise_for_status() log.debug('response: \n %s' % response.json()) pretty_response = json.dumps(response.json(), indent=2) log.debug('pretty json response \n %s' % pretty_response)
def run(self, **kwargs): logging.debug(" ---> Sharing popular stories...") MSharedStory.share_popular_stories(interactive=False)
social_services = None if self.options.get('requesting_user_id', None): social_services = MSocialServices.get_user( self.options.get('requesting_user_id')) try: twitter_api = social_services.twitter_api() except tweepy.error.TweepError, e: logging.debug( u' ***> [%-30s] ~FRTwitter fetch failed: %s: %s' % (self.feed.log_title[:30], self.address, e)) return else: usersubs = UserSubscription.objects.filter(feed=self.feed) if not usersubs: logging.debug( u' ***> [%-30s] ~FRTwitter fetch failed: %s: No subscriptions' % (self.feed.log_title[:30], self.address)) return for sub in usersubs: social_services = MSocialServices.get_user(sub.user_id) if not social_services.twitter_uid: continue try: twitter_api = social_services.twitter_api() if not twitter_api: continue else: break except tweepy.error.TweepError, e: logging.debug( u' ***> [%-30s] ~FRTwitter fetch failed: %s: %s' % (self.feed.log_title[:30], self.address, e))
def run(self, **kwargs): from apps.rss_feeds.models import Feed settings.LOG_TO_STREAM = True now = datetime.datetime.utcnow() start = time.time() r = redis.Redis(connection_pool=settings.REDIS_FEED_UPDATE_POOL) logging.debug(" ---> ~SN~FBQueuing broken feeds...") # Force refresh feeds refresh_feeds = Feed.objects.filter( active=True, fetched_once=False, active_subscribers__gte=1).order_by('?')[:100] refresh_count = refresh_feeds.count() cp1 = time.time() logging.debug(" ---> ~SN~FBFound %s active, unfetched broken feeds" % refresh_count) # Mistakenly inactive feeds hours_ago = (now - datetime.timedelta(minutes=10)).strftime('%s') old_tasked_feeds = r.zrangebyscore('tasked_feeds', 0, hours_ago) inactive_count = len(old_tasked_feeds) if inactive_count: r.zremrangebyscore('tasked_feeds', 0, hours_ago) # r.sadd('queued_feeds', *old_tasked_feeds) for feed_id in old_tasked_feeds: r.zincrby('error_feeds', feed_id, 1) feed = Feed.get_by_id(feed_id) feed.set_next_scheduled_update() logging.debug( " ---> ~SN~FBRe-queuing ~SB%s~SN dropped/broken feeds (~SB%s/%s~SN queued/tasked)" % (inactive_count, r.scard('queued_feeds'), r.zcard('tasked_feeds'))) cp2 = time.time() old = now - datetime.timedelta(days=1) old_feeds = Feed.objects.filter( next_scheduled_update__lte=old, active_subscribers__gte=1).order_by('?')[:500] old_count = old_feeds.count() cp3 = time.time() logging.debug( " ---> ~SN~FBTasking ~SBrefresh:~FC%s~FB inactive:~FC%s~FB old:~FC%s~SN~FB broken feeds... (%.4s/%.4s/%.4s)" % ( refresh_count, inactive_count, old_count, cp1 - start, cp2 - cp1, cp3 - cp2, )) Feed.task_feeds(refresh_feeds, verbose=False) Feed.task_feeds(old_feeds, verbose=False) logging.debug( " ---> ~SN~FBTasking broken feeds took ~SB%s~SN seconds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)" % (int((time.time() - start)), r.zcard('tasked_feeds'), r.scard('queued_feeds'), r.zcard('scheduled_updates')))
def construct_api(self): self.api = self.base_api + self.fsid + "/" + "crush_map" log.debug(self.api) return self.api
def construct_api(self): self.api = self.base_api + self.fsid + '/' + 'crush_type' log.debug(self.api) return self.api
'q': 0, '$d': 0 } len_text = len(text_to_check) for i in range(0, len_text - 1): if text_to_check[i] == '<' and text_to_check[i + 1] == '<': count['<>'] += 1 elif text_to_check[i] == '>' and text_to_check[i + 1] == '>': count['<>'] += 1 elif text_to_check[i] == '|' and text_to_check[i + 1] == 'c': count['c'] += 1 elif text_to_check[i] == '|' and text_to_check[i + 1] == 't': count['t'] += 1 elif text_to_check[i] == '\\' and text_to_check[i + 1] == '\\': count['bs'] += 1 # elif text_to_check[i] == '\\' and text_to_check[i+1] == 'n': # count['n'] += 1 elif text_to_check[i] == '\\' and text_to_check[i + 1] == '"': count['q'] += 1 elif text_to_check[i] == '$' and text_to_check[i + 1] == 'd': count['$d'] += 1 return count if __name__ == '__main__': log.debug('main() with args: %s' % str(sys.argv)) if os.name == 'nt': sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) # windows main()
def _run_process(self, args): log.debug('Running command: \'{}\''.format(' '.join(args))) process = Popen(args, stdout=PIPE) (output, err) = process.communicate() return process.wait(), output, err
class FetchFeed: def __init__(self, feed_id, options): self.feed = Feed.get_by_id(feed_id) self.options = options self.fpf = None @timelimit(150) def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): modified = None etag = None address = cache_bust_url(address) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (self.feed.title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', self.feed.permalink, )) if self.options.get('feed_xml'): logging.debug( u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug( u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.title[:30])) return FEED_OK, self.fpf try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError), e: logging.debug(u' ***> [%-30s] ~FR%s, turning off headers.' % (self.feed.title[:30], e)) self.fpf = feedparser.parse(address, agent=USER_AGENT) logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, self.fpf
def test_order_closest(self): id0 = Id(BIN_ID0) ordered_list = [ Id('\x00' * ID_SIZE_BYTES), Id(BIN_ID0[:-1] + '\x06'), Id(BIN_ID0[:9] + '\x01' * (ID_SIZE_BYTES - 9)), Id(BIN_ID0[:7] + '\xff' * (ID_SIZE_BYTES - 7)), Id(BIN_ID0[:7] + '\xff' * (ID_SIZE_BYTES - 7)), Id('\x00' + '\xff' * (ID_SIZE_BYTES - 1)), Id('\x53' * ID_SIZE_BYTES), Id('\xff' * ID_SIZE_BYTES), ] random_list = random.sample(ordered_list, len(ordered_list)) random_list_copy = random_list[:] log.debug('ordered list') for e in ordered_list: log.debug('%s' % e) log.debug('random order') for e in random_list: log.debug('%s' % e) result_list = id0.order_closest(random_list) log.debug('order_closest result') for e in result_list: log.debug('%s' % e) log.debug('random order (it should not change)') for e in random_list: log.debug('%s' % e) # make sure order_closest does not modify random_list assert random_list == random_list_copy for i, ordered_id in enumerate(ordered_list): log.debug('%d, %s, %s' % (i, ordered_id, result_list[i])) assert ordered_id.bin_id == result_list[i].bin_id