def test_no_accept_header(self): self.assertEqual({}, util.request_headers(url='http://foo/bar')) self.assertEqual({}, util.request_headers(source=Twitter(id='not-rhiaro'))) self.expect_requests_get('http://foo/bar', '') self.mox.ReplayAll() util.requests_get('http://foo/bar')
def test_no_accept_header(self): self.assertEquals(util.REQUEST_HEADERS, util.request_headers(url='http://foo/bar')) self.assertEquals(util.REQUEST_HEADERS, util.request_headers(source=Twitter(id='not-rhiaro'))) self.expect_requests_get('http://foo/bar', '', headers=util.REQUEST_HEADERS) self.mox.ReplayAll() util.requests_get('http://foo/bar')
def test_rhiaro_accept_header(self): """Only send Accept header to rhiaro.co.uk right now. https://github.com/snarfed/bridgy/issues/713 """ self.assertEquals(util.REQUEST_HEADERS_CONNEG, util.request_headers(url='http://rhiaro.co.uk/')) self.assertEquals(util.REQUEST_HEADERS_CONNEG, util.request_headers(source=Twitter(id='rhiaro'))) self.expect_requests_get('http://rhiaro.co.uk/', '', headers=util.REQUEST_HEADERS_CONNEG) self.mox.ReplayAll() util.requests_get('http://rhiaro.co.uk/')
def test_rhiaro_accept_header(self): """Only send Accept header to rhiaro.co.uk right now. https://github.com/snarfed/bridgy/issues/713 """ self.assertEqual(util.REQUEST_HEADERS_CONNEG, util.request_headers(url='http://rhiaro.co.uk/')) self.assertEqual(util.REQUEST_HEADERS_CONNEG, util.request_headers(source=Twitter(id='rhiaro'))) self.expect_requests_get('http://rhiaro.co.uk/', '', headers=util.REQUEST_HEADERS_CONNEG) self.mox.ReplayAll() util.requests_get('http://rhiaro.co.uk/')
def test_blocklist_localhost_when_deployed(self): self.mox.StubOutWithMock(util, 'LOCAL') util.LOCAL = False for bad in 'http://localhost:8080/', 'http://127.0.0.1/': resp = util.requests_get(bad) self.assertEqual(util.HTTP_REQUEST_REFUSED_STATUS_CODE, resp.status_code) self.assertEqual('Sorry, Bridgy has blocklisted this URL.', resp.text)
def test_requests_get_content_length_not_int(self): self.expect_requests_get("http://foo/bar", "xyz", response_headers={"Content-Length": "foo"}) self.mox.ReplayAll() resp = util.requests_get("http://foo/bar") self.assertEquals(200, resp.status_code) self.assertEquals("xyz", resp.content)
def query_live_status(uid=None): if uid is None: return uid = str(uid) query_url = 'https://api.bilibili.com/x/space/acc/info?mid={}&my_ts={}'.format( uid, int(time.time())) response = util.requests_get(query_url, '查询直播状态') if util.check_response_is_ok(response): result = json.loads(str(response.content, 'utf-8')) if result['code'] != 0: logger.error( '【查询直播状态】请求返回数据code错误:{code}'.format(code=result['code'])) else: name = result['data']['name'] live_status = result['data']['live_room']['liveStatus'] if LIVING_STATUS_DICT.get(uid, None) is None: LIVING_STATUS_DICT[uid] = live_status logger.info('【查询直播状态】【{uname}】初始化'.format(uname=name)) return if LIVING_STATUS_DICT.get(uid, None) != live_status: LIVING_STATUS_DICT[uid] = live_status room_id = result['data']['live_room']['roomid'] room_title = result['data']['live_room']['title'] room_cover_url = result['data']['live_room']['cover'] if live_status == 1: logger.info('【查询直播状态】【{name}】开播了,准备推送:{room_title}'.format( name=name, room_title=room_title)) push.push_for_bili_live(name, room_id, room_title, room_cover_url)
def resolve_profile_url(url, resolve=True): """Resolves a profile URL to be added to a source. Args: url: string resolve: boolean, whether to make HTTP requests to follow redirects, etc. Returns: string, resolved URL, or None """ final, _, ok = util.get_webmention_target(url, resolve=resolve) if not ok: return None final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and resolve: root = match.group(1) try: resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root except requests.RequestException: logging.warning("Couldn't fetch %s, preserving path in %s", root, final, exc_info=True) return final
def fetch_mf2(self, url): """Fetches a URL and extracts its mf2 data. Side effects: sets self.entity.html on success, calls self.error() on errors. Args: url: string Returns: (requests.Response, mf2 data dict) on success, None on failure """ try: fetched = util.requests_get(url) fetched.raise_for_status() except BaseException as e: util.interpret_http_exception(e) # log exception return self.error('Could not fetch source URL %s' % url) if self.entity: self.entity.html = fetched.text # .text is decoded unicode string, .content is raw bytes. if the HTTP # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it # can look for a <meta> tag with a charset and decode. text = (fetched.text if 'charset' in fetched.headers.get('content-type', '') else fetched.content) doc = BeautifulSoup(text) # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2. contents = doc.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' doc = unicode(post) # parse microformats, convert to ActivityStreams data = parser.Parser(doc=doc, url=fetched.url).to_dict() logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2)) items = data.get('items', []) if not items or not items[0]: return self.error('No microformats2 data found in ' + fetched.url, data=data, html=""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """ % (fetched.url, util.pretty_link(fetched.url))) return fetched, data
def create_comment(self, post_url, author_name, author_url, content): """Creates a new comment in the source silo. Must be implemented by subclasses. Args: post_url: string author_name: string author_url: string content: string Returns: JSON response dict with 'id' and other fields """ if not self.disqus_shortname: resp = util.requests_get(post_url) resp.raise_for_status() self.discover_disqus_shortname(resp.text) if not self.disqus_shortname: raise exc.HTTPBadRequest( "Your Bridgy account isn't fully set up yet: " "we haven't found your Disqus account." ) # strip slug, query and fragment from post url parsed = urlparse.urlparse(post_url) path = parsed.path.split("/") try: tumblr_post_id = int(path[-1]) except ValueError: path.pop(-1) post_url = urlparse.urlunparse(parsed[:2] + ("/".join(path), "", "", "")) # get the disqus thread id. details on thread queries: # http://stackoverflow.com/questions/4549282/disqus-api-adding-comment # https://disqus.com/api/docs/threads/details/ resp = self.disqus_call( util.requests_get, DISQUS_API_THREAD_DETAILS_URL, { "forum": self.disqus_shortname, # ident:[tumblr_post_id] should work, but doesn't :/ "thread": "link:%s" % post_url, }, ) thread_id = resp["id"] # create the comment message = u'<a href="%s">%s</a>: %s' % (author_url, author_name, content) resp = self.disqus_call( util.requests_post, DISQUS_API_CREATE_POST_URL, { "thread": thread_id, "message": message.encode("utf-8"), # only allowed when authed as moderator/owner # 'state': 'approved', }, ) return resp
def create_comment(self, post_url, author_name, author_url, content): """Creates a new comment in the source silo. Must be implemented by subclasses. Args: post_url: string author_name: string author_url: string content: string Returns: JSON response dict with 'id' and other fields """ if not self.disqus_shortname: resp = util.requests_get(post_url) resp.raise_for_status() self.discover_disqus_shortname(resp.text) if not self.disqus_shortname: raise exc.HTTPBadRequest( "Your Bridgy account isn't fully set up yet: " "we haven't found your Disqus account.") # strip slug, query and fragment from post url parsed = urlparse.urlparse(post_url) path = parsed.path.split('/') try: tumblr_post_id = int(path[-1]) except ValueError: path.pop(-1) post_url = urlparse.urlunparse(parsed[:2] + ('/'.join(path), '', '', '')) # get the disqus thread id. details on thread queries: # http://stackoverflow.com/questions/4549282/disqus-api-adding-comment # https://disqus.com/api/docs/threads/details/ resp = self.disqus_call( util.requests_get, DISQUS_API_THREAD_DETAILS_URL, { 'forum': self.disqus_shortname, # ident:[tumblr_post_id] should work, but doesn't :/ 'thread': 'link:%s' % post_url, }) thread_id = resp['id'] # create the comment message = u'<a href="%s">%s</a>: %s' % (author_url, author_name, content) resp = self.disqus_call( util.requests_post, DISQUS_API_CREATE_POST_URL, { 'thread': thread_id, 'message': message.encode('utf-8'), # only allowed when authed as moderator/owner # 'state': 'approved', }) return resp
def test_requests_get_content_length_not_int(self): self.expect_requests_get('http://foo/bar', 'xyz', response_headers={'Content-Length': 'foo'}) self.mox.ReplayAll() resp = util.requests_get('http://foo/bar') self.assertEquals(200, resp.status_code) self.assertEquals('xyz', resp.content)
def test_requests_get_content_length_not_int(self): self.expect_requests_get('http://foo/bar', 'xyz', response_headers={'Content-Length': 'foo'}) self.mox.ReplayAll() resp = util.requests_get('http://foo/bar') self.assertEqual(200, resp.status_code) self.assertEqual('xyz', resp.text)
def test_requests_get_too_big(self): self.expect_requests_get( 'http://foo/bar', '', response_headers={'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1)}) self.mox.ReplayAll() resp = util.requests_get('http://foo/bar') self.assertEquals(util.HTTP_REQUEST_REFUSED_STATUS_CODE, resp.status_code) self.assertIn(' larger than our limit ', resp.content)
def test_requests_get_too_big(self): self.expect_requests_get( 'http://foo/bar', '', response_headers={'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1)}) self.mox.ReplayAll() resp = util.requests_get('http://foo/bar') self.assertEqual(util.HTTP_RESPONSE_TOO_BIG_STATUS_CODE, resp.status_code) self.assertIn(' larger than our limit ', resp.text)
def _get_wechat_access_token(self): access_token = None url = 'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid={corpid}&corpsecret={corpsecret}'.format( corpid=self.wechat_corp_id, corpsecret=self.wechat_corp_secret) response = util.requests_get(url, '推送_wechat_获取access_tokon') if util.check_response_is_ok(response): result = json.loads(str(response.content, 'utf-8')) access_token = result['access_token'] return access_token
def query_live_status(room_id=None): if room_id is None: return query_url = 'https://webcast.amemv.com/webcast/reflow/{}?my_ts={}`'.format( room_id, int(time.time())) headers = get_headers_for_live() response = util.requests_get(query_url, '查询直播状态', headers=headers, use_proxy=True) if util.check_response_is_ok(response): html_text = response.text soup = BeautifulSoup(html_text, "html.parser") result = None scripts = soup.findAll('script') for script in scripts: script_string = script.string if script_string is None: continue if 'window.__INIT_PROPS__ = ' in script_string: result_str = script.string.replace('window.__INIT_PROPS__ = ', '') try: result = json.loads(result_str).get( '/webcast/reflow/:id', None) except TypeError: logger.error('【查询直播状态】json解析错误,room_id:{}'.format(room_id)) return None break if result is None: logger.error('【查询直播状态】请求返回数据为空,room_id:{}'.format(room_id)) else: if result.get('room', None) is None: logger.error( '【查询直播状态】请求返回数据中room为空,room_id:{}'.format(room_id)) return name = result['room']['owner']['nickname'] live_status = result['room']['status'] if LIVING_STATUS_DICT.get(room_id, None) is None: LIVING_STATUS_DICT[room_id] = live_status logger.info('【查询直播状态】【{uname}】初始化'.format(uname=name)) return if LIVING_STATUS_DICT.get(room_id, None) != live_status: LIVING_STATUS_DICT[room_id] = live_status room_title = result['room']['title'] room_cover_url = result['room']['cover']['url_list'][0] room_stream_url = result['room']['stream_url']['hls_pull_url'] if live_status == 2: logger.info('【查询直播状态】【{name}】开播了,准备推送:{room_title}'.format( name=name, room_title=room_title)) push.push_for_douyin_live(name, room_stream_url, room_title, room_cover_url)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith( util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def expand_target_urls(self, activity): """Expand the inReplyTo or object fields of an ActivityStreams object by fetching the original and looking for rel=syndication URLs. This method modifies the dict in place. Args: activity: an ActivityStreams dict of the activity being published """ for field in ('inReplyTo', 'object'): # microformats2.json_to_object de-dupes, no need to do it here objs = activity.get(field) if not objs: continue if isinstance(objs, dict): objs = [objs] augmented = list(objs) for obj in objs: url = obj.get('url') if not url: continue # get_webmention_target weeds out silos and non-HTML targets # that we wouldn't want to download and parse url, _, ok = util.get_webmention_target(url) if not ok: continue # fetch_mf2 raises a fuss if it can't fetch a mf2 document; # easier to just grab this ourselves than add a bunch of # special-cases to that method logging.debug('expand_target_urls fetching field=%s, url=%s', field, url) try: resp = util.requests_get(url) resp.raise_for_status() data = util.mf2py_parse(resp.text, url) except AssertionError: raise # for unit tests except BaseException: # it's not a big deal if we can't fetch an in-reply-to url logging.warning( 'expand_target_urls could not fetch field=%s, url=%s', field, url, exc_info=True) continue synd_urls = data.get('rels', {}).get('syndication', []) # look for syndication urls in the first h-entry queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type', [])) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue # these can be urls or h-cites synd_urls += microformats2.get_string_urls( item.get('properties', {}).get('syndication', [])) logging.debug( 'expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls) augmented += [{'url': u} for u in synd_urls] activity[field] = augmented
def expand_target_urls(self, activity): """Expand the inReplyTo or object fields of an ActivityStreams object by fetching the original and looking for rel=syndication URLs. This method modifies the dict in place. Args: activity: an ActivityStreams dict of the activity being published """ for field in ('inReplyTo', 'object'): # microformats2.json_to_object de-dupes, no need to do it here objs = activity.get(field) if not objs: continue if isinstance(objs, dict): objs = [objs] augmented = list(objs) for obj in objs: url = obj.get('url') if not url: continue # get_webmention_target weeds out silos and non-HTML targets # that we wouldn't want to download and parse url, _, ok = util.get_webmention_target(url) if not ok: continue # fetch_mf2 raises a fuss if it can't fetch a mf2 document; # easier to just grab this ourselves than add a bunch of # special-cases to that method logging.debug('expand_target_urls fetching field=%s, url=%s', field, url) try: resp = util.requests_get(url) resp.raise_for_status() data = mf2py.Parser(url=url, doc=resp.text).to_dict() except AssertionError: raise # for unit tests except BaseException: # it's not a big deal if we can't fetch an in-reply-to url logging.warning('expand_target_urls could not fetch field=%s, url=%s', field, url, exc_info=True) continue synd_urls = data.get('rels', {}).get('syndication', []) # look for syndication urls in the first h-entry queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type', [])) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue # these can be urls or h-cites synd_urls += microformats2.get_string_urls( item.get('properties', {}).get('syndication', [])) logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls) augmented += [{'url': u} for u in synd_urls] activity[field] = augmented
def test_requests_get_url_blacklist(self): resp = util.requests_get(next(iter(util.URL_BLACKLIST))) self.assertEquals(util.HTTP_REQUEST_REFUSED_STATUS_CODE, resp.status_code) self.assertEquals('Sorry, Bridgy has blacklisted this URL.', resp.content)
def fetch_mf2(self, url, id=None, require_mf2=True, raise_errors=False): """Fetches a URL and extracts its mf2 data. Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()` on errors. Args: url: string id: string, optional id of specific element to extract and parse. defaults to the whole page. require_mf2: boolean, whether to return error if no mf2 are found raise_errors: boolean, whether to let error exceptions propagate up or handle them Returns: (:class:`requests.Response`, mf2 data dict) on success, None on failure """ try: resp = util.requests_get(url) resp.raise_for_status() except werkzeug.exceptions.HTTPException: # raised by us, probably via self.error() raise except BaseException as e: if raise_errors: raise util.interpret_http_exception(e) # log exception self.error(f'Could not fetch source URL {url}') if self.entity: self.entity.html = resp.text # parse microformats soup = util.parse_html(resp) mf2 = util.parse_mf2(soup, url=resp.url, id=id) if id and not mf2: self.error(f'Got fragment {id} but no element found with that id.') # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2 and re-parse if not mf2.get('items'): contents = soup.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' # TODO: i should be able to pass post or contents[0] to mf2py instead # here, but it returns no items. mf2py bug? doc = str(post) mf2 = util.parse_mf2(doc, resp.url) logger.debug(f'Parsed microformats2: {json_dumps(mf2, indent=2)}') items = mf2.get('items', []) if require_mf2 and (not items or not items[0]): self.error('No microformats2 data found in ' + resp.url, data=mf2, html=f""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="{resp.url}">{util.pretty_link(resp.url)}</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """) return resp, mf2
def query_dynamic(uid=None): if uid is None: return uid = str(uid) query_url = 'http://api.vc.bilibili.com/dynamic_svr/v1/dynamic_svr/space_history' \ '?host_uid={uid}&offset_dynamic_id=0&need_top=0&platform=web&my_ts={my_ts}'.format(uid=uid, my_ts=int(time.time())) headers = get_headers(uid) response = util.requests_get(query_url, '查询动态状态', headers=headers, use_proxy=True) if util.check_response_is_ok(response): result = json.loads(str(response.content, 'utf-8')) if result['code'] != 0: logger.error( '【查询动态状态】请求返回数据code错误:{code}'.format(code=result['code'])) else: data = result['data'] if len(data['cards']) == 0: logger.info('【查询动态状态】【{uid}】动态列表为空'.format(uid=uid)) return item = data['cards'][0] dynamic_id = item['desc']['dynamic_id'] try: uname = item['desc']['user_profile']['info']['uname'] except KeyError: logger.error('【查询动态状态】【{uid}】获取不到uname'.format(uid=uid)) return if DYNAMIC_DICT.get(uid, None) is None: DYNAMIC_DICT[uid] = deque(maxlen=LEN_OF_DEQUE) cards = data['cards'] for index in range(LEN_OF_DEQUE): if index < len(cards): DYNAMIC_DICT[uid].appendleft( cards[index]['desc']['dynamic_id']) logger.info('【查询动态状态】【{uname}】动态初始化:{queue}'.format( uname=uname, queue=DYNAMIC_DICT[uid])) return if dynamic_id not in DYNAMIC_DICT[uid]: previous_dynamic_id = DYNAMIC_DICT[uid].pop() DYNAMIC_DICT[uid].append(previous_dynamic_id) logger.info('【查询动态状态】【{}】上一条动态id[{}],本条动态id[{}]'.format( uname, previous_dynamic_id, dynamic_id)) DYNAMIC_DICT[uid].append(dynamic_id) logger.info(DYNAMIC_DICT[uid]) dynamic_type = item['desc']['type'] if dynamic_type not in [2, 4, 8, 64]: logger.info( '【查询动态状态】【{uname}】动态有更新,但不在需要推送的动态类型列表中'.format( uname=uname)) return timestamp = item['desc']['timestamp'] dynamic_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) card_str = item['card'] card = json.loads(card_str) content = None pic_url = None if dynamic_type == 1: # 转发动态 content = card['item']['content'] elif dynamic_type == 2: # 图文动态 content = card['item']['description'] pic_url = card['item']['pictures'][0]['img_src'] elif dynamic_type == 4: # 文字动态 content = card['item']['content'] elif dynamic_type == 8: # 投稿动态 content = card['title'] pic_url = card['pic'] elif dynamic_type == 64: # 专栏动态 content = card['title'] pic_url = card['image_urls'][0] logger.info('【查询动态状态】【{uname}】动态有更新,准备推送:{content}'.format( uname=uname, content=content[:30])) push.push_for_bili_dynamic(uname, dynamic_id, content, pic_url, dynamic_type, dynamic_time)
def fetch_mf2(self, url, require_mf2=True, raise_errors=False): """Fetches a URL and extracts its mf2 data. Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()` on errors. Args: url: string require_mf2: boolean, whether to return error if no mf2 are found raise_errors: boolean, whether to let error exceptions propagate up or handle them Returns: (:class:`requests.Response`, mf2 data dict) on success, None on failure """ try: fetched = util.requests_get(url) fetched.raise_for_status() except BaseException as e: if raise_errors: raise util.interpret_http_exception(e) # log exception return self.error('Could not fetch source URL %s' % url) if self.entity: self.entity.html = fetched.text # .text is decoded unicode string, .content is raw bytes. if the HTTP # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it # can look for a <meta> tag with a charset and decode. text = (fetched.text if 'charset' in fetched.headers.get( 'content-type', '') else fetched.content) doc = util.beautifulsoup_parse(text) # parse microformats data = util.mf2py_parse(doc, fetched.url) # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2 and re-parse if not data.get('items'): contents = doc.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' doc = unicode(post) data = util.mf2py_parse(doc, fetched.url) logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2)) items = data.get('items', []) if require_mf2 and (not items or not items[0]): return self.error('No microformats2 data found in ' + fetched.url, data=data, html=""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """ % (fetched.url, util.pretty_link(fetched.url))) return fetched, data
def query_dynamic(uid=None, sec_uid=None): if uid is None or sec_uid is None: return signature = sign.get_signature() query_url = 'http://www.iesdouyin.com/web/api/v2/aweme/post?sec_uid={}&count=21&max_cursor=0&aid=1128&_signature={}'.format( sec_uid, signature) headers = get_headers(uid, sec_uid) response = util.requests_get(query_url, '查询动态状态', headers=headers, use_proxy=True) if util.check_response_is_ok(response): result = json.loads(str(response.content, 'utf-8')) if result['status_code'] != 0: logger.error('【查询动态状态】请求返回数据code错误:{code}'.format( code=result['status_code'])) else: aweme_list = result['aweme_list'] if len(aweme_list) == 0: logger.info( '【查询动态状态】【{sec_uid}】动态列表为空'.format(sec_uid=sec_uid)) return aweme = aweme_list[0] aweme_id = aweme['aweme_id'] uid = aweme['author']['uid'] nickname = aweme['author']['nickname'] if DYNAMIC_DICT.get(uid, None) is None: DYNAMIC_DICT[uid] = deque(maxlen=LEN_OF_DEQUE) for index in range(LEN_OF_DEQUE): if index < len(aweme_list): DYNAMIC_DICT[uid].appendleft( aweme_list[index]['aweme_id']) logger.info('【查询动态状态】【{nickname}】动态初始化:{queue}'.format( nickname=nickname, queue=DYNAMIC_DICT[uid])) return if aweme_id not in DYNAMIC_DICT[uid]: previous_aweme_id = DYNAMIC_DICT[uid].pop() DYNAMIC_DICT[uid].append(previous_aweme_id) logger.info('【查询动态状态】【{}】上一条动态id[{}],本条动态id[{}]'.format( nickname, previous_aweme_id, aweme_id)) DYNAMIC_DICT[uid].append(aweme_id) logger.info(DYNAMIC_DICT[uid]) aweme_type = aweme['aweme_type'] if aweme_type not in [4]: logger.info( '【查询动态状态】【{nickname}】动态有更新,但不在需要推送的动态类型列表中'.format( nickname=nickname)) return content = None pic_url = None video_url = None if aweme_type == 4: content = aweme['desc'] pic_url = aweme['video']['origin_cover']['url_list'][0] video_url_list = aweme['video']['play_addr']['url_list'] for temp in video_url_list: if 'ixigua.com' in temp or 'api.amemv.com' in temp: continue if 'aweme.snssdk.com' in temp or 'douyinvod.com' in temp: video_url = temp break logger.info('【查询动态状态】【{nickname}】动态有更新,准备推送:{content}'.format( nickname=nickname, content=content[:30])) push.push_for_douyin_dynamic(nickname, aweme_id, content, pic_url, video_url)
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of models.Source author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Return: a dict of syndicated_url to a list of new models.SyndicatedPost """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. # TODO skip sites we know don't have microformats2 markup author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = BeautifulSoup(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.warning('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if not feed_type: # type is not specified, use this to confirm that it's text/html feed_url, _, feed_type_ok = util.get_webmention_target(feed_url) else: feed_type_ok = feed_type == 'text/html' if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_type_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True) permalink_to_entry = {} for child in feeditems: if 'h-entry' in child['type']: # TODO maybe limit to first ~30 entries? (do that here rather than, # below because we want the *first* n entries) for permalink in child['properties'].get('url', []): if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = _process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling now = util.now_fn() logging.debug('updating source last_syndication_url %s', now) source.updates['last_syndication_url'] = now return results
def process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s for this permalink store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Returns: a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug( 'previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls( source, permalink, set(url for url in usynd if isinstance(url, basestring)), preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url or not feed_entry: # fetch the full permalink page if we think it might have more details parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = util.mf2py_parse(resp.text, permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.info('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug( 'saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds( feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info( 'rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def _process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: a list of previously discovered models.SyndicatedPosts for this permalink store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug('previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls(source, permalink, set( url for url in usynd if isinstance(url, basestring)), preexisting) success = True # fetch the full permalink page, which often has more detailed information if not results: parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls( source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug('saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results