def get_data_for_rss_item(rss_item: Dict) -> Dict: http_code, content, raw_content = _get_http_data_of_url(rss_item['link']) rss_item.update({ 'raw_content': raw_content, 'http_code': http_code, 'content': content, }) return rss_item
def get_data_for_rss_item(rss_item: Dict) -> Dict: http_code, content, raw_content = _get_http_data_of_url(rss_item['link']) rss_item.update( { 'raw_content': raw_content, 'http_code': http_code, 'content': content, } ) return rss_item
def update_news(): items_on_once = 10 filepath = './pk_list.pickle' # если какая-то новость косячная, то на ней обработка не замнется pk_list = load_pickle_file(filepath) shuffle(pk_list) if pk_list is None: return list_tags = list(Tag.objects.values_list('name', flat=True)) while pk_list: print('Parse: (left - %s)' % len(pk_list)) success_pks = [] for item in Item.objects.filter(pk__in=pk_list[:items_on_once]): try: http_code, content, _ = _get_http_data_of_url(item.link) assert http_code == '404', 'Not found page' item_data = { 'title': item.title, 'content': content, 'description': item.description, } tags_for_item = _get_tags_for_item(item_data, list_tags) if tags_for_item: # todo # надо ли определяет каких тегов нет еще и добавлять только их # или писать все, а БД сама разберется? # разница - в количестве запросов tags_for_insert = diff(tags_for_item, item.tags.values_list('name', flat=True)) tags_objects = Tag.objects.filter(name__in=tags_for_insert) item.tags.add(*tags_objects) item.save() except Exception: pass # print(item) success_pks.append(item.pk) Item.objects.filter(pk__in=success_pks).update(to_update=False) pk_list = diff(pk_list, success_pks) save_pickle_file(filepath, pk_list)
def update_news(): items_on_once = 10 filepath = './pk_list.pickle' # если какая-то новость косячная, то на ней обработка не замнется pk_list = load_pickle_file(filepath) shuffle(pk_list) if pk_list is None: return list_tags = list(Tag.objects.values_list('name', flat=True)) while pk_list: print("Parse: (left - %s)" % len(pk_list)) success_pks = [] for item in Item.objects.filter(pk__in=pk_list[:items_on_once]): try: http_code, content = _get_http_data_of_url(item.link) assert http_code == '404', "Not found page" item_data = { 'title': item.title, 'content': content, 'description': item.description, } tags_for_item = _get_tags_for_item(item_data, list_tags) if tags_for_item: # todo # надо ли определяет каких тегов нет еще и добавлять только их # или писать все, а БД сама разберется? # разница - в количестве запросов tags_for_insert = diff(tags_for_item, item.tags.values_list('name', flat=True)) tags_objects = Tag.objects.filter(name__in=tags_for_insert) item.tags.add(*tags_objects) item.save() except Exception: pass print(item) success_pks.append(item.pk) Item.objects.filter(pk__in=success_pks).update(to_update=False) pk_list = diff(pk_list, success_pks) save_pickle_file(filepath, pk_list)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = n.title # title = u'[!] %s' % n.title if fresh_google_check( # n.title) else n.title http_code, content, raw_content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'raw_content': raw_content, 'http_code': http_code, 'content': content, 'description': re.sub('<.*?>', '', n.summary), 'resource': src.resource, 'language': src.language, } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs.get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = n.title # title = u'[!] %s' % n.title if fresh_google_check( # n.title) else n.title http_code, content, raw_content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'raw_content': raw_content, 'http_code': http_code, 'content': content, 'description': re.sub('<.*?>', '', n.summary), 'resource': src.resource, 'language': src.language, } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs. get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def get_data_for_rss_item(rss_item: Dict) -> Dict: http_code, content, raw_content = _get_http_data_of_url(rss_item["link"]) rss_item.update({"raw_content": raw_content, "http_code": http_code, "content": content}) return rss_item