def import_tweets(**kwargs): for i in get_tweets(): # это помогает не парсить лишний раз ссылку, которая есть ct = len(Item.objects.filter(link=i[1])[0:1]) if ct: continue title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0] item_data = { 'title': title, 'link': i[1], 'http_code': i[3], 'resource': i[2] } data = apply_parsing_rules(item_data, **kwargs) if kwargs.get( 'query_rules') else {} item_data.update(data) save_item(item_data)
def get_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): print('\n\n' + '=' * 25) print(' ' + src.name) print('=' * 25 + '\n') num = 0 rssnews = feedparser.parse(src.link) for n in rssnews.entries: title = u'[!] %s' % n.title if fresh_google_check( n.title, debug=True) else n.title http_code, content, _ = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'http_code': http_code, 'content': content, 'description': n.summary, 'resource': src.resource, } data = apply_parsing_rules(item_data, **kwargs) if kwargs.get( 'query_rules') else {} item_data.update(data) print_str = '' print_str += 'status: %s' % item_data['status'] if ( 'status' in item_data) else '' print_str += 'tags: %s' % item_data['tags'] if ('tags' in item_data) else '' print_str += 'section: %s' % item_data['section'] if ( 'section' in item_data) else '' print(print_str) try: lastnews = Item.objects.get(link=item_data.get('link')) except Item.DoesNotExist: num += 1 print('%d: Title: %s (%s)' % (num, item_data.get('title'), item_data.get('link')))
def get_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): print('\n\n' + '=' * 25) print(' ' + src.name) print('=' * 25 + '\n') num = 0 rssnews = feedparser.parse(src.link) for n in rssnews.entries: title = u'[!] %s' % n.title if fresh_google_check( n.title, debug=True) else n.title http_code, content, _ = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'http_code': http_code, 'content': content, 'description': n.summary, 'resource': src.resource, } data = apply_parsing_rules( item_data, **kwargs) if kwargs.get('query_rules') else {} item_data.update(data) print_str = '' print_str += 'status: %s' % item_data['status'] if ( 'status' in item_data) else '' print_str += 'tags: %s' % item_data['tags'] if ( 'tags' in item_data) else '' print_str += 'section: %s' % item_data['section'] if ( 'section' in item_data) else '' print(print_str) try: lastnews = Item.objects.get(link=item_data.get('link')) except Item.DoesNotExist: num += 1 print('%d: Title: %s (%s)' % (num, item_data.get('title'), item_data.get('link')))
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = u'[!] %s' % n.title if fresh_google_check( n.title) else n.title http_code, content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'http_code': http_code, 'content': content, 'description': re.sub("<.*?>", "", n.summary), 'resource': src.resource, 'language': src.language, } data = apply_parsing_rules(item_data, **kwargs) if kwargs.get( 'query_rules') else {} item_data.update(data) save_item(item_data)