def parse(self, response): url = response.url links = self.extract_urls(response) article_urls = [] subindexes = [] subindex_urls = SiteRule.get_subindex_urls_by_url(url) patterns = Site.get_article_patterns(url) for link in links: if link.url in subindex_urls: subindexes.append(link.url) else: for pattern in patterns: if re.match(pattern, link.url.strip()): article_urls.append(link.url) if subindexes: self.server.rpush(SUB_INDEX_WORKER_KEY, *set(subindexes)) if article_urls: articles = Article.objects(source_url__in=article_urls) exsited_article_urls = [article.source_url for article in articles] new_urls = set(article_urls) - set(exsited_article_urls) if new_urls: self.server.rpush(TOP_LEVEL_ARTICLES_WORKER_KEY, *set(new_urls)) name = Site.get_name_by_url(url) Article.objects.insert([Article(source_url=article_url, source='Home', site_name=name, site_url=url, category=['Top Stories']) \ for article_url in set(new_urls)])
def main(): sub_index_urls = set(['http://fitnhit.com/education/','http://fitnhit.com/entertainment/','http://fitnhit.com/health/','http://fitnhit.com/news/','http://fitnhit.com/technology/']) site_url = 'http://fitnhit.com/' article_patterns = ['^http://fitnhit.com/.*/\d{5}'] title = 'FitNHit' for sub_index_url in sub_index_urls: siterule = SiteRule.objects.get(sub_index_url=sub_index_url) print siterule.title siterule.title = title siterule.site_url = site_url siterule.article_patterns = article_patterns siterule.save() rules = Site() rules.title = title rules.url = site_url rules.article_patterns = article_patterns rules.save() sites = Site.objects(title='Tribune') if sites: site = sites[0] site.title = 'The Express Tribune' site.save() sites = Site.objects(title='Tribune India') if sites: site = sites[0] site.title = 'The Tribune' site.save() articles = Article.objects(title='Tribune') for article in articles: article.title = 'The Express Tribune' article.save() articles = Article.objects(title='Tribune India') for article in articles: article.title = 'The Tribune' article.save()
def main(): Site.drop_collection() url_pattern = {} fin = open('scripts/article_pattern_precise.txt','r') for line in fin: title, pattern = line.strip().split('###') url_pattern[title] = pattern fin.close() fin = open('scripts/content_or_article.csv','r') for line in fin: title, url, _ = line.strip().split(',') if "http" not in url: url = 'http://%s' % url url = re.sub('/$', '', url) if title in url_pattern: article_patterns = url_pattern[title] site = Site() site.title = title site.url = url site.article_patterns = [article_patterns] site.save() print title, url fin.close()
import django os.environ.setdefault("DJANGO_SETTINGS_MODULE", "KNIGHT.settings") django.setup() from crawler.models import Site, Content, Image, Url, Html from django.utils import timezone import json if __name__ == '__main__': with open( '/workspace/django_darkknight/json_datas/highkorea/2018-10-20/1_hk_181020.json', 'r') as f: data = json.loads(f.read()) instance = Site(name='highkorea', stem='http://highkorea5ou4wcy.onion', on_off=True, crawl_end=timezone.localtime()) instance.save() for forum in data: url = list(forum.keys())[0] uinstance = Url(url=url, site=instance.id) uinstance.save() data = forum[url] htmls = data['html'] images = data['image'] articles = data['content'] for article in articles: author = article['author'] content = article['content'] cinstance = Content(author=author,
def parse(self, response): home_entries = Site.get_urls() self.server.rpush(HOMEPAGE_WORKER_KEY, *home_entries) yield Request(response.url, callback=self.parse, dont_filter=True)