示例#1
0
文件: home.py 项目: ICCV/chaos
 def parse(self, response):
     url = response.url
     links = self.extract_urls(response)
     article_urls = []
     subindexes = []
     subindex_urls = SiteRule.get_subindex_urls_by_url(url)
     patterns = Site.get_article_patterns(url)
     for link in links:
         if link.url in subindex_urls:
             subindexes.append(link.url)
         else:
             for pattern in patterns:
                 if re.match(pattern, link.url.strip()):
                     article_urls.append(link.url)
     if subindexes:
         self.server.rpush(SUB_INDEX_WORKER_KEY, *set(subindexes))
     if article_urls:
         articles = Article.objects(source_url__in=article_urls)
         exsited_article_urls = [article.source_url for article in articles]
         new_urls = set(article_urls) - set(exsited_article_urls)
         if new_urls:
             self.server.rpush(TOP_LEVEL_ARTICLES_WORKER_KEY, *set(new_urls))
             name = Site.get_name_by_url(url)
             Article.objects.insert([Article(source_url=article_url, source='Home',
                         site_name=name, site_url=url, category=['Top Stories']) \
                                 for article_url in set(new_urls)])
示例#2
0
def main():
    sub_index_urls = set(['http://fitnhit.com/education/','http://fitnhit.com/entertainment/','http://fitnhit.com/health/','http://fitnhit.com/news/','http://fitnhit.com/technology/'])
    site_url = 'http://fitnhit.com/'
    article_patterns = ['^http://fitnhit.com/.*/\d{5}']
    title = 'FitNHit'
    for sub_index_url in sub_index_urls:
        siterule = SiteRule.objects.get(sub_index_url=sub_index_url)
        print siterule.title
        siterule.title = title
        siterule.site_url = site_url
        siterule.article_patterns = article_patterns
        siterule.save()
    rules = Site()
    rules.title = title
    rules.url = site_url
    rules.article_patterns = article_patterns
    rules.save()
    sites = Site.objects(title='Tribune')
    if sites:
        site = sites[0]
        site.title = 'The Express Tribune'
        site.save()
    sites = Site.objects(title='Tribune India')
    if sites:
        site = sites[0]
        site.title = 'The Tribune'
        site.save()
    articles = Article.objects(title='Tribune')
    for article in articles:
        article.title = 'The Express Tribune'
        article.save()
    articles = Article.objects(title='Tribune India')
    for article in articles:
        article.title = 'The Tribune'
        article.save()
示例#3
0
def main():
    Site.drop_collection()
    url_pattern = {}
    fin = open('scripts/article_pattern_precise.txt','r')
    for line in fin:
        title, pattern = line.strip().split('###')
        url_pattern[title] = pattern
    fin.close()
    fin = open('scripts/content_or_article.csv','r')
    for line in fin:
        title, url, _ = line.strip().split(',')
        if "http" not in url:
            url = 'http://%s' % url
        url = re.sub('/$', '', url)

        if title in url_pattern:
            article_patterns = url_pattern[title]
            site = Site()
            site.title = title
            site.url = url
            site.article_patterns = [article_patterns]
            site.save()
            print title, url
    fin.close()
示例#4
0
import django

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "KNIGHT.settings")
django.setup()
from crawler.models import Site, Content, Image, Url, Html
from django.utils import timezone
import json

if __name__ == '__main__':

    with open(
            '/workspace/django_darkknight/json_datas/highkorea/2018-10-20/1_hk_181020.json',
            'r') as f:
        data = json.loads(f.read())
        instance = Site(name='highkorea',
                        stem='http://highkorea5ou4wcy.onion',
                        on_off=True,
                        crawl_end=timezone.localtime())
        instance.save()

        for forum in data:
            url = list(forum.keys())[0]
            uinstance = Url(url=url, site=instance.id)
            uinstance.save()
            data = forum[url]
            htmls = data['html']
            images = data['image']
            articles = data['content']
            for article in articles:
                author = article['author']
                content = article['content']
                cinstance = Content(author=author,
示例#5
0
文件: home.py 项目: ICCV/chaos
 def parse(self, response):
     home_entries = Site.get_urls()
     self.server.rpush(HOMEPAGE_WORKER_KEY, *home_entries)
     yield Request(response.url, callback=self.parse, dont_filter=True)