示例#1
0
def main():
    sub_index_urls = set(['http://fitnhit.com/education/','http://fitnhit.com/entertainment/','http://fitnhit.com/health/','http://fitnhit.com/news/','http://fitnhit.com/technology/'])
    site_url = 'http://fitnhit.com/'
    article_patterns = ['^http://fitnhit.com/.*/\d{5}']
    title = 'FitNHit'
    for sub_index_url in sub_index_urls:
        siterule = SiteRule.objects.get(sub_index_url=sub_index_url)
        print siterule.title
        siterule.title = title
        siterule.site_url = site_url
        siterule.article_patterns = article_patterns
        siterule.save()
    rules = Site()
    rules.title = title
    rules.url = site_url
    rules.article_patterns = article_patterns
    rules.save()
    sites = Site.objects(title='Tribune')
    if sites:
        site = sites[0]
        site.title = 'The Express Tribune'
        site.save()
    sites = Site.objects(title='Tribune India')
    if sites:
        site = sites[0]
        site.title = 'The Tribune'
        site.save()
    articles = Article.objects(title='Tribune')
    for article in articles:
        article.title = 'The Express Tribune'
        article.save()
    articles = Article.objects(title='Tribune India')
    for article in articles:
        article.title = 'The Tribune'
        article.save()
示例#2
0
def main():
    Site.drop_collection()
    url_pattern = {}
    fin = open('scripts/article_pattern_precise.txt','r')
    for line in fin:
        title, pattern = line.strip().split('###')
        url_pattern[title] = pattern
    fin.close()
    fin = open('scripts/content_or_article.csv','r')
    for line in fin:
        title, url, _ = line.strip().split(',')
        if "http" not in url:
            url = 'http://%s' % url
        url = re.sub('/$', '', url)

        if title in url_pattern:
            article_patterns = url_pattern[title]
            site = Site()
            site.title = title
            site.url = url
            site.article_patterns = [article_patterns]
            site.save()
            print title, url
    fin.close()