def main(): sub_index_urls = set(['http://fitnhit.com/education/','http://fitnhit.com/entertainment/','http://fitnhit.com/health/','http://fitnhit.com/news/','http://fitnhit.com/technology/']) site_url = 'http://fitnhit.com/' article_patterns = ['^http://fitnhit.com/.*/\d{5}'] title = 'FitNHit' for sub_index_url in sub_index_urls: siterule = SiteRule.objects.get(sub_index_url=sub_index_url) print siterule.title siterule.title = title siterule.site_url = site_url siterule.article_patterns = article_patterns siterule.save() rules = Site() rules.title = title rules.url = site_url rules.article_patterns = article_patterns rules.save() sites = Site.objects(title='Tribune') if sites: site = sites[0] site.title = 'The Express Tribune' site.save() sites = Site.objects(title='Tribune India') if sites: site = sites[0] site.title = 'The Tribune' site.save() articles = Article.objects(title='Tribune') for article in articles: article.title = 'The Express Tribune' article.save() articles = Article.objects(title='Tribune India') for article in articles: article.title = 'The Tribune' article.save()
def main(): Site.drop_collection() url_pattern = {} fin = open('scripts/article_pattern_precise.txt','r') for line in fin: title, pattern = line.strip().split('###') url_pattern[title] = pattern fin.close() fin = open('scripts/content_or_article.csv','r') for line in fin: title, url, _ = line.strip().split(',') if "http" not in url: url = 'http://%s' % url url = re.sub('/$', '', url) if title in url_pattern: article_patterns = url_pattern[title] site = Site() site.title = title site.url = url site.article_patterns = [article_patterns] site.save() print title, url fin.close()