def _get_zgqnb_article(self, url, date, page_name): print url medium = Medium.objects.get(pk=1836) urls = bs4.BeautifulSoup(requests.get(url).content).find('div', {'id' : 'titleList'}).find_all('a') for a in urls: article_url = urljoin(url, a.get('href')) soup = bs4.BeautifulSoup(requests.get(article_url).content) title = soup.find('h1').text print title article = Article() article.medium = medium article.title = title article.url = article_url article.publication_date = date article.page = page_name p_list = [] for p in soup.find('div', {'id' : 'ozoom'}).find_all('p'): p_list.append(p.text) content = '\n'.join(p_list) article.content = content if Article.objects.filter(medium=medium).filter(url=article_url).count(): article = Article.objects.filter(medium=medium).get(url=article_url) article.content = content article.save()
def _get_nfzm_article(self, url, date, issue): medium = Medium.objects.get(pk=951) article = Article() article.medium = medium article.issue = issue article.url = url article.publication_date = date r = requests.get(url, cookies={'PHPSESSID': 'l19dgbf6ticijmo9ka9osvufk0'}) content = bs4.BeautifulSoup(r.content) article.title = content.title.string.split('-')[-1].strip() article.content = content.find('section', {'id' : 'articleContent'}).text author = content.find('span', {'class' : 'author'}).find_all('em') if author[1].text.find(u'南方周末记者') != -1: author, created = Journalist.objects.get_or_create(medium=medium, name=author[2].text.strip()) if not created: article.author = author elif author[1].text.find(u'南方周末特约撰稿') != -1: article.author_name = author[2].text.strip() elif author[1].text.find(u'南方周末编辑部') != -1: article.author_name = u'南方周末编辑部' print article.author or article.author_name return article
def _get_whb_article(self, url, date,issue, page): medium = Medium.objects.get(pk=1399) soup = bs4.BeautifulSoup(requests.get(url).content) for title in soup.find('div', {'id' : 'BT'}).find_all('a'): article_page_url = urljoin(url, title.get('href')) r = requests.get(article_page_url) if r.status_code == 404: continue article_page = bs4.BeautifulSoup( r.content) if Article.objects.filter(medium=medium).filter(url=article_page_url).count(): article = Article.objects.filter(medium=medium).get(url=article_page_url) else: article = Article() article.medium = medium article.url = article_page_url article.publication_date = date article.page = page.text.strip() article.issue = issue print article_page_url title = article_page.title.text.strip().replace(u'文汇报 - ', '') article.title = title article.content = article_page.find('div', {'id' : 'articleText'}).text.strip().replace(u' ', '\n ') article.save()
def addArticle(request): article = Article() article.title = request.POST.get('title', '') article.abstract = request.POST.get('abstract', '') article.category_id = request.POST.get('categorie'); article.content = request.POST.get('content', '') article.author = User.objects.get(id=request.user.id) try: article.save() return HttpResponseRedirect('/articles/read') except: return HttpResponseRedirect('/')
def add_article(request): if request.method == 'POST': form = ArticleForm(request.POST) if form.is_valid(): article = Article() article.create_user = request.user article.title = form.cleaned_data.get('title') article.content = form.cleaned_data.get('content').replace('\n', '<br>') node_name = form.cleaned_data.get('node') article.add_node(node_name) article.save() return index(request) else: form = ArticleForm() return render(request,'add_article.html', {'form': form})
def add_article(request): if request.method == 'POST': form = ArticleForm(request.POST) if form.is_valid(): article = Article() article.create_user = request.user article.title = form.cleaned_data.get('title') article.content = form.cleaned_data.get('content').replace( '\n', '<br>') node_name = form.cleaned_data.get('node') article.add_node(node_name) article.save() return index(request) else: form = ArticleForm() return render(request, 'add_article.html', {'form': form})
def GenerateOneFakeData(): art = Article() art.title = get_random_str(name_str, 12) art.brief = get_random_str(brief_str) art.content = desc_str art.click_number = randint(10, 100) art.favor_number = randint(10, 100) art.comment_number = randint(10, 100) art.word_count = randint(100, 1000) art.front_image = "articles/images/art.png" # 创建当前文章的用户 art.user_id = users[randint(0, users.count() - 1)].id # 当前文章的类别 art.category_id = categories[randint(0, categories.count() - 1)].id # 为当前文章指定若干tags # art.tags.set(ArticleTag(name=tags[randint(0,tags.count()-1)].name)) return art
def write(request): if request.method == 'POST': form = ArticleForm(request.POST) if form.is_valid(): article = Article() article.create_user = request.user article.title = form.cleaned_data.get('title') article.content = form.cleaned_data.get('content') status = form.cleaned_data.get('status') if status in [Article.PUBLISHED, Article.DRAFT]: article.status = form.cleaned_data.get('status') article.save() tags = form.cleaned_data.get('tags') article.create_tags(tags) return redirect('/articles/') else: form = ArticleForm() return render(request, 'articles/write.html', {'form': form})
def _get_qlwb_article(self, url, date, issue, page): print page medium = Medium.objects.get(pk=1025) soup = bs4.BeautifulSoup(requests.get(url).content) if Article.objects.filter(medium=medium).filter(url=url).count(): article = Article.objects.filter(medium=medium).get(url=url) else: article = Article() article.medium = medium article.title = soup.find('td', {'class' : 'font01'}).text.strip().replace(u' ', '\n ') article.url = url article.publication_date = date article.page = page article.issue = issue article.content = soup.find('span', {'id' : 'contenttext'}).text.strip().replace(u' ', '\n ') article.save()
def handle(self, *args, **options): filename = options.get('filename') dom = minidom.parse(filename) blog = [] # list that will contain all posts categories_set = set() authors_set = set() for node in dom.getElementsByTagName('item'): status = node.getElementsByTagName('wp:status')[0].firstChild.data if status != "publish": continue post = dict() post["title"] = node.getElementsByTagName('title')[0].firstChild.data post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data post["author"] = node.getElementsByTagName( 'dc:creator')[0].firstChild.data post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data if node.getElementsByTagName('content:encoded')[0].firstChild != None: post["text"] = node.getElementsByTagName( 'content:encoded')[0].firstChild.data else: post["text"] = "" # wp:attachment_url could be use to download attachments # Get the categories tempCategories = [] post['author_id'] = AUTHORS[post['author']] print post['author_id'] for subnode in node.getElementsByTagName('category'): tempCategories.append(subnode.getAttribute('nicename')) categories_set.update(tempCategories) categories = [x for x in tempCategories if x != ''] post['tags'] = " ".join([x for x in tempCategories if x != '']) post["categories"] = categories # Add post to the list of all posts blog.append(post) from articles.models import Article, Tag import datetime errors = [] for entry in blog: article = Article() article.title = entry['title'] article.content = entry['text'] dia = entry['date'][4:7] mes = MESES[entry['date'][8:11]] anio = entry['date'][12:16] print entry['date'] print anio pub_date = datetime.datetime(int(anio), mes, int(dia)) article.publish_date = pub_date article.author_id = entry['author_id'] article.author_id = entry['author_id'] article.status_id = 2 try: article.save() except: errors.append(entry['title']) else: new_tags = [] for tag in entry['categories']: ntag = Tag.objects.get_or_create(name=tag) new_tags.append(ntag[0]) article.tags.add(*new_tags) print(len(errors)) print errors
def publish(f, draft, by=None, publish=None, is_active=True, login_required=False, debug=False): """ Publishes an article. :param f: The file to parse :type f: file :param by: Author username :type by: str :param draft: Save as draft? :type draft: bool :param draft: Article active? :type draft: bool :param draft: Require login? :type draft: bool :param publish: When to publish :type publish: datetime fmt=YYYY-MM-DD HH:MM :param debug: Print debug data :type debug: bool :returns: Saved :class:`Article` """ meta, content = parse_meta_and_article(f.read()) meta.update({ 'is_active': is_active, 'login_required': login_required, }) if draft: meta['status'] = 'Draft' if publish: meta['publish'] = publish if by: meta['by'] = by slug = slugify(meta['title']) # New or updated? articles = Article.objects.using(DB).filter(slug=slug) if len(articles) > 1: if not publish: raise ConfigurationError('Title ambiguous; supply publish date') articles.filter(publish_date=meta['publish']) article = articles[0] elif len(articles) == 1: article = articles[0] else: article = Article() article.content = content article.markup = 'h' todo = [] keys = list(meta.keys()) for key in keys + [k for k in REQUIRED_FIELDS+FIELD_DEFAULTS.keys() if k not in keys]: value = filter_field(key, meta) if key not in SAVE_NEEDED: setfield(article, key, value, debug) else: todo.append((key, value)) article.save(using=DB) for key, value in todo: setfield(article, key, value, debug) article.save(using=DB) return article