def test_keywordless_articles(self): """Tests that keywordless articles are put into a separate grouping.""" articles = [models.Article(url="example.com", keywords=[]), models.Article(url="test.com", keywords=[])] articles.extend(test_utils.SIMILAR_ARTICLES) groups = classifier.group_articles(articles) self.assertEqual(2, len(groups)) for group in groups: if len(group.get_articles()[0].get_keywords()) == 0: self.assertTrue(group.in_database())
def create_article(article_text): try: temp_article = models.Article(article_text['record_lens_id'], article_text['title'], article_text['authors'], str(int(article_text['volume']))) except Exception as e: temp_article = models.Article(article_text['record_lens_id'], article_text['title'], article_text['authors'], '-1') return temp_article
def find_unprocessed_articles(self, cap=10000): """ Obtains a list of unprocessed articles and determines their author data. """ done = [] with self.connection.db.cursor() as cursor: cursor.execute( f'SELECT DISTINCT(article) FROM prod.article_authors ORDER BY article;' ) for record in cursor: if len(record) > 0: done.append(record[0]) self.log.record(f'Found {len(done)} done already') todo = [] with self.PROD.db.cursor() as cursor: cursor.execute( f'SELECT id, url FROM prod.articles WHERE url IS NOT NULL ORDER BY id;' ) for record in cursor: if len(record) > 0: if record[0] not in done: todo.append(models.Article(record[0], record[1])) if len(todo) >= cap: self.log.record( f'Found max of {cap} entries to do; returning') return todo else: self.log.record(f'Empty entry', 'error') self.log.record(f'GOT {len(todo)} TO DO') return todo
def create_article(db: Session, minio: Minio, article: schemas.ArticleCreate): minio_file = '%s/%s.txt' % (str(article.news_source_id), str(uuid.uuid4())) # store article body in s3 minio_service.store_string(minio, body=article.article_body, minio_file=minio_file) # store article in db db_article = models.Article( site_article_id=article.site_article_id, title=article.title, body_file_path=minio_file, url=article.url, category=article.category, published=article.published, article_last_updated=article.article_last_updated, news_source_id=article.news_source_id) db.add(db_article) db.commit() db.refresh(db_article) db_article.article_body = minio_service.get_string( minio, minio_file=db_article.body_file_path) return db_article
def new_article(request): categorys = models.Category.objects.all() if request.method == 'POST': print(request.POST) form = ArticleForm(request.POST, request.FILES) if form.is_valid(): print("form is valid") print(request.FILES) data = form.cleaned_data del data['head_img'] uploaded_filename = handle_uploaded_file(request, request.FILES['head_img']) data['author_id'] = request.user.userprofile.id try: new_article_obj = models.Article(**data) new_article_obj.head_img = uploaded_filename new_article_obj.save() except Exception as e: return HttpResponse(e) return render(request, 'create_article.html', {'new_article_obj': new_article_obj}) else: print(form.errors) return render(request, 'create_article.html', { 'categorys': categorys, 'form': form }) return render(request, 'create_article.html', {'categorys': categorys})
def post_article(): """ URL - /api/v1.0/article Method - POST Creates a new article from a URL and returns a dictionary that represents it. """ post_json = request.get_json() if not post_json or not 'url' in post_json: abort(400) url = post_json['url'] # Check if the article is already in database query = models.Article.query.filter_by(url=url).first() if query: return jsonify(query.dictionary()), 201 # If not in DB, get article from web parsedArticle = ParsedArticle(url) article = models.Article( url=url, title=parsedArticle.get_title(), content=parsedArticle.get_content(), author=parsedArticle.get_author(), excerpt=parsedArticle.get_excerpt(), date=parsedArticle.get_date(), dek=parsedArticle.get_dek(), lead_image=parsedArticle.get_lead_image(), ) db.session.add(article) db.session.commit() return jsonify(article.dictionary()), 201
def _add_standalone_article(raw_url, added_by): url = raw_url.split('?')[0] #For if user copy-pastes from news site url = prepend_http(url) url = url.strip('/') url = url.strip('<>') url = url.strip() # This is a hack to deal with unicode passed in the URL. # Otherwise gives an error, since our table character set is latin1. (Why not encode the table as unicode?) url = url.encode('ascii', 'ignore') decoded_url = decode_scheme_colon(url) try: try: article = StandaloneArticle.objects.get(url=decoded_url) except StandaloneArticle.DoesNotExist: article = StandaloneArticle.objects.get(url=swap_http_https(decoded_url)) except StandaloneArticle.DoesNotExist: article = StandaloneArticle(url=decoded_url, added_by=added_by) article.save() # Trigger a scraper call try: a = models.Article.objects.get(url=article.url) except Article.DoesNotExist: a = models.Article(url=decoded_url, git_dir=get_and_make_git_repo()) a.save() return a
async def createNewArticle(name: str, project: models.Project = Depends(getProject), db: Session = Depends(get_db)): new_article = models.Article(name=name) db.add(new_article) project.articles.append(new_article) db.commit() return new_article
def create(self): self.data = self.parse_data() bbs_obj = models.Article(**self.data) bbs_obj.save() file_name = handle_upload_file(self.request, self.request.FILES["head_img"]) bbs_obj.head_img = "imgs/upload/%s" % file_name bbs_obj.save() return bbs_obj
def add_art(req): errs = '' if req.method == "POST": #print(req.POST) form = ArticleForm(req.POST, req.FILES) if form.is_valid(): #print ("--form data:",form.cleaned_data) form_data = form.cleaned_data form_data['author_id'] = req.user.userprofile.id #jieba 自动从title提取关键词, textrank = analyse.textrank keywords = textrank(form_data['title']) #循环组合前3个关键词 arr = [] n = 0 for s in keywords: arr.append(s) strs = ','.join(arr) form_data['keywords'] = strs # 循环保存到tags表 #查询数据库tag是否存在 try: have_tag = models.Tags.objects.get(tagname=s) num = int(have_tag.num) + 1 models.Tags.objects.filter(tagname=s).update(num=num) except: b = models.Tags(tagname=s, num=1) b.save() n = n + 1 if n == 3: break #增加文章描述 description = form_data['content'] form_data['description'] = mvhtml.strip_tags(description[0:200]) new_article_obj = models.Article(**form_data) new_article_obj.save() return render(req, 'addarticle.html') else: #print ('err:',form.errors) errs = form.errors if req.user.userprofile.id: parent_category = models.Category.objects.filter( parent_category_id=None) category = models.Category.objects.all() return render(req, 'addarticle.html', { 'parent_category': parent_category, 'category': category, 'errs': errs })
def create(self): self.data = self.parse_data() bbs_obj = models.Article(**self.data) # print bbs_obj # bbs_obj.save() filename = handle_upload_file(self.request, self.request.FILES['head_img']) bbs_obj.head_img = 'static/imgs/upload/%s' % filename bbs_obj.save() return bbs_obj
def get_ungrouped_articles(): """Get the items in the database and puts them into Article and Grouping objects.""" with database_utils.DatabaseConnection() as (connection, cursor): cursor.execute("SELECT name, link, article_text FROM article " "WHERE article_text != '' AND topic_id IS NULL;") articles = [] for item in cursor.fetchall(): name, url, article_text = item articles.append( models.Article(url=url, title=name, text=article_text)) return articles
def add_article(): """ Add new post to database. """ new_article = models.Article(request.form['name'], request.form['body']) db.session.add(new_article) db.session.commit() flash('New entry was successfully created') return redirect(url_for('index'))
def get_articles(source=None, distance=0): articles = [] rx = re.compile(r'^https?://(?:[^/]*\.)%s/' % source if source else '') pagelength = datetime.timedelta(days=1) end_date = datetime.datetime.now() - distance * pagelength start_date = end_date - pagelength print 'Asking query' version_query = '''SELECT version.id, version.article_id, version.v, version.title, version.byline, version.date, version.boring, version.diff_json, T.age as age, Articles.url as a_url, Articles.initial_date as a_initial_date, Articles.last_update as a_last_update, Articles.last_check as a_last_check FROM version, (SELECT Articles.id as article_id, MAX(T3.date) AS age, COUNT(T3.id) AS num_vs FROM Articles LEFT OUTER JOIN version T3 ON (Articles.id = T3.article_id) WHERE (T3.boring=0) GROUP BY Articles.id HAVING (age > %s AND age < %s AND num_vs > 1 )) T, Articles WHERE (version.article_id = Articles.id) and (version.article_id = T.article_id) and NOT version.boring ORDER BY date''' all_versions = models.Version.objects.raw(version_query, (start_date, end_date)) article_dict = {} for v in all_versions: a = models.Article(id=v.article_id, url=v.a_url, initial_date=v.a_initial_date, last_update=v.a_last_update, last_check=v.a_last_check) v.article = a article_dict.setdefault(v.article, []).append(v) for article, versions in article_dict.items(): url = article.url if not rx.match(url): print 'REJECTING', url continue if 'blogs.nytimes.com' in url: #XXX temporary continue if len(versions) < 2: continue rowinfo = get_rowinfo(article, versions) articles.append((article, versions[-1], rowinfo)) print 'Queries:', len( django.db.connection.queries), django.db.connection.queries articles.sort(key=lambda x: x[-1][0][1].date, reverse=True) return articles
def post(self, user): tags = self.request.get_all('tags') title = self.request.get('title') body = self.request.get('body') article = models.Article(title=title, body=body) for t in tags: article.tags.append(db.Key(encoded=t)) article.put() time.sleep(0.10) return self.redirect('/admin/article')
def setUp(self): """Set up the class for the tests.""" self._database_name_mock = mock.patch( "server.database_utils.database_name", return_value="mudima_test.db") self._database_name_mock.start() self._database_location = database_utils.database_path( database_utils.database_name()) self._delete_database() self.article = models.Article("example.com", title="Example", keywords=["0", "1"]) self.grouping = models.Grouping(self.article)
def add(self, message): #if not source_parser: # #TODO factory or something based on source # source_parser = HTMLSource(self.source) source_parser = HTMLSource(message.url) source_parser.fetch() article = models.Article() article.source = source_parser.source article.author = source_parser.author article.published = source_parser.published article.publisher = source_parser.publisher if message.timestamp: article.posted = message.timestamp if message.author: article.posted_by = message.author tokens = article.tokens count = 0 tokenizer = source_parser.tokenizer() for t in iter(tokenizer): token = t.lower() if token not in tokens: tokens[token] = 0 tokens[token] += 1 count += 1 #while True: # try: # token = tokenizer.next().lower() # except StopIteration: # break # if token not in tokens: # tokens[token] = 0 # tokens[token] += 1 # count += 1 article.save(context) if DEBUG: self.__stats.update({ 'tokenizer': tokenizer.stats(), 'count': count, }) return article
def search_name(self, value): for article in self.articles: if value in article.get('Source Title'): article_object = models.Article(article.get('Lens ID'), article.get('Title'), article.get('Source Title'), article.get('Date Published'), article.get('Author'), article.get('Publisher')) # article_object debe de ser un append para ir agregando a una lista magazine_object = models.Magazine(article.get('Source Title'), article.get('ISSNs'), article_object) self.list_magazine.append(magazine_object)
def Submitarticle(request): if request.method == 'POST': form = article(request.POST) cur = models.Article() description = form['description'].value() name = form['name'].value() art = form['article'].value() link = form['link'].value() if str(link) == "": link = "None" cur.__addarticle__(name, description, art, link) return HttpResponseRedirect('/L/#article')
def test_clean_database(self): """Test clean database.""" database_writer.write_groups([self.grouping]) self.assertEqual(1, len(database_reader.get_urls())) database_writer.clean_database() self.assertEqual(1, len(database_reader.get_urls())) grouping = models.Grouping( models.Article(url="google.com", publishedAt="2016-10-11T23:41:34Z", keywords=["a"])) database_writer.write_groups([grouping]) self.assertEqual(2, len(database_reader.get_urls())) database_writer.clean_database() self.assertEqual(1, len(database_reader.get_urls()))
def group_articles(article_list=None, debug=False): """Group good articles in the database.""" if article_list is None: article_list = database_reader.get_ungrouped_articles() else: article_list = [ models.Article(url=a) if isinstance(a, (str, unicode)) else a for a in article_list ] groupings = database_reader.get_grouped_articles() no_keyword_grouping = None for index, article in enumerate(article_list): if debug: print "Grouping", index, "out of", len(article_list) if not article.get_keywords(): if no_keyword_grouping is None: # in_database is set to True here because we do not want a no keyword grouping in the database. no_keyword_grouping = models.Grouping(article, in_database=True) else: no_keyword_grouping.add_article(article) continue # Skip the article if the keywords cannot be gotten from it. best_grouping, best_grouping_similarity = None, 0 # Need to make a shallow copy of list for the possibility of combining two of the items in the list. for grouping in groupings[:]: similarity = grouping.best_similarity(article) if similarity > best_grouping_similarity: # If this article has a high similarity with two separate groups, then combine the groups. if best_grouping_similarity > constants.MIN_COMBINE_GROUP_PERCENTAGE: if best_grouping.in_database(): if grouping.in_database(): database_writer.remove_grouping_from_database( grouping) best_grouping.combine_group(grouping) groupings.remove(grouping) else: grouping.combine_group(best_grouping) groupings.remove(best_grouping) best_grouping = grouping best_grouping_similarity = similarity if best_grouping is not None and best_grouping_similarity > constants.MIN_GROUPING_PERCENTAGE: best_grouping.add_article(article) else: groupings.append(models.Grouping(article)) if no_keyword_grouping: groupings.append(no_keyword_grouping) return groupings
def new_article(request): if request.method == 'POST': print request.POST form = ArticleForm(request.POST) if form.is_valid(): print "form data: ", form.cleaned_data form_data = form.cleaned_data form_data['author_id'] = request.user.userprofile.id new_article_obj = models.Article(**form_data) new_article_obj.save() return render(request, 'web/new_article.html', {'title': 'New Article Published', 'new_article_obj': new_article_obj }) else: print "Error: ", form.errors category_list = models.Category.objects.all() return render(request, 'web/new_article.html', {'title': 'New Article', 'category_list':category_list})
def paper_downloads(a_id, connection): """Returns time-series data from bioRxiv about how many times a paper's webpage and PDF have been downloaded. Arguments: - connection: a database Connection object. - a_id: the Rxivist-issued ID given to the paper being queried. Returns: - A list of months and the download stats for each month """ result = models.Article(a_id) result.GetTraffic(connection) return { "query": { "id": a_id }, "results": [{"month": x.month, "year": x.year, "downloads": x.downloads, "views": x.views} for x in result.traffic] }
def new_article(request): if request.method == "POST": form = ArticleForm(request.POST, request.FILES) if form.is_valid(): print "--form data:", form.cleaned_data form_data = form.cleaned_data form_data['author_id'] = request.user.userprofiles.id new_img_path = handle_uploaded_file(request, request.FILES['head_img']) form_data['head_img'] = new_img_path new_article_obj = models.Article(**form_data) new_article_obj.save() return render(request, 'new_article.html', {'new_article_obj': new_article_obj}) else: print "err:", form.errors category_list = models.Category.objects.all() return render(request, 'new_article.html', {'category_list': category_list})
def create_article(user, ffile, title, abstract=None, language='en'): data = { 'owner': user, 'title': title, 'slug': slugify(title), 'abstract': abstract, 'language': language } article = models.Article(**data) if not isinstance(ffile, File): # http://stackoverflow.com/questions/3501588/how-to-assign-a-local-file-to-the-filefield-in-django # handles regular files ffile = File(ffile) article.article.save('untitled', ffile) else: # handles uploaded files article.article = ffile article.save() article.authors.add(user.get_profile()) article.save() # necessary? return article
def editor_action(db): auth = check_session() if auth: title = request.forms.title subtitle = request.forms.subtitle img_url = request.forms.imgurl article = request.forms.article draft = int(request.forms.btnval) mode = request.query.m if mode == "new": new_post = models.Article( title=title, subtitle=subtitle, article=article, header_image=img_url, draft=draft, author_id=auth[0] ) db.add(new_post) elif mode == "edit": id = request.forms.id if len(id) is 0: redirect("/admin/editor") post = db.query(models.Article).filter(and_(models.Article.id == id, models.Article.author_id == auth[0])) post = post.first() if post.draft == True and post.draft != draft: post.created_on = datetime.now() post.title = title post.subtitle = subtitle post.header_image = img_url post.article = article post.draft = draft db.commit() redirect("/admin/view?mode=post") else: redirect("/admin/login")
def get_grouped_articles(): """Get the items in the database and puts them into Article and Grouping objects.""" with database_utils.DatabaseConnection() as (connection, cursor): cursor.execute( "SELECT name, topic_id, link, article_text, image_url FROM article " "WHERE article_text != '' AND topic_id IS NOT NULL;") groups = {} for item in cursor.fetchall(): name, id, url, article_text, image_url = item article = models.Article(url=url, title=name, text=article_text, urlToImage=image_url, in_database=True) article.set_keywords(_get_article_keywords(url, cursor)) if id in groups: groups.get(id).add_article(article, new_article=False) else: groups[id] = models.Grouping(article, uuid=id, in_database=True, has_new_articles=False) return list(groups.values())
def article(): updform = UpdateArticle() form = UploadArticle() fromDate = datetime.now() - timedelta(days=365) na_rass = 0 rrr = models.Article.query.filter(models.Article.timestamp >= fromDate).filter_by(stat=1).all() for r in rrr: f = models.File.query.filter_by(id=r.file).first() if f.owner == current_user.id: na_rass += 1 otclon = 0 rrr = models.Article.query.filter(models.Article.timestamp >= fromDate).filter_by(stat=2).all() for r in rrr: f = models.File.query.filter_by(id=r.file).first() if f.owner == current_user.id: otclon += 1 prin = 0 rrr = models.Article.query.filter(models.Article.timestamp >= fromDate).filter_by(stat=3).all() for r in rrr: f = models.File.query.filter_by(id=r.file).first() if f.owner == current_user.id: prin += 1 alll = 0 rrr = models.Article.query.filter(models.Article.timestamp >= fromDate).all() for r in rrr: f = models.File.query.filter_by(id=r.file).first() if f.owner == current_user.id: alll += 1 if form.submit.data: block = models.BlockUser.query.filter_by(id_user=current_user.id).first() if (block is None) or not (block.block_article): if block is not None and (block.block_file): return 'Блокировка загрузки файлов' current_file = form.file.data file = models.File.upload(current_file) db.session.add(file) db.session.commit() idfile = models.File.query.filter_by(drive_file_id=file.drive_file_id).first_or_404() article = models.Article(file=idfile.id, name=form.name.data, stat=1) db.session.add(article) db.session.commit() return redirect(url_for('article')) else: return 'Блокировка добавления статей' else: articles = models.Article.query.join(models.File, (models.File.id == models.Article.file)).all() files = models.File.query.filter(models.File.owner == current_user.id).all() statuses = models.Status.query.all() st = [(i.id, i.name) for i in statuses] ChangeArticleStatus.setStatuses(st) forms = {} articlesss = [] st = models.Status.query.all() statuses = {} for s in st: statuses[s.id] = s.name for art in articles: forms[art.id] = ChangeArticleStatus(id=art.id, stat=art.stat) for f in files: if art.file == f.id: articlesss.append({'article': art, 'file': f, 'owner' : current_user.username, 'owner_id' : current_user.id, 'id':art.id, 'stat_id': art.stat, 'stat':statuses[art.stat], 'timestamp' : art.timestamp.strftime("%d.%m.%Y %H:%M:%S") }) return render_template('articles.html', form=form, updform=updform, forms=forms, na_rass=na_rass ,otclon=otclon, prin=prin, all=alll, articles=articlesss)
def test_get_keywords_bad_url(self): """Check that it does not error out when the url is bad.""" article = models.Article("") with mock.patch("traceback.print_exc"): self.assertEqual(0, len(article.get_keywords())) self.assertEqual("", article.get_text())
"""Various utilities for tests.""" import database_utils import mock import models import os import unittest SIMILAR_ARTICLES = ( models.Article( "https://www.nytimes.com/2017/09/25/us/politics/obamacare-repeal-susan-collins-dead.html", keywords={ u'senators', u'repeal', u'support', u'bill', u'dead', u'gop', u'pivotal', u'health', u'declares', u'opposition', u'mr', u'vote', u'senator', u'republicans', u'republican', u'appears', u'care' }), models.Article( "http://thehill.com/policy/healthcare/352342-third-gop-senator-opposes-new-obamacare-" "repeal-killing-bill-ahead-of", keywords={ u'bill', u'trump', u'republicans', u'obamacare', u'dead', u'hearing', u'appears', u'lastditch', u'vote', u'collins', u'repeal', u'effort', u'gop' })) DISSIMILAR_ARTICLES = ( models.Article( "https://www.washingtonpost.com/opinions/cassidy-is-sorry-about-the-cassidy-graham-" "process-he-should-be/2017/09/25/0cd234f0-a243-11e7-ade1-76d061d56efa_story.html", keywords={ u'cassidygraham', u'votes', u'republicans', u'room', u'process',