Пример #1
0
def article_to_model(article_dict):
    """
    :type article_dict: dict
    :rtype: model.article.Article | model.article.TrainingArticle
    """
    try:
        article_id = int(article_dict.get("id")[0])
        # article_categories = article_dict.get("categories")
        article_headline = (
            article_dict.get("headline")[0]).encode(TARGET_ENCODING)
        # article_keywords = [keyword.encode(TARGET_ENCODING) for keyword in article_dict.get("keywords")]
        # article_lead = article_dict.get("lede")[0].encode(TARGET_ENCODING)
        if article_dict.get("text"):
            article_text = article_dict.get("text")[0].encode(TARGET_ENCODING)
        else:
            article_text = ''
        article_model = Article(id=article_id,
                                categories=[],
                                headline=article_headline,
                                keywords=[],
                                lead=u'',
                                text=article_text)
        special_coverage_id = article_dict.get("specialCoverage")
        if special_coverage_id:
            return TrainingArticle(specialCoverage=int(special_coverage_id[0]),
                                   **article_model.__dict__)
        else:
            return article_model
    except Exception as e:
        log("Exception on parsing article: {}, could not create model. Context: {}"
            .format(e, article_dict.keys()))
        return None
Пример #2
0
    def test_call_switch_display(self, mock_main_view):
        """ Unit test of controller.title_loop.call_switch_display """

        test_feed = [Article('test title', "test url", "date")]
        call_switch_display(mock_main_view, test_feed)
        mock_main_view.display_entry.assert_called_with(
            'test title', "test url")
Пример #3
0
def rss_author(name):
    articles = Article().get_author_paged(name, start=0, amount=100)
    payload = render_template('rss.xml',
                              articles=articles,
                              version=settings.corres_version,
                              author=name)
    return Response(payload, mimetype='text/xml')
Пример #4
0
 def save(cls, user_name, id, title, content):
     summary = QiniuService.get_summary(content)
     if title is None:
         title = 'untitled'
     if id is None:
         file_name = md5(title + content +
                         datetime.now().timestamp().__str__())
         url = QiniuService.upload_doc(content, file_name)
         catalogue_index = cls.get_max_catalogue_index(user_name, 1)
         article = Article(title=title,
                           file_key=file_name,
                           user_name=user_name,
                           url=url,
                           summary=summary,
                           catalogue_id=1,
                           catalogue_index=catalogue_index)
         article.insert()
         return Article.select().filter(
             Article.user_name == user_name,
             Article.file_key == file_name).one().id
     article = Article.select().get(id)
     if user_name != article.user_name:
         raise ServerException(msg=f'您没有权限修改{article.user_name}的文章')
     file_name = md5(title + content)
     if article.file_key == file_name:
         return id
     url = QiniuService.upload_doc(content, article.file_key, file_name)
     article.url = url
     article.file_key = file_name
     article.summary = summary
     Article.update(article)
     return id
Пример #5
0
    def post(self):
        if self.user:
            if 'main' in self.request.POST:
                self.redirect('/blog')
            elif 'sub' in self.request.POST:
                subject = self.request.get('subject')
                content = self.request.get('content')

                # created by someone. someone should be unique
                uid = self.read_secure_cookie('user_id')
                # if subject and content filled
                if subject and content:
                    article = Article(title=subject,
                                      text=content,
                                      likes=0,
                                      who_liked=[],
                                      created_by=uid)
                    # put the article to db
                    article.put()
                    self.redirect('/blog/%s' % str(article.key().id()))
                else:
                    # either subject or content missing
                    error = "Subject or Content is missing"
                    self.render("new_post.html",
                                title=subject,
                                text=content,
                                error=error,
                                likes=0,
                                who_liked=[],
                                created_by=uid)
        else:
            self.redirect('/blog/login')
Пример #6
0
def save():
    userid = session.get('userid')
    username = session.get('username')

    if not userid or not username or not session.get('is_admin'):
        return redirect(url_for('admin.login'))

    title = request.form.get('title')
    content = request.form.get('content')

    if not title or not content:
        tag_chk_list = get_article_taglist(0)
        return render_template('admin/publish.html',
                               active='publish',
                               article=None,
                               tag_list=tag_chk_list)

    ## 去掉HTML的背景颜色,防止和现有CSS的背景颜色冲突
    content = htmlHelper.purge_background(content)

    ## 向数据库添加一篇文章
    article = Article(userid, username, '', '')
    db_session.add(article)
    db_session.flush()

    draft = Draft(article.id, userid, username, title, content)
    db_session.add(draft)
    db_session.flush()

    ## 向数据库添加文章标签
    tag_list = request.form.getlist('tags')
    dao.save_tags(article.id, tag_list)
    return redirect('/admin/draft?draftId=%d' % draft.id)
Пример #7
0
def publish():

    if not session.get('is_admin'):
        return redirect(url_for('admin.login'))
    article_id = int(request.args.get('articleId', 0))

    if not article_id:
        return abort(404)

    draft = db_session.query(Draft).filter(
        Draft.article_id == article_id).first()
    article = db_session.query(Article).filter(
        Article.id == article_id).first()

    if draft:
        if not article:
            article = Article(draft.user_id, draft.user_name, draft.title,
                              draft.content)
            article.id = draft.article_id
            db_session.add(article)
        else:
            article.title = draft.title
            article.content = draft.content

        db_session.delete(draft)
        db_session.flush()

    return redirect('/detail?articleId=' + str(article_id))
Пример #8
0
    def extract(self):
        article_json = self.__crawler()

        article = Article(article_json)

        self.__storage(article)

        return article
Пример #9
0
 def process_item(self, item, spider):
     a = Article(title=item["title"].encode("utf-8"),
                 url=item["url"],
                 body=item["body"].encode("utf-8"),
                 publish_time=item["publish_time"].encode("utf-8"),
                 source_site=item["source_site"].encode("utf-8"))
     self.session.add(a)
     self.session.commit()
Пример #10
0
def create_article(request, board, subject, content):
    article = Article()
    article.board = board
    board.article_count += 1
    article.user = request.user
    article.subject = subject
    article.change_content(content)
    return article
Пример #11
0
    def test_ten_second_loop_calls_its_self(self, mock_timer, mock_main_view,
                                            mock_switch_display):
        """ Unit test of controller.title_loop.ten_second_loop """

        test_feed = [Article('test title', "test url", "date")]
        ten_second_loop(mock_main_view, 7, test_feed)
        self.assertTrue(mock_timer.called)
        self.assertTrue(mock_switch_display.called)
Пример #12
0
 def process_item(self, item, spider):
     a = Article(title=item["title"].encode("utf-8"),
                 url=item["url"],
                 content=item["content"].encode("utf-8"),
                 publish_time=item["publish_time"].encode("utf-8"),
                 publish_user=item["publish_user"].encode("utf-8"),
                 folder_id=2)
     self.session.add(a)
     self.session.commit()
Пример #13
0
def home():
    start, amount = pager_args()
    articles = Article().get_paged(start=start, amount=amount)
    tot_count = Article().get_count_filtered()
    tweet_count = Tweet().get_tweetcount_filtered()
    payload = render_template(
        'home.html',
        articles=articles,
        start=int(start),
        amount=int(amount),
        tot_count=tot_count,
        tweet_count=tweet_count,
        cssver=os.path.getmtime(
            os.path.join(os.path.dirname(__file__), 'static/thcrrspndnt.css')),
        site=settings.CONFIG.get('site', 'thecorrespondent.com'),
        version=settings.corres_version,
    )
    return payload
Пример #14
0
    def extract(self, count):
        links = self.link_collector.collect(count)

        articles = []
        for link in links:
            content = self.content_extractor.extract(link.url)
            articles.append(Article(link.title, link.url, content.snippet))

        return articles
 def post(self):
     user_id = self.request.cookies.get('user_id')
     user_id = int(user_id)
     user_info = ndb.Key('User', user_id).get()
     commit = Article(username=user_info.username,
                      title=self.request.get('title'),
                      text=self.request.get('text'))
     commit.put()
     return webapp2.redirect('/')
Пример #16
0
    def test_call_new_feed(self, mock_feedmanager, mock_getfeedcontents,
                           mock_getfeedname):
        """Unit test of controller.title_loop.test_call_new_feed"""

        test_url = "https://www.theguardian.com/us/rss"
        article_1 = Article(
            "Article 1", "Link 1",
            (datetime.now() - timedelta(days=1)))  # 1 day ago (most recent)
        article_2 = Article("Article 2", "Link 2",
                            (datetime.now() - timedelta(days=2)))  # 2 days ago
        article_3 = Article("Article 3", "Link 3",
                            (datetime.now() - timedelta(days=3)))  # 3 days ago

        mock_getfeedcontents.return_value = [article_1, article_2, article_3]
        mock_getfeedname.return_value = "Test Feed Name"

        call_new_feed(mock_feedmanager, test_url)
        mock_feedmanager.update.assert_called_once()
Пример #17
0
    def test_call_switch_display(self, mock_main_view):
        """Unit test of controller.title_loop.call_switch_display"""

        test_feed_manager = FeedManager()
        test_article = Article(self.test_title, self.test_url, datetime.now())
        test_feed_manager.update("Test Feed Title", "Test Feed Url",
                                 [test_article])

        call_switch_display(mock_main_view, test_feed_manager)
        mock_main_view.display_entry.assert_called_with(
            self.test_title, self.test_url)
Пример #18
0
    def test_ten_second_loop_calls_its_self(self, mock_timer, mock_main_view,
                                            mock_switch_display):
        """Unit test of controller.title_loop.ten_second_loop"""

        test_feed_manager = FeedManager()
        test_article = Article(self.test_title, self.test_url, datetime.now())
        test_feed_manager.update("Test Feed Title", "Test Feed Url",
                                 [test_article])

        ten_second_loop(mock_main_view, 7, test_feed_manager)
        self.assertTrue(mock_timer.called)
        self.assertTrue(mock_switch_display.called)
Пример #19
0
def search():
    query = request.args.get('query')
    if not query:
        return home()
    start, amount = pager_args()
    # split the query on whitespace, but keep quoted strings together
    tokens = re.findall('\w+|"[\w\s]*"', query)
    articles = Article().get_search(tokens, start=start, amount=amount)
    tot_count = Article().get_searchcount(tokens)
    tweet_count = Tweet().get_tweetcount_searchquery(tokens)
    payload = render_template('search.html',
                              articles=articles,
                              start=int(start),
                              amount=int(amount),
                              version=settings.corres_version,
                              tot_count=tot_count,
                              tweet_count=tweet_count,
                              tokens=tokens,
                              cssver=os.path.getmtime(
                                  os.path.join(os.path.dirname(__file__),
                                               'static/thcrrspndnt.css')))
    return payload
Пример #20
0
def cluster_article():
    '''
        Cluster news article
    '''
    if not "title" in request.json:
        return json.loads("{}"), 400

    title = utils.preprocess_title(request.json["title"])
    sentence = tokenizer.tokenize(title)
    input_tokens = tokenizer.encode_plus(
        title,
        max_length=SEQUENCE_LENGTH,
        pad_to_max_length=SEQUENCE_LENGTH,
        add_special_tokens=True,
        return_tensors='tf',
        return_token_type_ids=True,
        return_attention_mask=True,
    )

    inputs = {
        "attention_mask": input_tokens["attention_mask"],
        "token_type_ids": input_tokens["token_type_ids"],
        "training": False
    }

    # Get entities
    ner_output = models["ner"](input_tokens["input_ids"], **inputs)[0]
    ner_output = np.argmax(ner_output, axis=2)[0]
    entities = utils.post_token_classification(sentence,
                                               ner_output,
                                               ner_labels,
                                               normalize=True)

    # Get categorisation
    cat_predictions = models["categorisation"](input_tokens["input_ids"])

    # Get CLS token
    transformer_outputs = models["categorisation"].transformer(
        input_tokens["input_ids"])[0][0]

    article = Article(db,
                      raw=title,
                      token_ids=input_tokens["input_ids"].numpy().tolist(),
                      cls_token=transformer_outputs[0].numpy().tolist(),
                      created_at=request.json["timestamp"],
                      categories=utils.sigmoid(
                          cat_predictions[0][0].numpy()).tolist(),
                      entities=entities)
    clusters.fit_article(article)
    return jsonify({"success": True})
Пример #21
0
def add_article():
    if request.method == 'GET':
        return render_template('add_article.html')
    else:
        articleName = request.form['name']
        articleDescription = request.form['description']
        articlePrice = request.form['price']

        article = Article(name=articleName,
                          description=articleDescription,
                          price=int(articlePrice))
        stock.addArticleQuantity(article, 1)

        return redirect(url_for('index'))
Пример #22
0
def read_file(file_dir_path):
    # 读取全部文章
    articles = []
    files = os.listdir(file_dir_path)
    write_time = None
    background = ""
    content = ""
    annotation = ""
    for file in files:
        print(file.title())
        if file.title().__contains__("Store"):
            continue
        title = get_article_title(file.title())
        back_line = 0
        anno_line = 0
        lines = open(file_dir_path + "/" + file, "r",
                     encoding="utf-8").readlines()
        for i in range(lines.__len__()):
            lines[i] = lines[i].strip()
            line = lines[i]
            if line.isspace():
                continue
            if match_chinese_data(line):
                write_time = ChineseToDate(line.strip().replace(" ", ""))
                continue
            # 背景
            if line.startswith(">"):
                background = line.replace(">", "").strip()
                back_line = i
                continue
            # 正文之后是注释, 跳过分割线
            if match_division(line):
                anno_line = i
                continue
        if anno_line == 0:
            anno_line = len(lines)
        content = "".join(lines[back_line + 1:anno_line])
        annotation = "".join(lines[anno_line + 1:len(lines)])

        article = Article(title, write_time, background, content, annotation)
        title = ""
        write_time = None
        background = ""
        content = ""
        annotation = ""
        articles.append(article)

    return articles
Пример #23
0
def parse(feed_link: str) -> []:
    # Get the contents of an atom or rss feed using the feedparser library. Return all the relevant
    #   information as Articles (unsorted).

    fm_logger.debug('parse')

    feed = feedparser.parse(feed_link)
    article_list = []

    for entry in feed.entries:
        title = entry.title
        link = entry.link
        datetime = entry.published_parsed  # time.struct_time object parsed within feedparser from string attribute
        article = Article(title, link, datetime)

        article_list.append(article)

    return article_list
Пример #24
0
def add_article():
    if request.method == 'GET':
        #show create form
        return render_template('add_article.html')
        #print('GET')
    else:
        #create article from post body
        articleName = request.form['name']
        articleDescription = request.form['description']
        articlePrice = request.form['price']
        articleQuantity = request.form['quantity']

        # on rajoute les clés:noms de colonnes au "=valeur"
        article = Article(name=articleName,
                          description=articleDescription,
                          price=int(articlePrice))
        stock.addArticleQuantity(article, articleQuantity)

        return redirect(url_for('index'))
Пример #25
0
def deliver_article():
    data = request.get_json()
    if is_user_login(request):
        user_id = get_user_id_by_cookie(request)
    else:
        return jsonify({'code': 208, 'msg': "登录信息已经过期"})
    reward_status = int(data['reward_status'])
    title = data['title']
    content = data['content']
    tag_id = data['tag_id']
    category_id = data['category_id']
    category_name = data['category_name']
    tag_name = data['tag_name']
    if exist_user(user_id) and exist_category(category_id):
        send_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        if title == '' or title.isspace():
            title = None
        if content == '' or content.isspace():
            content = None
        r1 = Article(user_id, title, content, tag_id, category_id, tag_name, send_time, send_time,
                     category_name, reward_status)
        try:
            db.session.add(r1)
            # 作者发布文章,+3积分
            user = User.query.filter(User.id == user_id).first()
            user.effect += 4
            tag_id =str(tag_id)
            ids =tag_id.split(",")
            for i in  ids :
                result=Tag.query.filter(Tag.tag_id == i).first()
                result.tag_heat=int(result.tag_heat) + int(1)
            db.session.commit()
            #print('user.effect:', user.effect)
            return jsonify({"status": "200", "msg": "发布成功"})
        except:
            db.session.rollback()
            return jsonify({"status": "203", "msg": "发布失败"})
    else:
        return jsonify({"status": "203", "msg": "参数有误"})
Пример #26
0
def parse_article() -> Article:
    """
    This function gets an article from the top of 'https://www.theguardian.com/international' and parses it into the model.
    :return: Model of type Article
    """

    url = get_link()
    # For example:
    #url = 'https://www.theguardian.com/world/2020/jan/05/donald-trump-vows-to-hit-52-sites-very-hard-if-iran-retaliates-for-suleimani-killing'

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    headline = soup.find('h1', class_='content__headline').text
    date = soup.find('time', class_='content__dateline-wpd')["datetime"]

    article_body = ''
    paragraphs = soup.find(class_='content__article-body').find_all('p')
    for paragraph in paragraphs:
        article_body += paragraph.text + '\n\n'

    article_model = Article(headline, article_body, "Guardian", url, date)

    return article_model
Пример #27
0
def parse_article() -> Article:
    """
    This function gets an article from the top of https://edition.cnn.com/world and parses it into the model.
    :return: Model of type Article
    """

    url = get_link()
    # For example:
    #url = 'https://edition.cnn.com/2020/01/03/business/carlos-ghosn-escape-jet/index.html'

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    headline = soup.find('h1', class_='pg-headline').text
    date = soup.find('p', class_='update-time').text

    article_body = ''
    paragraphs = soup.find_all(class_='zn-body__paragraph')
    for paragraph in paragraphs:
        article_body += paragraph.text + '\n\n'

    article_model = Article(headline, article_body, "CNN", url, date)

    return article_model
Пример #28
0
def rss():
    articles = Article().get_paged(start=0, amount=100)
    payload = render_template('rss.xml',
                              articles=articles,
                              version=settings.corres_version)
    return Response(payload, mimetype='text/xml')
Пример #29
0
    def get_article_list(self):
        article_list = []
        article = Article()

        article_list.append(article)
        return article_list
def _parse_rss(bs_feed: BeautifulSoup) -> List[Article]:
    """
    model.parser._parse_rss

    Parses the data within BeautifulSoup into a list of Articles.

    Raises exceptions if the rss file is not properly formatted to be parsed. Could also raise an exception if the rss
    feed has no entries.

    Returns the contents of the feed.

    Arguments:
        bs_feed -- the BeautifulSoup object which contains data on this feed.
    """

    p_logger.debug('_parse_rss')
    # Get the relevant meta about the feed itself (name & link)

    if bs_feed.rss.channel is None:
        raise InvalidRssException("By RSS V2.0 specifications the rss element must have a single, subordinate"
                                  + "<channel> element which contains metadata on the feed.")

    if bs_feed.rss.channel.title is None:
        raise InvalidRssException("By RSS V2.0 specifications the channel element must have a single, subordinate"
                                  + "<title> element which is the name of the feed itself.")

    feed_name = bs_feed.channel.title

    if bs_feed.rss.channel.link is None:
        raise InvalidRssException("By RSS V2.0 specifications the channel element must have a single, subordinate"
                                  + "<link> element which is the URL to the HTML website corresponding to the channel.")

    feed_link = bs_feed.channel.link

    # Get the items within the feed and parse them as Articles

    feed_contents = []
    items = bs_feed.find_all("item")

    if len(items) == 0:
        raise InvalidRssException("This rss feed has no entries.")

    for item in items:
        title = item.title.string
        link = item.link.string
        date = item.pubDate.string

        # Make sure the required data was parsed in order to create an Article
        if title is None or link is None or date is None:
            print("The Article contains blank information and cannot be parsed:"
                  "/n/t" + "title == %s" % title +
                  "/n/t" + "link == %s" % link +
                  "/n/t" + "date == %s" % str(date))
            p_logger.info("The Article contains blank information and cannot be parsed:"
                          "/n/t" + "title == %s" % title +
                          "/n/t" + "link == %s" % link +
                          "/n/t" + "date == %s" % str(date))
            continue

        # Convert the date from rfc822 (rss std format) to datetime
        # The one line of code comes from:
        # https://stackoverflow.com/questions/1568856/how-do-i-convert-rfc822-to-a-python-datetime-object
        date = datetime.utcfromtimestamp(utils.mktime_tz(utils.parsedate_tz(date)))

        feed_contents.append(Article(title, link, date))

    if len(feed_contents) == 0:
        raise InvalidRssException("The feed was found to be blank. Could not parse the feed.")

    return feed_contents