Пример #1
0
def history(site, start_page_id):
    """
    過去ログ
    """
    _limit = 20
    svm = generate_index_contents(site)

    # パラメータチェック
    start_page_id = int(start_page_id)
    if start_page_id == 100000000:
        pages = Page.get_new_history(site_id=site.id, _limit=_limit)
    else:
        pages = Page.get_history(site_id=site.id, pk_until=start_page_id, _limit=_limit)

    # pageのランク付け
    pages = page_rank(pages)

    # 次のページの遷移先
    is_next = None
    if pages and len(pages) == _limit:
        last_page_id = pages[-1].id
        is_next = last_page_id - 1

    return render_template('dat/history.html',
                           site=site,
                           keyword=keyword,
                           list_pages=pages,
                           svm=svm,
                           is_next=is_next)
Пример #2
0
def index(site, page_id):
    # パラメータチェックとメインコンテンツ生成
    page_id = int(page_id)
    try:
        contents = Page.get_by_site(page_id, site.id)
    except Page.DoesNotExist:
        app_log(logging.ERROR, "Page does not exist site_id:{} page_id:{}".format(site.id, page_id))
        return error_page(site, ErrorPageCategory.DoesNotExist)

    # ページが有効期間外ならエラー
    if not contents.is_enable():
        app_log(logging.ERROR, "Page is not open site_id:{} page_id:{}".format(site.id, page_id))
        return error_page(site, ErrorPageCategory.NotOpen)

    # 追加用ページ
    extend_page = contents.get_history_from_myself()
    if contents and contents.prev_page:
        ignore_ids = [page_id, contents.prev_page.id]
    else:
        ignore_ids = [page_id]

    try:
        svm = generate_index_contents(site, extend_page=extend_page, ignore_ids=ignore_ids)
    except SiteEmptyError:
        app_log(logging.WARNING, "site is empty site_id:{} page_id:{}".format(site.id, page_id))
        return error_page(site, ErrorPageCategory.SiteIsEmpty)

    # pvを記録
    if random.randint(0, 20) == 1:
        contents.count_up(20)

    return render_template('dat/page.html',
                           contents=contents,
                           site=site,
                           svm=svm)
Пример #3
0
 def update_at(self):
     from module.site.page import Page
     import datetime
     import pytz
     pages = Page.get_new_history(self.id)
     new_list = sorted(pages, key=lambda x: x.id, reverse=True)
     now = datetime.datetime.now(pytz.utc)
     for page in new_list:
         if page.is_enable(now):
             return page.open_at
     raise ValueError
Пример #4
0
 def update_at(self):
     from module.site.page import Page
     import datetime
     import pytz
     pages = Page.get_new_history(self.id)
     new_list = sorted(pages, key=lambda x: x.id, reverse=True)
     now = datetime.datetime.now(pytz.utc)
     for page in new_list:
         if page.is_enable(now):
             return page.open_at
     raise ValueError
Пример #5
0
def main(subject):
    # 読み込み
    posts = {}
    for posted in dat_reader(subject.dat_url):
        posts[posted.num] = posted

    # 読み込み後のパースとエラー排除とレスによる重み付け
    posts = analyze_post(posts)

    # キーワード解析
    r_indexes = analyze_keyword(posts)

    # postsにキーワード解析内容を反映したあと、keywordデータをDBに一括登録
    insert_keyword(posts, r_indexes, subject.site.id)

    # 評価高い投稿を出力
    pages = []
    for key in posts:
        output = PageRepository(_type=PageType.POST_RANK.value)
        if posts[key].priority > 300:
            # 評価の高い投稿を出力
            print("++++++++++++++++++++")
            print(posts[key].priority)
            print("++++++++++++++++++++")
            posts[key].printer(posts=posts, output=output)
            # DB出力用に記録
            pages.append(output)

    # キーワード評価が高い投稿を出力
    for r_index in r_indexes:
        pages.append(printer_res(r_index, posts))

    # dbに記録するレコードの生成
    pages = filter_overlap(pages)
    keyword_record_dict = {
        r_index.keyword: r_index.keyword_record
        for r_index in r_indexes
    }
    bulk_pages = [
        page.output_for_page(subject, keyword_record_dict) for page in pages
        if page.is_enable
    ]

    # バルク!
    pages = Page.bulk_insert(bulk_pages)
    PageKeywordRelation.register(pages)

    # 公開日を最適に設定する
    set_start_at(pages)
Пример #6
0
def get_pr_page(site, _limit=3):
    """
    クローラー最適化のために、外部サイトのページを取得する
    :param site: site
    :return: list[Page]
    """
    other_site = _get_other_site(site)
    pages = Page.get_new_history(other_site.id, _limit=30)
    pages = sorted(pages, key=lambda x: x.view_count, reverse=True)
    result = []
    now = datetime.datetime.now(pytz.utc)
    for page in pages:
        if page.is_enable(now):
            result.append(page)
        if len(result) >= _limit:
            return result
    return result
Пример #7
0
def get_pr_page(site, _limit=3):
    """
    クローラー最適化のために、外部サイトのページを取得する
    :param site: site
    :return: list[Page]
    """
    other_site = _get_other_site(site)
    pages = Page.get_new_history(other_site.id, _limit=30)
    pages = sorted(pages, key=lambda x: x.view_count, reverse=True)
    result = []
    now = datetime.datetime.now(pytz.utc)
    for page in pages:
        if page.is_enable(now):
            result.append(page)
        if len(result) >= _limit:
            return result
    return result
Пример #8
0
def main(subject):
    # 読み込み
    posts = {}
    for posted in dat_reader(subject.dat_url):
        posts[posted.num] = posted

    # 読み込み後のパースとエラー排除とレスによる重み付け
    posts = analyze_post(posts)

    # キーワード解析
    r_indexes = analyze_keyword(posts)

    # postsにキーワード解析内容を反映したあと、keywordデータをDBに一括登録
    insert_keyword(posts, r_indexes, subject.site.id)

    # 評価高い投稿を出力
    pages = []
    for key in posts:
        output = PageRepository(_type=PageType.POST_RANK.value)
        if posts[key].priority > 300:
            # 評価の高い投稿を出力
            print("++++++++++++++++++++")
            print(posts[key].priority)
            print("++++++++++++++++++++")
            posts[key].printer(posts=posts, output=output)
            # DB出力用に記録
            pages.append(output)

    # キーワード評価が高い投稿を出力
    for r_index in r_indexes:
        pages.append(printer_res(r_index, posts))

    # dbに記録するレコードの生成
    pages = filter_overlap(pages)
    keyword_record_dict = {r_index.keyword: r_index.keyword_record for r_index in r_indexes}
    bulk_pages = [page.output_for_page(subject, keyword_record_dict) for page in pages if page.is_enable]

    # バルク!
    pages = Page.bulk_insert(bulk_pages)
    PageKeywordRelation.register(pages)

    # 公開日を最適に設定する
    set_start_at(pages)
Пример #9
0
def sitemap():
    """
    googleクローラー用のsitemap.xml
    """
    all_sites = Site.get_all()
    new_pages = Page.gets_new(10000)
    new_keywords = PageKeywordRelation.gets_new(10000)
    now = datetime.datetime.now(pytz.utc) - datetime.timedelta(seconds=3600)
    new_keyword_pages = [keyword for keyword in new_keywords if keyword.page and keyword.page.is_enable(now)]

    return render_template('sitemap/sitemap.html',
                           url_base='http://www.niku.tokyo/',
                           new_site_date=max([site.created_at for site in all_sites]),
                           all_sites=all_sites,
                           new_pages=new_pages,
                           new_keyword_pages=new_keyword_pages,
                           one_days_ago=datetime.datetime.now() - datetime.timedelta(days=1),
                           three_days_ago=datetime.datetime.now() - datetime.timedelta(days=3),
                           )
    def run(self):
        # PageKeywordRelationに1行でもレコードあれば実行しない
        if PageKeywordRelation.objects().filter().count() > 0:
            raise AssertionError("PageKeywordRelation data is exist")

        # pageとkeywordのデータ全取得
        page_all = Page.get_all()
        page_all = sorted(page_all, key=lambda x: x.id)

        # 10ページずつbulk
        count = 0
        pages = []
        for page in page_all:
            pages.append(page)
            count += 1
            if len(pages) > 10:
                PageKeywordRelation.register(pages)
                pages = []
                print("{}/{}".format(count, len(page_all)))
        PageKeywordRelation.register(pages)
Пример #11
0
def tests_page_models():
    # # insert
    # page = Page(site_id=1,
    #             dat_id=12345,
    #             page="agraeg43g34qhg43qh43qh34")
    # page2 = Page(site_id=1,
    #              dat_id=112345,
    #              page="agraeg43g34qhg43qh43qh34")
    #
    # Page.bulk_insert([page, page2])
    #
    # # update
    # page2.dat_id = 22222
    # page2.save()

    all_pages = Page.objects().all()
    for page in all_pages:
        _id = page.site.get_background_image_id(page.id)
        assert 1 <= _id <= 5
        print(_id)
    raise
Пример #12
0
 def output_for_page(self, subject, keyword_record_dict):
     """
     DB出力用のPageクラスを出力
     :param subject: Subject
     :param keyword_record_dict: dict{int: Keyword}
     :rtype : Page
     """
     s = ''.join(
         [post.generate_post_message_for_db() for post in self.output])
     keyword_record_ids = [
         keyword_record.id for keyword_record in
         self.get_keyword_record_ids(keyword_record_dict)
     ]
     page_top_post = '<br/>'.join(self.output[0].post_message_for_output)
     return Page(
         site_id=subject.site.id,
         dat_id=subject.dat_id,
         page=s,
         page_top=page_top_post,
         type=self.matome_type,
         _keywords=','.join([str(_id) for _id in keyword_record_ids]),
     )
Пример #13
0
def set_start_at(pages):
    """
        48時間の値を最適化して設定する
        :param pages: list(Page)
        :return:
        """
    # 3件以下なら何もしない
    if len(pages) <= 3:
        return

    # 既に48時間先に30件以上予約がある場合は設定しない。
    feature_page = Page.get_feature_page(pages[0].site_id)
    print(feature_page)
    if len(feature_page) >= 30:
        return

    # 最適化して並び替える
    _today_page = pages[2:]
    _tomorrow_page = pages[:2]

    _set_start_at(_today_page)
    _set_start_at(_tomorrow_page, time_shift=datetime.timedelta(hours=24))
Пример #14
0
def set_start_at(pages):
        """
        48時間の値を最適化して設定する
        :param pages: list(Page)
        :return:
        """
        # 3件以下なら何もしない
        if len(pages) <= 3:
            return

        # 既に48時間先に30件以上予約がある場合は設定しない。
        feature_page = Page.get_feature_page(pages[0].site_id)
        print(feature_page)
        if len(feature_page) >= 30:
            return

        # 最適化して並び替える
        _today_page = pages[2:]
        _tomorrow_page = pages[:2]

        _set_start_at(_today_page)
        _set_start_at(_tomorrow_page, time_shift=datetime.timedelta(hours=24))
Пример #15
0
# -*- coding: utf-8 -*-
from module.scraping.search import SearchManager
from module.site.page import Keyword, Page
from module.site.site import Site


for i in range(1, 10):
    print(Keyword.get(i))
for i in range(1, 10):
    print(Keyword.get(i))

page = Page.get(1)
print(page.keywords)
print(page.keywords)
print(page.keywords)
print(page.keywords)

print(page.tile_label)
Пример #16
0
def generate_index_contents(site, _limit=30, extend_page=None, ignore_ids=()):
    """
    トップページ表示用のデータを生成する
    :param site: Site
    :param _limit: int
    :param extend_page: list(Page)
    :param ignore_ids: list(int)
    :return: SiteViewModel
    """
    pages = Page.get_new_history(site.id, _limit=_limit)
    if extend_page:
        pages += extend_page

    # 未来日公開の記事は公開しない
    now = datetime.datetime.now(pytz.utc)
    pages = [page for page in pages if page.is_enable(now=now)]

    if ignore_ids:
        pages_repository = {page.id: page for page in pages if page.id not in ignore_ids}
    else:
        pages_repository = {page.id: page for page in pages}
    pages = list(pages_repository.values())

    # ページが存在しない
    if bool(pages) is False:
        raise SiteEmptyError

    # 10件未満
    if len(pages) <= 10:
        pr_pages = get_pr_page(site)
        return SiteViewModel(site=site,
                             contents=random.choice(pages),
                             panels=[random.choice(pages) for x in range(6)],
                             page_list=pages + pr_pages)

    # 最新の10件からviewが多い1件を取る
    new_list = sorted(pages, key=lambda x: x.id, reverse=True)[:10]
    new_list = sorted(new_list, key=lambda x: x.view_count, reverse=True)
    if len(new_list) == len([page for page in new_list if page.start_at]):
        new_list = sorted(new_list, key=lambda x: x.start_at, reverse=True)
    contents = new_list[0]
    new_list = new_list[1:]
    pages_repository.pop(contents.id)

    # 人気順に並んだ最新の9件からパネル用の3件を取る
    random.shuffle(new_list)
    panels = new_list[1:4]
    for panel_page in panels:
        panel_page.set_favorite(False)
        pages_repository.pop(panel_page.id)

    # 残りの46件からviewが多い3件を取る
    left_pages = list(pages_repository.values())
    left_pages = sorted(left_pages, key=lambda x: x.view_count, reverse=True)
    for x in range(3):
        panel_page = left_pages.pop()
        panel_page.set_favorite(True)
        panels.append(panel_page)
    random.shuffle(panels)

    # 残りページをView数をベースに点数付与
    for page in left_pages[0:3]:  # 3件
        page.set_view_level(PageViewLevel.SUPERNOVA)

    for page in left_pages[3:5]:  # 2件
        page.set_view_level(PageViewLevel.HOT)

    for page in left_pages[5:7]:  # 2件
        page.set_view_level(PageViewLevel.WARM)

    # 残りをidで降順ソートする
    left_pages = sorted(left_pages, key=lambda x:x.id, reverse=True)

    # クローラー用のPRページを追加
    left_pages = left_pages[:20]
    left_pages += get_pr_page(site)
    return SiteViewModel(site=site,
                         contents=contents,
                         panels=panels,
                         page_list=left_pages)
Пример #17
0
 def page(self):
     return Page.get(self.page_id)
Пример #18
0
def generate_index_contents(site, _limit=30, extend_page=None, ignore_ids=()):
    """
    トップページ表示用のデータを生成する
    :param site: Site
    :param _limit: int
    :param extend_page: list(Page)
    :param ignore_ids: list(int)
    :return: SiteViewModel
    """
    pages = Page.get_new_history(site.id, _limit=_limit)
    if extend_page:
        pages += extend_page

    # 未来日公開の記事は公開しない
    now = datetime.datetime.now(pytz.utc)
    pages = [page for page in pages if page.is_enable(now=now)]

    if ignore_ids:
        pages_repository = {
            page.id: page
            for page in pages if page.id not in ignore_ids
        }
    else:
        pages_repository = {page.id: page for page in pages}
    pages = list(pages_repository.values())

    # ページが存在しない
    if bool(pages) is False:
        raise SiteEmptyError

    # 10件未満
    if len(pages) <= 10:
        pr_pages = get_pr_page(site)
        return SiteViewModel(site=site,
                             contents=random.choice(pages),
                             panels=[random.choice(pages) for x in range(6)],
                             page_list=pages + pr_pages)

    # 最新の10件からviewが多い1件を取る
    new_list = sorted(pages, key=lambda x: x.id, reverse=True)[:10]
    new_list = sorted(new_list, key=lambda x: x.view_count, reverse=True)
    if len(new_list) == len([page for page in new_list if page.start_at]):
        new_list = sorted(new_list, key=lambda x: x.start_at, reverse=True)
    contents = new_list[0]
    new_list = new_list[1:]
    pages_repository.pop(contents.id)

    # 人気順に並んだ最新の9件からパネル用の3件を取る
    random.shuffle(new_list)
    panels = new_list[1:4]
    for panel_page in panels:
        panel_page.set_favorite(False)
        pages_repository.pop(panel_page.id)

    # 残りの46件からviewが多い3件を取る
    left_pages = list(pages_repository.values())
    left_pages = sorted(left_pages, key=lambda x: x.view_count, reverse=True)
    for x in range(3):
        panel_page = left_pages.pop()
        panel_page.set_favorite(True)
        panels.append(panel_page)
    random.shuffle(panels)

    # 残りページをView数をベースに点数付与
    for page in left_pages[0:3]:  # 3件
        page.set_view_level(PageViewLevel.SUPERNOVA)

    for page in left_pages[3:5]:  # 2件
        page.set_view_level(PageViewLevel.HOT)

    for page in left_pages[5:7]:  # 2件
        page.set_view_level(PageViewLevel.WARM)

    # 残りをidで降順ソートする
    left_pages = sorted(left_pages, key=lambda x: x.id, reverse=True)

    # クローラー用のPRページを追加
    left_pages = left_pages[:20]
    left_pages += get_pr_page(site)
    return SiteViewModel(site=site,
                         contents=contents,
                         panels=panels,
                         page_list=left_pages)
Пример #19
0
 def page(self):
     return Page.get(self.page_id)