def parse_rss():

    url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/'

    today = datetime.date.today()
    week_before = today - datetime.timedelta(weeks=1)
    try:
        packages = {
            x.get('name').strip(): x
            for x in list(Package.objects.all()
                          .values('name', 'description', 'link'))
        }
        _start_week, _end_week = get_start_end_of_week(today)
        _ = Issue.objects.filter(date_from=_start_week, date_to=_end_week)

        assert _.count() <= 1, 'Many ISSUE on week'
        _ = None if _.count() == 0 else _[0]
        news = Item.objects.filter(issue=_,
                                   status='active') if _ is not None else []

        section = Section.objects.get(title='Релизы')
        resource = Resource.objects.get(link='http://allmychanges.com/')
    except Exception as e:
        print(e)
        return

    saved_packages = []
    for n in feedparser.parse(url).entries:
        package_name, package_version = n.title.split()
        package_name = package_name.replace('python/', '')

        ct = len(Item.objects.filter(link=n.link, status='active')[0:1])
        if ct or not ('python' in n.title):
            saved_packages.append(package_name)
            continue

        time_struct = getattr(n, 'published_parsed', None)
        if time_struct:
            _timestamp = mktime(time_struct)
            dt = datetime.datetime.fromtimestamp(_timestamp)
            if dt.date() < week_before:
                continue

        try:
            if not (package_name in
                    packages.keys()) or package_name in saved_packages:
                continue

            if news and check_previous_news_of_package(news, packages.get(
                    package_name)):
                off_other_release_news(news, packages.get(package_name))

            item_data = _generate_release_item(package_version,
                                               n.link, resource, section,
                                               packages.get(package_name))
            saved_packages.append(package_name)
            save_item(item_data)
        except Exception as e:
            print(e)
            continue
def import_python_weekly(issue_url, **kwargs):
    resource = Resource.objects.get(title='PythonWeekly')

    page = html.parse(issue_url)

    # a = requests.get(url).content
    blocks = page.getroot().find_class('bodyTable')[0].xpath('//span[@style="font-size:14px"]')

    for x in blocks:
        link = x.cssselect('a')[0]
        url = link.attrib['href']
        title = link.text
        _text = x.getnext()
        if _text is None:
            continue
        text = etree.tostring(x.getnext()).decode('utf-8').replace('<br/>', '').strip()

        item_data = {
            'title': title,
            'link': url,
            'raw_content': text,
            'http_code': 200,
            'content': text,
            'description': text,
            'resource': resource,
            'language': 'en',
        }
        item_data.update(
            apply_parsing_rules(item_data, **kwargs)
            if kwargs.get('query_rules') else {})
        item_data = apply_video_rules(item_data.copy())
        save_item(item_data)
示例#3
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):
        try:
            rss_items = map(
                get_data_for_rss_item,
                filter(is_not_exists_rss_item,
                       filter(_is_old_rss_news, get_items_from_rss(src.link))))

            # parse weekly digests
            digests_items = list(rss_items)
            list(
                map(parse_weekly_digest, filter(is_weekly_digest,
                                                digests_items)))

            list(
                map(parse_django_weekly_digest,
                    filter(is_django_weekly_digest, digests_items)))

            resource = src.resource
            language = src.language
            for i, rss_item in enumerate(digests_items):
                rss_item.update({
                    'resource': resource,
                    'language': language,
                })
                rss_item.update(
                    apply_parsing_rules(rss_item, **kwargs) if kwargs.
                    get('query_rules') else {})
                rss_item.update(apply_video_rules(rss_item.copy()))
                save_item(rss_item)
        except (URLError, TooManyRedirects, socket.timeout):
            print(src)
示例#4
0
def parse_rss():
    url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/'

    today = datetime.date.today()
    week_before = today - datetime.timedelta(weeks=1)
    try:
        packages = {
            x.get('name').strip(): x
            for x in list(Package.objects.all().values('name', 'description',
                                                       'link'))
        }
        _start_week, _end_week = get_start_end_of_week(today)
        _ = Issue.objects.filter(date_from=_start_week, date_to=_end_week)

        assert _.count() <= 1, 'Many ISSUE on week'
        _ = None if _.count() == 0 else _[0]
        news = Item.objects.filter(issue=_,
                                   status='active') if _ is not None else []

        section = Section.objects.get(title='Релизы')
        resource = Resource.objects.get(link='http://allmychanges.com/')
    except Exception as e:
        print(e)
        return

    saved_packages = []
    for n in feedparser.parse(url).entries:
        package_name, package_version = n.title.split()
        package_name = package_name.replace('python/', '')

        ct = len(Item.objects.filter(link=n.link, status='active')[0:1])
        if ct or not ('python' in n.title):
            saved_packages.append(package_name)
            continue

        time_struct = getattr(n, 'published_parsed', None)
        if time_struct:
            _timestamp = mktime(time_struct)
            dt = datetime.datetime.fromtimestamp(_timestamp)
            if dt.date() < week_before:
                continue

        try:
            if not (package_name in packages.keys()) \
                or package_name in saved_packages:
                continue

            if news \
                and check_previous_news_of_package(
                    news, packages.get(package_name)):
                off_other_release_news(news, packages.get(package_name))

            item_data = _generate_release_item(package_version, n.link,
                                               resource, section,
                                               packages.get(package_name))
            saved_packages.append(package_name)
            save_item(item_data)
        except Exception as e:
            print(e)
            continue
示例#5
0
def main():
    url = 'http://feed.exileed.com/vk/feed/pynsk'

    _section_title = 'Колонка автора'
    _res_title = 'Александр Сапронов (PyNSK)'

    resource = Resource.objects.filter(title=_res_title)
    assert resource.count() == 1, "Not found resoure: %s" % _res_title
    resource = resource[0]

    section = Section.objects.filter(title=_section_title)
    assert section.count() == 1, "Not found section: %s" % _section_title
    section = section[0]

    r = re.compile(r"(htt(p|ps)://[^ ]+)")

    today = datetime.date.today()
    week_before = today - datetime.timedelta(weeks=1)
    rssnews = feedparser.parse(url)
    for n in reversed(rssnews.entries):
        if len(Item.objects.filter(link=n.link)[0:1]):
            continue

        # print("Parse: %s" % n.link)
        title = None
        content = None

        time_struct = getattr(n, 'published_parsed', None)
        if time_struct:
            _timestamp = mktime(time_struct)
            dt = datetime.datetime.fromtimestamp(_timestamp)
            if dt.date() < week_before:
                continue

        text = n.summary
        for x in l:
            if x in text and '<br><br>' in text.split(x)[1]:
                _ = text.split(x)[1].split('<br>')
                title = x + _[0]
                content = ' </br>\n'.join(filter(lambda x: x, _[1:]))

                content = r.sub(r'<a href="\1">\1</a>', content)
                break

        if title is not None and content is not None:
            content_link = "<a href='%s' target='_blank'>[Продолжение]</a>" % n.link
            content = textwrap.shorten(content, width=300, placeholder="...%s" % content_link)\
                .replace('<a...', '...')
            _ = {
                'link': n.link,
                'description': content,
                'title': title,
                'resource': resource,
                'language': 'ru',
                'section': section,
                'status': 'active',
            }
            save_item(_)
def parse_rss():
    # todo
    # hardcode
    # это личная лента модератора
    # по возможности заменить на ленту спец. созданную для pydigest
    url = 'https://allmychanges.com/rss/05a5ec600331b03741bd08244afa11cb/'

    try:
        packages = {x.get('name'): x for x in
                    list(Package.objects.all()
                         .values('name', 'description', 'url'))}
        section = Section.objects.get(title=u'Релизы')
        resource = Resource.objects.get(link='http://allmychanges.com/')
    except Exception:
        return

    today = datetime.date.today()
    week_before = today - datetime.timedelta(weeks=1)
    saved_packages = []
    for n in feedparser.parse(url).entries:
        package_name, package_version = n.title.split()
        package_name = package_name.replace('python/', '')

        ct = len(Item.objects.filter(link=n.link)[0:1])
        if ct or not ('python' in n.title):
            saved_packages.append(package_name)
            continue

        time_struct = getattr(n, 'published_parsed', None)
        if time_struct:
            _timestamp = mktime(time_struct)
            dt = datetime.datetime.fromtimestamp(_timestamp)
            if dt.date() < week_before:
                continue

        try:
            if not (package_name in packages.keys()) or package_name in saved_packages:
                continue

            item_data = _generate_release_item(
                package_name,
                package_version,
                n.link,
                resource,
                section,
                packages.get(package_name)
            )
            saved_packages.append(package_name)
            save_item(item_data)
        except Exception as e:
            continue
def parse_rss():

    url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/'

    try:
        packages = {x.get('name').strip(): x for x in
                    list(Package.objects.all()
                         .values('name', 'description', 'url'))}
        section = Section.objects.get(title=u'Релизы')
        resource = Resource.objects.get(link='http://allmychanges.com/')
    except Exception:
        return

    today = datetime.date.today()
    week_before = today - datetime.timedelta(weeks=1)
    saved_packages = []
    for n in feedparser.parse(url).entries:
        package_name, package_version = n.title.split()
        package_name = package_name.replace('python/', '')

        ct = len(Item.objects.filter(link=n.link)[0:1])
        if ct or not ('python' in n.title):
            saved_packages.append(package_name)
            continue

        time_struct = getattr(n, 'published_parsed', None)
        if time_struct:
            _timestamp = mktime(time_struct)
            dt = datetime.datetime.fromtimestamp(_timestamp)
            if dt.date() < week_before:
                continue

        try:
            if not (package_name in packages.keys()) or package_name in saved_packages:
                continue

            item_data = _generate_release_item(
                package_name,
                package_version,
                n.link,
                resource,
                section,
                packages.get(package_name)
            )
            saved_packages.append(package_name)
            save_item(item_data)
        except Exception as e:
            continue
def parse_rss():

    url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/'

    try:
        packages = {
            x.get('name').strip(): x
            for x in list(Package.objects.all().values('name', 'description',
                                                       'url'))
        }
        section = Section.objects.get(title=u'Релизы')
        resource = Resource.objects.get(link='http://allmychanges.com/')
    except Exception:
        return

    today = datetime.date.today()
    week_before = today - datetime.timedelta(weeks=1)
    saved_packages = []
    for n in feedparser.parse(url).entries:
        package_name, package_version = n.title.split()
        package_name = package_name.replace('python/', '')

        ct = len(Item.objects.filter(link=n.link)[0:1])
        if ct or not ('python' in n.title):
            saved_packages.append(package_name)
            continue

        time_struct = getattr(n, 'published_parsed', None)
        if time_struct:
            _timestamp = mktime(time_struct)
            dt = datetime.datetime.fromtimestamp(_timestamp)
            if dt.date() < week_before:
                continue

        try:
            if not (package_name
                    in packages.keys()) or package_name in saved_packages:
                continue

            item_data = _generate_release_item(package_name, package_version,
                                               n.link, resource, section,
                                               packages.get(package_name))
            saved_packages.append(package_name)
            save_item(item_data)
        except Exception as e:
            continue
示例#9
0
def main(url: str = "", number: int = "") -> None:
    data = {
        'query_rules': ParsingRules.objects.filter(is_activated=True).all(),
        'query_sections': Section.objects.all(),
        'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES],
    }
    _apply_rules = _apply_rules_wrap(**data)

    parser = ImportPythonParser()
    if number and not url:
        url = parser.get_issue_url(number)
    if not number and not url:
        url = parser.get_latest_issue_url()
    blocks = parser.get_blocks(url)
    with_rules_applied = map(_apply_rules, blocks)
    for block in with_rules_applied:
        save_item(block)
示例#10
0
def import_tweets(**kwargs):
    for i in get_tweets():
        # это помогает не парсить лишний раз ссылку, которая есть
        if Item.objects.filter(link=i[1]).exists():
            continue

        # title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
        title = i[0]
        item_data = {
            'title': title,
            'link': i[1],
            'http_code': i[3],
            'resource': i[2]
        }
        data = apply_parsing_rules(item_data, **kwargs) if kwargs.get(
            'query_rules') else {}
        item_data.update(data)
        save_item(item_data)
示例#11
0
def import_tweets(**kwargs):
    for i in get_tweets():
        try:
            # это помогает не парсить лишний раз ссылку, которая есть
            if Item.objects.filter(link=i[1]).exists():
                continue

            # title = '[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
            title = i[0]
            item_data = {"title": title, "link": i[1], "http_code": i[3], "resource": i[2]}
            if is_weekly_digest(item_data):
                parse_weekly_digest(item_data)
            else:
                data = apply_parsing_rules(item_data, **kwargs) if kwargs.get("query_rules") else {}
                item_data.update(data)
            save_item(item_data)
        except (URLError, TooManyRedirects, socket.timeout):
            print(i)
示例#12
0
def import_tweets(**kwargs):
    for i in get_tweets():
        # это помогает не парсить лишний раз ссылку, которая есть
        if Item.objects.filter(link=i[1]).exists():
            continue

        # title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
        title = i[0]
        item_data = {
            'title': title,
            'link': i[1],
            'http_code': i[3],
            'resource': i[2]
        }
        data = apply_parsing_rules(
            item_data, **kwargs) if kwargs.get('query_rules') else {}
        item_data.update(data)
        save_item(item_data)
def parse():
    base_url = 'https://twitter.com/NewReleaseNotes/'
    packages = list(Package.objects.all().values('name', 'description', 'url'))

    if packages:
        try:
            section = Section.objects.get(title=u'Релизы')
            resource = Resource.objects.get(link='http://allmychanges.com/')
        except Exception:
            return

        tweets_data = get_tweets_by_url(base_url)

        for text, link, http_code in tweets_data:
            for x in packages:
                if 'python' in text and "python/%s" % x.get(
                        'name').lower() in text:
                    name = u"{} - {}".format(
                        x.get('name'),
                        text.split(' of')[0]
                    )
                    description = u"Вышла новая версия пакета {0} - {1}." \
                                  u" {2}." \
                                  u" Изменения описаны по ссылке <a href='{3}'>{3}</a>. " \
                                  u"Скачать можно по ссылке: <a href='{4}'>{4}</a>".format(
                        x.get('name'),
                        text.split(' of')[0],
                        x.get('description'),
                        link,
                        x.get('url')
                    )

                    save_item({
                        'title': name,
                        'link': link,
                        'resource': resource,
                        'status': 'active',
                        'section': section,
                        'language': 'en',
                        'description': description,
                    })
示例#14
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res="rss", in_edit=False):
        try:
            rss_items = map(
                get_data_for_rss_item,
                filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link))),
            )

            # parse weekly digests
            digests_items = list(rss_items)
            list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items)))

            resource = src.resource
            language = src.language
            for i, rss_item in enumerate(digests_items):
                rss_item.update({"resource": resource, "language": language})
                rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get("query_rules") else {})
                rss_item.update(apply_video_rules(rss_item.copy()))
                save_item(rss_item)
        except (URLError, TooManyRedirects, socket.timeout):
            print(src)
示例#15
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = n.title
            # title = u'[!] %s' % n.title if fresh_google_check(
            #    n.title) else n.title

            http_code, content, raw_content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'raw_content': raw_content,
                'http_code': http_code,
                'content': content,
                'description': re.sub('<.*?>', '', n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            item_data.update(
                apply_parsing_rules(item_data, **kwargs) if kwargs.
                get('query_rules') else {})
            item_data = apply_video_rules(item_data.copy())
            save_item(item_data)
示例#16
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = n.title
            # title = u'[!] %s' % n.title if fresh_google_check(
            #    n.title) else n.title

            http_code, content, raw_content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'raw_content': raw_content,
                'http_code': http_code,
                'content': content,
                'description': re.sub('<.*?>', '', n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            item_data.update(
                apply_parsing_rules(item_data, **kwargs)
                if kwargs.get('query_rules') else {})
            item_data = apply_video_rules(item_data.copy())
            save_item(item_data)
示例#17
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):
        rss_items = map(get_data_for_rss_item,
                        filter(is_not_exists_rss_item,
                               filter(_is_old_rss_news,
                                      get_items_from_rss(src.link))))

        # parse weekly digests
        digests_items = list(rss_items)
        list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items)))

        resource = src.resource
        language = src.language
        for i, rss_item in enumerate(digests_items):
            rss_item.update({
                'resource': resource,
                'language': language,
            })
            rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get('query_rules') else {})
            rss_item.update(apply_video_rules(rss_item.copy()))
            save_item(rss_item)
示例#18
0
def import_tweets(**kwargs):
    for i in get_tweets():
        try:
            # это помогает не парсить лишний раз ссылку, которая есть
            if Item.objects.filter(link=i[1]).exists():
                continue

            # title = '[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
            title = i[0]
            item_data = {
                'title': title,
                'link': i[1],
                'http_code': i[3],
                'resource': i[2]
            }
            if is_weekly_digest(item_data):
                parse_weekly_digest(item_data)
            else:
                data = apply_parsing_rules(
                    item_data, **kwargs) if kwargs.get('query_rules') else {}
                item_data.update(data)
            save_item(item_data)
        except (URLError, TooManyRedirects, socket.timeout):
            print(i)