Exemplo n.º 1
0
def import_python_weekly(issue_url, **kwargs):
    resource = Resource.objects.get(title='PythonWeekly')

    page = html.parse(issue_url)

    # a = requests.get(url).content
    blocks = page.getroot().find_class('bodyTable')[0].xpath('//span[@style="font-size:14px"]')

    for x in blocks:
        link = x.cssselect('a')[0]
        url = link.attrib['href']
        title = link.text
        _text = x.getnext()
        if _text is None:
            continue
        text = etree.tostring(x.getnext()).decode('utf-8').replace('<br/>', '').strip()

        item_data = {
            'title': title,
            'link': url,
            'raw_content': text,
            'http_code': 200,
            'content': text,
            'description': text,
            'resource': resource,
            'language': 'en',
        }
        item_data.update(
            apply_parsing_rules(item_data, **kwargs)
            if kwargs.get('query_rules') else {})
        item_data = apply_video_rules(item_data.copy())
        save_item(item_data)
Exemplo n.º 2
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):
        try:
            rss_items = map(
                get_data_for_rss_item,
                filter(is_not_exists_rss_item,
                       filter(_is_old_rss_news, get_items_from_rss(src.link))))

            # parse weekly digests
            digests_items = list(rss_items)
            list(
                map(parse_weekly_digest, filter(is_weekly_digest,
                                                digests_items)))

            list(
                map(parse_django_weekly_digest,
                    filter(is_django_weekly_digest, digests_items)))

            resource = src.resource
            language = src.language
            for i, rss_item in enumerate(digests_items):
                rss_item.update({
                    'resource': resource,
                    'language': language,
                })
                rss_item.update(
                    apply_parsing_rules(rss_item, **kwargs) if kwargs.
                    get('query_rules') else {})
                rss_item.update(apply_video_rules(rss_item.copy()))
                save_item(rss_item)
        except (URLError, TooManyRedirects, socket.timeout):
            print(src)
Exemplo n.º 3
0
def import_tweets(**kwargs):
    for i in get_tweets():
        try:
            # это помогает не парсить лишний раз ссылку, которая есть
            if Item.objects.filter(link=i[1]).exists():
                continue

            # title = '[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
            title = i[0]
            item_data = {"title": title, "link": i[1], "http_code": i[3], "resource": i[2]}
            if is_weekly_digest(item_data):
                parse_weekly_digest(item_data)
            else:
                data = apply_parsing_rules(item_data, **kwargs) if kwargs.get("query_rules") else {}
                item_data.update(data)
            save_item(item_data)
        except (URLError, TooManyRedirects, socket.timeout):
            print(i)
Exemplo n.º 4
0
def import_tweets(**kwargs):
    for i in get_tweets():
        # это помогает не парсить лишний раз ссылку, которая есть
        if Item.objects.filter(link=i[1]).exists():
            continue

        # title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
        title = i[0]
        item_data = {
            'title': title,
            'link': i[1],
            'http_code': i[3],
            'resource': i[2]
        }
        data = apply_parsing_rules(item_data, **kwargs) if kwargs.get(
            'query_rules') else {}
        item_data.update(data)
        save_item(item_data)
Exemplo n.º 5
0
def import_tweets(**kwargs):
    for i in get_tweets():
        # это помогает не парсить лишний раз ссылку, которая есть
        if Item.objects.filter(link=i[1]).exists():
            continue

        # title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
        title = i[0]
        item_data = {
            'title': title,
            'link': i[1],
            'http_code': i[3],
            'resource': i[2]
        }
        data = apply_parsing_rules(
            item_data, **kwargs) if kwargs.get('query_rules') else {}
        item_data.update(data)
        save_item(item_data)
Exemplo n.º 6
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res="rss", in_edit=False):
        try:
            rss_items = map(
                get_data_for_rss_item,
                filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link))),
            )

            # parse weekly digests
            digests_items = list(rss_items)
            list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items)))

            resource = src.resource
            language = src.language
            for i, rss_item in enumerate(digests_items):
                rss_item.update({"resource": resource, "language": language})
                rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get("query_rules") else {})
                rss_item.update(apply_video_rules(rss_item.copy()))
                save_item(rss_item)
        except (URLError, TooManyRedirects, socket.timeout):
            print(src)
Exemplo n.º 7
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = n.title
            # title = u'[!] %s' % n.title if fresh_google_check(
            #    n.title) else n.title

            http_code, content, raw_content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'raw_content': raw_content,
                'http_code': http_code,
                'content': content,
                'description': re.sub('<.*?>', '', n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            item_data.update(
                apply_parsing_rules(item_data, **kwargs)
                if kwargs.get('query_rules') else {})
            item_data = apply_video_rules(item_data.copy())
            save_item(item_data)
Exemplo n.º 8
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = n.title
            # title = u'[!] %s' % n.title if fresh_google_check(
            #    n.title) else n.title

            http_code, content, raw_content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'raw_content': raw_content,
                'http_code': http_code,
                'content': content,
                'description': re.sub('<.*?>', '', n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            item_data.update(
                apply_parsing_rules(item_data, **kwargs) if kwargs.
                get('query_rules') else {})
            item_data = apply_video_rules(item_data.copy())
            save_item(item_data)
Exemplo n.º 9
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):
        rss_items = map(get_data_for_rss_item,
                        filter(is_not_exists_rss_item,
                               filter(_is_old_rss_news,
                                      get_items_from_rss(src.link))))

        # parse weekly digests
        digests_items = list(rss_items)
        list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items)))

        resource = src.resource
        language = src.language
        for i, rss_item in enumerate(digests_items):
            rss_item.update({
                'resource': resource,
                'language': language,
            })
            rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get('query_rules') else {})
            rss_item.update(apply_video_rules(rss_item.copy()))
            save_item(rss_item)
Exemplo n.º 10
0
def import_tweets(**kwargs):
    for i in get_tweets():
        try:
            # это помогает не парсить лишний раз ссылку, которая есть
            if Item.objects.filter(link=i[1]).exists():
                continue

            # title = '[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
            title = i[0]
            item_data = {
                'title': title,
                'link': i[1],
                'http_code': i[3],
                'resource': i[2]
            }
            if is_weekly_digest(item_data):
                parse_weekly_digest(item_data)
            else:
                data = apply_parsing_rules(
                    item_data, **kwargs) if kwargs.get('query_rules') else {}
                item_data.update(data)
            save_item(item_data)
        except (URLError, TooManyRedirects, socket.timeout):
            print(i)
Exemplo n.º 11
0
 def _apply_rules(item: dict) -> dict:
     item.update(
         apply_parsing_rules(item, **rules)
         if kwargs.get('query_rules') else {})
     item.update(apply_video_rules(item))
     return item
Exemplo n.º 12
0
 def _apply_rules(item: dict) -> dict:
     item.update(
         apply_parsing_rules(item, **rules) if kwargs.get('query_rules'
                                                          ) else {})
     item.update(apply_video_rules(item))
     return item