Пример #1
0
async def process_season(season):
    text2 = get_text(
        'https://www.championat.com/biathlon/_biathlonworldcup/tournament/{}/players/'
        .format(season['cc_id']))
    if text2:
        soup = BeautifulSoup(text2, 'html.parser')
        nodes = soup.select('a[href]')
        for node in nodes:
            if '/biathlon/_biathlonworldcup/' in node.get(
                    'href') and '/players/' in node.get('href'):
                player = get_player(node)
                client.ibustats.racers.update_one(
                    {'wiki.ru': player['name']},
                    {'$set': {
                        'champ.cc_id': player['cc_id']
                    }},
                    upsert=True)
                client.ibustats.racers.update_one(
                    {'wiki.ru': player['name']},
                    {'$addToSet': {
                        'champ.tournaments': player['tournament']
                    }},
                    upsert=False)
    await asyncio.sleep(4 + random.randint(4, 12))
    text = get_text(
        'https://www.championat.com/biathlon/_biathlonworldcup/tournament/{}/teams/'
        .format(season['cc_id']))
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        nodes = soup.select('a[href]')
        for node in nodes:
            if '/biathlon/_biathlonworldcup/' in node.get(
                    'href') and '/teams/' in node.get('href'):
                country = get_country(node)
Пример #2
0
async def get_infobox(lang, title):
    print('--get_infobox--{}--{}'.format(lang, title))
    wikis = {'lang': lang, 'name': title, 'infobox': {}}
    text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title))
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        nodes = soup.select(
            'li.interlanguage-link a.interlanguage-link-target')
        for _ in nodes:
            pass
    return wikis
Пример #3
0
async def get_links(lang, title):
    text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title))
    if text is None:
        return {'title': title, 'links': []}
    category = {'title': title, 'links': []}
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        cat_nodes = soup.select('div#mw-content-text a[title]')
        if cat_nodes:
            for cat_node in cat_nodes:
                category['links'].append(cat_node.get('title'))
    return category
Пример #4
0
async def get_ci(lang, title):
    print('--get_ci--{}--{}'.format(lang, title))
    text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title))
    if text is None:
        return {'name': title}
    info = {'name': title}
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        info['flag'] = get_flag_info(soup)
        info['emblem'] = get_emblem_info(soup)
    print('INFO\tget_ci({}, {})\r\n\t{}'.format(lang, title, info))
    return info
Пример #5
0
async def get_pi(lang, title):
    print('--get_pi--{}--{}'.format(lang, title))
    text = get_text(
        'https://{}.wikipedia.org/w/index.php?title={}&action=info'.format(
            lang, title))
    if text is None:
        return {'name': title}
    category = {'name': title}
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        category['pvi_month'] = get_pvi_month(soup)
        category['lasttime'] = get_lasttime(soup)
    return category
Пример #6
0
def get_articles(url):
    base_url = "{0.scheme}://{0.netloc}".format(urlsplit(url))
    html, error = get_text(url)
    pol_nodes = html.select('a[href]')
    urls = set()
    if pol_nodes:
        for pol_node in pol_nodes:
            ref = pol_node.get('href')
            if '#comments' in ref:
                continue
            if '/2021/' in ref or '/2020/' in ref:
                if 'https://' in ref:
                    urls.add(ref)
                else:
                    urls.add('{}{}'.format(base_url, ref))
    return urls
Пример #7
0
async def get_interwikis(lang, title):
    print('--get_interwikis--{}--{}'.format(lang, title))
    wikis = {'lang': lang, 'name': title, 'interwikis': {}}
    text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title))
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        nodes = soup.select(
            'li.interlanguage-link a.interlanguage-link-target')
        for node in nodes:
            lang_title = node.get('title')
            if '–' in lang_title:
                lang_title = lang_title[:lang_title.rfind('–')].strip()
            elif '—' in lang_title:
                lang_title = lang_title[:lang_title.rfind('—')].strip()
            wikis['interwikis'][node.get('lang')] = lang_title
            print('\t--interwiki--{}--{}'.format(node.get('lang'), lang_title))
    return wikis
Пример #8
0
async def get_info(lang, title):
    print('INFO\tget_info({}, {})'.format(lang, title))
    text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title))
    if text is None:
        return {'name': title, 'countries': []}
    category = {'name': title, 'countries': []}
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        category['countries'] = get_country_info(soup)
        category['image'] = get_image_info(soup)
        category['desc'] = get_desc(soup)
        category['name'] = get_name_info(soup)
        if category['name'] is None:
            category['name'] = title
        category['bday'] = get_bday_info(soup)
    print('INFO\tget_info({}, {})\r\n\t{}'.format(lang, title, category))
    return category
Пример #9
0
async def _get_pages(url):
    text = get_text(url)
    await asyncio.sleep(4)
    if text is None:
        return {'pages': [], 'next': None}
    category = {'pages': [], 'next': None}
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        next_nodes = soup.select('div#mw-pages a[title]')
        if next_nodes:
            for next_node in next_nodes:
                if 'Следующая страница' in next_node.text:
                    category['next'] = next_node.get('href')
        cat_nodes = soup.select('div#mw-pages div.mw-category li a[title]')
        if cat_nodes:
            for cat_node in cat_nodes:
                category['pages'].append(cat_node.get('title'))
    return category
Пример #10
0
async def process_player(player):
    client.ibustats.racers.update_one(
        {'champ.cc_id': player['champ']['cc_id']}, {'$set': {
            'images': []
        }},
        upsert=False)
    text2 = get_text(
        'https://www.championat.com/biathlon/_biathlonworldcup/tournament/{}/players/{}/'
        .format(player['champ']['tournaments'][0], player['champ']['cc_id']))
    if text2:
        soup = BeautifulSoup(text2, 'html.parser')
        images = soup.select(
            'div._player div.entity-header__info div.entity-header__img img')
        for image in images:
            update_image(player, image)
        nodes = soup.select('div._player.entity-header > div > ul > li')
        for node in nodes:
            if 'Команда:' in node.text:
                update_team(player, node)
            if 'Дата рождения:' in node.text:
                update_bday(player, node)
    await asyncio.sleep(16 + random.randint(8, 16))
Пример #11
0
async def get_externals(lang, title):
    print('--get_externals--{}--{}'.format(lang, title))
    wikis = {'lang': lang, 'name': title, 'externals': []}
    text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title))
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        nodes = soup.select('a.external.text')
        for node in nodes:
            if node.text == 'Facebook':
                wikis['externals'].append(node.get('href'))
            if node.text == 'Instagram':
                wikis['externals'].append(node.get('href'))
            if node.text == 'Твиттер':
                wikis['externals'].append(node.get('href'))
            if node.text == 'ВКонтакте':
                wikis['externals'].append(node.get('href'))
            if node.text == 'biathlon.com.ua':
                wikis['externals'].append(node.get('href'))
            if node.text == 'IBU':
                wikis['externals'].append(node.get('href'))
            print('\t--externals--{}'.format(wikis['externals']))
    await asyncio.sleep(10 + random.randint(4, 8))
    return wikis
Пример #12
0
 async def get(self, request):
     feed_id = request.path_params['feed_id']
     feed = feeds.find_one({"_id": ObjectId(feed_id)})
     text = get_text(feed['link'])
     return XmlResponse(text)
Пример #13
0
threads4 = threads.find({}).sort([("thread_id", 1)]).limit(32)
for thread in threads4:
    thread_url = 'https://talks.by/showthread.php?t={}'.format(
        thread['thread_id'])
    urls.add(thread_url)


def parse_user(node):
    return None


while len(urls) > 0:
    url = urls.pop()
    print(url)
    text = get_text(url)
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        ref_nodes = soup.select('a[href]')
        if ref_nodes:
            for ref_node in ref_nodes:
                if '>>' in ref_node.text:
                    urls.add('https://talks.by/{}'.format(
                        ref_node.get('href')))
        user_nodes = soup.select('div.row-user a.username')
        if user_nodes:
            for user_node in user_nodes:
                query = parse.urlsplit(user_node.get('href')).query
                params = parse.parse_qs(query)
                op_result = users.update_one(
                    {'u': params['u'][0]}, {'$set': {
Пример #14
0
client = MongoClient()
news = client.news
users = news.users

urls = set()

turls = {
    'https://talks.by/forumdisplay.php?f=45&page={}&order=desc'.format(i)
    for i in range(64)
}

while turls:
    turl = turls.pop()
    print(turl)
    threads_page = get_text(turl)
    if threads_page:
        soup = BeautifulSoup(threads_page, 'html.parser')
        ref_nodes = soup.select('a[href]')
        if ref_nodes:
            for ref_node in ref_nodes:
                query = parse.urlsplit(ref_node.get('href')).query
                params = parse.parse_qs(query)
                if 't' in params:
                    urls.add('https://talks.by/showthread.php?t={}'.format(
                        params['t'][0]))
    print(len(urls))

while urls:
    url = urls.pop()
    print(url)