Пример #1
0
def get_weapons():
    weapons = []
    gap_str = ';;;;;'
    for i in range(0, 112):
        url = "http://weapon.huanqiu.com/weaponlist/aircraft/list_0_0_0_0_"
        html = html_paser(url + str(i), 'utf-8')
        if html == 0:
            print('页面出错')
            exit()
        soup = bs(html, 'html.parser')
        div = soup.find(attrs={"class": 'picList'})
        if div is None:
            continue
        lis = div.findAll('li')
        for item in lis:
            name_span = item.find(attrs={"class": 'name'})
            if name_span is None:
                continue
            name = name_span.text
            category_span = item.find(attrs={"class": 'category'})
            category = '-1'
            if category_span is not None:
                category = category_span.text
            weapons.append(name + gap_str + 'category' + gap_str + category)
    return weapons
Пример #2
0
def crawler(concept):
    global cur_level_entity
    url_pre = "https://zh.wikipedia.org/wiki/Category:"
    url = url_pre + concept
    html = html_paser(url, 'utf-8')
    if html == 0:
        return
    # fine page

    soup = bs(html, 'html.parser')
    pages = soup.find(attrs={'id': 'mw-pages'})
    if pages is not None:
        links = pages.findAll('a')
        for link in links:
            page_set[concept].add(link.text)
            if link.text not in entity_set:
                entity_set.add(link.text)

    subconcepts = soup.find(attrs={'id': 'mw-subcategories'})
    if subconcepts is not None:
        links = subconcepts.findAll('a')
        for link in links:
            subcat_set[concept].add(link.text)
            if link.text not in con_set:
                cur_level_entity.add(link.text)
                con_set.add(link.text)
Пример #3
0
def turn_page_thread(submission):
    url_pre = "https://baike.baidu.com/item/"
    url = url_pre + submission
    html = html_paser(url, 'utf-8')
    if html == 0:
        return
    pattern = re.compile(
        '<dt class="basicInfo-item name">\s*([^<]+)\s*</dt>\s*<dd class="basicInfo-item value">\s*([^<]+)\s+</dd>'
    )
    triples = re.findall(pattern, html)
    if len(triples) == 0:
        return
    for item in triples:
        triple = [submission, item[0], item[1]]
        Producer.producer(entity_queue, triple)
Пример #4
0
def turn_page_thread(submission):
    url_pre = "https://baike.baidu.com/item/"
    url = url_pre + submission
    html = html_paser(url, 'utf-8')
    if html == 0:
        return
    pattern = re.compile('main-content')
    if re.search(pattern, html) is None:
        return
    pattern1 = re.compile(u'<a target=_blank href="(/item/[^"]+)">([^<]+)</a>')
    content = re.findall(pattern1, str(html))
    if len(content) == 0:
        return
    for link in content:
        triple = [submission, link[1], link[0]]
        Producer.producer(entity_queue, triple)
Пример #5
0
def turn_page_thread(submission):
    html = html_paser(submission, 'utf-8')
    if html == 0:
        return
    soup = bs(html, 'html.parser')
    main_title = soup.find(attrs={"class":'main-title'})
    if main_title is None:
        return
    title = main_title.text
    category_box = soup.find(attrs={'class': 'article'})
    if category_box is None:
        return
    categories = category_box.findAll('p')
    article = ''
    if len(categories)==0:
        return
    for item in categories:
        article +=item.text
    Producer.producer(entity_queue, (title, article))
Пример #6
0
def turn_page_thread(submission):
    url_pre = "https://baike.baidu.com/item/"
    url = url_pre + submission
    html = html_paser(url, 'utf-8')
    if html == 0:
        return
    pattern = re.compile('main-content')
    if re.search(pattern, html) is None:
        return
    soup = bs(html, 'html.parser')
    box = soup.find(attrs={'class': 'lemma-summary'})
    if box is None:
        return
    info = box.findAll(attrs={'class': 'para'})
    if len(info) == 0:
        return
    str = ''
    for item in info:
        str += item.text
    triple = [submission, 'abstract', str]
    Producer.producer(entity_queue, triple)