示例#1
0
def paper_parser(url):
    """docstring for content_parser"""
    page_content = requests.get(url).text
    paths = PAPER_PATH_RE.findall(page_content)
    paper_list = []
    for path in paths:
        if urlset.has_url('paper', path):
            pass
            #print path, 'old'    # @todo logs
        else:
            p = pq(PAPER_URL.format(path=path))
            paper_list.append(Paper(id=PAPER_ID_RE.search(path).group(1),
                                    path=path,
                                    title=p('h1').text() or 'null',
                                    author=(p('.author a').text() or '').split(),
                                    abstract=p('.abstrack').remove('strong').text() or '',
                                    keywords=(p('.keywords a').text() or '').split(),
                                    classification=p('#wxClass').attr.value or 'null',
                                    category=u'默认',
                                    update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())))
            print path, 'new'    # @todo logs
    try:
        solr.add('paper', paper_list)
    except:
        'err adding paper'
示例#2
0
def paper_parser(url):
    """docstring for content_parser"""
    page_content = requests.get(url).text
    paths = PAPER_PATH_RE.findall(page_content)
    paper_list = []
    for path in paths:
        if urlset.has_url('paper', path):
            pass
            #print path, 'old'    # @todo logs
        else:
            p = pq(PAPER_URL.format(path=path))
            paper_list.append(
                Paper(id=PAPER_ID_RE.search(path).group(1),
                      path=path,
                      title=p('h1').text() or 'null',
                      author=(p('.author a').text() or '').split(),
                      abstract=p('.abstrack').remove('strong').text() or '',
                      keywords=(p('.keywords a').text() or '').split(),
                      classification=p('#wxClass').attr.value or 'null',
                      category=u'默认',
                      update_time=time.strftime('%Y-%m-%dT%XZ',
                                                time.gmtime())))
            print path, 'new'  # @todo logs
    try:
        solr.add('paper', paper_list)
    except:
        'err adding paper'
示例#3
0
def news_parser(url):
    """新闻解析器,从指定Url中获取新闻存入Solr"""
    links = common.get_links(url)
    news_list = []
    for link in reversed(links):
        if not urlset.has_url('news', link.url):
            try:
                news_list.append(News(url=link.url,
                                 title=link.title,
                                 content=extract(requests.get(link.url).text),
                                 category=u'默认',
                                 update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())))
            except:
                print 'error adding', link.url
            print 'content', link.url
    try:
        solr.add('news', news_list)
    except:
        print 'error adding news'
示例#4
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit(
        'http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml'
    )
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(
                        id=info_list[0].text[6:],
                        path='~',
                        title=info_list[4].text[6:],
                        abstract='~',
                        inventor=info_list[7].text[5:].split(';')[:-1],
                        applicant=info_list[6].text[10:].split(';')[:-1],
                        category=info_list[5].text[8:].split('; '),
                        update_time=time.strftime('%Y-%m-%dT%XZ',
                                                  time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'  # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
示例#5
0
def news_parser(url):
    """新闻解析器,从指定Url中获取新闻存入Solr"""
    links = common.get_links(url)
    news_list = []
    for link in reversed(links):
        if not urlset.has_url('news', link.url):
            try:
                news_list.append(
                    News(url=link.url,
                         title=link.title,
                         content=extract(requests.get(link.url).text),
                         category=u'默认',
                         update_time=time.strftime('%Y-%m-%dT%XZ',
                                                   time.gmtime())))
            except:
                print 'error adding', link.url
            print 'content', link.url
    try:
        solr.add('news', news_list)
    except:
        print 'error adding news'
示例#6
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit('http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml')
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(id=info_list[0].text[6:],
                                    path='~',
                                    title=info_list[4].text[6:],
                                    abstract='~',
                                    inventor=info_list[7].text[5:].split(';')[:-1],
                                    applicant=info_list[6].text[10:].split(';')[:-1],
                                    category=info_list[5].text[8:].split('; '),
                                    update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'    # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()