def test_keyword(self): solr = pysolr.Solr('http://localhost:8983/solr/unittest') solr.add([{ "id": "Belfast", "description": "This is about Belfast", }, { "id": "Lisburn", "description": "This is about Lisburn", }, { "id": "Titanic", "description": "This is about Titanic", },]) query_generic = solr.search("about") query_titanic = solr.search("Titanic") generic_result_list = [] titanic_result_list = [] generic_test = ['Belfast','Lisburn','Titanic'] titanic_test = ['Titanic'] for result in query_titanic: titanic_result_list.append(result['id']) for result in query_generic: generic_result_list.append(result['id']) self.assertEqual(generic_result_list,generic_test) self.assertEqual(titanic_result_list,titanic_test)
def paper_parser(url): """docstring for content_parser""" page_content = requests.get(url).text paths = PAPER_PATH_RE.findall(page_content) paper_list = [] for path in paths: if urlset.has_url('paper', path): pass #print path, 'old' # @todo logs else: p = pq(PAPER_URL.format(path=path)) paper_list.append(Paper(id=PAPER_ID_RE.search(path).group(1), path=path, title=p('h1').text() or 'null', author=(p('.author a').text() or '').split(), abstract=p('.abstrack').remove('strong').text() or '', keywords=(p('.keywords a').text() or '').split(), classification=p('#wxClass').attr.value or 'null', category=u'默认', update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))) print path, 'new' # @todo logs try: solr.add('paper', paper_list) except: 'err adding paper'
def paper_parser(url): """docstring for content_parser""" page_content = requests.get(url).text paths = PAPER_PATH_RE.findall(page_content) paper_list = [] for path in paths: if urlset.has_url('paper', path): pass #print path, 'old' # @todo logs else: p = pq(PAPER_URL.format(path=path)) paper_list.append( Paper(id=PAPER_ID_RE.search(path).group(1), path=path, title=p('h1').text() or 'null', author=(p('.author a').text() or '').split(), abstract=p('.abstrack').remove('strong').text() or '', keywords=(p('.keywords a').text() or '').split(), classification=p('#wxClass').attr.value or 'null', category=u'默认', update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))) print path, 'new' # @todo logs try: solr.add('paper', paper_list) except: 'err adding paper'
def test_solr_return_100(self): solr = pysolr.Solr('http://localhost:8983/solr/keyword') root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path_to_files = root+'/python/data/image_json.json' with open(path_to_files) as data_file: data = json.load(data_file) solr.add(data) results = solr.search("Belfast", **{ 'hl': 'true', 'hl.fragsize': 100, 'rows': 100,}) self.assertEqual(len(results),100)
def news_parser(url): """新闻解析器,从指定Url中获取新闻存入Solr""" links = common.get_links(url) news_list = [] for link in reversed(links): if not urlset.has_url('news', link.url): try: news_list.append(News(url=link.url, title=link.title, content=extract(requests.get(link.url).text), category=u'默认', update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))) except: print 'error adding', link.url print 'content', link.url try: solr.add('news', news_list) except: print 'error adding news'
def patent_parser(search_exp): """@todo: Docstring for patent_parser. """ patent_list = [] b = Browser("phantomjs") b.reload() b.visit( 'http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml' ) b.fill('searchInfo', search_exp) b.click_link_by_text(u'检索') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) for _ in xrange(10): item_list = b.find_by_css('.s_c_conter') for item in item_list: info_list = item.find_by_tag('td') if not urlset.has_url('patent', info_list[0].text[6:]): try: patent = Patent( id=info_list[0].text[6:], path='~', title=info_list[4].text[6:], abstract='~', inventor=info_list[7].text[5:].split(';')[:-1], applicant=info_list[6].text[10:].split(';')[:-1], category=info_list[5].text[8:].split('; '), update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())) patent_list.append(patent) print patent.id, 'new' # @todo logs except: print 'error patent' if b.is_text_present(u'下一页'): b.click_link_by_text(u'下一页') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) else: break try: solr.add('patent', patent_list) except: 'err adding patent' finally: b.quit()
def news_parser(url): """新闻解析器,从指定Url中获取新闻存入Solr""" links = common.get_links(url) news_list = [] for link in reversed(links): if not urlset.has_url('news', link.url): try: news_list.append( News(url=link.url, title=link.title, content=extract(requests.get(link.url).text), category=u'默认', update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))) except: print 'error adding', link.url print 'content', link.url try: solr.add('news', news_list) except: print 'error adding news'
def patent_parser(search_exp): """@todo: Docstring for patent_parser. """ patent_list = [] b = Browser("phantomjs") b.reload() b.visit('http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml') b.fill('searchInfo', search_exp) b.click_link_by_text(u'检索') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) for _ in xrange(10): item_list = b.find_by_css('.s_c_conter') for item in item_list: info_list = item.find_by_tag('td') if not urlset.has_url('patent', info_list[0].text[6:]): try: patent = Patent(id=info_list[0].text[6:], path='~', title=info_list[4].text[6:], abstract='~', inventor=info_list[7].text[5:].split(';')[:-1], applicant=info_list[6].text[10:].split(';')[:-1], category=info_list[5].text[8:].split('; '), update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())) patent_list.append(patent) print patent.id, 'new' # @todo logs except: print 'error patent' if b.is_text_present(u'下一页'): b.click_link_by_text(u'下一页') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) else: break try: solr.add('patent', patent_list) except: 'err adding patent' finally: b.quit()
import urllib import zipfile # grab the lcsh file if necessary if (not os.path.isfile("lcsh.nt.zip")): os.system("wget http://id.loc.gov/static/data/lcsh.nt.zip") # connect to solr solr = solr.SolrConnection("http://*****:*****@en.$' # send the data off to solr count = 0 for line in files.open(nt_file): match = re.match(pref_label_pattern, line) if match: count += 1 uri, label = match.groups() solr.add(id=uri, label=label) print uri, label if count % 1000 == 0: solr.commit() solr.commit()