Exemplo n.º 1
0
 def test_keyword(self):
     solr = pysolr.Solr('http://localhost:8983/solr/unittest')
     solr.add([{
            "id": "Belfast",
            "description": "This is about Belfast",
            },
           {
            "id": "Lisburn",
            "description": "This is about Lisburn",
            },
            {
            "id": "Titanic",
            "description": "This is about Titanic",
            },]) 
 
     query_generic = solr.search("about")
     query_titanic = solr.search("Titanic")
     generic_result_list = []
     titanic_result_list = []
     generic_test = ['Belfast','Lisburn','Titanic']
     titanic_test = ['Titanic']
 
 
     for result in query_titanic:
         titanic_result_list.append(result['id'])
     
     for result in query_generic:
         generic_result_list.append(result['id'])
     
     self.assertEqual(generic_result_list,generic_test)
     self.assertEqual(titanic_result_list,titanic_test)
Exemplo n.º 2
0
def paper_parser(url):
    """docstring for content_parser"""
    page_content = requests.get(url).text
    paths = PAPER_PATH_RE.findall(page_content)
    paper_list = []
    for path in paths:
        if urlset.has_url('paper', path):
            pass
            #print path, 'old'    # @todo logs
        else:
            p = pq(PAPER_URL.format(path=path))
            paper_list.append(Paper(id=PAPER_ID_RE.search(path).group(1),
                                    path=path,
                                    title=p('h1').text() or 'null',
                                    author=(p('.author a').text() or '').split(),
                                    abstract=p('.abstrack').remove('strong').text() or '',
                                    keywords=(p('.keywords a').text() or '').split(),
                                    classification=p('#wxClass').attr.value or 'null',
                                    category=u'默认',
                                    update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())))
            print path, 'new'    # @todo logs
    try:
        solr.add('paper', paper_list)
    except:
        'err adding paper'
Exemplo n.º 3
0
def paper_parser(url):
    """docstring for content_parser"""
    page_content = requests.get(url).text
    paths = PAPER_PATH_RE.findall(page_content)
    paper_list = []
    for path in paths:
        if urlset.has_url('paper', path):
            pass
            #print path, 'old'    # @todo logs
        else:
            p = pq(PAPER_URL.format(path=path))
            paper_list.append(
                Paper(id=PAPER_ID_RE.search(path).group(1),
                      path=path,
                      title=p('h1').text() or 'null',
                      author=(p('.author a').text() or '').split(),
                      abstract=p('.abstrack').remove('strong').text() or '',
                      keywords=(p('.keywords a').text() or '').split(),
                      classification=p('#wxClass').attr.value or 'null',
                      category=u'默认',
                      update_time=time.strftime('%Y-%m-%dT%XZ',
                                                time.gmtime())))
            print path, 'new'  # @todo logs
    try:
        solr.add('paper', paper_list)
    except:
        'err adding paper'
Exemplo n.º 4
0
 def test_solr_return_100(self):
     solr = pysolr.Solr('http://localhost:8983/solr/keyword')
     root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     path_to_files = root+'/python/data/image_json.json'
     
     with open(path_to_files) as data_file:    
         data = json.load(data_file)
     
     solr.add(data)
     
     results = solr.search("Belfast", **{
             'hl': 'true',
             'hl.fragsize': 100,
             'rows': 100,})
     
     self.assertEqual(len(results),100)
Exemplo n.º 5
0
def news_parser(url):
    """新闻解析器,从指定Url中获取新闻存入Solr"""
    links = common.get_links(url)
    news_list = []
    for link in reversed(links):
        if not urlset.has_url('news', link.url):
            try:
                news_list.append(News(url=link.url,
                                 title=link.title,
                                 content=extract(requests.get(link.url).text),
                                 category=u'默认',
                                 update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())))
            except:
                print 'error adding', link.url
            print 'content', link.url
    try:
        solr.add('news', news_list)
    except:
        print 'error adding news'
Exemplo n.º 6
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit(
        'http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml'
    )
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(
                        id=info_list[0].text[6:],
                        path='~',
                        title=info_list[4].text[6:],
                        abstract='~',
                        inventor=info_list[7].text[5:].split(';')[:-1],
                        applicant=info_list[6].text[10:].split(';')[:-1],
                        category=info_list[5].text[8:].split('; '),
                        update_time=time.strftime('%Y-%m-%dT%XZ',
                                                  time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'  # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
Exemplo n.º 7
0
def news_parser(url):
    """新闻解析器,从指定Url中获取新闻存入Solr"""
    links = common.get_links(url)
    news_list = []
    for link in reversed(links):
        if not urlset.has_url('news', link.url):
            try:
                news_list.append(
                    News(url=link.url,
                         title=link.title,
                         content=extract(requests.get(link.url).text),
                         category=u'默认',
                         update_time=time.strftime('%Y-%m-%dT%XZ',
                                                   time.gmtime())))
            except:
                print 'error adding', link.url
            print 'content', link.url
    try:
        solr.add('news', news_list)
    except:
        print 'error adding news'
Exemplo n.º 8
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit('http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml')
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(id=info_list[0].text[6:],
                                    path='~',
                                    title=info_list[4].text[6:],
                                    abstract='~',
                                    inventor=info_list[7].text[5:].split(';')[:-1],
                                    applicant=info_list[6].text[10:].split(';')[:-1],
                                    category=info_list[5].text[8:].split('; '),
                                    update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'    # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
Exemplo n.º 9
0
import urllib
import zipfile

# grab the lcsh file if necessary
if (not os.path.isfile("lcsh.nt.zip")):
    os.system("wget http://id.loc.gov/static/data/lcsh.nt.zip")

# connect to solr
solr = solr.SolrConnection("http://*****:*****@en.$'

# send the data off to solr
count = 0
for line in files.open(nt_file):
    match = re.match(pref_label_pattern, line)
    if match: 
        count += 1
        uri, label = match.groups()
        solr.add(id=uri, label=label)
        print uri, label
    if count % 1000 == 0:
        solr.commit()

solr.commit()