Пример #1
0
def getInfoForBibcode(bibcode):
    c = adsrdf.ADSConnection(SESAME, REPOSITORY)
    bibcodeuri = 'uri_bib:' + bibcode
    result = {}
    iduri = c.getDataBySP(bibcodeuri, 'fabio:isRealizationOf')
    print "returned", iduri
    iduri = iduri[0]

    result['id'] = iduri.split('#')[1]
    iduri = 'uri_bib:' + result['id']
    print "IDURI", iduri, result['id']

    result['bibcode'] = bibcode
    result['keywords'] = [
        e.split('#')[1].replace('_', ' ')
        for e in c.getDataBySP(iduri, 'adsbib:keywordConcept')
    ]
    result['title'] = c.getDataBySP(iduri, 'adsbase:title')
    result['author'] = [
        unquote(e.split('#')[1]).replace('_', ' ')
        for e in c.getDataBySP(iduri, 'pav:authoredBy')
    ]
    result['keywords_s'] = result['keywords']
    result['author_s'] = result['author']
    return result
Пример #2
0
 def setUp(self):
     print "Setting up"
     
     initialize_logging("rdf2solr5")
     debug("Starting:", time.asctime())
     confname="./default.conf"
     PORT=8984
     SESAME='http://localhost:'+str(PORT)+'/openrdf-sesame/'
     REPOSITORY='testads8'
     self.sesame = adsrdf.ADSConnection(SESAME, REPOSITORY)
Пример #3
0
def getInfoForBibcode(bibcode):
    c = adsrdf.ADSConnection(SESAME, REPOSITORY)
    bibcodeuri = 'uri_bib:' + bibcode
    result = {}
    iduri = c.getDataBySP(bibcodeuri, 'fabio:isRealizationOf')
    print "returned", iduri
    iduri = iduri[0]

    result['id'] = iduri.split('#')[1]
    iduri = 'uri_bib:' + result['id']
    print "IDURI", iduri, result['id']

    result['bibcode'] = bibcode
    result['keywords'] = [
        e.split('#')[1].replace('_', ' ')
        for e in c.getDataBySP(iduri, 'adsbib:keywordConcept')
    ]
    result['title'] = c.getDataBySP(iduri, 'adsbase:title')
    result['author'] = [
        unquote(e.split('#')[1]).replace('_', ' ')
        for e in c.getDataBySP(iduri, 'pav:authoredBy')
    ]
    result['keywords_s'] = result['keywords']
    result['author_s'] = result['author']
    #get the publication uri
    result['pubyear'] = int(
        c.getDataBySP(bibcodeuri, 'adsbib:pubDate')[0].split()[1])
    theobjects = c.getDataBySP(bibcodeuri, 'adsbase:hasAstronomicalSource')
    objectlist = []
    for theobj in theobjects:
        odata = c.getDataBySP('uri_base:' + theobj.split('#')[1],
                              'adsbase:hasMetadataString')
        odict = eval(odata[0])
        oid = odict['id']
        otype = odict['otype']
        ouri = theobj
        objectlist.append({'oid': oid, 'otype': otype, 'ouri': ouri})
    result['objectnames'] = [e['oid'] for e in objectlist]
    result['objecttypes'] = [e['otype'] for e in objectlist]
    result['objectnames_s'] = result['objectnames']
    result['objecttypes_s'] = result['objecttypes']
    return result
Пример #4
0
def getInfoForBibcode(bibcode):
    c=adsrdf.ADSConnection(SESAME, REPOSITORY)
    bibcodeuri='uri_bib:'+bibcode
    result={}
    iduri=c.getDataBySP(bibcodeuri, 'fabio:isRealizationOf')
    print "returned", iduri, bibcodeuri
    iduri=iduri[0]

    result['id']=iduri.split('#')[1]
    iduri='uri_bib:'+result['id']
    print "IDURI", iduri, result['id']
        
    result['bibcode']=bibcode
    result['keywords']=[e.split('#')[1].replace('_',' ') for e in c.getDataBySP(iduri, 'adsbib:keywordConcept')]
    result['title']=c.getDataBySP(iduri, 'adsbase:title')[0]
    pquery0="""
        SELECT ?atext WHERE {
            uri_bib:%s adsbib:hasAbstract ?anode.
            ?anode adsbib:abstractText ?atext.            
        }
     """ % (result['id'])
        
    #print pquery0
    res1=c.makeQuery(pquery0)
    #print res1[0]
    result['abstract']=res1[0]['atext']['value']
    print "TITLE", result['title']
    citationcount=len(c.getDataBySP(iduri, 'cito:cites'))
    result['citationcount_i']=citationcount
    #this is the first thing that can have multiple stuff from chandra, hut and other
    #we still dont handle this
    ptray=c.getDataBySP(bibcodeuri, 'adsbib:paperType')
    if len(ptray)>0:
        result['papertype_s']=ptray
        print "PTYPE", bibcode, ptray
    else:
        result['papertype_s']=["None"]
        print "PTYPE", bibcode, "NONE"
    
    #Above is only accurate when we dont do overlaps. For HUT/Chandra overlap, we should
    #be doing None/Something overlap but i do this as just Something should be fine
    #Itake the position that "None", if you want to institutionalize it, should be put in the rdf    
    #print "PAPERTYPE", result['papertype_s']
    authoren=c.getDataBySP(iduri, 'pav:authoredBy')
    #print authoren
    #BUG: one slash too many in authors you think?
    result['author']=[unquote(e.split('/')[-2]).replace('_',' ') for e in authoren]
    #print result['author']
    result['keywords_s']=result['keywords']
    result['author_s']=result['author']
    #get the publication uri
    result['pubyear_i']=int(c.getDataBySP(bibcodeuri, 'adsbib:pubDate')[0].split()[1])
    theobjects=c.getDataBySP(bibcodeuri, 'adsbase:hasAstronomicalSource')
    objectlist=[]
    for theobj in theobjects:
        #print "theobj", theobj
        odata=c.getDataBySP('uri_source:'+theobj.split('/')[-1], 'adsbase:hasMetadataString')
        odict=eval(odata[0])
        oid=odict['id']
        otype=odict['otype']
        ouri=theobj
        objectlist.append({'oid':oid, 'otype':otype, 'ouri':ouri})
    result['objectnames']=[e['oid'] for e in objectlist]
    result['objecttypes']=[e['otype'] for e in objectlist]
    result['objectnames_s']=result['objectnames']
    result['objecttypes_s']=result['objecttypes']
    print result['objectnames']
    #theobsids=[rinitem(splitns(e)) for e in c.getDataBySP(bibcodeuri, 'adsbase:aboutScienceProduct')]
    theobsiduris=c.getDataBySP(bibcodeuri, 'adsbase:aboutScienceProcess')
    #print "OBSIDS", bibcodeuri, theobsiduris
    obsray=[]
    #TESTnotice by this we dont uniq telescopes or data types...what does this mean for the numbers, if anything?
    daprops=['obsids_s','obsvtypes_s','exptime_f','obsvtime_d','instruments_s', 'telescopes_s', 'emdomains_s', 'missions_s', 'targets_s', 'ra_f','dec_f', 'datatypes_s','propids_s', 'proposaltitle', 'proposalpi', 'proposalpi_s', 'proposaltype_s']
    print "THEOBSIDURIS", theobsiduris
    datatypes=[]
    for theuri in theobsiduris:
        thedict={}
        #BUG: make this polymorphic
        themission, thevariable, theobsid=splitns(theuri)
        uritail=themission+"/"+thevariable+"/"+theobsid
        print "URITAIL", uritail
        thedict['missions_s']=themission # this should be in RDF!!
        

        if theuri.find('MAST')!=-1:
            pquery0="""
            SELECT ?tname WHERE {
            %s adsbase:target ?tnode.
            ?tnode adsbase:name ?tname.            
            }
            """ % (n3encode('uri_obs:'+uritail))
            #Sprint pquery0
            res1=c.makeQuery(pquery0)
            target=res1[0]['tname']['value']
            thetarget=themission+"/"+target
        elif theuri.find('CHANDRA'):
            titleray=c.getDataBySP('uri_obs:'+uritail, 'adsbase:title')
            if len(titleray)==0:
                title="Unspecified"
            else:
                title=titleray[0]
            thetarget=themission+"/"+title
        else:
            thetarget="None"
        print "The target", thetarget
        thedict['targets_s']=thetarget
        #print "::::::::::::::::", theobsid, theuri, themission, thevariable
        #thedict['obsids_s']=rinitem(theobsid)
        thedict['obsids_s']=themission+"/"+theobsid

        #print theobsid, c.getDataBySP('uri_obs:'+uritail, 'adsobsv:observationType')
        obstypes=c.getDataBySP('uri_obs:'+uritail, 'adsobsv:observationType')
        if len(obstypes)>0:
            thedict['obsvtypes_s']=obstypes[0]
        else:
            thedict['obsvtypes_s']=themission+"/None"
        #Hut dosent have obsvtypes. Caal it MAST_HUT/None
        print "???", c.getDataBySP('uri_obs:'+uritail, 'adsobsv:tExptime'), c.getDataBySP('uri_obs:'+uritail, 'adsobsv:tExpTime')
        try:
            thedict['exptime_f']=float(c.getDataBySP('uri_obs:'+uritail, 'adsobsv:tExpTime')[0])
        except:
            thedict['exptime_f']=float(c.getDataBySP('uri_obs:'+uritail, 'adsobsv:tExptime')[0])
        tdt=c.getDataBySP('uri_obs:'+uritail, 'adsbase:atTime')[0]
        #print "TDT", tdt
        obsvtime=datetime.datetime.strptime(tdt,"%Y-%m-%dT%H:%M:%S")
        #month, day, year, thehour=tdt.split()

        #th, tmin=thehour[:-2].split(':')
        #th=int(th)
        #tmin=int(tmin)
        #if thehour[-2:]=='PM' and th < 12:
        #    th=int(th)+12
        #    print "TDT", tdt
        #obsvtime=datetime.datetime(int(year), list(calendar.month_abbr).index(month), int(day), th, tmin)
        thedict['obsvtime_d']=obsvtime.isoformat()+"Z"
        
        #hasDatum is a subset of hasDataProduct. How do we get sparql to fo up inhertitance hierarchy
        #Currently we have no way of knowing as the owl file hasnt been loaded in
        pquery="""
            SELECT ?dtype WHERE {
            {%s adsobsv:hasDataProduct ?daturi.} UNION {%s adsobsv:hasDatum ?daturi.}
            ?daturi adsbase:dataType ?dtype.
            }
        """ % (n3encode('uri_obs:'+uritail),n3encode('uri_obs:'+uritail) )
        res=c.makeQuery(pquery)
        #print "RES", res, pquery
        tempdt={}
        if len(res)>0:
            for ele in res:
                tkey=ele['dtype']['value']
                if tempdt.has_key('tkey'):
                    tempdt[tkey]+=1
                else:
                    tempdt[tkey]=1
            thedict['datatypes_s']=tempdt.keys()
        else:
            thedict['datatypes_s']=[]
        #BUG: Still assume one istrument. This will change, point is how? There will be both
        #multiple stuff for non-simple obs and hierarchical stuff for simple obs like gratings
        #how will we model this?
        print "DATATYPES", thedict['datatypes_s']
        theinstrument=c.getDataBySP('uri_obs:'+uritail, 'adsbase:usingInstrument')[0]
        theinstrumentname=theinstrument.split('/')[-1]
        thedict['instruments_s']="/".join(theinstrumentname.split('_'))
        #BUG: Still assume one telescope, this will change
        thetelescope=c.getDataBySP('uri_obs:'+uritail, 'adsobsv:atTelescope')[0]
        thetelescopename=thetelescope.split('/')[-1]
        thedict['telescopes_s']="/".join(thetelescopename.split('_'))
        #print thedict['instruments_s']
        #pointing=c.getDataBySP('uri_obs:'+theobsid, 'adsobsv:associatedPosition')[0]
        #FAIL dune to bnode crapola ra=c.getDataBySP(pointing, 'adsobsv:ra')
        #BUG we should first even see if Pointing exists before going for ra or dec
        
        #This will need special handling as it is multivalued array even within obsv.
        #So it will need flattening within publications
        theemdomains=c.getDataBySP('uri_obs:'+uritail, 'adsobsv:wavelengthDomain')
        #BUG:Note that by doing this emdomains is optional...Not sure we want that
        if len(theemdomains) > 0:
            thedict['emdomains_s']=[]
            for domain in theemdomains:
                thedict['emdomains_s'].append(domain.split('_')[-1])
            
        thepointings=c.getDataBySP('uri_obs:'+uritail, 'adsobsv:associatedPosition')
        
        if len(thepointings) > 0:
            pquery="""
            SELECT ?ra ?dec WHERE {
            %s adsobsv:associatedPosition ?position.
            ?position adsobsv:ra ?ra.
            ?position adsobsv:dec ?dec.
                
            }
            """ % (n3encode('uri_obs:'+uritail))
        
            #print pquery
            res=c.makeQuery(pquery)
            #print "POINTING", res
            ra=None
            dec=None
            if len(res)!=0:
                ra=res[0]['ra']['value']
                dec=res[0]['dec']['value']
                #print "RADEC", ra, dec
            if ra!='None' and dec!='None':
                thedict['ra_f']=float(ra)
                thedict['dec_f']=float(dec)
        else:
            print "******************************************No pointings for ", uritail
            
        #proposal stuff...not searching abstracts yet
        props=c.getDataBySP('uri_obs:'+uritail, 'adsbase:asAResultOfProposal')
        #BUG: again assuming only one proposal here. When we get paper proposals this will
        #Not be true any more. We should also disambiguate observational from paper proposals.
        #though paper proposals will be assoced with papers, not here, so this should be obsvprop
        #only
        if len(props)>0:
            propuri=props[0]
            #print "PROPURI", propuri
            themission, thevariable, thepropid=splitns(propuri)
            proptail=themission+"/"+thevariable+"/"+thepropid
            thedict['propids_s']=themission+"/"+thepropid
            #print proptail, n3encode('uri_prop:'+proptail), c.getDataBySP('uri_prop:'+proptail, 'adsbase:title')
            thedict['proposaltitle']=c.getDataBySP('uri_prop:'+proptail, 'adsbase:title')[0]
            thedict['proposaltype_s']=c.getDataBySP('uri_prop:'+proptail, 'adsobsv:observationProposalType')[0]
            e=c.getDataBySP('uri_prop:'+proptail, 'adsbase:principalInvestigator')[0]
            #print "PI", e
            thedict['proposalpi']=unquote(e.split('/')[-2]).replace('_',' ')
            thedict['proposalpi_s']=thedict['proposalpi']
        
        #BUG: SHOULD we have something like this associating None's where there is no proposal?????'    
        #else:
        #    thedict['propids_s']=themission+"/None"
            
            #print thedict
        obsray.append(thedict)
    
    #print "OBSRAY", obsray
    if len(obsray)>0:
        for tkey in daprops:
            #print "tkey is ", tkey
            temptkey=[e[tkey] for e in obsray if e.has_key(tkey)]
            #print "temptkey", temptkey
            temp2=[item if hasattr(item,'__iter__') else [item] for item in temptkey]
            #print "temp2", temp2
	    if len(temp2) >0:
            	result[tkey]=reduce(lambda x,y: x+y, temp2)
	    else:
		result[tkey]=[]
    return result
Пример #5
0
    #print bibdir
    print '===================================='
    solrinstance.add([bibdir], commit=False)
    
    
    
    #Issue with loading into sh obsids.sh and all wont we duplicate them if we do stuff separateky for overlaps and stuff. Should we do it just once or check whats been loaded to protect against this BUG
if __name__=="__main__":
    if len(sys.argv)==2:
        execfile("./default.conf")
    elif len(sys.argv)==3:
        execfile(sys.argv[2])
    else:
        print "Usage: python rdf2solr3.py biblistfile [conffile]"
        sys.exit(-1)
    c=adsrdf.ADSConnection(SESAME, REPOSITORY)
    print "cccccccccccccccccccccccccccccccccccc",c
    #researchpapers=[unquote(e.split('#')[1]) for e in c.getDataByType('cito:ResearchPaper')]
    #h= HTMLParser.HTMLParser()
    researchpapers=[ele.strip() for ele in open(sys.argv[1]).readlines()]
    print researchpapers
    #researchpapers=['2000A&A...359..489C', '2000ApJ...534L..47G', '2000ApJ...536L..27W', '2000ApJ...540L..69S', '2000ApJ...541...49H']
    solr=pysolr.Solr(SOLR)
    #solr=None
    #researchpapers=['2000ApJ...534L..47G', '2009ApJ...692.1143K']
    for ele in researchpapers:
        print "Indexing: ",ele
        putIntoSolr(solr, ele)
        print "-------------"
    solr.commit()