Пример #1
0
def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """
    
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        h_at[obs_id]=[(vals, 1)]
            
                    
            
    fh.close()    
    return h_at    
Пример #2
0
def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """
    
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        h_at[obs_id]=[(vals, 1)]
            
                    
            
    fh.close()    
    return h_at    
def getObsCoreFile(odhfname, fname, ohead, nsplit=10000, format="n3"):
    """Convert the given obscore file from MAST (in psv format) into
    RDF.

    Rows that can not be converted are ignored (an error message is
    displayed on STDERR in this case).

    Since the input file is large we now split apart the output every
    nsplit rows. The output is written to
    
        ohead.<i>.<format>

    where i is a counter, starting at 1
    """
    obsdatahash={}
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    graph = makeGraph()
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        at_time="_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name=access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') !=-1:
            access_name=access_name.split('_sum')[0]
        anbool=1
        if access_name.find('_imcscor')!=-1:
            access_name=access_name.split('_imcscor')[0]
            anbool=0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind=access_url.find(obs_id+"_d")
        nightfind=access_url.find(obs_id+"_n")
        afind=access_url.find(obs_id+"_a")
        if dayfind!=-1:
            d2key=obs_id+"_d"
            #dkey=obs_id#lets not to day separately
        elif afind!=-1:
            d2key=obs_id+"_a"
        elif nightfind!=-1:
            d2key=obs_id+"_a"
        else:
            d2key=obs_id
            
        #dkey=obs_id+"--"+access_name
        dkey=obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey]=[]
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))
    
    #print "LLLLL"    
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen=len(globalrowdict[dkey])
        h_an={}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool=ele
            print "time",at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name]=[]
            if anbool==1 or dalen==1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2={}        
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist=[e[1] for e in h_an[item] if e[1]!=None]
            if len(thetimelist)>=1:
                thetime=thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item]=[(e[0],thetime) for e in h_an[item]]
        print "deekee",dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey+"="+item[1]):
                    h_at[dkey+"="+item[1]]=[]
                h_at[dkey+"="+item[1]].append(item[0])
            
            
                    
            
        
        
    for oid in h_at.keys():
        print "OID",oid
        #print "<<<",h_at[oid],">>>"
        graph=addObsCoreObs(oid,h_at[oid], obsdatahash)
        writeGraph(graph,
                       "{0}.{1}.{2}".format(ohead, oid, format),
                       format=format)

            
    fh.close()
    fd=open(odhfname,"w")
    fd.write(str(obsdatahash))
    fd.close()
def addObsCoreRow(row):
    """Returns a Graph representing the given row. We do not add it to the
    main graph here in case there is invalid data for this row. Perhaps
    it would be better to have all validity checks first and then add
    direcctly to the graph, since it may be faster once the main graph
    starts getting large (unlikely).

    Errors may be thrown if the input is invalid (e.g. unable to coerce
    a cell into the correct type).
    """

    vals = row2dict(row)

    # We use this as a hash and assume it is a unique value
    # (could check this assumption as we process the files, but
    # it was true in the original dataset).
    #
    # Originally I had used the obs_id cell as a unique identifier for
    # both observation and dataset, but it turned out not to be unique
    # enough. It may be that this is down to the modelling, where we try
    # to associate as much information as possible with the observation,
    # rather than the data product. If the observation were very light-weight
    # then we could keep this as an identifier for the observation, but
    # would still need unique identifiers for the data values.
    #
    # We reverse the access URL before hashing to try and reduce collisions.
    # This is a bit silly and needs replacing; for instance
    # we do not need this many characters and the current "hash" scheme isn't
    # very unique.
    #
    access_url = vals['access_url']
    if access_url.strip() == '':
        raise ValueError("Empty access_url for row")
    
    obs_id = vals['obs_id']
    if obs_id == '':
        raise ValueError("No obs_id value in this row!")
    
    # We use a scheme based on the path
    #    
    #    xxx/data/MAST/obsid/<obs_id>/<hash>
    #    xxx/observation/MAST/obsid/<obs_id>/<hash>
    #
    # where <hash> is a "hash" of the access_url value.
    # This is intentended to
    #   - reduce file sizes (e.g. use of slash rather than hash URI)
    #   - be more REST-ful in that we can define properties for parents
    #     of these URIs to manage and merge data
    #   - allow somewhat easier updates in case of changes - e.g to
    #     the data location because a server changes so access_url
    #     changes but nothing else does
    #
    uri_hash = base64.urlsafe_b64encode(access_url[::-1])
    daturi = mkURI("/obsv/data/MAST/obsid/{0}/".format(obs_id), uri_hash)
    #obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id), uri_hash)
    obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id))
    graph = Graph()

    # Can we assume this is a SimpleObservation or could it be a
    # ComplexObservation? Not convinced we can tell, so
    # use the parent Observation class for now.
    #
    #gadd(graph, obsuri, a, adsobsv.SimpleObservation)
    gadd(graph, obsuri, a, adsobsv.Observation)

    # For now assuming we have a Datum rather than a DataSet;
    # we could use the parent SingularDataProdict but try
    # this.
    #
    gadd(graph, daturi, a, adsobsv.Datum)
    #gadd(graph, daturi, a, adsobsv.SingularDataProduct)

    #gadd(graph, obsuri, adsobsv.hasDatum, daturi)
    gadd(graph, obsuri, adsobsv.hasDataProduct, daturi)

    #gadd(graph, daturi, adsobsv.forSimpleObservation, obsuri)
    gadd(graph, daturi, adsobsv.forObservation, obsuri)

    # Qus: should we use obs_id for both here?
    gadd(graph, obsuri, adsobsv.observationId, Literal(obs_id))
    gadd(graph, daturi, adsobsv.dataProductId, Literal(obs_id))

    ### Observational properties
    #
    emmin = vals['em_min']
    emmax = vals['em_max']

    addVals(graph, obsuri,
            [
                adsbase.atTime, vals['date_obs'], asDateTime(),
                # not convinced that observerTime is worth it, as a xsd:duration
                adsobsv.observedTime, vals['t_exptime'], asDuration,
                adsobsv.tExptime, vals['t_exptime'], asDouble,

                adsobsv.resolution, vals['s_resolution'], asDouble,
                adsobsv.tResolution, vals['t_resolution'], asDouble,

                adsobsv.wavelengthStart, emmin, asDouble,
                adsobsv.wavelengthEnd, emmax, asDouble,

                adsbase.title, vals['title'], Literal,
                
                adsobsv.fov, vals['s_fov'], asDouble,

            ])

    # For now we create a URI for each target_name and make
    # it an AstronomicalSourceName. We know that this is not
    # always "sensible", in that some names are not sources as
    # such but calibration values (e.g. '20% UV FLOOD' or
    # 'NULL SAFETY RD') or some scheme of the observer
    # which may be positional or something else (e.g. '+014381').
    #
    tname = vals['target_name'].strip()
    if tname != '':
        tnameuri = mkURI("/obsv/target/MAST/", tname)

        gadd(graph, obsuri, adsbase.target, tnameuri)
        addVals(graph, tnameuri,
                [
                    a, adsobsv.AstronomicalSourceName, None,
                    adsbase.name, tname, Literal,
                    ])

    # We do not use the em_domain field since the values found in
    # the MAST table do not appear to match the ObsCore/VODataService
    # enumerations. Instead we create values based on the em_min/max
    # fields. These could be inferred but worth being explicit here.
    #
    for domain in getEMDomains(float(emmin), float(emmax)):
        addVal(graph, obsuri, adsobsv.wavelengthDomain, domain)

    sra = vals['s_ra']
    sdec = vals['s_dec']
    if sra != '' and sdec != '':
        gdbnadd(graph, obsuri, adsobsv.associatedPosition,
                [
                    a, adsobsv.Pointing,
                    adsobsv.ra, asDouble(sra),
                    adsobsv.dec, asDouble(sdec),
                ])

    sregion = vals['s_region']
    if sregion != '':
        predList = [
                a, adsobsv.FootPrint,
                adsobsv.s_region, Literal(sregion),
            ]
            
        gdbnadd(graph, obsuri, adsobsv.associatedFootprint, predList)

    # TODO:
    #   - work out what prefix to use; for now guessing uri_conf is okay,
    #     since that's what the Chandra pipeline uses, but would uri_obsv be
    #     better? Alternatively, move to a scheme more like the other URIs
    #     we create here
    #
    tname = vals['telescope_name']
    iname = vals['instrument']
    oname="MAST"
    gadd(graph, obsuri, adsobsv.atObservatory,
             addFragment(uri_infra, 'observatory/' + oname))
    if tname != '':
        gadd(graph, obsuri, adsobsv.atTelescope,
             addFragment(uri_infra, 'telescope/MAST_' + tname))

    if iname != '':
        gadd(graph, obsuri, adsbase.usingInstrument,
             addFragment(uri_infra, 'instrument/MAST_' + iname))

    ### Data set properties
    #
    gadd(graph, daturi, adsobsv.dataURL, URIRef(access_url))
    #BUG: fix this to use a mapper
    dprodtype="image"#DEFAULT
    if vals['dataproduct_type'].find("Spectrum.") != -1:
        dprodtype="spectra"
    elif vals['dataproduct_type'].find("Image.") != -1:
        dprodtype="image"
    addVals(graph, daturi,
            [
                pav.createdOn, vals['creation_date'], asDateTime(),
                adsobsv.calibLevel, vals['calib_level'], asInt,

                adsbase.dataType, dprodtype, Literal, # could be a URI; how standardised are the values?
                adsobsv.dataFormat, vals['access_format'], Literal, # could be a URI; how standardised are the values?
            ])

    # Adding a link to the IVOA identifier for completeness.
    # Since this is the dataset identifier, we link it to the
    # dataset rather than the observation.
    #
    gadd(graph, daturi, adsbase.hasIVOAIdentifier,
         URIRef(vals['obs_publisher_did']))
        
    # The scheme for creator and collection URI is
    #
    #    xxx/creator/MAST/<obs_creator_name>
    #    xxx/collection/MAST/<obs_collection>
    #
    # although <obs_collection> can be an IVOA identifier, which means
    # we use that instead; this breaks linked-data approach, so perhaps
    # need a predicate to say "this represents this IVOA id" (could
    # use owl:sameAs but not convinced we want this).
    #
    #   - should I replace / by some other character since it could
    #     confuse some parsers? Replace space with ?
    #
    #   - upper case all characters under the assumption that case
    #     is not important and that there may be differences in case
    #
    cname = vals['obs_creator_name']
    if cname != '':
        # Is this correct; ie is the obs_creator_name really
        # the same as observationMadeBy?
        #
        #cnameuri = mkURI("/obsv/creator/MAST/", cname)
        cnameuri=addFragment(uri_conf, 'project/MAST_' + cname)
        gadd(graph, obsuri, adsobsv.observationMadeBy, cnameuri)
        #gdadd(graph, cnameuri, [
        #    a, agent.PersonName,
        #    agent.fullName, Literal(cname)
        #    ])

    ocoll = vals['obs_collection']
    if ocoll != '':
        if is_ivoa_uri(ocoll):
            colluri = URIRef(ocoll)
        else:
            colluri = mkURI("/obsv/collection/MAST/", ocoll)

        addVal(graph, daturi, adsobsv.fromDataCollection, colluri)
        gdadd(graph, colluri, [
            a, adsobsv.DataCollection,
            adsbase.name, Literal(ocoll)
            ])

    return graph
Пример #5
0
def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """

    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1

    globalrowdict = {}
    h_at = {}
    for row in rdr:
        vals = row2dict(row)
        obs_id = vals['obs_id']
        at_time = "_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name = access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') != -1:
            access_name = access_name.split('_sum')[0]
        anbool = 1
        if access_name.find('_imcscor') != -1:
            access_name = access_name.split('_imcscor')[0]
            anbool = 0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind = access_url.find(obs_id + "_d")
        nightfind = access_url.find(obs_id + "_n")
        afind = access_url.find(obs_id + "_a")
        if dayfind != -1:
            d2key = obs_id + "_d"
            #dkey=obs_id#lets not to day separately
        elif afind != -1:
            d2key = obs_id + "_a"
        elif nightfind != -1:
            d2key = obs_id + "_a"
        else:
            d2key = obs_id

        #dkey=obs_id+"--"+access_name
        dkey = obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey] = []
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))

    #print "LLLLL"
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen = len(globalrowdict[dkey])
        h_an = {}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool = ele
            print "time", at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name] = []
            if anbool == 1 or dalen == 1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2 = {}
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist = [e[1] for e in h_an[item] if e[1] != None]
            if len(thetimelist) >= 1:
                thetime = thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item] = [(e[0], thetime) for e in h_an[item]]
        print "deekee", dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey + "=" + item[1]):
                    h_at[dkey + "=" + item[1]] = []
                h_at[dkey + "=" + item[1]].append((item[0][0], item[0][4]))
                #add the anbool and vals in here
                #unbool tells you which row contains the information we ought to use
                #in this case not imscor. in default case anbool=1 for everything.

    fh.close()
    return h_at
Пример #6
0
def getObsCoreFile(odhfname, fname, ohead, nsplit=10000, format="n3"):
    """Convert the given obscore file from MAST (in psv format) into
    RDF.

    Rows that can not be converted are ignored (an error message is
    displayed on STDERR in this case).

    Since the input file is large we now split apart the output every
    nsplit rows. The output is written to
    
        ohead.<i>.<format>

    where i is a counter, starting at 1
    """
    obsdatahash = {}
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    graph = makeGraph()
    globalrowdict = {}
    h_at = {}
    for row in rdr:
        vals = row2dict(row)
        obs_id = vals['obs_id']
        at_time = "_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name = access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') != -1:
            access_name = access_name.split('_sum')[0]
        anbool = 1
        if access_name.find('_imcscor') != -1:
            access_name = access_name.split('_imcscor')[0]
            anbool = 0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind = access_url.find(obs_id + "_d")
        nightfind = access_url.find(obs_id + "_n")
        afind = access_url.find(obs_id + "_a")
        if dayfind != -1:
            d2key = obs_id + "_d"
            #dkey=obs_id#lets not to day separately
        elif afind != -1:
            d2key = obs_id + "_a"
        elif nightfind != -1:
            d2key = obs_id + "_a"
        else:
            d2key = obs_id

        #dkey=obs_id+"--"+access_name
        dkey = obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey] = []
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))

    #print "LLLLL"
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen = len(globalrowdict[dkey])
        h_an = {}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool = ele
            print "time", at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name] = []
            if anbool == 1 or dalen == 1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2 = {}
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist = [e[1] for e in h_an[item] if e[1] != None]
            if len(thetimelist) >= 1:
                thetime = thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item] = [(e[0], thetime) for e in h_an[item]]
        print "deekee", dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey + "=" + item[1]):
                    h_at[dkey + "=" + item[1]] = []
                h_at[dkey + "=" + item[1]].append(item[0])

    for oid in h_at.keys():
        print "OID", oid
        #print "<<<",h_at[oid],">>>"
        graph = addObsCoreObs(oid, h_at[oid], obsdatahash)
        writeGraph(graph,
                   "{0}.{1}.{2}".format(ohead, oid, format),
                   format=format)

    fh.close()
    fd = open(odhfname, "w")
    fd.write(str(obsdatahash))
    fd.close()
Пример #7
0
def addObsCoreRow(row):
    """Returns a Graph representing the given row. We do not add it to the
    main graph here in case there is invalid data for this row. Perhaps
    it would be better to have all validity checks first and then add
    direcctly to the graph, since it may be faster once the main graph
    starts getting large (unlikely).

    Errors may be thrown if the input is invalid (e.g. unable to coerce
    a cell into the correct type).
    """

    vals = row2dict(row)

    # We use this as a hash and assume it is a unique value
    # (could check this assumption as we process the files, but
    # it was true in the original dataset).
    #
    # Originally I had used the obs_id cell as a unique identifier for
    # both observation and dataset, but it turned out not to be unique
    # enough. It may be that this is down to the modelling, where we try
    # to associate as much information as possible with the observation,
    # rather than the data product. If the observation were very light-weight
    # then we could keep this as an identifier for the observation, but
    # would still need unique identifiers for the data values.
    #
    # We reverse the access URL before hashing to try and reduce collisions.
    # This is a bit silly and needs replacing; for instance
    # we do not need this many characters and the current "hash" scheme isn't
    # very unique.
    #
    access_url = vals['access_url']
    if access_url.strip() == '':
        raise ValueError("Empty access_url for row")

    obs_id = vals['obs_id']
    if obs_id == '':
        raise ValueError("No obs_id value in this row!")

    # We use a scheme based on the path
    #
    #    xxx/data/MAST/obsid/<obs_id>/<hash>
    #    xxx/observation/MAST/obsid/<obs_id>/<hash>
    #
    # where <hash> is a "hash" of the access_url value.
    # This is intentended to
    #   - reduce file sizes (e.g. use of slash rather than hash URI)
    #   - be more REST-ful in that we can define properties for parents
    #     of these URIs to manage and merge data
    #   - allow somewhat easier updates in case of changes - e.g to
    #     the data location because a server changes so access_url
    #     changes but nothing else does
    #
    uri_hash = base64.urlsafe_b64encode(access_url[::-1])
    daturi = mkURI("/obsv/data/MAST/obsid/{0}/".format(obs_id), uri_hash)
    #obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id), uri_hash)
    obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id))
    graph = Graph()

    # Can we assume this is a SimpleObservation or could it be a
    # ComplexObservation? Not convinced we can tell, so
    # use the parent Observation class for now.
    #
    #gadd(graph, obsuri, a, adsobsv.SimpleObservation)
    gadd(graph, obsuri, a, adsobsv.Observation)

    # For now assuming we have a Datum rather than a DataSet;
    # we could use the parent SingularDataProdict but try
    # this.
    #
    gadd(graph, daturi, a, adsobsv.Datum)
    #gadd(graph, daturi, a, adsobsv.SingularDataProduct)

    #gadd(graph, obsuri, adsobsv.hasDatum, daturi)
    gadd(graph, obsuri, adsobsv.hasDataProduct, daturi)

    #gadd(graph, daturi, adsobsv.forSimpleObservation, obsuri)
    gadd(graph, daturi, adsobsv.forObservation, obsuri)

    # Qus: should we use obs_id for both here?
    gadd(graph, obsuri, adsobsv.observationId, Literal(obs_id))
    gadd(graph, daturi, adsobsv.dataProductId, Literal(obs_id))

    ### Observational properties
    #
    emmin = vals['em_min']
    emmax = vals['em_max']

    addVals(
        graph,
        obsuri,
        [
            adsbase.atTime,
            vals['date_obs'],
            asDateTime(),
            # not convinced that observerTime is worth it, as a xsd:duration
            adsobsv.observedTime,
            vals['t_exptime'],
            asDuration,
            adsobsv.tExptime,
            vals['t_exptime'],
            asDouble,
            adsobsv.resolution,
            vals['s_resolution'],
            asDouble,
            adsobsv.tResolution,
            vals['t_resolution'],
            asDouble,
            adsobsv.wavelengthStart,
            emmin,
            asDouble,
            adsobsv.wavelengthEnd,
            emmax,
            asDouble,
            adsbase.title,
            vals['title'],
            Literal,
            adsobsv.fov,
            vals['s_fov'],
            asDouble,
        ])

    # For now we create a URI for each target_name and make
    # it an AstronomicalSourceName. We know that this is not
    # always "sensible", in that some names are not sources as
    # such but calibration values (e.g. '20% UV FLOOD' or
    # 'NULL SAFETY RD') or some scheme of the observer
    # which may be positional or something else (e.g. '+014381').
    #
    tname = vals['target_name'].strip()
    if tname != '':
        tnameuri = mkURI("/obsv/target/MAST/", tname)

        gadd(graph, obsuri, adsbase.target, tnameuri)
        addVals(graph, tnameuri, [
            a,
            adsobsv.AstronomicalSourceName,
            None,
            adsbase.name,
            tname,
            Literal,
        ])

    # We do not use the em_domain field since the values found in
    # the MAST table do not appear to match the ObsCore/VODataService
    # enumerations. Instead we create values based on the em_min/max
    # fields. These could be inferred but worth being explicit here.
    #
    for domain in getEMDomains(float(emmin), float(emmax)):
        addVal(graph, obsuri, adsobsv.wavelengthDomain, domain)

    sra = vals['s_ra']
    sdec = vals['s_dec']
    if sra != '' and sdec != '':
        gdbnadd(graph, obsuri, adsobsv.associatedPosition, [
            a,
            adsobsv.Pointing,
            adsobsv.ra,
            asDouble(sra),
            adsobsv.dec,
            asDouble(sdec),
        ])

    sregion = vals['s_region']
    if sregion != '':
        predList = [
            a,
            adsobsv.FootPrint,
            adsobsv.s_region,
            Literal(sregion),
        ]

        gdbnadd(graph, obsuri, adsobsv.associatedFootprint, predList)

    # TODO:
    #   - work out what prefix to use; for now guessing uri_conf is okay,
    #     since that's what the Chandra pipeline uses, but would uri_obsv be
    #     better? Alternatively, move to a scheme more like the other URIs
    #     we create here
    #
    tname = vals['telescope_name']
    iname = vals['instrument']
    oname = "MAST"
    gadd(graph, obsuri, adsobsv.atObservatory,
         addFragment(uri_infra, 'observatory/' + oname))
    if tname != '':
        gadd(graph, obsuri, adsobsv.atTelescope,
             addFragment(uri_infra, 'telescope/MAST_' + tname))

    if iname != '':
        gadd(graph, obsuri, adsbase.usingInstrument,
             addFragment(uri_infra, 'instrument/MAST_' + iname))

    ### Data set properties
    #
    gadd(graph, daturi, adsobsv.dataURL, URIRef(access_url))
    #BUG: fix this to use a mapper
    dprodtype = "image"  #DEFAULT
    if vals['dataproduct_type'].find("Spectrum.") != -1:
        dprodtype = "spectra"
    elif vals['dataproduct_type'].find("Image.") != -1:
        dprodtype = "image"
    addVals(
        graph,
        daturi,
        [
            pav.createdOn,
            vals['creation_date'],
            asDateTime(),
            adsobsv.calibLevel,
            vals['calib_level'],
            asInt,
            adsbase.dataType,
            dprodtype,
            Literal,  # could be a URI; how standardised are the values?
            adsobsv.dataFormat,
            vals['access_format'],
            Literal,  # could be a URI; how standardised are the values?
        ])

    # Adding a link to the IVOA identifier for completeness.
    # Since this is the dataset identifier, we link it to the
    # dataset rather than the observation.
    #
    gadd(graph, daturi, adsbase.hasIVOAIdentifier,
         URIRef(vals['obs_publisher_did']))

    # The scheme for creator and collection URI is
    #
    #    xxx/creator/MAST/<obs_creator_name>
    #    xxx/collection/MAST/<obs_collection>
    #
    # although <obs_collection> can be an IVOA identifier, which means
    # we use that instead; this breaks linked-data approach, so perhaps
    # need a predicate to say "this represents this IVOA id" (could
    # use owl:sameAs but not convinced we want this).
    #
    #   - should I replace / by some other character since it could
    #     confuse some parsers? Replace space with ?
    #
    #   - upper case all characters under the assumption that case
    #     is not important and that there may be differences in case
    #
    cname = vals['obs_creator_name']
    if cname != '':
        # Is this correct; ie is the obs_creator_name really
        # the same as observationMadeBy?
        #
        #cnameuri = mkURI("/obsv/creator/MAST/", cname)
        cnameuri = addFragment(uri_conf, 'project/MAST_' + cname)
        gadd(graph, obsuri, adsobsv.observationMadeBy, cnameuri)
        #gdadd(graph, cnameuri, [
        #    a, agent.PersonName,
        #    agent.fullName, Literal(cname)
        #    ])

    ocoll = vals['obs_collection']
    if ocoll != '':
        if is_ivoa_uri(ocoll):
            colluri = URIRef(ocoll)
        else:
            colluri = mkURI("/obsv/collection/MAST/", ocoll)

        addVal(graph, daturi, adsobsv.fromDataCollection, colluri)
        gdadd(graph, colluri,
              [a, adsobsv.DataCollection, adsbase.name,
               Literal(ocoll)])

    return graph
Пример #8
0
def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """
    
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        at_time="_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name=access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') !=-1:
            access_name=access_name.split('_sum')[0]
        anbool=1
        if access_name.find('_imcscor')!=-1:
            access_name=access_name.split('_imcscor')[0]
            anbool=0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind=access_url.find(obs_id+"_d")
        nightfind=access_url.find(obs_id+"_n")
        afind=access_url.find(obs_id+"_a")
        if dayfind!=-1:
            d2key=obs_id+"_d"
            #dkey=obs_id#lets not to day separately
        elif afind!=-1:
            d2key=obs_id+"_a"
        elif nightfind!=-1:
            d2key=obs_id+"_a"
        else:
            d2key=obs_id
            
        #dkey=obs_id+"--"+access_name
        dkey=obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey]=[]
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))
    
    #print "LLLLL"    
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen=len(globalrowdict[dkey])
        h_an={}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool=ele
            print "time",at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name]=[]
            if anbool==1 or dalen==1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2={}        
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist=[e[1] for e in h_an[item] if e[1]!=None]
            if len(thetimelist)>=1:
                thetime=thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item]=[(e[0],thetime) for e in h_an[item]]
        print "deekee",dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey+"="+item[1]):
                    h_at[dkey+"="+item[1]]=[]
                h_at[dkey+"="+item[1]].append((item[0][0], item[0][4]))
                #add the anbool and vals in here
                #unbool tells you which row contains the information we ought to use
                #in this case not imscor. in default case anbool=1 for everything.
            
                    
            
    fh.close()    
    return h_at