예제 #1
0
    def find_data_entries(self,
                          dataset,
                          offset,
                          limit,
                          start_time=None,
                          end_time=None):
        try:
            with self.connection() as repo:
                start_time = dam.format_time(
                    start_time) if start_time is not None else None
                end_time = dam.format_time(
                    end_time) if end_time is not None else None
                dam_objs = repo.retrieve_tuples("data",
                                                dataset=dataset.repository_id,
                                                offset=offset,
                                                limit=limit,
                                                startTime=start_time,
                                                endTime=end_time)
        except dam.DAMException as e:
            logger.exception("Exception while getting data entries")
            raise PersistenceError("Error getting data entries: %s" % (str(e)))

        ret = []
        for dam_obj in dam_objs["results"]:
            data_entry = DataEntry()
            data_entry.id = dam_obj["metadata"]["id"]
            data_entry.dataset = dataset.id
            data_entry.timestamp = parse_timestamp(dam_obj["metadata"]["time"])
            self._copy_attrs(dam_obj["data"], data_entry)
            ret.append(data_entry)
        return SearchResults(ret, dam_objs["offset"], dam_objs["limit"],
                             dam_objs["count"])
 def _create_data_entry(self, obs, schema):
     """Internal method for creating the DataEntry domain object from a database
     observation
     """
     entry = DataEntry()
     entry.dataset = obs.dataset
     entry.id = obs.id
     entry.timestamp = obs.timestamp
     for attr in obs.attrs:
         if isinstance(schema.attrs[attr.name], FileDataType):
             entry[attr.name] = FileObject(f_path=attr.value)
         else:
             entry[attr.name] = attr.value
     return entry
예제 #3
0
    def fetch_observations(self, sos, caps, cwd, ret):
        insert_dir = os.path.join(cwd, "observations")
        if not os.path.exists(insert_dir):
            os.makedirs(insert_dir)

        for observationID in caps.createRangeGenerator():
            if observationID not in self.state['observations']:
                logger.debug("GetObservationByID for %s" % observationID)
                sos_obs = sos.getObservationByID(observationID,
                                                 "om:Observation")
                obs_path = os.path.join(insert_dir, "%s.xml" % observationID)
                with open(obs_path, "wb") as output:
                    output.write(sos_obs.getXMLString())
                    timestamp = sos_obs.getTimestamp()
                    new_data_entry = DataEntry(timestamp=timestamp)
                    new_data_entry[self.field] = FileObject(
                        f_path=obs_path, mime_type=SOSMimeTypes.om_1_0_0)
                    ret.append(new_data_entry)
                self.state['observations'].append(observationID)
                self.state['observation_map'][sos_obs.getSensorID()].append(
                    observationID)
            else:
                logger.debug(
                    "GetObservationByID for %s already retrieved, ignoring." %
                    observationID)
 def get_data_entry(self, dataset_id, data_entry_id):
     try:
         with self.connection() as repo:
             dam_obj = repo.getTuples(data_entry_id)
     except dam.DAMException as e:
         logger.exception("Exception while getting data entry")
         raise PersistenceError("Error getting data entry: %s"%(str(e)))
     
     if dam_obj == None and len(dam_obj) == 1: return None
     dam_obj = dam_obj[0]
     data_entry = DataEntry()
     data_entry.id = data_entry_id
     data_entry.dataset = dataset_id
     data_entry.timestamp = parse_timestamp(dam_obj["metadata"]["time"])
     self._copy_attrs(dam_obj["data"], data_entry)
     return data_entry
예제 #5
0
    def fetch_single(self, cwd):
        """Fetch a single resource from a URL"""
        req = urllib2.Request(self.url)
        f_out_name = os.path.join(cwd, "outputfile")
        f_in = None
        try:
            f_in = urllib2.urlopen(req)
            timestamp = parse_timestamp_rfc_2822(f_in.headers["Last-Modified"]) if "Last-Modified" in f_in.headers \
                else datetime.datetime.now()
            with file(f_out_name, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

            self.state["lasttime"] = format_timestamp(timestamp)
        finally:
            if f_in != None: f_in.close()
        new_data_entry = DataEntry(timestamp=timestamp)

        file_name = None
        try:
            file_name = self.url.split("/")[-1]
        except:
            pass

        new_data_entry[self.field] = FileObject(f_path="outputfile",
                                                mime_type="",
                                                file_name=file_name)

        return [new_data_entry]
예제 #6
0
    def get_data_entry(self, dataset_id, data_entry_id):
        try:
            with self.connection() as repo:
                dam_obj = repo.getTuples(data_entry_id)
        except dam.DAMException as e:
            logger.exception("Exception while getting data entry")
            raise PersistenceError("Error getting data entry: %s" % (str(e)))

        if dam_obj == None and len(dam_obj) == 1: return None
        dam_obj = dam_obj[0]
        data_entry = DataEntry()
        data_entry.id = data_entry_id
        data_entry.dataset = dataset_id
        data_entry.timestamp = parse_timestamp(dam_obj["metadata"]["time"])
        self._copy_attrs(dam_obj["data"], data_entry)
        return data_entry
예제 #7
0
    def fetch(self, cwd, service=None):
        with open(os.path.join(cwd, "file1"), "w") as f:
            f.write("2,55\n3,2\n")

        data_entry = DataEntry(timestamp=datetime.datetime.now())
        data_entry["file1"] = FileObject("file1")

        return [data_entry]
 def find_data_entries(self, dataset, offset, limit, start_time=None, end_time=None):
     try:
         with self.connection() as repo:
             start_time = dam.format_time(start_time) if start_time is not None else None
             end_time = dam.format_time(end_time) if end_time is not None else None
             dam_objs = repo.retrieve_tuples("data", dataset=dataset.repository_id, 
                             offset=offset, limit=limit, startTime=start_time, endTime=end_time)
     except dam.DAMException as e:
         logger.exception("Exception while getting data entries")
         raise PersistenceError("Error getting data entries: %s"%(str(e)))
     
     ret = []
     for dam_obj in dam_objs["results"]:
         data_entry = DataEntry()
         data_entry.id = dam_obj["metadata"]["id"]
         data_entry.dataset = dataset.id
         data_entry.timestamp = parse_timestamp(dam_obj["metadata"]["time"])
         self._copy_attrs(dam_obj["data"], data_entry)
         ret.append(data_entry)
     return SearchResults(ret, dam_objs["offset"], dam_objs["limit"], dam_objs["count"])
예제 #9
0
    def testScript(self):
        file1 = "1\n2\n"
        with open(os.path.join(self.cwd, "file1"), "w") as f:
            f.write(file1)
        data_entry = DataEntry(timestamp=datetime.datetime.now())
        data_entry["file1"] = FileObject("file1")

        script = """def process(cwd, data_entry):
    return [data_entry, None, None]
"""
        new_entries = run_script(script, self.cwd, data_entry)

        self.assertEquals(3, len(new_entries))
예제 #10
0
    def test_file_object_roundtrip(self):
        """The file object should marshall everything but the file stream"""
        data_entry = DataEntry(1)
        data_entry["temp"] = FileObject(f_path=os.path.join(
            os.path.dirname(jcudc24ingesterapi.__file__),
            "tests/test_ingest.xml"),
                                        mime_type="text/xml")

        data_entry_dto = self.marshaller.obj_to_dict(data_entry)
        self.assertEqual("text/xml",
                         data_entry_dto["data"]["temp"]["mime_type"])

        data_entry_domain = self.marshaller.dict_to_obj(data_entry_dto)
        self.assertEqual("text/xml", data_entry_domain["temp"].mime_type)
예제 #11
0
    def fetch(self, cwd, service=None):
        """Scans a folder to find new files. The filenames are UTC timestamps that used
        as the timestamp for these samples.
        
        :param cwd: working directory to place binary data
        :returns: dict containing the data to be ingested
        """
        if not hasattr(self, "path"):
            raise DataSourceError("Path not set")
        if not os.path.exists(self.path):
            raise DataSourceError("Could not find the staging path")

        start_time = datetime.datetime.utcnow()

        # When the file should have been modified since
        since = None
        if "lasttime" in self.state and self.state["lasttime"] != None and len(
                self.state["lasttime"]) > 0:
            since = calendar.timegm(
                parse_timestamp(self.state["lasttime"]).timetuple())

        ret = []
        for f_name in os.listdir(self.path):
            timestamp = self.match_filename(f_name)
            if timestamp == None: continue

            logger.debug("%s %s" % (str(timestamp), f_name))

            new_filename = "file-" + f_name
            if self.archive != None:
                shutil.copyfile(os.path.join(self.path, f_name),
                                os.path.join(self.archive, f_name))
            shutil.move(os.path.join(self.path, f_name),
                        os.path.join(cwd, new_filename))
            #timestamp = datetime.datetime.utcfromtimestamp(int(m.group(1)))
            new_data_entry = DataEntry(timestamp=timestamp)
            new_data_entry[self.field] = FileObject(f_path=new_filename,
                                                    file_name=f_name,
                                                    mime_type="")
            ret.append(new_data_entry)

        self.state["lasttime"] = format_timestamp(
            since) if since != None else None

        return ret
예제 #12
0
    def test_data_entry(self):
        dt = datetime.datetime.utcfromtimestamp(1357788112)
        dt = dt.replace(tzinfo=jcudc24ingesterapi.UTC)

        data_entry = DataEntry(1, dt)
        data_entry["temp"] = 1.2

        data_entry_dto = self.marshaller.obj_to_dict(data_entry)
        self.assertEquals("2013-01-10T03:21:52.000Z",
                          data_entry_dto["timestamp"])
        self.assertEquals(1, data_entry_dto["dataset"])
        self.assertEquals(1.2, data_entry_dto["data"]["temp"])

        data_entry_return = self.marshaller.dict_to_obj(data_entry_dto)
        self.assertEquals(data_entry.timestamp, data_entry_return.timestamp)
        self.assertEquals(data_entry.dataset, data_entry_return.dataset)
        self.assertEquals(data_entry.data["temp"],
                          data_entry_return.data["temp"])
예제 #13
0
    def test_parent_schemas(self):
        """This test creates a nested schema with attributes provided at 2
        different levels. A data entry is saved, and then retrieved, and the
        values tested.
        """
        loc1 = self.ingester_platform.post(
            Location(11.0, 11.0, "Test Site", 100))

        temp_work = self.ingester_platform.createUnitOfWork()
        temperature_schema = DataEntrySchema("Test Temp Schema")
        temperature_schema.addAttr(Double("temperature"))
        temp_work.post(temperature_schema)
        temp_work.commit()

        air_temperature_schema = DataEntrySchema("Air Temp Schema")
        air_temperature_schema.extends = [temperature_schema.id]
        air_temperature_schema = self.ingester_platform.post(
            air_temperature_schema)

        instrument_schema = DataEntrySchema("Instrument Schema")
        instrument_schema.extends = [air_temperature_schema.id]
        instrument_schema.addAttr(Double("var2"))
        instrument_schema = self.ingester_platform.post(instrument_schema)

        dataset = Dataset(location=loc1.id, schema=instrument_schema.id)
        dataset = self.ingester_platform.post(dataset)

        work = self.ingester_platform.createUnitOfWork()
        data_entry = DataEntry(dataset.id, datetime.datetime.now())
        data_entry["temperature"] = 10
        data_entry["var2"] = 11
        work.post(data_entry)
        work.commit()

        data_entry_ret = self.ingester_platform.getDataEntry(
            dataset.id, data_entry.id)

        self.assertEquals(data_entry["temperature"],
                          data_entry_ret["temperature"])
        self.assertEquals(data_entry["var2"], data_entry_ret["var2"])
예제 #14
0
    def fetch_sensorml(self, sos, caps, cwd, ret):
        sensorIDS = caps.getSensorIDs()
        sensorml_dir = os.path.join(cwd, "sensorml")
        if not os.path.exists(sensorml_dir):
            os.makedirs(sensorml_dir)

        for sensorID in sensorIDS:
            if sensorID not in self.state['sensorml']:
                logger.debug("Getting SensorML for %s" % sensorID)
                sml = sos.describeSensor(sensorID)
                sml_path = os.path.join(sensorml_dir, sensorID)
                with open(sml_path, "wb") as sensorml:
                    sensorml.write(sml.getXMLString())
                    timestamp = datetime.datetime.now()
                    new_data_entry = DataEntry(timestamp=timestamp)
                    new_data_entry[self.field] = FileObject(
                        f_path=sml_path, mime_type=SOSMimeTypes.sensorML_1_0_1)
                    ret.append(new_data_entry)
                self.state['sensorml'].append(sensorID)
            else:
                logger.debug("SensorML for %s already exists, ignoring." %
                             sensorID)
예제 #15
0
 def fetch(self, cwd, service=None):
     time.sleep(20)
     return [DataEntry(timestamp=datetime.datetime.now())]
예제 #16
0
    def test_data_types(self):
        schema1 = DatasetMetadataSchema("schema1")
        schema1.addAttr(FileDataType("file"))
        schema1a = self.service.persist(schema1)

        self.assertEquals(1, len(schema1a.attrs))

        schema2 = DataEntrySchema("schema2")
        schema2.addAttr(FileDataType("file"))
        schema2.addAttr(Double("x"))
        schema2a = self.service.persist(schema2)

        loc = Location(10.0, 11.0)
        loca = self.service.persist(loc)

        dataset = Dataset()
        dataset.schema = schema1a.id
        dataset.location = loca.id
        # We've trying to use a dataset_metadata schema, so this should fail
        self.assertRaises(ValueError, self.service.persist, dataset)

        dataset.schema = schema2a.id
        # Now we're using the correct type of schema
        dataset1a = self.service.persist(dataset)

        dataset1b = self.service.get_dataset(dataset1a.id)
        self.assertEquals(dataset1a.id, dataset1b.id)
        self.assertDictEqual(dataset1a.__dict__, dataset1b.__dict__)

        # Update and add a data source
        dataset1b.data_source = PullDataSource(
            "http://www.abc.net.au",
            None,
            recursive=False,
            field="file",
            processing_script="TEST",
            sampling=PeriodicSampling(10000))
        dataset1b.enabled = True
        dataset1c = self.service.persist(dataset1b)
        self.assertNotEqual(None, dataset1c.data_source)
        self.assertEqual("TEST", dataset1c.data_source.processing_script)
        self.assertNotEqual(None, dataset1c.data_source.sampling)

        datasets = self.service.get_active_datasets()
        self.assertEquals(1, len(datasets))
        self.assertNotEqual(None, datasets[0].data_source)
        self.assertEqual("TEST", datasets[0].data_source.processing_script)
        self.assertNotEqual(None, datasets[0].data_source.sampling)

        # Test with criteria
        datasets = self.service.get_active_datasets(kind="pull_data_source")
        self.assertEquals(1, len(datasets))

        datasets = self.service.get_active_datasets(kind="push_data_source")
        self.assertEquals(0, len(datasets))

        schema1b = self.service.get_schema(schema1a.id)
        self.assertEquals(schema1a.id, schema1b.id)

        datasets = self.service.search("dataset")
        self.assertEquals(1, len(datasets))

        schemas = self.service.search("data_entry_schema")
        self.assertEquals(1, len(schemas))

        schemas = self.service.search("dataset_metadata_schema")
        self.assertEquals(1, len(schemas))

        locs = self.service.search("location")
        self.assertEquals(1, len(locs))

        # Test ingest
        data_entry_1 = DataEntry(dataset1b.id, datetime.datetime.now())
        data_entry_1['x'] = 27.8
        data_entry_1 = self.service.persist(data_entry_1)
        self.assertIsNotNone(data_entry_1.id)
예제 #17
0
    def test_api_usage(self):
        #       User data that is created by filling out the provisioning interface workflow steps.
        #   General
        title = "Test project"
        data_manager = "A Person"
        project_lead = "Another Person"

        #   Metadata
        project_region = Region("Test Region",
                                ((1, 1), (2, 2), (2, 1), (1, 1)))

        #   Methods & Datasets
        loc1 = Location(11.0, 11.0, "Test Site", 100)
        loc2 = Location(11.0, 11.0, "Test Site", 100)
        loc3 = Location(12.0, 11.0, "Test Site", 100)

        temp_work = self.ingester_platform.createUnitOfWork()
        temperature_schema = DataEntrySchema("Test Temp Schema")
        temperature_schema.addAttr(Double("temperature"))
        temp_work.post(temperature_schema)
        temp_work.commit()

        air_temperature_schema = DataEntrySchema("Air Temp Schema")
        air_temperature_schema.extends = [temperature_schema.id]
        air_temperature_schema = self.ingester_platform.post(
            air_temperature_schema)

        second_level_inheritence_schema = DataEntrySchema("Second Inheritence")
        second_level_inheritence_schema.extends = [air_temperature_schema.id]
        second_level_inheritence_schema = self.ingester_platform.post(
            second_level_inheritence_schema)

        # Check the name is set
        temperature_schema_1 = self.ingester_platform.getSchema(
            temperature_schema.id)
        self.assertIsNotNone(temperature_schema.name)
        self.assertEquals(temperature_schema.name, temperature_schema_1.name)

        file_schema = DataEntrySchema()
        file_schema.addAttr(FileDataType("file"))
        file_schema = self.ingester_platform.post(file_schema)

        dataset1 = Dataset(location=None, schema=temperature_schema.id)
        dataset2 = Dataset(
            location=None,
            schema=file_schema.id,
            data_source=PullDataSource(
                "http://test.com",
                "file_handle",
                processing_script=
                "file://d:/processing_scripts/awsome_processing.py"))

        #        dataset3 = Dataset(None, file_schema, PullDataSource("http://test.com", "file_handle"), CustomSampling("file://d:/sampling_scripts/awsome_sampling.py"), "file://d:/processing_scripts/awsome_processing.py")

        self.cleanup_files.append(dataset2.data_source.processing_script)
        #        self.cleanup_files.push(dataset3.sampling.script)
        #        self.cleanup_files.push(dataset3.processing_script)

        #       Provisioning admin accepts the submitted project
        work = self.ingester_platform.createUnitOfWork()

        work.post(project_region)  # Save the region

        loc1.region = project_region.id  # Set the datasets location to use the projects region
        work.post(loc1)  # Save the location
        dataset1.location = loc1.id  # Set the datasets location
        work.post(dataset1)  # Save the dataset

        loc2.region = project_region.id
        work.post(loc2)
        dataset2.location = loc2.id
        work.post(dataset2)

        work.commit()

        # Region, location and dataset id's will be saved to the project within the provisioning system in some way

        #       User searches for datasets

        # TODO: Nigel? - Define searching api
        found_dataset_id = dataset1.id  # The dataset that has an extended file schema

        #       User manually enters data
        timestamp = datetime.datetime.now()
        data_entry_1 = DataEntry(found_dataset_id, timestamp)
        data_entry_1['temperature'] = 27.8  # Add the extended schema items
        data_entry_1 = self.ingester_platform.post(data_entry_1)
        self.assertIsNotNone(data_entry_1.id)

        timestamp2 = timestamp + datetime.timedelta(seconds=1)
        data_entry_2 = DataEntry(found_dataset_id, timestamp2)
        data_entry_2['temperature'] = 27.8  # Add the extended schema items
        data_entry_2 = self.ingester_platform.post(data_entry_2)

        self.assertEquals(
            2,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id), 0, 10).results))
        result = self.ingester_platform.search(
            DataEntrySearchCriteria(found_dataset_id), 0, 1)
        self.assertEquals(2, result.count)
        self.assertEquals(1, len(result.results))
        self.assertEquals(
            1,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id), 1, 1).results))

        result = self.ingester_platform.search(
            DataEntrySearchCriteria(found_dataset_id), 2, 1)
        self.assertEquals(0, len(result.results))

        self.assertEquals(
            0,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id,
                                            end_time=timestamp -
                                            datetime.timedelta(seconds=60)), 0,
                    10).results))
        self.assertEquals(
            0,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id,
                                            start_time=timestamp +
                                            datetime.timedelta(seconds=60)), 0,
                    10).results))
        self.assertEquals(
            2,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(
                        found_dataset_id,
                        start_time=timestamp - datetime.timedelta(seconds=60),
                        end_time=timestamp + datetime.timedelta(seconds=60)),
                    0, 10).results))

        work = self.ingester_platform.createUnitOfWork()
        data_entry_3 = DataEntry(dataset2.id, datetime.datetime.now())
        data_entry_3['file'] = FileObject(f_handle=open(
            os.path.join(os.path.dirname(jcudc24ingesterapi.__file__),
                         "tests/test_ingest.xml")),
                                          mime_type="text/xml")
        work.post(data_entry_3)
        work.commit()
        self.assertIsNotNone(data_entry_3.id)

        f_in = self.ingester_platform.getDataEntryStream(
            dataset2.id, data_entry_3.id, "file")
        self.assertIsNotNone(f_in)
        data = f_in.read()
        f_in.close()
        self.assertLess(0, len(data), "Expected data in file")

        #       User enters quality assurance metadata
        quality_metadata_schema = DatasetMetadataSchema()
        quality_metadata_schema.addAttr(String("unit"))
        quality_metadata_schema.addAttr(String("description"))
        quality_metadata_schema.addAttr(Double("value"))
        quality_metadata_schema = self.ingester_platform.post(
            quality_metadata_schema)

        entered_metadata = DatasetMetadataEntry(data_entry_1.dataset,
                                                quality_metadata_schema.id)
        entered_metadata['unit'] = "%"
        entered_metadata['description'] = "Percent error"
        entered_metadata['value'] = 0.98

        entered_metadata = self.ingester_platform.post(entered_metadata)

        # Now find that metadata
        results = self.ingester_platform.search(
            DatasetMetadataSearchCriteria(data_entry_1.dataset), 0, 10).results
        self.assertEqual(1, len(results))

        data_entry_md_schema = DataEntryMetadataSchema("test")
        data_entry_md_schema.addAttr(String("description"))
        data_entry_md_schema.addAttr(Double("value"))
        data_entry_md_schema = self.ingester_platform.post(
            data_entry_md_schema)
        calibration = DataEntryMetadataEntry(metadata_schema_id=int(
            data_entry_md_schema.id),
                                             dataset_id=dataset2.id,
                                             object_id=data_entry_3.id)
        calibration["description"] = "Test"
        calibration["value"] = 1.2

        calibration2 = DataEntryMetadataEntry(metadata_schema_id=int(
            data_entry_md_schema.id),
                                              dataset_id=dataset2.id,
                                              object_id=data_entry_3.id)
        calibration2["description"] = "Test2"
        calibration2["value"] = 2.3
        calibration2 = self.ingester_platform.post(calibration2)

        calibrations = self.ingester_platform.search(
            DataEntryMetadataSearchCriteria(int(81), int(3648)),
            offset=0,
            limit=1000)
        self.assertEquals(1, len(calibrations.results))
        self.assertEquals(calibrations.results[0].schema_id,
                          data_entry_md_schema.id)

        self.ingester_platform.delete(calibration2)
        self.ingester_platform.delete(calibration)
        self.ingester_platform.delete(data_entry_md_schema)

        #       User changes sampling rate
        # FIXME: This test is going to be changed to be done by editing the dataset
        #        sampling_rate_changed = Metadata(dataset1.id, type(dataset1), SampleRateMetadataSchema())
        #        sampling_rate_changed.change_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        #        sampling_rate_changed.sampling = CustomSampling("file://d:/sampling_scripts/awsome_sampling.py")
        #
        #        try:
        #            sampling_rate_changed = self.ingester_platform.post(sampling_rate_changed)
        #            assert(sampling_rate_changed.metadata_id is None, "Sampling rate change failed")
        #        except:
        #            assert(True, "Sampling rate change failed")

        #       User wants some random metadata specific to their project
        # FIXME: Not sure what use case this is trying to demonstrate
        #        random_metadata_schema =  DataEntryMetadataSchema()
        #        random_metadata_schema.addAttr('random_field', Double())

        #        random_metadata = Metadata(data_entry.data_entry_id, type(data_entry), random_metadata_schema)
        #        random_metadata.random_field = 1.5

        #        try:
        #            random_metadata = self.ingester_platform.post(random_metadata)
        #            assert(random_metadata.metadata_id is None, "random_metadata failed")
        #        except:
        #            assert(True, "random_metadata failed")

        #       User changes the data source of the dataset
        new_data_source = PullDataSource("http://test.com/new_data",
                                         "file_handle")
        dataset1.data_source = new_data_source
        dataset1 = self.ingester_platform.post(dataset1)
        self.assertNotEqual(None, dataset1)

        #       External, 3rd party searches for data
        # TODO: external 3rd parties should be able to use the api to get data without authentication
        # TODO: I'm not sure exactly how this should work, but the search api could be open access (need spam limitations or something?)

        #       Project is disabled/finished
        work = self.ingester_platform.createUnitOfWork()
        work.disable(dataset1.id)
        work.disable(dataset2.id)
        work.commit()

        #       Project is obsolete and data should be deleted
        work = self.ingester_platform.createUnitOfWork()
        work.delete(dataset1.id)
        work.delete(dataset2.id)
        work.commit()
예제 #18
0
    def fetch_http(self, cwd):
        """Recursively fetch from an HTTP server.
        """
        RE_A = re.compile("href=\"(\./){0,1}([0-9A-Za-z\-_\.\:]+)\"")
        req = urllib2.Request(self.url)
        ret = []

        since = None
        if "lasttime" in self.state and self.state["lasttime"] != None and len(
                self.state["lasttime"]) > 0:
            since = eut.formatdate(calendar.timegm(
                parse_timestamp(self.state["lasttime"]).timetuple()),
                                   usegmt=True)

        f_in = None
        try:
            f_index = urllib2.urlopen(req)
            index_page = f_index.read()
            f_index.close()
            urls = RE_A.findall(index_page)
            found = 0

            RE_FILENAME = None if self.pattern == None else re.compile(
                self.pattern)
            for url_part in urls:
                if RE_FILENAME != None and RE_FILENAME.match(
                        url_part[1]) == None:
                    continue

                url = urlparse.urljoin(self.url, url_part[0] + url_part[1])
                req = urllib2.Request(url)
                if since != None: req.add_header("If-Modified-Since", since)
                try:
                    f_in = urllib2.urlopen(req)
                    f_out_name = os.path.join(cwd, "outputfile%d" % found)
                    timestamp = parse_timestamp_rfc_2822(
                        f_in.headers["Last-Modified"])
                    with file(f_out_name, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    new_data_entry = DataEntry(timestamp=timestamp)
                    file_name = None
                    try:
                        file_name = url_part[1].split("/")[-1]
                    except:
                        pass
                    new_data_entry[self.field] = FileObject(
                        f_path="outputfile%d" % found,
                        mime_type="",
                        file_name=file_name)
                    ret.append(new_data_entry)
                    found += 1

                    if since == None or timestamp > since:
                        since = timestamp

                except urllib2.HTTPError, e:
                    if e.code == 304:
                        continue
        finally:
            if f_in != None: f_in.close()

        self.state["lasttime"] = format_timestamp(
            since) if since != None else None
        return ret