Exemplo n.º 1
0
    def preprocess_memex_data_sources(self, folder_path):
        source_map = OrderedDict()
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            print(file_path)
            with open(file_path, "r") as f:
                for json_line in f.readlines():
                    json_obj = json.loads(json_line)
                    source_name = json_obj["tld"]

                    if source_name not in source_map:
                        source_map[source_name] = Source(source_name)

                    source = source_map[source_name]

                    for attr in json_obj:
                        if attr.startswith("inferlink"):
                            attr_name = attr.split("_")[1]
                            if attr_name not in source.column_map:
                                source.column_map[attr_name] = Column(
                                    attr_name, source.name)
                                source.column_map[
                                    attr_name].semantic_type = attr_name
                            for ele1 in json_obj[attr]:
                                if isinstance(ele1["result"], dict):
                                    source.column_map[attr_name].add_value(
                                        ele1["result"]["value"])
                                else:
                                    for ele2 in ele1["result"]:
                                        source.column_map[attr_name].add_value(
                                            ele2["value"])

        for source in source_map.values():
            if source.column_map:
                source.write_csv_file("data/datasets/memex/%s" % source.name)
def add_semantic_type(column=None, semantic_type=None):
    if not (column and semantic_type):
        column = request.json["column"]
        semantic_type = request.json["semantic_type"]

    column_name = column.keys()[0]

    if column and semantic_type and column_name:
        source = Source(column_name)
        source.read_data_from_dict(column)
        source.set_semantic_type(semantic_type, column_name)
        _id = get_new_index_name(semantic_type, column_name)
        source.save(index_config={"name": _id, "size": 0})
        return str(_id)
        """
    def read_data_sources(self, folder_path):
        data_folder_path = os.path.join(folder_path, "data")
        model_folder_path = os.path.join(folder_path, "model")

        for filename in os.listdir(data_folder_path):
            extension = os.path.splitext(filename)[1]

            source = Source(os.path.splitext(filename)[0], self.sc)
            file_path = os.path.join(data_folder_path, filename)

            if extension == ".csv":
                source.read_data_from_csv(file_path)
            elif extension == ".json":
                source.read_data_from_json(file_path)
            elif extension == ".xml":
                source.read_data_from_xml(file_path)
            self.source_map[filename] = source

        for filename in os.listdir(model_folder_path):
            source = self.source_map[os.path.splitext(os.path.splitext(filename)[0])[0]]

            source.read_semantic_type_json(os.path.join(model_folder_path, filename))
Exemplo n.º 4
0
def add_semantic_type(column=None, semantic_type=None):
    try:
        if not (column and semantic_type):
            column = request.json["column"]
            semantic_type = request.json["semantic_type"]
        logging.info("Adding semantic type: {}".format(semantic_type))
        column_name = column.keys()[0]

        if column and semantic_type and column_name:
            source = Source(column_name)
            source.read_data_from_dict(column)
            source.set_semantic_type(semantic_type, column_name)
            _id = get_new_index_name(semantic_type, column_name)
            source.save(index_config={"name": _id, "size": 0})
            resp = jsonify({"index_name": _id})
            resp.status_code = 200
            return resp
    except Exception as e:
        return error("Semantic type adding failed: {}".format(e.args))
Exemplo n.º 5
0
def add_semantic_type(column=None, semantic_type=None):
    if not (column and semantic_type):
        column = request.json["column"]
        semantic_type = request.json["semantic_type"]


    column_name = column.keys()[0]

    if column and semantic_type and column_name:
        source = Source(column_name)
        source.read_data_from_dict(column)
        source.set_semantic_type(semantic_type, column_name)
        _id = get_new_index_name(semantic_type, column_name)
        source.save(index_config={"name": _id, "size": 0})
        return str(_id)
        """
Exemplo n.º 6
0
	def configure(self):
		sources = Config.options('sources')
		if len(sources) < 1:
			raise RuntimeError('At least one Source must be configured!')

		self.mixer = Mixer()

		self.sources = []
		for name, url in Config.items('sources'):
			source = Source(name, url)
			self.mixer.append(source)
			self.sources.append(source)

		self.mixer.configure()

		if Config.has_option('output', 'rtmp_push_url'):
			rtmp_push_url = Config.get('output', 'rtmp_push_url')
			self.sink = RtmpSink(rtmp_push_url, self.mixer.output_width, self.mixer.output_height)

		else:
			self.sink = LocalSink(self.mixer.output_width, self.mixer.output_height)
Exemplo n.º 7
0
 def setUp(self):
     self.fd, self.data_file = tempfile.mkstemp()
     os.unlink(self.data_file)
     self.source = Source(self.data_file)
Exemplo n.º 8
0
class Test_Source(unittest.TestCase):
    def setUp(self):
        self.fd, self.data_file = tempfile.mkstemp()
        os.unlink(self.data_file)
        self.source = Source(self.data_file)

    def tearDown(self):
        os.close(self.fd)

    def test__read_from_file(self):
        expected = {'test_key': 'test_value'}
        args = ['level1', 'level2']
        with open(self.data_file, 'w') as f:
            f.write(json.dumps({args[0]: {args[1]: expected}}))

        actual = self.source._read_from_file(*args)
        self.assertEquals(expected, actual)

    def test__read_from_file_no_args(self):
        expected = {'test_key': 'test_value'}
        with open(self.data_file, 'w') as f:
            f.write(json.dumps(expected))

        actual = self.source._read_from_file()
        self.assertEquals(expected, actual)

    def test__read_from_file_file_does_not_exist(self):
        with self.assertRaises(IOError):
            self.source._read_from_file()

    def test__read_from_file_file_corrupt(self):
        with open(self.data_file, 'w') as f:
            f.write('corrupt file')

        with self.assertRaises(ValueError):
            self.source._read_from_file()

    def test__read_from_file_bad_key(self):
        with open(self.data_file, 'w') as f:
            f.write(json.dumps({}))

        with self.assertRaises(KeyError):
            self.source._read_from_file('key')

    def test__write_to_file(self):
        key = 'test_key'
        value = 'test_value'
        args = ['test_arg1', 'test_arg2']
        expected = {args[0]: {args[1]: {key: value}}}

        self.source._write_to_file(key, value, *args)
        with open(self.data_file) as f:
            actual = json.loads(f.read())
        self.assertEquals(expected, actual)

    def test__write_to_file_no_args(self):
        key = 'test_key'
        value = 'test_value'
        expected = {key: value}

        self.source._write_to_file(key, value)
        with open(self.data_file) as f:
            actual = json.loads(f.read())
        self.assertEquals(expected, actual)

    def test__write_to_file_file_exists(self):
        new_key = 'test_key'
        new_value = 'test_value'
        old_key = 'old_key'
        old_value = 'old_value'
        expected = {new_key: new_value, old_key: old_value}

        with open(self.data_file, 'w') as f:
            f.write(json.dumps({old_key: old_value}))

        self.source._write_to_file(new_key, new_value)
        with open(self.data_file) as f:
            actual = json.loads(f.read())
        self.assertEquals(expected, actual)

    def test__write_to_file_file_corrupt(self):
        key = 'test_key'
        value = 'test_value'
        expected = {key: value}

        with open(self.data_file, 'w') as f:
            f.write('corrupted_data')

        self.source._write_to_file(key, value)
        with open(self.data_file) as f:
            actual = json.loads(f.read())
        self.assertEquals(expected, actual)
    def read_data_sources(self, folder_paths):
        logging.info("Reading data sources...")
        for folder_name in folder_paths:
            folder_path = os.path.join(self.data_folder, folder_name)
            logging.info("-->folder: {}".format(folder_path))
            source_map = OrderedDict()
            data_folder_path = os.path.join(folder_path, "data")
            model_folder_path = os.path.join(folder_path, "model")

            for filename in os.listdir(data_folder_path):
                extension = os.path.splitext(filename)[1]

                if ".DS" in filename:
                    continue
                logging.info("   ...file: {}".format(filename))
                print(filename)

                source = Source(os.path.splitext(filename)[0])
                file_path = os.path.join(data_folder_path, filename)

                if "full" in data_folder_path:
                    source.read_data_from_wc_csv(file_path)
                elif extension == ".csv":
                    source.read_data_from_csv(file_path)
                elif extension == ".json":
                    source.read_data_from_json(file_path)
                elif extension == ".xml":
                    source.read_data_from_xml(file_path)
                else:
                    source.read_data_from_text_file(file_path)
                source_map[filename] = source
            if os.path.exists(model_folder_path):
                for filename in os.listdir(model_folder_path):
                    if ".DS" in filename:
                        continue

                    try:
                        source = source_map[os.path.splitext(
                            os.path.splitext(filename)[0])[0]]
                    except:
                        source = source_map[filename]

                    extension = os.path.splitext(filename)[1]
                    if extension == ".json":
                        source.read_semantic_type_json(
                            os.path.join(model_folder_path, filename))
                    else:
                        print(source)
                        source.read_semantic_type_from_gold(
                            os.path.join(model_folder_path, filename))

            self.dataset_map[folder_name] = source_map
    def read_data_sources(self, folder_paths):
        semantic_type_set = set()
        attr_count = 0
        for folder_name in folder_paths:
            self.logger.debug("Read dataset: %s", folder_name)

            folder_path = "data/datasets/%s" % folder_name
            source_map = OrderedDict()
            data_folder_path = os.path.join(folder_path, "data")
            model_folder_path = os.path.join(folder_path, "model")

            for filename in sorted(os.listdir(data_folder_path)):
                extension = os.path.splitext(filename)[1]

                if ".DS" in filename:
                    continue

                self.logger.debug("    -> read: %s", filename)

                source = Source(os.path.splitext(filename)[0])
                file_path = os.path.join(data_folder_path, filename)

                if "full" in data_folder_path:
                    source.read_data_from_wc_csv(file_path)
                elif extension == ".csv":
                    source.read_data_from_csv(file_path)
                elif extension == ".json":
                    source.read_data_from_json(file_path)
                elif extension == ".xml":
                    source.read_data_from_xml(file_path)
                else:
                    source.read_data_from_text_file(file_path)
                source_map[filename] = source

                # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36
                for key in list(source.column_map.keys()):
                    column = source.column_map[key]
                    if column.semantic_type:
                        if len(column.value_list) == 0:
                            del source.column_map[key]
                            source.empty_val_columns[key] = column
                            logging.warning("Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values",
                                            column.name, source.name)

                for column in source.column_map.values():
                    semantic_type_set.add(column.semantic_type)
                attr_count += len(source.column_map.values())
            if os.path.exists(model_folder_path):
                for filename in os.listdir(model_folder_path):
                    if ".DS" in filename:
                        continue

                    try:
                        source = source_map[os.path.splitext(os.path.splitext(filename)[0])[0]]
                    except:
                        source = source_map[filename]

                    extension = os.path.splitext(filename)[1]
                    if extension == ".json":
                        source.read_semantic_type_json(os.path.join(model_folder_path, filename))
                    else:
                        print source
                        source.read_semantic_type_from_gold(os.path.join(model_folder_path, filename))

            self.dataset_map[folder_name] = source_map
            # print semantic_type_set
            print len(semantic_type_set)
            print attr_count
Exemplo n.º 11
0
    def read_data_sources(self, folder_paths):
        semantic_type_set = set()
        attr_count = 0
        for folder_name in folder_paths:
            self.logger.debug("Read dataset: %s", folder_name)

            folder_path = "data/datasets/%s" % folder_name
            source_map = OrderedDict()
            data_folder_path = os.path.join(folder_path, "tables")
            model_folder_path = os.path.join(folder_path, "models")

            for filename in sorted(os.listdir(data_folder_path)):
                extension = os.path.splitext(filename)[1]

                if ".DS" in filename:
                    continue

                self.logger.debug("    -> read: %s", filename)

                source = Source(os.path.splitext(filename)[0])
                file_path = os.path.join(data_folder_path, filename)

                if "full" in data_folder_path:
                    source.read_data_from_wc_csv(file_path)
                elif extension == ".csv":
                    source.read_data_from_csv(file_path)
                elif extension == ".json":
                    source.read_data_from_json(file_path)
                elif extension == ".xml":
                    source.read_data_from_xml(file_path)
                else:
                    source.read_data_from_text_file(file_path)
                source_map[filename] = source

                if ('rowNumber' in source.column_map):
                    del source.column_map['rowNumber']

                # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36
                for key in list(source.column_map.keys()):
                    column = source.column_map[key]
                    if column.semantic_type:
                        if len(column.value_list) == 0:
                            del source.column_map[key]
                            source.empty_val_columns[key] = column
                            logging.warning(
                                "Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values",
                                column.name, source.name)

                for column in source.column_map.values():
                    semantic_type_set.add(column.semantic_type)
                attr_count += len(source.column_map.values())
            if os.path.exists(model_folder_path):
                for filename in os.listdir(model_folder_path):
                    if ".DS" in filename:
                        continue

                    try:
                        source = source_map[os.path.splitext(
                            os.path.splitext(filename)[0])[0]]
                    except:
                        source = source_map[filename]

                    extension = os.path.splitext(filename)[1]
                    if extension == ".json":
                        source.read_semantic_type_json(
                            os.path.join(model_folder_path, filename))
                    else:
                        print(source)
                        source.read_semantic_type_from_gold(
                            os.path.join(model_folder_path, filename))

            self.dataset_map[folder_name] = source_map
            # print semantic_type_set
            print(len(semantic_type_set))
            print(attr_count)