示例#1
0
def extract(data_store, record_selector, field_name, output_dir_path=None):
    """Extract specified field from selected records

    Args:
        data_store: a DataStore object
        record_selector: a RecordSelector object. It defines which records
            should be processed.
        field_name: name of the field to be extracted. Can contain nested names,
            e.g. 'field1.field2' .
        output_dir_path: a FileSystemPath object. This is where the extracted
            fields will be saved. If this is not given, the extracted fields
            are printed to stdout.
    """
    if output_dir_path is not None:
        output_dir_path.make_dirs()
    for record in record_selector.get_records(data_store):
        datum = record.content
        if datum:  #check if there is any data to be written
            parts = field_name.split('.')
            for part in parts:
                datum = datum[part]
            if output_dir_path is not None:
                try:
                    with output_dir_path.append(str(record.index)).open('w') \
                            as output:
                        output.write(to_byte_string(datum))
                except Exception:
                    error("while processing record with index {}"\
                        .format(record.index))
                    raise
            else:
                print(to_byte_string(datum))
示例#2
0
    def __record_fulfills_condition(record, selection):
        """Checks whether a field in Avro records fulfills a condition
    
        Args:
            record: an Avro record as a Python dictionary
            selection: an EqualitySelection object (optional)
        Returns:
            True if a record fulfill the condition
            False otherwise
        """
        if selection is None:
            return True

        key_parts = selection.get_key_parts()
        value = selection.get_value()

        content = record.content
        try:
            for key_part in key_parts:
                content = content[key_part]
        except KeyError:
            error("Specified key not found in the schema: {}"\
                    .format(".".join(key_parts)))
            raise

        if content is None and value == "null":
            return True
        if unicode(content) == unicode(value):
            return True
        else:
            return False
示例#3
0
    def get_schema(self):
        """Lazy accessor for data store schema

        If schema is given as a run parameter, then returns this schema.
        Otherwise extracts the schema from the Avro data store files.
        """
        if self._schema:
            return self._schema
        else:
            if not self._schema_path:  #if there is no schema
                paths = self.__get_paths_to_avro_files()
                with DataFileReader(
                        paths[0].open("r"),
                        _FieldsOrderPreservingDatumReader()) as reader:
                    self._schema = avro.schema.parse(
                        reader.get_meta('avro.schema'))
                    return self._schema
            else:  #a schema is given
                try:
                    self._schema = avro.schema.parse(
                        self._schema_path.open("r").read())
                    return self._schema
                except TypeError:
                    error("supplied schema cannot be parsed!")
                    raise
示例#4
0
def to_json(data_store, record_selector, printer, pretty=False):
    """Converts selected records to JSON.
    
    Args:
        data_store: a DataStore object
        record_selector: a RecordSelector object that defines which records
            should be processed.
        printer: a Printer object that is used to print the JSON.
        pretty: True if the output should be a valid, pretty-printed JSON.
    """
    first_record = True
    if pretty:
        printer.print("[", end="")
    for record in record_selector.get_records(data_store):
        try:
            if first_record:
                first_record = False
            else:
                if pretty:
                    printer.print(",")
                else:
                    printer.print("")
            content = encapsulate_strings(record.content)
            printer.print(dict_to_json(content, pretty), end="")
        except Exception:
            error("while processing record with index {}".format(record.index))
            raise
    if pretty:
        printer.print("]")
    else:
        ## The case when no records were present
        if not first_record:
            printer.print("")
示例#5
0
 def __record_fulfills_condition(record, selection):
     """Checks whether a field in Avro records fulfills a condition
 
     Args:
         record: an Avro record as a Python dictionary
         selection: an EqualitySelection object (optional)
     Returns:
         True if a record fulfill the condition
         False otherwise
     """
     if selection is None:
         return True
 
     key_parts = selection.get_key_parts()
     value = selection.get_value()
 
     content = record.content
     try:
         for key_part in key_parts:
             content = content[key_part]
     except KeyError:
         error("Specified key not found in the schema: {}"\
                 .format(".".join(key_parts)))
         raise
 
     if content is None and value=="null":
         return True
     if unicode(content) == unicode(value):
         return True
     else:
         return False
示例#6
0
def __get_value(datum, field_name):
    parts = field_name.split('.')
    try:
        for part in parts:
            datum = datum[part]
        return datum
    except KeyError:
        error("Field '{}' is not defined in the data".format(field_name))
        raise
示例#7
0
def extract(data_store,
            record_selector,
            value_field,
            name_field=None,
            create_dirs=None,
            output_dir_path=None):
    """Extract specified field from selected records

    Args:
        data_store: a DataStore object
        record_selector: a RecordSelector object. It defines which records
            should be processed.
        value_field: name of the field to be extracted. 
            Can contain nested names, e.g. 'field1.field2' .
        name_field: value of this field is used as a name of the file 
            containing extracted data for given row. If this is not given,
            the name of the file is assumed to be the index of corresponding
            record.
        create_dirs: extracted fields corresponding to the same name 
            are to be placed in the same directory. 
            Name of directory is equal to the `name_field`; 
            names of the files inside are consecutive numbers.
        output_dir_path: a FileSystemPath object. This is where the extracted
            fields will be saved. If this is not given, the extracted fields
            are printed to stdout.
    """
    if output_dir_path is not None:
        output_dir_path.make_dirs()
    for record in record_selector.get_records(data_store):
        datum = record.content
        if datum:  #check if there is any data to be written
            field_value = __get_value(datum, value_field)
            if output_dir_path is not None:
                try:
                    output_name = __get_output_name(record.index, datum,
                                                    name_field)
                    output_path = output_dir_path.append(output_name)
                    file_path = None
                    if create_dirs:
                        file_path = __prepare_dir(output_path)
                    else:
                        file_path = __prepare_file(output_path, name_field)
                    with file_path.open("w") as output:
                        output.write(to_byte_string(field_value))
                except Exception:
                    error("while processing record with index {}"\
                        .format(record.index))
                    raise
            else:
                print(to_byte_string(field_value))
示例#8
0
def __prepare_dir(output_dir):
    if output_dir.exists():
        if not output_dir.is_dir():
            error("File with name '{}' already exists. "\
                "Unable to create a directory with the same name".\
                format(output_dir))
            raise Exception()
    else:
        output_dir.make_dirs()
    file_names = output_dir.ls()
    new_number = 0
    if len(file_names) > 0:
        file_numbers = [int(f) for f in file_names]
        new_number = max(file_numbers) + 1
    new_file_name = str(new_number)
    return output_dir.append(new_file_name)
示例#9
0
def __prepare_file(output_file, name_field):
    if output_file.exists():
        ## There is a race condition between checking
        ## if the file exists and opening it for writing
        ## but I don't see  an easy way to deal with it
        ## in a way that would work for HDFS as well as
        ## for local file system paths. Dealing with
        ## this problem doesn't seem to be important anyway.
        error_string = "The file with name '{}' "\
            "already exists.".format(output_file)
        if name_field is not None:
            error_string = error_string + " This is probably "\
                "because the selected values of given field "\
                "'{}' are not unique".format(name_field)
        error(error_string)
        raise Exception()
    return output_file
示例#10
0
 def __iter__(self):
     paths = self.__get_paths_to_avro_files()
     prev_global_record_index = 0
     for path in paths:
         with DataFileReader(path.open("r"), _FieldsOrderPreservingDatumReader(
                 readers_schema=self.get_schema())) as reader:
             prev_local_record_index = 0
             try:
                 for record in reader:
                     yield record
                     prev_global_record_index = prev_global_record_index + 1
                     prev_local_record_index = prev_local_record_index + 1
             except Exception:
                 error("processing record with index {} failed. "\
                     "This record comes from \"{}\" Avro file and in this "\
                     "file it has local index equal {}.".format(
                         prev_global_record_index+1, path,
                         prev_local_record_index+1))
                 raise
示例#11
0
    def get_schema(self):
        """Lazy accessor for data store schema

        If schema is given as a run parameter, then returns this schema.
        Otherwise extracts the schema from the Avro data store files.
        """
        if self._schema:
            return self._schema
        else:
            if not self._schema_path: #if there is no schema
                paths = self.__get_paths_to_avro_files()
                with DataFileReader(paths[0].open("r"), _FieldsOrderPreservingDatumReader()) as reader:
                    self._schema = avro.schema.parse(reader.get_meta('avro.schema'))
                    return self._schema
            else: #a schema is given
                try:
                    self._schema = avro.schema.parse(self._schema_path.open("r").read())
                    return self._schema
                except TypeError:
                    error("supplied schema cannot be parsed!")
                    raise
示例#12
0
 def __iter__(self):
     paths = self.__get_paths_to_avro_files()
     prev_global_record_index = 0
     for path in paths:
         with DataFileReader(
                 path.open("r"),
                 _FieldsOrderPreservingDatumReader(
                     readers_schema=self.get_schema())) as reader:
             prev_local_record_index = 0
             try:
                 for record in reader:
                     yield record
                     prev_global_record_index = prev_global_record_index + 1
                     prev_local_record_index = prev_local_record_index + 1
             except Exception:
                 error("processing record with index {} failed. "\
                     "This record comes from \"{}\" Avro file and in this "\
                     "file it has local index equal {}.".format(
                         prev_global_record_index+1, path,
                         prev_local_record_index+1))
                 raise
示例#13
0
 def __get_paths_to_avro_files(self):
     paths = []
     for file_name in self._datastore_path.ls():
         ## Ignore files starting with underscore. 
         ## Such files are also ignored by default by map-reduce jobs.
         ## We're also ignoring files starting with dot to ignore
         ## ".svn" directories.
         if not (fnmatch.fnmatch(file_name, "_*") or 
                 fnmatch.fnmatch(file_name, ".*")):
             path = self._datastore_path.append(file_name)
             ## We're not checking whether the path is a directory because
             ## such testing takes too much time
             #if path.is_dir():
                 #continue
             paths.append(path)
     if len(paths) == 0:
         raise error("Specified data store path is empty or is not valid")
     paths.sort()
     return paths
示例#14
0
 def __get_paths_to_avro_files(self):
     paths = []
     for file_name in self._datastore_path.ls():
         ## Ignore files starting with underscore.
         ## Such files are also ignored by default by map-reduce jobs.
         ## We're also ignoring files starting with dot to ignore
         ## ".svn" directories.
         if not (fnmatch.fnmatch(file_name, "_*")
                 or fnmatch.fnmatch(file_name, ".*")):
             path = self._datastore_path.append(file_name)
             ## We're not checking whether the path is a directory because
             ## such testing takes too much time
             #if path.is_dir():
             #continue
             paths.append(path)
     if len(paths) == 0:
         raise error("Specified data store path is empty or is not valid")
     paths.sort()
     return paths