Пример #1
0
 def _build_query_result_lists(self):
     # build lists of query results across each column
     for file_obj_index, f_obj in enumerate(self.query_output_file_objects):
         # the first line of each query result is the source column path.
         # we need this to get the values for fetching the joined rows.
         self.source_file_paths.append(f_obj.readline().rstrip("\n"))
         for line in f_obj:
             # SHOULD RESOVLE TO INT
             line_id = resolve_type(line, VALID_TYPES)
             self.line_id_sets[file_obj_index].append(line_id)
         # we are done with this file object forever now, we have all useful
         # info from the column.
         f_obj.close()
Пример #2
0
def main(query_output_file_directory, new_column_output_file_directory):
    query_output_file_objects = []
    for root, dirs, files in os.walk(query_output_file_directory):
        for f_name in files:
            # append the file handler to the file so we can iterate them later
            query_output_file_objects.append(io.open(os.path.join(root, f_name)))

    source_file_paths = []
    # build sets for each files line ids
    line_id_sets = [list() for i in range(len(query_output_file_objects))]

    # build lists of query results across each column
    for file_obj_index, f_obj in enumerate(query_output_file_objects):
        # the first line of each query result is the source column path.
        # we need this to get the values for fetching the joined rows.
        source_file_paths.append(f_obj.readline().rstrip("\n"))
        for line in f_obj:
            # SHOULD RESOVLE TO INT
            line_id = resolve_type(line, VALID_TYPES)
            line_id_sets[file_obj_index].append(line_id)

    # we only want unique filenames for columns, it wont matter what file is
    # associated to what once we join the rows
    referenced_column_paths = set(source_file_paths)

    # clean up file handlers
    for f_obj in query_output_file_objects:
        f_obj.close()

    # find smallest result and base our join off of it
    # default to 0 since its still a valid index
    smallest_set_index = 0
    for set_index, line_id_set in enumerate(line_id_sets):
        if len(line_id_sets[set_index]) < len(line_id_sets[smallest_set_index]):
            samllest_set_index = set_index

    # used later so we can build the table
    joined_lines = []
    # go ahead and join the columns
    for p_line_id in line_id_sets[smallest_set_index]:
        for line_id_set_index, line_id_set in enumerate(line_id_sets):
            # skip the set we are currently iterating through
            if line_id_set_index == smallest_set_index:
                continue
            # find out if we can join any rows
            if p_line_id in line_id_set:
                joined_lines.append(p_line_id)

    # write out each joined line to the respective new columns.
    for column in referenced_column_paths:
        column_directory, column_name = os.path.split(column)
        # open the column file for iteration
        with io.open(column, "rb") as in_column_obj:
            # write out the new column
            with io.open(os.path.join(new_column_output_file_directory, column_name), "wb") as out_column_obj:
                # needed to maintain state within the iteration of the in_column
                line_counter = 0
                # used to determine whether or not to contine reading the
                # in_column. If we hit as many ids as are in the joined_lines,
                # we can assume there will be no more to find.
                ids_hit = 0
                for line in in_column_obj:
                    # if we have hit the number of ids in our list, we can
                    # expect not to find anything more.
                    if ids_hit == len(joined_lines):
                        break
                    if line_counter in joined_lines:
                        # push the number of ids we have found forward
                        ids_hit += 1
                        # write out value to new column
                        out_column_obj.write(line)
                    # push the line counter forward to maintain state
                    line_counter += 1