示例#1
0
    def process_qnode(self, kw: KgtkWriter, current_process_node_id: str,
                      each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool:
        interesting_qnode: bool = False
        if each_node_attributes:
            for k in each_node_attributes:
                if each_node_attributes[k]:
                    interesting_qnode = True
                    break
        if not interesting_qnode:
            return False

        concat_sentence: str
        explanation: str
        concat_sentence, explanation = self.attribute_to_sentence(
            each_node_attributes, current_process_node_id)
        if self.explain:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence),
                KgtkFormat.stringify(explanation)
            ])
        else:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence)
            ])
        return True
示例#2
0
 def write_row(self, ew: KgtkWriter, node1: str, label: str, node2: str):
     output_row: typing.List[str] = [node1, label, node2]
     if self.idbuilder is None:
         ew.write(output_row)
     else:
         ew.write(self.idbuilder.build(output_row, self.output_line_count))
     self.output_line_count += 1
示例#3
0
文件: filter.py 项目: yyht/kgtk
    def single_object_filter_inverted(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        obj_idx: int,
        obj_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single object filter inverted",
                  file=error_file,
                  flush=True)

        obj_filter_value: str = list(obj_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[obj_idx] != obj_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
示例#4
0
文件: filter.py 项目: yyht/kgtk
    def single_predicate_filter(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        pred_idx: int,
        pred_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single predicate filter",
                  file=error_file,
                  flush=True)

        pred_filter_value: str = list(pred_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[pred_idx] == pred_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
示例#5
0
文件: kgtkcompact.py 项目: yyht/kgtk
    def process_row(self,
                    input_key: str,
                    row: typing.List[str],
                    line_number: int,
                    idb: typing.Optional[KgtkIdBuilder],
                    ew: KgtkWriter,
                    flush: bool = False):
        # Note:  This code makes the assumption that row lengths do not vary!
        if self.current_key is not None:
            # We have a record being built.  Write it?
            if flush or self.current_key != input_key:
                # self.current_key != input_key means that the key is changing.
                self.compact_row()
                if self.current_row is not None:
                    if idb is None:
                        ew.write(self.current_row)
                    else:
                        ew.write(idb.build(self.current_row, line_number))
                    self.output_line_count += 1
                self.current_key = None
                self.current_row = None

        if flush:
            # This was a flush request.  We're done.
            return

        # Are we starting a new key?
        if self.current_key is None:
            # Save the new row.
            self.current_key = input_key
            self.expand_row(row)
        else:
            # Merge into an existing row.
            self.merge_row(row)
示例#6
0
    def process_cacheing_filter(self, input_kr: KgtkReader,
                                filter_kr: KgtkReader,
                                input_key_columns: typing.List[int],
                                filter_key_columns: typing.List[int],
                                ew: KgtkWriter):
        if self.verbose:
            print("Processing by cacheing the filter file's key set..")

        if self.verbose:
            print("Building the filter key set from %s" %
                  self.filter_file_path,
                  file=self.error_file,
                  flush=True)
        key_set: typing.Set[str] = self.extract_key_set(
            filter_kr, "filter", filter_key_columns)
        if self.verbose or self.very_verbose:
            print("There are %d entries in the filter key set." % len(key_set),
                  file=self.error_file,
                  flush=True)
            if self.very_verbose:
                print("Keys: %s" % " ".join(key_set),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Filtering records from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        output_line_count: int = 0

        # TODO: join these two code paths using xor?
        row: typing.List[str]
        input_key: str
        if self.invert:
            for row in input_kr:
                input_line_count += 1
                input_key = self.build_key(row, input_key_columns)
                if input_key not in key_set:
                    ew.write(row)
                    output_line_count += 1
        else:
            for row in input_kr:
                input_line_count += 1
                input_key = self.build_key(row, input_key_columns)
                if input_key in key_set:
                    ew.write(row)
                    output_line_count += 1

        if self.verbose:
            print("Read %d records, wrote %d records." %
                  (input_line_count, output_line_count),
                  file=self.error_file,
                  flush=True)
示例#7
0
    def process_row(self,
                    input_key: str,
                    row: typing.List[str],
                    line_number: int,
                    idb: typing.Optional[KgtkIdBuilder],
                    ew: KgtkWriter,
                    flush: bool = False):
        if self.very_verbose:
            print("Input key %s" % repr(input_key), file=self.error_file, flush=True)
        # Note:  This code makes the assumption that row lengths do not vary!
        if self.current_key is not None:
            if self.very_verbose:
                print("No current key", file=self.error_file, flush=True)
            # We have a record being built.  Write it?
            if flush or self.current_key != input_key:
                if self.very_verbose:
                    if flush:
                        print("flush", file=self.error_file, flush=True)
                    else:
                        print("current_key %s != input_key %s" % (repr(self.current_key), repr(input_key)), file=self.error_file, flush=True)
                # self.current_key != input_key means that the key is changing.
                self.compact_row()
                if self.current_row is not None:
                    if self.very_verbose:
                        print("writing %s" % repr(self.field_separator.join(self.current_row)), file=self.error_file, flush=True)
                    if idb is None:
                        ew.write(self.current_row)
                    else:
                        ew.write(idb.build(self.current_row, line_number))
                    self.output_line_count += 1
                self.current_key = None
                self.current_row = None

        if flush:
            # This was a flush request.  We're done.
            return

        # Are we starting a new key?
        if self.current_key is None:
            # Save the new row.
            if self.very_verbose:
                print("New current_key %s" % repr(self.current_key), file=self.error_file, flush=True)
            self.current_key = input_key
            if self.very_verbose:
                print("Expand row %s" % self.field_separator.join(row), file=self.error_file, flush=True)
            self.expand_row(row)
        else:
            # Merge into an existing row.
            if self.very_verbose:
                print("Merge row", file=self.error_file, flush=True)
            self.merge_row(row)
示例#8
0
def generate_kgtk_output(entities_output,output_kgtk_file,output_no_header,verbose,very_verbose):

    # Open the output file.
    kw: KgtkWriter = KgtkWriter.open(#kr.column_names,
                                    ['node1', 'label', 'node2'],
                                    output_kgtk_file,
                                    #mode=KgtkWriter.Mode[kr.mode.name],
                                    mode = KgtkWriter.Mode.AUTO,
                                    require_all_columns=False,
                                    prohibit_extra_columns=False,
                                    fill_missing_columns=False,
                                    gzip_in_parallel=False,
                                    no_header=output_no_header,
                                    verbose=verbose,
                                    very_verbose=very_verbose)

    input_line_count: int = 0
    if verbose:
        logging.info("Processing the input records.", file=self.error_file, flush=True)

    MODULE_NAME = 'graph_embeddings' # __name__.split('.')[-1] 
    with open(entities_output) as wv_file:
        for line in wv_file:
            line = line.replace('\n','')   #remove  \n
            entity_name = line.split('\t')[0]
            entity_vev = ','.join(line.split('\t')[1:])
            input_line_count += 1
            kw.write([entity_name,MODULE_NAME,entity_vev]) 
             
    if verbose:
        logging.info("Processed %d records." % (input_line_count), file=self.error_file, flush=True)

    kw.close()
示例#9
0
    def write_updated_namespace_file(self):
        # Is there an updated namespaces file?
        if self.updated_namespace_file_path is None:
            return

        if self.verbose:
            print("Opening updated namespaces file %s" %
                  str(self.updated_namespace_file_path),
                  file=self.error_file,
                  flush=True)
        # Open the updated namespaces file.
        un: KgtkWriter = KgtkWriter.open(self.COLUMN_NAMES,
                                         self.updated_namespace_file_path,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        namespace_id: str
        for namespace_id in sorted(self.namespace_ids.keys()):
            un.write([
                namespace_id, self.prefix_expansion_label,
                '"' + self.namespace_ids[namespace_id] + '"'
            ])
        un.close()
示例#10
0
    def process(self):
        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % str(self.input_file_path),
                  file=self.error_file,
                  flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the output file: %s" % str(self.output_file_path),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        # here kw has one line already where PBG doesn't need it,

        input_line_count: int = 0
        if self.verbose:
            print("Processing the input records.",
                  file=self.error_file,
                  flush=True)

        # node1 relation node2
        node1_index = kr.get_node1_column_index()
        node2_index = kr.get_node2_column_index()
        relation_index = kr.get_id_column_index('relation')

        row: typing.List[str]
        # delete header
        kw.file_out.seek(0)  # set the cursor to the top of the file
        kw.file_out.truncate()  # truncate following part == delete first line
        # print(kw.file_out.tell())

        for row in kr:
            input_line_count += 1
            kw.write([row[node1_index], row[relation_index], row[node2_index]])

        if self.verbose:
            print("Processed %d records." % (input_line_count),
                  file=self.error_file,
                  flush=True)

        kw.close()
示例#11
0
文件: kgtklift.py 项目: usbader/kgtk
    def write_output_row(
        self,
        ew: KgtkWriter,
        row: typing.List[str],
        new_columns: int,
        input_select_column_idx: int,
        label_select_column_idx: int,
        labels: typing.Mapping[str, str],
        lifted_column_idxs: typing.List[int],
        lifted_output_column_idxs: typing.List[int],
    ) -> bool:
        output_row: typing.List[str] = row.copy()
        if new_columns > 0:
            output_row.extend([""] * new_columns)
        output_select_column_idx: int = input_select_column_idx

        do_write: bool = True
        do_lift: bool = True
        if label_select_column_idx >= 0:
            print("label_select_column_idx %d" % label_select_column_idx)
            if row[label_select_column_idx] == self.label_select_column_value:
                # Don't lift label columns, if we have stored labels in the input records.
                do_lift = False
                if self.remove_label_records:
                    do_write = False
        if input_select_column_idx >= 0:
            if self.input_select_column_value is not None and row[
                    input_select_column_idx] != self.input_select_column_value:
                # Not selected for lifting into.
                do_lift = False
        if do_lift:
            # Lift the specified columns in this row.
            did_lift: bool = False
            lifted_column_idx: int
            for idx, lifted_column_idx in enumerate(lifted_column_idxs):
                label_key: str = row[lifted_column_idx]
                if label_key in labels:
                    output_row[
                        lifted_output_column_idxs[idx]] = labels[label_key]
                    did_lift = True  # What if we want to note if we lifted all columns?
            if did_lift and output_select_column_idx >= 0 and self.output_select_column_value is not None:
                output_row[
                    output_select_column_idx] = self.output_select_column_value

        if do_write:
            ew.write(output_row)
        return do_write
示例#12
0
    def write_new_edge(
        self,
        kw: KgtkWriter,
        unreifiedw: typing.Optional[KgtkWriter],
        potential_edge_attributes: typing.List[typing.List[str]],
        edge_id: str,
        rdf_subject_value: str,
        rdf_predicate_value: str,
        rdf_object_value: str,
        label_column_idx: int,
        node2_column_idx: int,
        node1_column_name: str,
        label_column_name: str,
        node2_column_name: str,
        id_column_name: str,
    ):
        kw.writemap({
            node1_column_name: rdf_subject_value,
            label_column_name: rdf_predicate_value,
            node2_column_name: rdf_object_value,
            id_column_name: edge_id,
        })
        self.output_line_count += 1

        if unreifiedw is not None:
            unreifiedw.writemap({
                node1_column_name: rdf_subject_value,
                label_column_name: rdf_predicate_value,
                node2_column_name: rdf_object_value,
                id_column_name: edge_id,
            })

        self.write_edge_attributes(
            kw,
            unreifiedw,
            potential_edge_attributes,
            edge_id,
            label_column_idx,
            node2_column_idx,
            node1_column_name,
            label_column_name,
            node2_column_name,
            id_column_name,
        )
示例#13
0
    def write_new_edge(
        self,
        kw: KgtkWriter,
        unreifiedw: typing.Optional[KgtkWriter],
        potential_edge_attributes: typing.List[typing.List[str]],
        node1_value: str,
        node2_value: str,
        edge_id: str,
        label_column_idx: int,
        node2_column_idx: int,
        node1_column_name: str,
        label_column_name: str,
        node2_column_name: str,
        id_column_name: str,
    ):
        new_label_value: str = self.new_label_value if self.new_label_value is not None else self.value_label_value
        kw.writemap({
            node1_column_name: node1_value,
            label_column_name: new_label_value,
            node2_column_name: node2_value,
            id_column_name: edge_id,
        })
        self.output_line_count += 1

        if unreifiedw is not None:
            unreifiedw.writemap({
                node1_column_name: node1_value,
                label_column_name: new_label_value,
                node2_column_name: node2_value,
                id_column_name: edge_id,
            })

        self.write_edge_attributes(
            kw,
            unreifiedw,
            potential_edge_attributes,
            edge_id,
            label_column_idx,
            node2_column_idx,
            node1_column_name,
            label_column_name,
            node2_column_name,
            id_column_name,
        )
示例#14
0
    def pass_group_through(self, kw: KgtkWriter,
                           uninvolvedw: typing.Optional[KgtkWriter],
                           node1_group: typing.List[typing.List[str]],
                           new_id_column: bool):
        # Unreification was not triggered.  Pass this group of rows
        # through unchanged, except for possibly appending an ID
        # column.
        #
        # TODO: Perhaps we'd like to build an ID value at the same time?
        row: typing.List[str]
        for row in node1_group:
            if uninvolvedw is not None:
                uninvolvedw.write(row)

            if new_id_column:
                row = row.copy()
                row.append("")

            kw.write(row)
            self.output_line_count += 1
示例#15
0
    def write_edge_attributes(
        self,
        kw: KgtkWriter,
        unreifiedw: typing.Optional[KgtkWriter],
        potential_edge_attributes: typing.List[typing.List[str]],
        edge_id: str,
        label_column_idx: int,
        node2_column_idx: int,
        node1_column_name: str,
        label_column_name: str,
        node2_column_name: str,
        id_column_name: str,
    ):
        width: int = self.get_width(len(potential_edge_attributes))
        attribute_number: int = 0
        edge_row: typing.List[str]
        for edge_row in potential_edge_attributes:
            attribute_number += 1

            attr_edge_id: str = self.make_new_id(edge_id, attribute_number,
                                                 width)

            kw.writemap({
                node1_column_name: edge_id,
                label_column_name: edge_row[label_column_idx],
                node2_column_name: edge_row[node2_column_idx],
                id_column_name: attr_edge_id
            })
            self.output_line_count += 1

            if unreifiedw is not None:
                unreifiedw.writemap({
                    node1_column_name:
                    edge_id,
                    label_column_name:
                    edge_row[label_column_idx],
                    node2_column_name:
                    edge_row[node2_column_idx],
                    id_column_name:
                    attr_edge_id
                })
示例#16
0
    def process_row(self,
                    input_key: str,
                    row: typing.List[str],
                    line_number: int,
                    idb: typing.Optional[KgtkIdBuilder],
                    ew: KgtkWriter,
                    flush: bool = False):
        # Note:  This code makes the assumption that row lengths do not vary!
        if self.current_key is not None:
            # We have a record being built.  Write it?
            if flush or self.current_key != input_key:
                # self.current_key != input_key means that the key is changing.
                self.compact_row()
                if self.current_row is not None:
                    if idb is None:
                        ew.write(self.current_row)
                    else:
                        ew.write(idb.build(self.current_row, line_number))
                self.current_key = None
                self.current_row = None

        if flush:
            # This was a flush request.  We're done.
            return

        # Are we starting a new key?
        if self.current_key is None:
            # Save the new row as the current row.  If the next row
            # doesn't have the same input key, we'll write this
            # row out with a minimum of handling.
            self.current_key = input_key
            self.current_row = row
            return

        if self.current_row_lists is None:
            self.expand_row()
        self.merge_row(row)
示例#17
0
    def write_output_row(self, ew: KgtkWriter, row: typing.List[str],
                         new_columns: int, label_column_idx: int,
                         labels: typing.Mapping[str, str],
                         lifted_column_idxs: typing.List[int],
                         lifted_output_column_idxs: typing.List[int]):
        output_row: typing.List[str] = row.copy()
        if new_columns > 0:
            output_row.extend([""] * new_columns)

        if label_column_idx >= 0 and row[
                label_column_idx] == self.label_column_value:
            # Don't lift label columns, if we have stored labels in the input records.
            pass
        else:
            # Lift the specified columns in this row.
            lifted_column_idx: int
            for idx, lifted_column_idx in enumerate(lifted_column_idxs):
                lifted_value: str = row[lifted_column_idx]
                if lifted_value in labels:
                    output_row[lifted_output_column_idxs[idx]] = labels[
                        row[lifted_column_idx]]

        ew.write(output_row)
        return
示例#18
0
    def process(self):
        input_kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            who="input",
            options=self.input_reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(
            input_kr, "input")
        label_col_idx = input_key_columns[1]
        label = '{}{}'.format('c', label_col_idx)

        g = load_graph_from_csv(str(input_kr.file_path),
                                not (self.undirected),
                                skip_first=not (self.no_header),
                                hashed=True,
                                csv_options={'delimiter': '\t'},
                                ecols=(input_key_columns[0],
                                       input_key_columns[2]))

        es = []
        header = ['node1', 'label', 'node2']
        if self.properties:
            properties = self.properties.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=self.strong)

        ew: KgtkWriter = KgtkWriter.open(header,
                                         self.output_file_path,
                                         mode=input_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        for v, c in enumerate(comp):
            ew.write([
                g.vertex_properties['name'][v], 'connected_component',
                str(c)
            ])
示例#19
0
 def write_files(error_file, file_number, file_prefix, kr, lines_to_write, output_path, Qnode, reader_options,
                 split_by_qnode, suffix):
     if split_by_qnode:
         output_kgtk_file = Path(f'{output_path}/{Qnode}{suffix}')
     else:
         output_kgtk_file = Path(f'{output_path}/{file_prefix}{file_number}{suffix}')
     kw = KgtkWriter.open(kr.column_names,
                          output_kgtk_file,
                          mode=KgtkWriter.Mode[kr.mode.name],
                          use_mgzip=reader_options.use_mgzip,  # Hack!
                          mgzip_threads=reader_options.mgzip_threads,  # Hack!
                          error_file=error_file,
                          verbose=False,
                          very_verbose=False)
     for r in lines_to_write:
         kw.write(r)
     kw.close()
示例#20
0
def generate_kgtk_output(entities_output, output_kgtk_file, verbose,
                         very_verbose):

    # Open the output file.
    kw: KgtkWriter = KgtkWriter.open(  #kr.column_names,
        ['id', 'node1', 'node2', 'relation'
         ],  # in order to obey the kgtk rules
        output_kgtk_file,
        #mode=KgtkWriter.Mode[kr.mode.name],
        mode=KgtkWriter.Mode.AUTO,
        require_all_columns=False,
        prohibit_extra_columns=False,
        fill_missing_columns=False,
        gzip_in_parallel=False,
        verbose=verbose,
        very_verbose=very_verbose)

    input_line_count: int = 0
    if verbose:
        logging.info("Processing the input records.",
                     file=self.error_file,
                     flush=True)

    #delete header
    kw.file_out.seek(0)  # set the cursor to the top of the file
    kw.file_out.truncate()  # truncate following part == delete first line

    MODULE_NAME = 'graph_embeddings'  # __name__.split('.')[-1]
    with open(entities_output) as wv_file:
        for line in wv_file:
            line = line.replace('\n', '')  #remove  \n
            entity_name = line.split('\t')[0]
            entity_vev = ','.join(line.split('\t')[1:])
            input_line_count += 1
            kw.write([entity_name, MODULE_NAME, entity_vev])

    if verbose:
        logging.info("Processed %d records." % (input_line_count),
                     file=self.error_file,
                     flush=True)

    kw.close()
示例#21
0
    def process(self):
        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True)

        kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                          error_file=self.error_file,
                                          options=self.reader_options,
                                          value_options = self.value_options,
                                          verbose=self.verbose,
                                          very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True)
        # Open the output file.
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        input_line_count: int = 0

        if self.verbose:
            print("Processing the input records.", file=self.error_file, flush=True)

        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row)

        if self.verbose:
            print("Processed %d records." % (input_line_count), file=self.error_file, flush=True)
        
        kw.close()
示例#22
0
    def open_output_writer(
        self, ikr: KgtkReader, lifted_column_idxs: typing.List[int]
    ) -> typing.Tuple[KgtkWriter, typing.List[int]]:
        # Build the output column names.
        output_column_names: typing.List[str]
        lifted_output_column_idxs: typing.List[int]
        output_column_names, lifted_output_column_idxs = self.build_output_column_names(
            ikr, lifted_column_idxs)

        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path,
                  file=self.error_file,
                  flush=True)
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[ikr.mode.name],
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        return ew, lifted_output_column_idxs
示例#23
0
文件: kgtkifexists.py 项目: yyht/kgtk
    def process(self):
        UPDATE_VERSION: str = "2020-08-24T21:47:20.256050+00:00#mr0wtMHlN/QaplDsGc/ylG3Hw5stsjziykzuGlSHBSion4xoW/Bec0sn55IQ7wFWBUClRS7g1tbAuaqEduhUVA=="
        if self.show_version or self.verbose:
            print("KgtkIfEfexists version: %s" % UPDATE_VERSION, file=self.error_file, flush=True)

        # Open the input files once.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True)
            else:
                print("Reading the input data from stdin", file=self.error_file, flush=True)

        input_kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                                error_file=self.error_file,
                                                who="input",
                                                options=self.input_reader_options,
                                                value_options = self.value_options,
                                                verbose=self.verbose,
                                                very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the filter input file: %s" % self.filter_file_path, file=self.error_file, flush=True)
        filter_kr: KgtkReader = KgtkReader.open(self.filter_file_path,
                                                who="filter",
                                                error_file=self.error_file,
                                                options=self.filter_reader_options,
                                                value_options=self.value_options,
                                                verbose=self.verbose,
                                                very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(self.input_keys, input_kr, filter_kr, "input")
        filter_key_columns: typing.List[int] = self.get_key_columns(self.filter_keys, filter_kr, input_kr, "filter")

        if len(input_key_columns) != len(filter_key_columns):
            print("There are %d input key columns but %d filter key columns.  Exiting." % (len(input_key_columns), len(filter_key_columns)),
                  file=self.error_file, flush=True)
            return

        ew: typing.Optional[KgtkWriter] = None
        if self.output_file_path is not None:
            if self.verbose:
                print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True)
            ew = KgtkWriter.open(input_kr.column_names,
                                 self.output_file_path,
                                 mode=input_kr.mode,
                                 require_all_columns=False,
                                 prohibit_extra_columns=True,
                                 fill_missing_columns=True,
                                 gzip_in_parallel=False,
                                 verbose=self.verbose,
                                 very_verbose=self.very_verbose)
            
        rew: typing.Optional[KgtkWriter] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening the reject file: %s" % self.reject_file_path, file=self.error_file, flush=True)
            rew = KgtkWriter.open(input_kr.column_names,
                                  self.reject_file_path,
                                  mode=input_kr.mode,
                                  require_all_columns=False,
                                  prohibit_extra_columns=True,
                                  fill_missing_columns=True,
                                  gzip_in_parallel=False,
                                  verbose=self.verbose,
                                  very_verbose=self.very_verbose)
            
        if self.cache_input:
            if self.preserve_order:
                self.process_cacheing_input_preserving_order(input_kr=input_kr,
                                                             filter_kr=filter_kr,
                                                             input_key_columns=input_key_columns,
                                                             filter_key_columns=filter_key_columns,
                                                             ew=ew,
                                                             rew=rew)
            else:
                self.process_cacheing_input(input_kr=input_kr,
                                            filter_kr=filter_kr,
                                            input_key_columns=input_key_columns,
                                            filter_key_columns=filter_key_columns,
                                            ew=ew,
                                            rew=rew)
        else:
            self.process_cacheing_filter(input_kr=input_kr,
                                         filter_kr=filter_kr,
                                         input_key_columns=input_key_columns,
                                         filter_key_columns=filter_key_columns,
                                         ew=ew,
                                         rew=rew)

        if ew is not None:
            ew.close()
        if rew is not None:
            rew.close()
示例#24
0
def run(
        input_file: KGTKFiles,
        path_file: KGTKFiles,
        output_file: KGTKFiles,
        statistics_only: bool,
        undirected: bool,
        max_hops: int,
        source_column_name: typing.Optional[str],
        target_column_name: typing.Optional[str],
        shortest_path: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool.all import find_vertex
    from graph_tool.topology import all_paths
    from graph_tool.topology import all_shortest_paths

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    from kgtk.exceptions import KGTKException
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="input", fallback=True)
        path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="path", fallback=True)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        id_col = 'name'

        if verbose:
            print("Reading the path file: %s" % str(path_kgtk_file),
                  file=error_file,
                  flush=True)
        pairs = []
        pkr: KgtkReader = KgtkReader.open(
            path_kgtk_file,
            error_file=error_file,
            options=path_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        path_source_idx: int = pkr.get_node1_column_index(source_column_name)
        if path_source_idx < 0:
            print("Missing node1 (source) column name in the path file.",
                  file=error_file,
                  flush=True)

        path_target_idx: int = pkr.get_node2_column_index(target_column_name)
        if path_target_idx < 0:
            print("Missing node1 (target) column name in the path file.",
                  file=error_file,
                  flush=True)
        if path_source_idx < 0 or path_target_idx < 0:
            pkr.close()
            raise KGTKException("Exiting due to missing columns.")

        paths_read: int = 0
        path_row: typing.List[str]
        for path_row in pkr:
            paths_read += 1
            if len(path_row) != pkr.column_count:
                raise KGTKException(
                    "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read."
                    % (paths_read, str(path_kgtk_file), pkr.column_count,
                       len(path_row)))
            src: str = path_row[path_source_idx]
            tgt: str = path_row[path_target_idx]
            pairs.append((src, tgt))
        pkr.close()
        if verbose:
            print("%d path rows read" % paths_read,
                  file=error_file,
                  flush=True)
        if len(pairs) == 0:
            print("No path pairs found, the output will be empty.",
                  file=error_file,
                  flush=True)
        elif verbose:
            print("%d path pairs found" % len(pairs),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sub_index: int = kr.get_node1_column_index()
        if sub_index < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred_index: int = kr.get_label_column_index()
        if pred_index < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj_index: int = kr.get_node2_column_index()
        if obj_index < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        id_index: int = kr.get_id_column_index()
        if id_index < 0:
            print("Missing id column", file=error_file, flush=True)
        if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred_index]
        id_col_name: str = kr.column_names[id_index]

        G = load_graph_from_kgtk(kr,
                                 directed=not undirected,
                                 ecols=(sub_index, obj_index),
                                 verbose=verbose,
                                 out=error_file)

        output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id']
        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        id_count = 0
        if not statistics_only:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                kw.write([
                    G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                    '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1
            if verbose:
                print("%d edges found." % id_count,
                      file=error_file,
                      flush=True)

        id_count = 0
        path_id = 0
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                if shortest_path:
                    _all_paths = all_shortest_paths(G,
                                                    source_id,
                                                    target_id,
                                                    edges=True)
                else:
                    _all_paths = all_paths(G,
                                           source_id,
                                           target_id,
                                           cutoff=max_hops,
                                           edges=True)

                for path in _all_paths:
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1: str = 'p%d' % path_id
                        kw.write([
                            node1,
                            str(edge_num), edge_id,
                            '{}-{}-{}'.format(node1, edge_num, id_count)
                        ])
                        id_count += 1
                    path_id += 1

        if verbose:
            print("%d paths contining %d edges found." % (path_id, id_count),
                  file=error_file,
                  flush=True)

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
示例#25
0
    def process(self):
        kmc: KgtkMergeColumns = KgtkMergeColumns()

        # Is the output file an edge file, a node file, or unknown?
        is_edge_file: bool = False
        is_node_file: bool = False

        krs: typing.List[KgtkReader] = []
        kr: KgtkReader
        idx: int

        if self.verbose:
            print("Starting kgtkcat pid=%d" % (os.getpid()),
                  file=self.error_file,
                  flush=True)

        if self.verbose:
            print("Opening the %d input files." % len(self.input_file_paths),
                  file=self.error_file,
                  flush=True)

        saw_stdin: bool = False
        input_file_path: Path
        for idx, input_file_path in enumerate(self.input_file_paths):
            if str(input_file_path) == "-":
                if saw_stdin:
                    raise ValueError("Duplicate standard input file %d" %
                                     (idx + 1))
                else:
                    saw_stdin = False
                if self.verbose:
                    print("Opening file %d: standard input" % (idx + 1),
                          file=self.error_file,
                          flush=True)
            else:
                if self.verbose:
                    print("Opening file %d: %s" %
                          (idx + 1, str(input_file_path)),
                          file=self.error_file,
                          flush=True)

            kr = KgtkReader.open(
                input_file_path,
                who="input " + str(idx + 1),
                options=self.reader_options,
                value_options=self.value_options,
                error_file=self.error_file,
                verbose=self.verbose,
                very_verbose=self.very_verbose,
            )
            krs.append(kr)

            # Unless directed otherwise, do not merge edge files with node
            # files.  If options.mode == KgtkReaderMode.NONE, then neither
            # kr.is_edge_file nor kr.is_node_file will be set and the
            # consistency check will be skipped.
            if kr.is_edge_file:
                if is_node_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge an edge file to a node file: %s" %
                        input_file_path)
                if is_edge_file == False and self.verbose:
                    print("The output file will be an edge file.",
                          file=self.error_file,
                          flush=True)
                is_edge_file = True
            elif kr.is_node_file:
                if is_edge_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge a node file to an edge file: %s" %
                        input_file_path)
                if is_node_file == False and self.verbose:
                    print("The output file will be an node file.",
                          file=self.error_file,
                          flush=True)
                is_node_file = True

            if self.verbose or self.very_verbose:
                print("Mapping the %d column names in %s." %
                      (len(kr.column_names), input_file_path),
                      file=self.error_file,
                      flush=True)
            if self.very_verbose:
                print(" ".join(kr.column_names),
                      file=self.error_file,
                      flush=True)
            new_column_names: typing.List[str] = kmc.merge(kr.column_names)
            if self.very_verbose:
                print(" ".join(new_column_names),
                      file=self.error_file,
                      flush=True)

        if self.verbose or self.very_verbose:
            print("There are %d merged columns." % len(kmc.column_names),
                  file=self.error_file,
                  flush=True)
        if self.very_verbose:
            print(" ".join(kmc.column_names), file=self.error_file, flush=True)

        if self.output_column_names is not None:
            if self.verbose:
                print("There are %d new output column names." %
                      len(self.output_column_names),
                      file=self.error_file,
                      flush=True)
            if len(self.output_column_names) != len(kmc.column_names):
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError(
                    "There are %d merged columns, but %d output column names."
                    % (len(kmc.column_names), len(self.output_column_names)))

        output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE
        if is_edge_file:
            output_mode = KgtkWriter.Mode.EDGE
            if self.verbose:
                print("Opening the output edge file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        elif is_node_file:
            output_mode = KgtkWriter.Mode.NODE
            if self.verbose:
                print("Opening the output node file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        else:
            if self.verbose:
                print("Opening the output file: %s" % str(self.output_path),
                      file=self.error_file,
                      flush=True)

        ew: KgtkWriter = KgtkWriter.open(
            kmc.column_names,
            self.output_path,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            mode=output_mode,
            output_format=self.output_format,
            output_column_names=self.output_column_names,
            old_column_names=self.old_column_names,
            new_column_names=self.new_column_names,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        output_data_lines: int = 0
        for idx, kr in enumerate(krs):
            if kr.file_path is None:
                # This shouldn't happen because we constrined all
                # input_file_path elements to be not None.  However,
                # checking here keeps mypy happy.
                #
                # TODO: throw a better exception.
                #
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError("Missing file path.")
            input_file_path = kr.file_path
            if self.verbose:
                print("Copying data from file %d: %s" %
                      (idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

            shuffle_list: typing.List[int] = ew.build_shuffle_list(
                kmc.new_column_name_lists[idx])

            input_data_lines: int = 0
            row: typing.List[str]
            for row in kr:
                input_data_lines += 1
                output_data_lines += 1
                ew.write(row, shuffle_list=shuffle_list)

            # Flush the output file so far:
            ew.flush()

            if self.verbose:
                print("Read %d data lines from file %d: %s" %
                      (input_data_lines, idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Wrote %d lines total from %d files" %
                  (output_data_lines, len(krs)),
                  file=self.error_file,
                  flush=True)

        # Close the open files.
        ew.close()
        for kr2 in krs:
            kr2.close()
示例#26
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#27
0
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )
        self.id_column_idx = kr.id_column_idx

        # If requested, create the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError(
                    "ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = []

        if len(self.key_column_names) == 0:
            if kr.is_edge_file:
                # Add the KGTK edge file required columns.
                key_idx_list.append(kr.node1_column_idx)
                key_idx_list.append(kr.label_column_idx)
                key_idx_list.append(kr.node2_column_idx)
                if not self.compact_id and kr.id_column_idx >= 0:
                    key_idx_list.append(kr.id_column_idx)

            elif kr.is_node_file:
                # Add the KGTK node file required column:
                key_idx_list.append(kr.id_column_idx)

            else:
                raise ValueError(
                    "The input file is neither an edge nor a node file.  Key columns must be supplied."
                )

        else:
            # Append columns to the list of key column indices,
            # silently removing duplicates, but complaining about unknown names.
            #
            # TODO: warn about duplicates?
            column_name: str
            for column_name in self.key_column_names:
                if column_name not in kr.column_name_map:
                    raise ValueError("Column %s is not in the input file" %
                                     (repr(column_name)))
                key_idx: int = kr.column_name_map[column_name]
                if key_idx not in key_idx_list:
                    key_idx_list.append(key_idx)

        if self.verbose:
            print("key indexes: %s" %
                  " ".join([str(idx) for idx in key_idx_list]),
                  file=self.error_file,
                  flush=True)

        self.keep_first_idx_list.clear()
        if len(self.keep_first_names) > 0:
            keep_first_name: str
            for keep_first_name in self.keep_first_names:
                if keep_first_name not in kr.column_name_map:
                    raise ValueError(
                        "Keep first column %s is not in the input file" %
                        (repr(keep_first_name)))
                keep_first_idx: int = kr.column_name_map[keep_first_name]
                if keep_first_idx in key_idx_list:
                    raise ValueError(
                        "Keep first column %s may not be a key column" %
                        (repr(keep_first_name)))
                self.keep_first_idx_list.append(keep_first_idx)
            if self.verbose:
                print("keep first indexes: %s" %
                      " ".join([str(idx) for idx in self.keep_first_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.deduplicate:
            if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list:
                self.keep_first_idx_list.append(kr.id_column_idx)

            # Any columns that aren't in the keep_first list and aren't
            # already in key_idx_list will be appended to key_idx_list:
            idx: int
            for idx in range(kr.column_count):
                if idx not in self.keep_first_idx_list and idx not in key_idx_list:
                    key_idx_list.append(idx)

            if self.verbose:
                print("revised key indexes: %s" %
                      " ".join([str(idx) for idx in key_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            key_idx_list_str: typing.List[str] = []
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(
            output_column_names,
            self.output_file_path,
            mode=kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        # Open the optional list output file.
        lew: typing.Optional[KgtkWriter] = None
        if self.list_output_file_path is not None:
            lew = KgtkWriter.open(
                output_column_names,
                self.list_output_file_path,
                mode=kr.mode,
                require_all_columns=False,
                prohibit_extra_columns=True,
                fill_missing_columns=True,
                use_mgzip=self.reader_options.use_mgzip,  # Hack!
                mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
                gzip_in_parallel=False,
                verbose=self.verbose,
                very_verbose=self.very_verbose)

        input_line_count: int = 0
        row: typing.List[str] = []
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass  # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError(
                                    "Line %d sort violation going up: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError(
                                    "Line %d sort violation going down: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key

                self.process_row(input_key, row, input_line_count, idb, ew,
                                 lew)

        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[
                str, typing.List[typing.List[str]]] = {}

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [row]

            if self.verbose:
                print("Processing the sorted input data",
                      file=self.error_file,
                      flush=True)

            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew,
                                     lew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, lew, flush=True)

        if self.verbose:
            print("Read %d records, excluded %d records, wrote %d records." %
                  (input_line_count, self.excluded_row_count,
                   self.output_line_count),
                  file=self.error_file,
                  flush=True)
            if lew is not None:
                print("Wrote %d list ouput records." %
                      (self.list_output_line_count),
                      file=self.error_file,
                      flush=True)

        ew.close()
        if lew is not None:
            lew.close()
示例#28
0
    def process(self):
        output_column_names: typing.List[str]
        if self.build_id and self.idbuilder_options is not None:
            self.idbuilder = KgtkIdBuilder.from_column_names(
                self.COLUMN_NAMES, self.idbuilder_options)
            output_column_names = self.idbuilder.column_names
        else:
            output_column_names = self.COLUMN_NAMES

        if self.verbose:
            print("Opening output file %s" % str(self.output_file_path),
                  file=self.error_file,
                  flush=True)
        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        rw: typing.Optional[typing.TextIO] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening reject file %s" % str(self.reject_file_path),
                      file=self.error_file,
                      flush=True)
            # Open the reject file. Since the input data is not in KGTK format,
            # we use an ordinary file here.
            if str(self.reject_file_path) == "-":
                rw = sys.stdout
            else:
                rw = open(self.reject_file_path, "wt")

        total_input_line_count: int = 0
        reject_line_count: int = 0

        namespace_line_count: int = self.get_initial_namespaces()

        input_file_path: str
        for input_file_path in self.input_file_paths:
            input_line_count: int = 0

            if self.local_namespace_use_uuid or self.namespace_id_use_uuid or self.newnode_use_uuid:
                if self.override_uuid is not None:
                    self.local_namespace_uuid = self.override_uuid  # for debugging
                else:
                    # Generate a new local namespace UUID.
                    self.local_namespace_uuid = shortuuid.uuid()

            # Open the input file.
            if self.verbose:
                print("Opening the input file: %s" % input_file_path,
                      file=self.error_file,
                      flush=True)
            infile: typing.TestIO
            if str(input_file_path) == "-":
                infile = sys.stdin
            else:
                infile = open(input_file_path, 'rt')

            line: str
            for line in infile:
                input_line_count += 1
                total_input_line_count += 1

                row: typing.List[str]
                valid: bool
                row, valid = self.parse(line, input_line_count)
                if not valid:
                    if rw is not None:
                        rw.write(line)
                    reject_line_count += 1
                    continue

                node1: str
                ok_1: bool
                node1, ok_1 = self.convert_and_validate(
                    row[0], input_line_count, ew)

                label: str
                ok_2: bool
                label, ok_2 = self.convert_and_validate(
                    row[1], input_line_count, ew)

                node2: str
                ok_3: bool
                node2, ok_3 = self.convert_and_validate(
                    row[2], input_line_count, ew)

                if ok_1 and ok_2 and ok_3:
                    self.write_row(ew, node1, label, node2)
                else:
                    if rw is not None:
                        rw.write(line)
                    reject_line_count += 1

            if input_file_path != "-":
                infile.close()

                self.save_namespaces(ew)

        if self.verbose:
            print("Processed %d known namespaces." % (namespace_line_count),
                  file=self.error_file,
                  flush=True)
            print("Processed %d records." % (total_input_line_count),
                  file=self.error_file,
                  flush=True)
            print("Rejected %d records." % (reject_line_count),
                  file=self.error_file,
                  flush=True)
            print("Wrote %d records." % (self.output_line_count),
                  file=self.error_file,
                  flush=True)

        if ew is not None:
            ew.close()

        if rw is not None and self.reject_file_path is not None and self.reject_file_path != "-":
            rw.close()
示例#29
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        compute_degrees: bool,
        compute_pagerank: bool,
        compute_hits: bool,
        log_file: str,
        statistics_only: bool,
        vertex_in_degree: str,
        vertex_out_degree: str,
        vertex_pagerank: str,
        vertex_auth: str,
        vertex_hubs: str,
        top_n: int,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool import centrality
    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    v_prop_dict = {
        'vertex_pagerank': vertex_pagerank,
        'vertex_hubs': vertex_hubs,
        'vertex_auth': vertex_auth
    }
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later
        directions = ['in', 'out', 'total']
        id_col = 'name'
        output_columns = ["node1", "label", "node2", "id"]

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred]

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)
        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)

        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        with open(log_file, 'w') as writer:
            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if compute_degrees:
                writer.write('\n###Degrees:\n')
                for direction in directions:
                    degree_data = gtanalysis.compute_node_degree_hist(
                        G2, direction)
                    max_degree = len(degree_data) - 1
                    mean_degree, std_degree = gtanalysis.compute_avg_node_degree(
                        G2, direction)
                    writer.write(
                        '%s degree stats: mean=%f, std=%f, max=%d\n' %
                        (direction, mean_degree, std_degree, max_degree))

            if compute_pagerank:
                writer.write('\n###PageRank\n')
                v_pr = G2.new_vertex_property('float')
                centrality.pagerank(G2, prop=v_pr)
                G2.properties[('v', 'vertex_pagerank')] = v_pr
                writer.write('Max pageranks\n')
                result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank',
                                                     top_n, id_col)
                for n_id, n_label, pr in result:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr))

            if compute_hits:
                writer.write('\n###HITS\n')
                hits_eig, G2.vp['vertex_hubs'], G2.vp[
                    'vertex_auth'] = gtanalysis.compute_hits(G2)
                writer.write('HITS hubs\n')
                main_hubs = gtanalysis.get_topn_indices(
                    G2, 'vertex_hubs', top_n, id_col)
                for n_id, n_label, hubness in main_hubs:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness))
                writer.write('HITS auth\n')
                main_auth = gtanalysis.get_topn_indices(
                    G2, 'vertex_auth', top_n, id_col)
                for n_id, n_label, authority in main_auth:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority))

        id_count = 0
        if not statistics_only:
            for e in G2.edges():
                sid, oid = e
                lbl = G2.ep[predicate][e]
                kw.write([
                    G2.vp[id_col][sid], lbl, G2.vp[id_col][oid],
                    '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1

        id_count = 0
        for v in G2.vertices():
            v_id = G2.vp[id_col][v]
            kw.write([
                v_id, vertex_in_degree,
                str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree,
                                                      id_count)
            ])
            id_count += 1
            kw.write([
                v_id, vertex_out_degree,
                str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree,
                                                       id_count)
            ])
            id_count += 1

            for vprop in G2.vertex_properties.keys():
                if vprop == id_col:
                    continue
                kw.write([
                    v_id, v_prop_dict[vprop],
                    str(G2.vp[vprop][v]),
                    '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count)
                ])
                id_count += 1

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
示例#30
0
文件: unique.py 项目: usbader/kgtk
    def process(self):
        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.column_name not in kr.column_name_map:
            raise ValueError("Column %s is not in the input file" %
                             (self.column_name))
        column_idx: int = kr.column_name_map[self.column_name]

        where_column_idx: int = -1
        where_value_set: typing.Set[str] = {}
        if self.where_column_name is not None:
            if self.where_column_name not in kr.column_name_map:
                raise ValueError(
                    "Where column '%s' is not in the input file." %
                    (self.where_column_name))
            where_column_idx = kr.column_name_map[self.where_column_name]
            if self.where_values is None or len(self.where_values) == 0:
                raise ValueError("Where column '%s' but no values to test." %
                                 (self.where_column_name))
            else:
                where_value_set = set(self.where_values)

        if self.verbose:
            print("Counting unique values from the %s column in %s" %
                  (self.column_name, self.input_file_path),
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        skip_line_count: int = 0

        value_counts: typing.MutableMapping[str, int] = {}

        row: typing.list[str]
        for row in kr:
            input_line_count += 1
            if where_column_idx >= 0:
                if row[where_column_idx] not in where_value_set:
                    skip_line_count += 1
                    continue
            value: str = row[column_idx]
            if len(value) == 0:
                value = self.empty_value
            if len(value) > 0:
                value = self.prefix + value
                value_counts[value] = value_counts.get(value, 0) + 1

        if self.verbose:
            print(
                "Read %d records, skipped %d, found %d unique non-empty values, %d empty values."
                % (input_line_count, skip_line_count, len(value_counts),
                   input_line_count - len(value_counts)),
                file=self.error_file,
                flush=True)

        # No node mode we can't open the output file until we are done reading
        # the input file, because we need the list of uniqueue values to
        # build the column list.
        output_columns: typing.List[str]
        if self.output_format == "edge":
            output_columns = ["node1", "label", "node2"]
        elif self.output_format == "node":
            output_columns = ["id"]
            for value in sorted(value_counts.keys()):
                # TODO: provide a way to override this check.
                if value in KgtkFormat.NODE1_COLUMN_NAMES:
                    raise ValueError(
                        "Cannot write a KGTK node file with a column named '%s'."
                        % value)
                output_columns.append(value)
        else:
            raise ValueError("Unknown output format %s" %
                             str(self.output_format))

        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path,
                  file=self.error_file,
                  flush=True)

        ew: KgtkWriter = KgtkWriter.open(output_columns,
                                         self.output_file_path,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if self.output_format == "edge":
            for value in sorted(value_counts.keys()):
                ew.write([value, self.label_value, str(value_counts[value])])
        elif self.output_format == "node":
            row = [self.column_name]
            for value in sorted(value_counts.keys()):
                row.append(str(value_counts[value]))
            ew.write(row)
        else:
            raise ValueError("Unknown output format %s" %
                             str(self.output_format))

        ew.close()