Пример #1
0
def run(filename, english_only, sort):
    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv

    def row_to_edge(row):
        return '\t'.join(row) + '\n'

    try:
        in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata']
        out_columns = ['node1', 'label', 'node2']
        with open(filename, 'r') as f:
            reader = csv.reader(f, delimiter='\t', quotechar='"')
            sys.stdout.write(row_to_edge(out_columns))
            for row in reader:
                new_row = [row[2], row[1], row[3]]
                if not english_only or (new_row[0].startswith('/c/en/')
                                        and new_row[2].startswith('/c/en/')):
                    sys.stdout.write(row_to_edge(new_row))

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #2
0
def run():

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import json
    import nltk
    from nltk.corpus import wordnet as wn

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def obtain_wordnet_lemmas(syn):
        lemmas = []
        for lemma in syn.lemma_names():
            lemmas.append(lemma.replace('_', ' '))
        return lemmas

    def obtain_hypernyms(syn):
        hyps = []
        for hypernym in syn.hypernyms():
            hyps.append(hypernym.name())
        return hyps

    def obtain_member_holonyms(syn):
        hols = []
        for hol in syn.member_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_part_holonyms(syn):
        hols = []
        for hol in syn.part_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_substance_meronyms(syn):
        hols = []
        for hol in syn.substance_meronyms():
            hols.append(hol.name())
        return hols

    def get_wn_data():
        syns = list(wn.all_synsets())
        all_labels = {}
        all_hyps = {}
        all_members = {}
        all_parts = {}
        all_subs = {}
        for syn in syns:
            syn_name = syn.name()

            lemmas = obtain_wordnet_lemmas(syn)
            all_labels[syn_name] = '|'.join(lemmas)

            hypernyms = obtain_hypernyms(syn)
            if len(hypernyms):
                all_hyps[syn_name] = hypernyms

            member_holonyms = obtain_member_holonyms(syn)
            if len(member_holonyms):
                all_members[syn_name] = member_holonyms

            part_holonyms = obtain_part_holonyms(syn)
            if len(part_holonyms):
                all_parts[syn_name] = part_holonyms

            substance_meronyms = obtain_substance_meronyms(syn)
            if len(substance_meronyms):
                all_subs[syn_name] = substance_meronyms

        return all_labels, all_hyps, all_members, all_parts, all_subs

    def create_edges(data, labels, rel, rel_label):
        all_rows = []
        for node1, v in data.items():
            for node2 in v:
                node1_preflabel = labels[node1].split('|')[0]
                node2_preflabel = labels[node2].split('|')[0]
                sentence = ' '.join(
                    [node1_preflabel, rel_label, node2_preflabel])
                if rel == '/r/IsA':
                    question = ('What is %s?' % node1_preflabel).capitalize()
                else:
                    question = ('%s %s what?' %
                                (node1_preflabel, rel_label)).capitalize()
                a_row = [
                    node1, rel, node2, labels[node1], rel_label, labels[node2],
                    "", "WN", "1.0", "", sentence, question
                ]
                all_rows.append(a_row)
        return all_rows

    try:
        out_columns = [
            'node1', 'label', 'node2', 'node1_label', 'label_label',
            'node2_label', 'label_dimension', 'source', 'weight', 'creator',
            'sentence', 'question'
        ]
        sys.stdout.write(header_to_edge(out_columns))

        all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data()
        hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA', 'is a')
        member_edges = create_edges(all_members, all_labels, '/r/PartOf',
                                    'is a part of')
        part_edges = create_edges(all_parts, all_labels, '/r/PartOf',
                                  'is a part of')
        sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf',
                                 'is made of')
        all_edges = hyp_edges + member_edges + part_edges + sub_edges

        for edge in all_edges:
            sys.stdout.write('\t'.join(edge) + '\n')

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #3
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        pattern: str,
        subj_col: typing.Optional[str],
        pred_col: typing.Optional[str],
        obj_col: typing.Optional[str],
        or_pattern: bool,
        invert: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            output_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--pattern=%s" % str(pattern), file=error_file)
        if subj_col is not None:
            print("--subj=%s" % str(subj_col), file=error_file)
        if pred_col is not None:
            print("--pred=%s" % str(pred_col), file=error_file)
        if obj_col is not None:
            print("--obj=%s" % str(obj_col), file=error_file)
        print("--or=%s" % str(or_pattern), file=error_file)
        print("--invert=%s" % str(invert), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    def prepare_filter(pattern: str) -> typing.Set[str]:
        filt: typing.Set[str] = set()
        pattern = pattern.strip()
        if len(pattern) == 0:
            return filt

        target: str
        for target in pattern.split(","):
            target = target.strip()
            if len(target) > 0:
                filt.add(target)

        return filt

    try:

        patterns: typing.List[str] = pattern.split(";")
        if len(patterns) != 3:
            print(
                "Error: The pattern must have three sections separated by semicolons (two semicolons total).",
                file=error_file,
                flush=True)
            raise KGTKException("Bad pattern")

        subj_filter: typing.Set[str] = prepare_filter(patterns[0])
        pred_filter: typing.Set[str] = prepare_filter(patterns[1])
        obj_filter: typing.Set[str] = prepare_filter(patterns[2])
        apply_subj_filter: bool = len(subj_filter) > 0
        apply_pred_filter: bool = len(pred_filter) > 0
        apply_obj_filter: bool = len(obj_filter) > 0

        if verbose and not (apply_subj_filter or apply_pred_filter
                            or apply_obj_filter):
            print("Warning: the filter is empty.", file=error_file, flush=True)

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        subj_idx: int = kr.get_node1_column_index(subj_col)
        pred_idx: int = kr.get_label_column_index(pred_col)
        obj_idx: int = kr.get_node2_column_index(obj_col)

        # Complain about a missing column only when it is needed by the pattern.
        trouble: bool = False
        if subj_idx < 0 and len(subj_filter) > 0:
            trouble = True
            print("Error: Cannot find the subject column '%s'." %
                  kr.get_node1_canonical_name(subj_col),
                  file=error_file,
                  flush=True)
        if pred_idx < 0 and len(pred_filter) > 0:
            trouble = True
            print("Error: Cannot find the predicate column '%s'." %
                  kr.get_label_canonical_name(pred_col),
                  file=error_file,
                  flush=True)
        if obj_idx < 0 and len(obj_filter) > 0:
            trouble = True
            print("Error: Cannot find the object column '%s'." %
                  kr.get_node2_canonical_name(obj_col),
                  file=error_file,
                  flush=True)
        if trouble:
            raise KGTKException("Missing columns.")

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        rw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            if verbose:
                print("Opening the reject file: %s" % str(reject_kgtk_file),
                      file=error_file,
                      flush=True)
            rw = KgtkWriter.open(kr.column_names,
                                 reject_kgtk_file,
                                 mode=KgtkWriter.Mode[kr.mode.name],
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0
        subj_filter_keep_count: int = 0
        pred_filter_keep_count: int = 0
        obj_filter_keep_count: int = 0
        subj_filter_reject_count: int = 0
        pred_filter_reject_count: int = 0
        obj_filter_reject_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            keep: bool = False
            reject: bool = False
            if apply_subj_filter:
                if row[subj_idx] in subj_filter:
                    keep = True
                    subj_filter_keep_count += 1
                else:
                    reject = True
                    subj_filter_reject_count += 1

            if apply_pred_filter:
                if row[pred_idx] in pred_filter:
                    keep = True
                    pred_filter_keep_count += 1
                else:
                    reject = True
                    pred_filter_reject_count += 1

            if apply_obj_filter:
                if row[obj_idx] in obj_filter:
                    keep = True
                    obj_filter_keep_count += 1
                else:
                    reject = True
                    obj_filter_reject_count += 1

            if (not keep ^ invert) if or_pattern else (reject ^ invert):
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1
            else:
                kw.write(row)
                output_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
            print("Keep counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_keep_count, pred_filter_keep_count,
                   obj_filter_keep_count))
            print("Reject counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_reject_count, pred_filter_reject_count,
                   obj_filter_reject_count))

        kw.close()
        if rw is not None:
            rw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #4
0
def run(input_file: KGTKFiles, english_only):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from kgtk.kgtkformat import KgtkFormat

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def make_node_label(node):
        return KgtkFormat.stringify(node.strip().split('/')[3].replace(
            '_', ' '))

    def split_camel_case(name):
        splitted = re.sub('([A-Z][a-z]+)', r' \1',
                          re.sub('([A-Z]+)', r' \1', name)).split()
        return ' '.join(splitted).lower()

    def make_rel_label(rel):
        return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))

    def row_to_edge(row, cols):

        edge = {}
        edge['node1'] = row[2]
        edge['relation'] = row[1]
        edge['node2'] = row[3]
        edge['node1_label'] = make_node_label(row[2])
        edge['node2_label'] = make_node_label(row[3])
        edge['relation_label'] = make_rel_label(row[1])
        edge['relation_dimension'] = ''

        metadata = json.loads(row[4])
        edge['source'] = KgtkFormat.stringify('CN')
        if 'surfaceText' in metadata.keys():
            edge['sentence'] = KgtkFormat.stringify(
                metadata['surfaceText'].replace('\\', ''))
        else:
            edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return '\t'.join(edge_list) + '\n'

    try:
        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata']
        out_columns = [
            'node1', 'relation', 'node2', 'node1_label', 'node2_label',
            'relation_label', 'relation_dimension', 'source', 'sentence'
        ]

        with open(filename, 'r') as f:
            reader = csv.reader(f, delimiter='\t', quotechar='"')
            sys.stdout.write(header_to_edge(out_columns))
            for row in reader:
                if not english_only or (row[2].startswith('/c/en/')
                                        and row[3].startswith('/c/en/')):
                    sys.stdout.write(row_to_edge(row, out_columns))

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #5
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        new_edges_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_lower: typing.Optional[typing.List[str]] = None,
        label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE,
        lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR,
        lower: bool = False,
        normalize: bool = False,
        deduplicate_new_edges: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    new_edges_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if new_edges_kgtk_file is not None:
            print("--label-file=%s" % str(new_edges_kgtk_file),
                  file=error_file)

        if base_columns is not None:
            print("--base-columns=%s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_lower is not None:
            print("--columns-to-lower=%s" % " ".join(columns_to_lower),
                  file=error_file)
        print("--label-value=%s" % label_value, file=error_file)
        print("--lift-separator=%s" % lift_separator, file=error_file)
        print("--lower=%s" % lower, file=error_file)
        print("--normalize=%s" % normalize, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_new_edges,
              file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if not lower and not normalize:
        raise KGTKException(
            "One or both of --lower and --normalize must be requested.")

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict()

        node1_column_name: str = kr.get_node1_column_actual_name()
        label_column_name: str = kr.get_label_column_actual_name()
        node2_column_name: str = kr.get_node2_column_actual_name()
        id_column_name: str = kr.get_id_column_actual_name()

        key_column_names: typing.List[str] = list()
        key_column_idxs: typing.Set[int] = set()

        if node1_column_name != "":
            if verbose:
                print("Node1 column name: %s" % node1_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node1_column_name)
            key_column_idxs.add(kr.node1_column_idx)

        if label_column_name != "":
            if verbose:
                print("Label column name: %s" % label_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(label_column_name)
            key_column_idxs.add(kr.label_column_idx)

        if node2_column_name != "":
            if verbose:
                print("Node2 column name: %s" % node2_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node2_column_name)
            key_column_idxs.add(kr.node2_column_idx)

        if id_column_name != "":
            if verbose:
                print("Id column name: %s" % id_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(id_column_name)
            key_column_idxs.add(kr.id_column_idx)
        elif normalize:
            raise KGTKException(
                "--normalize was requested but the ID column was not found.")

        base_name: str
        new_label_value: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_lower is not None and len(
                columns_to_lower) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower)
            # column_names and base_columns are paired. New records use label_value.
            if len(columns_to_lower) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to remove but only %d base columns."
                    % (len(columns_to_lower), len(base_columns)))

            if len(label_value) == 0:
                raise KGTKException("The --label-value must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                if normalize and base_name == id_column_name:
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], column_name)
                else:
                    if not lower:
                        raise KGTKException(
                            "--lower is not enabled for column %s, base name %s"
                            % (repr(column_name), repr(base_name)))
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], label_value)

        elif columns_to_lower is not None and len(columns_to_lower) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0
            # Each column name is split at the lift separator to determine the base name and label value.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)
                    if base_name not in kr.column_names:
                        raise KGTKException(
                            "For column name %s, base name %s is not known" %
                            (repr(column_name), repr(base_name)))

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    raise KGTKException(
                        "Unable to parse column name %s, no separator (%s)." %
                        (repr(column_name), repr(lift_separator)))

                lower_map[kr.column_name_map[column_name]] = (
                    kr.column_name_map[base_name], new_label_value)

        elif columns_to_lower is None or len(columns_to_lower) == 0:
            # Pattern 3: len(columns_to_lower) == 0.
            # Any column that matches a lift pattern against one of the
            # key columns (node1, label, node2, id, or their aliases)
            # will be lowered.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)
                if verbose:
                    print("Using the default base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)
            else:
                if verbose:
                    print("Using these base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    if verbose:
                        print("column %s is a key column, skipping." %
                              repr(column_name),
                              file=error_file,
                              flush=True)
                    continue

                # Does this column match a lifting pattern?
                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)

                    if base_name not in base_columns:
                        if verbose:
                            print(
                                "Column %s contains base name %s, which is not a base column."
                                % (repr(column_name), repr(base_name)),
                                file=error_file,
                                flush=True)
                        continue

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    if verbose:
                        print(
                            "Column %s does not contain the separator %s and not normalizing, skipping."
                            % (repr(column_name), repr(lift_separator)),
                            file=error_file,
                            flush=True)
                    continue

                # This test should be redundant.
                if base_name in kr.column_names:
                    lower_map[idx] = (kr.column_name_map[base_name],
                                      new_label_value)
                else:
                    raise KGTKException(
                        "Base name %s was unexpectedly not found." %
                        repr(base_name))

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower or normalize.")

        if verbose:
            print("The following columns will be lowered or normalized",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_idx, new_label_value = lower_map[idx]
                base_name = kr.column_names[base_idx]
                print(" %s from %s (label %s)" %
                      (column_name, base_name, repr(new_label_value)),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)
        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if new_edges_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(new_edges_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                node1_column_name, label_column_name, node2_column_name
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  new_edges_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            kw.write(row, shuffle_list=shuffle_list)
            output_line_count += 1

            column_idx: int
            for column_idx in lower_map.keys():
                node1_idx: int
                node1_idx, new_label_value = lower_map[column_idx]
                node1_value: str
                node1_value = row[node1_idx]
                if len(node1_value) == 0:
                    continue  # TODO: raise an exception

                item: str = row[column_idx]
                if len(item) == 0:
                    continue  # Ignore empty node2 values.

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        continue  # Ignore empty node2 values.

                    if deduplicate_new_edges:
                        label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    output_map: typing.Mapping[str, str] = {
                        node1_column_name: node1_value,
                        label_column_name: new_label_value,
                        node2_column_name: node2_value,
                    }
                    if lkw is None:
                        kw.writemap(output_map)
                        label_line_count += 1
                        output_line_count += 1
                    else:
                        lkw.writemap(output_map)
                        label_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #6
0
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from collections import defaultdict

    out_columns = [
        'node1', 'label', 'node2', 'node1_label', 'label_label', 'node2_label',
        'label_dimension', 'source', 'weight', 'creator', 'sentence',
        'question'
    ]

    proximity_relation = '/r/LocatedNear'
    property_relation = '/r/HasProperty'
    property_relation_label = 'has property'
    capableof_relation = '/r/CapableOf'
    capableof_relation_label = 'capable of'

    def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                    image_id):
        my_row = node1, rel, node2, '|'.join(node1_lbl), rel_lbl, '|'.join(
            node2_lbl), '', 'VG', '1.0', 'I' + image_id, '', ''
        return '\t'.join(my_row) + '\n'

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def create_uri(ns, rel):
        return '%s:%s' % (ns, rel)

    try:
        scene_graph_filename: Path = KGTKArgumentParser.get_input_file(
            input_file)
        attr_synsets_filename: Path = KGTKArgumentParser.get_input_file(
            attr_syn_file)

        with open(scene_graph_filename, 'r') as f:
            images_data = json.load(f)

        with open(attr_synsets_filename, 'r') as f:
            attr_synsets = json.load(f)

        sys.stdout.write(header_to_edge(out_columns))

        for counter, an_image in enumerate(images_data):

            image_id = str(an_image['image_id'])
            #image_node=create_uri('vg', 'I' + image_id)

            # OBJECTS
            objid2names = defaultdict(list)
            objid2syns = {}
            rows = []
            for o in an_image['objects']:
                obj_id = o['object_id']
                o_synset = o['synsets']
                objid2syns[obj_id] = o_synset
                for name in o['names']:
                    name = name.strip().lower().rstrip('.')
                    if not name: continue
                    objid2names[obj_id].append(name)

                # ATTRIBUTES
                if 'attributes' in o.keys():
                    for attr in o['attributes']:
                        attr = attr.lower()
                        if attr in attr_synsets:
                            asyn = attr_synsets[attr]
                            apos = asyn.split('.')[1]
                            if apos != 'n':
                                if apos == 'v':  # verb
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                osyn, objid2names[obj_id],
                                                asyn, [attr],
                                                capableof_relation,
                                                capableof_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)
                                else:  #adjective
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                osyn, objid2names[obj_id],
                                                asyn, [attr],
                                                property_relation,
                                                property_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)

            # RELATIONS
            for rel in an_image['relationships']:
                #synsets=rel['synsets']
                relation_label = rel['predicate'].lower().strip().strip('.')
                sub_id = rel['subject_id']
                sub_names = objid2names[sub_id]
                sub_syns = objid2syns[sub_id]
                obj_id = rel['object_id']
                obj_names = objid2names[obj_id]
                obj_syns = objid2syns[obj_id]

                for ssyn in sub_syns:
                    for osyn in obj_syns:
                        if osyn != ssyn:
                            edge_row = create_edge(ssyn, sub_names, osyn,
                                                   obj_names,
                                                   proximity_relation,
                                                   relation_label, image_id)
                            if edge_row not in rows:
                                rows.append(edge_row)
            for a_row in rows:
                sys.stdout.write(a_row)

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #7
0
def run(input_file: KGTKFiles, relation, source, output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from string import Template
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def make_node_label(node):
        return KgtkFormat.stringify(node[3:])

    def split_camel_case(name):
        splitted = re.sub('([A-Z][a-z]+)', r' \1',
                          re.sub('([A-Z]+)', r' \1', name)).split()
        return ' '.join(splitted).lower()

    def make_rel_label(rel):
        return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))

    def row_to_edge(node1, rel, node2, source, cols):

        edge = {}
        prefix = source.lower()
        edge['node1'] = prefix + ':' + node1
        edge['relation'] = rel
        edge['node2'] = prefix + ':' + node2
        edge['node1;label'] = make_node_label(node1)
        edge['node2;label'] = make_node_label(node2)
        edge['relation;label'] = make_rel_label(rel)
        edge['relation;dimension'] = ''

        edge['source'] = KgtkFormat.stringify(source)
        edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return edge_list

    try:
        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata']
        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        with open(filename, 'r') as f:
            reader = csv.reader(f, delimiter=' ', quotechar='"')
            #sys.stdout.write(header_to_edge(out_columns))
            for row in reader:
                ew.write(
                    row_to_edge(row[0], relation, row[1], source, out_columns))

        # Clean up.
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #8
0
def run(input_file: KGTKFiles, relation, source):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from string import Template
    from kgtk.kgtkformat import KgtkFormat

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def make_node_label(node):
        return KgtkFormat.stringify(node[3:])

    def split_camel_case(name):
        splitted = re.sub('([A-Z][a-z]+)', r' \1',
                          re.sub('([A-Z]+)', r' \1', name)).split()
        return ' '.join(splitted).lower()

    def make_rel_label(rel):
        return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))

    def row_to_edge(node1, rel, node2, source, cols):

        edge = {}
        prefix = source.lower()
        edge['node1'] = prefix + ':' + node1
        edge['relation'] = rel
        edge['node2'] = prefix + ':' + node2
        edge['node1_label'] = make_node_label(node1)
        edge['node2_label'] = make_node_label(node2)
        edge['relation_label'] = make_rel_label(rel)
        edge['relation_dimension'] = ''

        edge['source'] = KgtkFormat.stringify(source)
        edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return '\t'.join(edge_list) + '\n'

    try:
        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata']
        out_columns = [
            'node1', 'relation', 'node2', 'node1_label', 'node2_label',
            'relation_label', 'relation_dimension', 'source', 'sentence'
        ]

        with open(filename, 'r') as f:
            reader = csv.reader(f, delimiter=' ', quotechar='"')
            sys.stdout.write(header_to_edge(out_columns))
            for row in reader:
                sys.stdout.write(
                    row_to_edge(row[0], relation, row[1], source, out_columns))

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #9
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        pattern: str,
        subj_col: typing.Optional[str],
        pred_col: typing.Optional[str],
        obj_col: typing.Optional[str],
        or_pattern: bool,
        invert: bool,
        show_version: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    UPDATE_VERSION: str = "2020-08-06T17:06:06.829542+00:00#Mu9vz3KEPh+beQeSwZ8qGMKrTJzHWZFfZFXY6UrYXJAnNpPSin+5NvkSfxKLMkyJtGyeavgGAz8+74bup7eYaQ=="
    if show_version or verbose:
        print("kgtk filter version: %s" % UPDATE_VERSION,
              file=error_file,
              flush=True)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--pattern=%s" % str(pattern), file=error_file)
        if subj_col is not None:
            print("--subj=%s" % str(subj_col), file=error_file)
        if pred_col is not None:
            print("--pred=%s" % str(pred_col), file=error_file)
        if obj_col is not None:
            print("--obj=%s" % str(obj_col), file=error_file)
        print("--or=%s" % str(or_pattern), file=error_file)
        print("--invert=%s" % str(invert), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    def prepare_filter(pattern: str) -> typing.Set[str]:
        filt: typing.Set[str] = set()
        pattern = pattern.strip()
        if len(pattern) == 0:
            return filt

        target: str
        for target in pattern.split(","):
            target = target.strip()
            if len(target) > 0:
                filt.add(target)

        return filt

    def single_predicate_filter(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        pred_idx: int,
        pred_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single predicate filter",
                  file=error_file,
                  flush=True)

        pred_filter_value: str = list(pred_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[pred_idx] == pred_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def single_predicate_filter_inverted(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        pred_idx: int,
        pred_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single predicate filter inverted",
                  file=error_file,
                  flush=True)

        pred_filter_value: str = list(pred_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[pred_idx] != pred_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def single_object_filter(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        obj_idx: int,
        obj_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single object filter",
                  file=error_file,
                  flush=True)

        obj_filter_value: str = list(obj_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[obj_idx] == obj_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def single_object_filter_inverted(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        obj_idx: int,
        obj_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single object filter inverted",
                  file=error_file,
                  flush=True)

        obj_filter_value: str = list(obj_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[obj_idx] != obj_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def general_filter(kr: KgtkReader, kw: KgtkWriter,
                       rw: typing.Optional[KgtkWriter], subj_idx: int,
                       subj_filter: typing.Set[str], pred_idx: int,
                       pred_filter: typing.Set[str], obj_idx: int,
                       obj_filter: typing.Set[str]):
        if verbose:
            print("Applying a general filter", file=error_file, flush=True)

        apply_subj_filter: bool = len(subj_filter) > 0
        apply_pred_filter: bool = len(pred_filter) > 0
        apply_obj_filter: bool = len(obj_filter) > 0
        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0
        subj_filter_keep_count: int = 0
        pred_filter_keep_count: int = 0
        obj_filter_keep_count: int = 0
        subj_filter_reject_count: int = 0
        pred_filter_reject_count: int = 0
        obj_filter_reject_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            keep: bool = False
            reject: bool = False
            if apply_subj_filter:
                if row[subj_idx] in subj_filter:
                    keep = True
                    subj_filter_keep_count += 1
                else:
                    reject = True
                    subj_filter_reject_count += 1

            if apply_pred_filter:
                if row[pred_idx] in pred_filter:
                    keep = True
                    pred_filter_keep_count += 1
                else:
                    reject = True
                    pred_filter_reject_count += 1

            if apply_obj_filter:
                if row[obj_idx] in obj_filter:
                    keep = True
                    obj_filter_keep_count += 1
                else:
                    reject = True
                    obj_filter_reject_count += 1

            if (not keep ^ invert) if or_pattern else (reject ^ invert):
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1
            else:
                kw.write(row)
                output_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
            print("Keep counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_keep_count, pred_filter_keep_count,
                   obj_filter_keep_count))
            print("Reject counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_reject_count, pred_filter_reject_count,
                   obj_filter_reject_count))

    try:

        patterns: typing.List[str] = pattern.split(";")
        if len(patterns) != 3:
            print(
                "Error: The pattern must have three sections separated by semicolons (two semicolons total).",
                file=error_file,
                flush=True)
            raise KGTKException("Bad pattern")

        subj_filter: typing.Set[str] = prepare_filter(patterns[0])
        pred_filter: typing.Set[str] = prepare_filter(patterns[1])
        obj_filter: typing.Set[str] = prepare_filter(patterns[2])

        if verbose and len(subj_filter) == 0 and len(pred_filter) == 0 and len(
                obj_filter) == 0:
            print("Warning: the filter is empty.", file=error_file, flush=True)

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        subj_idx: int = kr.get_node1_column_index(subj_col)
        pred_idx: int = kr.get_label_column_index(pred_col)
        obj_idx: int = kr.get_node2_column_index(obj_col)

        # Complain about a missing column only when it is needed by the pattern.
        trouble: bool = False
        if subj_idx < 0 and len(subj_filter) > 0:
            trouble = True
            print("Error: Cannot find the subject column '%s'." %
                  kr.get_node1_canonical_name(subj_col),
                  file=error_file,
                  flush=True)
        if pred_idx < 0 and len(pred_filter) > 0:
            trouble = True
            print("Error: Cannot find the predicate column '%s'." %
                  kr.get_label_canonical_name(pred_col),
                  file=error_file,
                  flush=True)
        if obj_idx < 0 and len(obj_filter) > 0:
            trouble = True
            print("Error: Cannot find the object column '%s'." %
                  kr.get_node2_canonical_name(obj_col),
                  file=error_file,
                  flush=True)
        if trouble:
            raise KGTKException("Missing columns.")

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        rw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            if verbose:
                print("Opening the reject file: %s" % str(reject_kgtk_file),
                      file=error_file,
                      flush=True)
            rw = KgtkWriter.open(kr.column_names,
                                 reject_kgtk_file,
                                 mode=KgtkWriter.Mode[kr.mode.name],
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        if len(subj_filter) == 0 and len(pred_filter) == 1 and len(
                obj_filter) == 0:
            if invert:
                single_predicate_filter_inverted(kr, kw, rw, pred_idx,
                                                 pred_filter)
            else:
                single_predicate_filter(kr, kw, rw, pred_idx, pred_filter)
        elif len(subj_filter) == 0 and len(pred_filter) == 0 and len(
                obj_filter) == 1:
            if invert:
                single_object_filter_inverted(kr, kw, rw, obj_idx, obj_filter)
            else:
                single_object_filter(kr, kw, rw, obj_idx, obj_filter)
        else:
            general_filter(kr, kw, rw, subj_idx, subj_filter, pred_idx,
                           pred_filter, obj_idx, obj_filter)

        kw.close()
        if rw is not None:
            rw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #10
0
def run(input_file: KGTKFiles, english_only, output_file: KGTKFiles,
        weights_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def make_node_label(node):
        return KgtkFormat.stringify(node.strip().split('/')[3].replace(
            '_', ' '))

    def split_camel_case(name):
        splitted = re.sub('([A-Z][a-z]+)', r' \1',
                          re.sub('([A-Z]+)', r' \1', name)).split()
        return ' '.join(splitted).lower()

    def make_rel_label(rel):
        return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))

    def make_weight_edge(row):

        node1 = '%s-%s-%s-0000' % (row[2], row[1], row[3])
        rel = 'weight'
        node2 = str(json.loads(row[-1])['weight'])
        return [node1, rel, node2]

    def row_to_edge(row, cols):

        edge = {}
        edge['node1'] = row[2]
        edge['relation'] = row[1]
        edge['node2'] = row[3]
        edge['node1;label'] = make_node_label(row[2])
        edge['node2;label'] = make_node_label(row[3])
        edge['relation;label'] = make_rel_label(row[1])
        edge['relation;dimension'] = ''

        metadata = json.loads(row[4])
        edge['source'] = KgtkFormat.stringify('CN')
        if 'surfaceText' in metadata.keys():
            edge['sentence'] = KgtkFormat.stringify(
                metadata['surfaceText'].replace('\\', ''))
        else:
            edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return edge_list

    try:
        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata']
        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        if weights_file:
            info_kgtk_file: Path = KGTKArgumentParser.get_output_file(
                weights_file)
            ew_aux: KgtkWriter = KgtkWriter.open(
                out_columns[:3],
                info_kgtk_file,
                #mode=input_kr.mode,
                require_all_columns=False,
                prohibit_extra_columns=True,
                fill_missing_columns=True,
                gzip_in_parallel=False,
                #verbose=self.verbose,
                #very_verbose=self.very_verbose
            )

        with open(filename, 'r') as f:
            reader = csv.reader(f, delimiter='\t', quotechar='"')
            for row in reader:
                if not english_only or (row[2].startswith('/c/en/')
                                        and row[3].startswith('/c/en/')):
                    ew.write(row_to_edge(row, out_columns))
                    if weights_file and 'weight' in json.loads(row[-1]).keys():
                        ew_aux.write(make_weight_edge(row))

        # Clean up
        ew.close()
        if weights_file:
            ew_aux.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #11
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        into_file: KGTKFiles,
        enable: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    into_kgtk_file: Path = KGTKArgumentParser.get_output_file(
        into_file, who="The tee output file", default_stdout=False)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--to-file=%s" % str(into_kgtk_file), file=error_file)
        print("--enable=%s" % str(enable), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        tkw: typing.Optional[KgtkWriter] = None
        if enable:
            if verbose:
                print("Opening the tee output file: %s" %
                      str(output_kgtk_file),
                      file=error_file,
                      flush=True)
            tkw = KgtkWriter.open(kr.column_names,
                                  into_kgtk_file,
                                  mode=KgtkWriter.Mode[kr.mode.name],
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        input_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row)
            if tkw is not None:
                tkw.write(row)

        if verbose:
            print("Processed %d rows." % (input_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if tkw is not None:
            tkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #12
0
def run(output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import json
    import nltk
    nltk.download("wordnet")
    from nltk.corpus import wordnet as wn
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def obtain_wordnet_lemmas(syn):
        lemmas = []
        for lemma in syn.lemma_names():
            lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' ')))
        return lemmas

    def obtain_hypernyms(syn):
        hyps = []
        for hypernym in syn.hypernyms():
            hyps.append(hypernym.name())
        return hyps

    def obtain_member_holonyms(syn):
        hols = []
        for hol in syn.member_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_part_holonyms(syn):
        hols = []
        for hol in syn.part_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_substance_meronyms(syn):
        hols = []
        for hol in syn.substance_meronyms():
            hols.append(hol.name())
        return hols

    def get_wn_data():
        syns = list(wn.all_synsets())
        all_labels = {}
        all_hyps = {}
        all_members = {}
        all_parts = {}
        all_subs = {}
        for syn in syns:
            syn_name = syn.name()

            lemmas = obtain_wordnet_lemmas(syn)
            all_labels[syn_name] = '|'.join(lemmas)

            hypernyms = obtain_hypernyms(syn)
            if len(hypernyms):
                all_hyps[syn_name] = hypernyms

            member_holonyms = obtain_member_holonyms(syn)
            if len(member_holonyms):
                all_members[syn_name] = member_holonyms

            part_holonyms = obtain_part_holonyms(syn)
            if len(part_holonyms):
                all_parts[syn_name] = part_holonyms

            substance_meronyms = obtain_substance_meronyms(syn)
            if len(substance_meronyms):
                all_subs[syn_name] = substance_meronyms

        return all_labels, all_hyps, all_members, all_parts, all_subs

    def create_edges(data, labels, rel, rel_label):
        all_rows = []
        source = KgtkFormat.stringify('WN')
        for node1, v in data.items():
            for node2 in v:
                node1_preflabel = labels[node1].split('|')[0]
                node2_preflabel = labels[node2].split('|')[0]
                a_row = [
                    'wn:' + node1, rel, 'wn:' + node2, labels[node1],
                    labels[node2], rel_label, "", source, ''
                ]
                all_rows.append(a_row)
        return all_rows

    try:
        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data()
        hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA',
                                 KgtkFormat.stringify('is a'))
        member_edges = create_edges(all_members, all_labels, '/r/PartOf',
                                    KgtkFormat.stringify('is a part of'))
        part_edges = create_edges(all_parts, all_labels, '/r/PartOf',
                                  KgtkFormat.stringify('is a part of'))
        sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf',
                                 KgtkFormat.stringify('is made of'))
        all_edges = hyp_edges + member_edges + part_edges + sub_edges

        for edge in all_edges:
            ew.write(edge)

        # Clean up.
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #13
0
def run(input_file: KGTKFiles, english_only):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from string import Template

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def make_node_label(node):
        return node.strip().split('/')[3].replace('_', ' ')

    def split_camel_case(name):
        splitted = re.sub('([A-Z][a-z]+)', r' \1',
                          re.sub('([A-Z]+)', r' \1', name)).split()
        return ' '.join(splitted).lower()

    def make_rel_label(rel):
        return split_camel_case(rel.split('/')[-1])

    def get_template(label):
        t = {
            'antonym':
            Template('What is the opposite from $node1?'),
            'at location':
            Template('At what location is $node1?'),
            'capable of':
            Template('What is $node1 capable of?'),
            'causes':
            Template('What is caused by $node1?'),
            'causes desire':
            Template('What desire is caused by $node1?'),
            'created by':
            Template('What can create $node1?'),
            'defined as':
            Template('How can $node1 be defined?'),
            'derived from':
            Template('From which word is $node1 derived?'),
            'desires':
            Template('What does $node1 desire?'),
            'distinct from':
            Template('What is $node1 distinct from?'),
            'etymologically derived from':
            Template('What is $node1 etymologically derived from?'),
            'symbol of':
            Template('What is $node1 symbol of?'),
            'synonym':
            Template('What is a synonym of $node1?'),
            'manner of':
            Template('$node1 is a manner of what?'),
            'located near':
            Template('What is $node1 located near to?'),
            'has context':
            Template('What is a context of $node1?'),
            'similar to':
            Template('What is $node1 similar to?'),
            'etymologically related to':
            Template("What is $node1 etymologically related to?"),
            'made of':
            Template('What is $node1 made of?'),
            'receives action':
            Template('What can be done to $node1?'),
            'obstructed by':
            Template('What is $node1 obstructed by?'),
            'motivated by goal':
            Template('What goal motivates $node1?'),
            'has property':
            Template('What is a property of $node1?'),
            'has prerequisite':
            Template('What is a prerequisite for $node1?'),
            'has first subevent':
            Template('What is the first subevent of $node1?'),
            'has last subevent':
            Template('What is the last subevent of $node1?'),
            'has subevent':
            Template('What is a subevent of $node1?'),
            'used for':
            Template('What is $node1 used for?'),
            'has a':
            Template('What belongs to $node1?'),
            'is a':
            Template('What is a $node1?'),
            'form of':
            Template('What is $node1 a form of?'),
            'related to':
            Template('What is $node1 related to?')
        }
        if label in t.keys():
            return t[label]
        else:
            return None

    def make_question(node1, label, node2):
        t = get_template(label)
        return t.substitute(node1=node1)

    def row_to_edge(row, cols):

        edge = {}
        edge['node1'] = row[2]
        edge['label'] = row[1]
        edge['node2'] = row[3]
        edge['node1_label'] = make_node_label(row[2])
        edge['node2_label'] = make_node_label(row[3])
        edge['label_label'] = make_rel_label(row[1])
        edge['label_dimension'] = ''

        metadata = json.loads(row[4])
        edge['weight'] = str(metadata['weight'])
        edge['source'] = 'CN'
        edge['creator'] = metadata['dataset']
        if 'surfaceText' in metadata.keys():
            edge['sentence'] = metadata['surfaceText']
        else:
            edge['sentence'] = ''

        t = get_template(edge['label_label'])
        if t:
            edge['question'] = t.substitute(node1=edge['node1_label'])
        else:
            return ''

        edge_list = [edge[col] for col in cols]
        return '\t'.join(edge_list) + '\n'

    try:
        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata']
        out_columns = [
            'node1', 'label', 'node2', 'node1_label', 'label_label',
            'node2_label', 'label_dimension', 'source', 'weight', 'creator',
            'sentence', 'question'
        ]

        with open(filename, 'r') as f:
            reader = csv.reader(f, delimiter='\t', quotechar='"')
            sys.stdout.write(header_to_edge(out_columns))
            for row in reader:
                if not english_only or (row[2].startswith('/c/en/')
                                        and row[3].startswith('/c/en/')):
                    sys.stdout.write(row_to_edge(row, out_columns))

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #14
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]],
        split_on_commas: bool,
        split_on_spaces: bool,
        strip_spaces: bool,
        all_except: bool,
        ignore_missing_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        print("--split-on-commas=%s" % str(split_on_commas), file=error_file)
        print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file)
        print("--strip-spaces=%s" % str(strip_spaces), file=error_file)
        print("--all-except=%s" % str(all_except), file=error_file)
        print("--ignore-missing-columns=%s" % str(ignore_missing_columns),
              file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if columns is None:
            columns = []  # This simplifies matters.

        if split_on_spaces:
            # We will be very lenient, and allow space-seperated arguments
            # *inside* shell quoting, e.g.
            #
            # kgtk remove_columns -c 'name name2 name3'
            #
            # Do not enable this option if spaces are legal inside your
            # column names.
            columns = " ".join(columns).split()
        remove_columns: typing.List[str] = []
        arg: str
        column_name: str
        for arg in columns:
            if split_on_commas:
                for column_name in arg.split(","):
                    if strip_spaces:
                        column_name = column_name.strip()
                    if len(column_name) > 0:
                        remove_columns.append(column_name)
            else:
                if strip_spaces:
                    arg = arg.strip()
                if len(arg) > 0:
                    remove_columns.append(arg)
        if verbose:
            if all_except:
                print("Removing all columns except %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
            else:
                print("Removing %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
        if len(remove_columns) == 0:
            raise KGTKException("No columns to remove")

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        output_column_names: typing.List[str]

        trouble_column_names: typing.List[str] = []
        if all_except:
            if not ignore_missing_columns:
                for column_name in remove_columns:
                    if column_name not in kr.column_names:
                        print("Error: cannot retain unknown column '%s'." %
                              column_name,
                              file=error_file,
                              flush=True)
                        trouble_column_names.append(column_name)

            output_column_names = []
            for column_name in kr.column_names:
                if column_name in remove_columns:
                    output_column_names.append(column_name)

        else:
            output_column_names = kr.column_names.copy()
            for column_name in remove_columns:
                if column_name in output_column_names:
                    output_column_names.remove(column_name)

                elif not ignore_missing_columns:
                    print("Error: cannot remove unknown column '%s'." %
                          column_name,
                          file=error_file,
                          flush=True)
                    trouble_column_names.append(column_name)

        if len(trouble_column_names) > 0:
            raise KGTKException("Unknown columns %s" %
                                " ".join(trouble_column_names))

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        input_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row, shuffle_list=shuffle_list)

        if verbose:
            print("Processed %d rows." % (input_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #15
0
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles,
        output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from collections import defaultdict
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                    image_id):
        my_row = [
            node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
            rel_lbl, '',
            KgtkFormat.stringify('VG'), ''
        ]
        return my_row

    try:
        scene_graph_filename: Path = KGTKArgumentParser.get_input_file(
            input_file)
        attr_synsets_filename: Path = KGTKArgumentParser.get_input_file(
            attr_syn_file)

        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        proximity_relation = '/r/LocatedNear'
        property_relation = 'mw:MayHaveProperty'
        property_relation_label = KgtkFormat.stringify('may have property')
        capableof_relation = '/r/CapableOf'
        capableof_relation_label = KgtkFormat.stringify('capable of')

        with open(scene_graph_filename, 'r') as f:
            images_data = json.load(f)

        with open(attr_synsets_filename, 'r') as f:
            attr_synsets = json.load(f)

        for counter, an_image in enumerate(images_data):

            image_id = str(an_image['image_id'])

            # OBJECTS
            objid2names = defaultdict(list)
            objid2syns = {}
            rows = []
            for o in an_image['objects']:
                obj_id = o['object_id']
                o_synset = o['synsets']
                objid2syns[obj_id] = o_synset
                for name in o['names']:
                    name = name.strip().lower().rstrip('.')
                    if not name: continue
                    objid2names[obj_id].append(KgtkFormat.stringify(name))

                # ATTRIBUTES
                if 'attributes' in o.keys():
                    for attr in o['attributes']:
                        attr = attr.lower()
                        if attr in attr_synsets:
                            asyn = attr_synsets[attr]
                            apos = asyn.split('.')[1]
                            if apos != 'n':
                                if apos == 'v':  # verb
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                capableof_relation,
                                                capableof_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)
                                else:  #adjective
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                property_relation,
                                                property_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)

            # RELATIONS
            for rel in an_image['relationships']:
                #synsets=rel['synsets']
                relation_label = KgtkFormat.stringify(
                    rel['predicate'].lower().strip().strip('.'))
                sub_id = rel['subject_id']
                sub_names = objid2names[sub_id]
                sub_syns = objid2syns[sub_id]
                obj_id = rel['object_id']
                obj_names = objid2names[obj_id]
                obj_syns = objid2syns[obj_id]

                for ssyn in sub_syns:
                    for osyn in obj_syns:
                        if osyn != ssyn:
                            edge_row = create_edge('wn:' + ssyn, sub_names,
                                                   'wn:' + osyn, obj_names,
                                                   proximity_relation,
                                                   relation_label, image_id)
                            if edge_row not in rows:
                                rows.append(edge_row)
            for a_row in rows:
                ew.write(a_row)

        # Clean up
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Пример #16
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]] = None,
        labels: typing.Optional[typing.List[str]] = None,
        id_column_name: typing.Optional[str] = None,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    import os

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)

        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        if labels is not None:
            print("--labels=%s" % " ".join(labels), file=error_file)
        if id_column_name is not None:
            print("--id-column=%s" % id_column_name, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Starting normalize_nodes pid=%d" % (os.getpid()),
              file=error_file,
              flush=True)

    label_map: typing.MutableMapping[str, str] = dict()
    if labels is not None and len(labels) > 0:
        if columns is None:
            raise KGTKException(
                "--columns must be supplied when --labels is used.")
        if len(columns) != len(labels):
            raise KGTKException("%d columns were supplied, but %d labels." %
                                (len(columns), len(labels)))
        idx: int
        label: str
        for idx, label in enumerate(labels):
            label_map[columns[idx]] = label

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        id_column_idx: int = kr.get_id_column_index(id_column_name)
        if id_column_idx < 0:
            raise KGTKException("Unknown ID column %s" % repr(id_column_name))

        output_column_names: typing.List[str] = [
            KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
        ]

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        input_line_count: int = 0
        output_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            node1_value: str = row[id_column_idx]

            column_idx: int
            column_name: str
            for column_idx, column_name in enumerate(kr.column_names):
                if column_idx == id_column_idx:
                    continue
                if columns is not None and column_name not in columns:
                    continue

                label_value: str = label_map.get(column_name, column_name)

                new_value: str = row[column_idx]
                if len(new_value) == 0:
                    continue  # ignore empty values.

                # The column value might contain a KGTK list.  Since node2 isn't supposed
                # to contain lists, we'll split it.
                node2_value: str
                for node2_value in KgtkValue.split_list(new_value):
                    if len(node2_value) == 0:
                        continue  # node2 shouldn't contain empty values

                    output_row: typing.List[str] = [
                        node1_value, label_value, node2_value
                    ]
                    kw.write(output_row)
                    output_line_count += 1

        if verbose:
            print("Read %d node rows, wrote %d edge rows." %
                  (input_line_count, output_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #17
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        label_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_remove: typing.Optional[typing.List[str]] = None,
        label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE,
        lift_suffix: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX,
        deduplicate_labels: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    label_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(label_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if label_kgtk_file is not None:
            print("--label-file=%s" % str(label_kgtk_file), file=error_file)

        if base_columns is not None:
            print("--base-columns=%s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_remove is not None:
            print("--columns-to-lower=%s" % " ".join(columns_to_remove),
                  file=error_file)
        print("--label-value=%s" % label_value, file=error_file)
        print("--lift-suffix=%s" % lift_suffix, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_labels, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, int] = dict()

        # These columns will never be removed:
        key_column_idxs: typing.Set[int] = set(
            (kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx,
             kr.id_column_idx))
        key_column_idxs.discard(-1)
        key_column_names: typing.Set[str] = set(
            (kr.column_names[idx] for idx in key_column_idxs))

        base_name: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_remove is not None and len(
                columns_to_remove) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_remove) > 0 and len(base_columns) == len(columns_to_remove)
            # column_names and base_columns are paired.
            if len(columns_to_remove) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to remove but only %d base columns."
                    % (len(columns_to_remove), len(base_columns)))

            for idx, column_name in enumerate(columns_to_remove):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                lower_map[kr.column_name_map[
                    column_name]] = kr.column_name_map[base_name]

        elif columns_to_remove is not None and len(columns_to_remove) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_remove) > 0 and len(base_columns) == 0
            # Each column name is stripped of the lift suffix to determine the base name.
            if len(lift_suffix) == 0:
                raise KGTKException("The --lift-suffix must not be empty.")

            for idx, column_name in enumerate(columns_to_remove):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if not column_name.endswith(lift_suffix):
                    raise KGTKException("Unable to parse column name %s." %
                                        repr(column_name))

                base_name = column_name[:-len(lift_suffix)]

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is not known" %
                        (repr(column_name), repr(base_name)))

                lower_map[kr.column_name_map[
                    column_name]] = kr.column_name_map[base_name]

        elif columns_to_remove is None or len(columns_to_remove) == 0:
            # Pattern 3: len(columns_to_remove) == 0.
            if len(lift_suffix) == 0:
                raise KGTKException("The --lift-suffix must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    continue

                # Does this column match a lifting pattern?
                for base_name in base_columns:
                    if len(base_name) == 0:
                        continue
                    if column_name == base_name + lift_suffix:
                        lower_map[idx] = kr.column_name_map[base_name]

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower.")

        if verbose:
            print("The following columns will be lowered",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_name = kr.column_names[lower_map[idx]]
                print(" %s from %s" % (column_name, base_name),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)
        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if label_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(label_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  label_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        # If labels will be written to the output file and deduplication is enabled:
        check_existing_labels: bool = \
            deduplicate_labels and \
            lkw is None and \
            kr.node1_column_idx >= 0 and \
            kr.label_column_idx >= 0 and \
            kr.node2_column_idx >= 0

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if check_existing_labels and row[
                    kr.label_column_idx] == label_value:
                label_key = row[
                    kr.node1_column_idx] + KgtkFormat.COLUMN_SEPARATOR + row[
                        kr.node2_column_idx]
                if label_key in label_set:
                    continue
                else:
                    label_set.add(label_key)

            kw.write(row, shuffle_list=shuffle_list)
            output_line_count += 1

            column_idx: int
            for column_idx in lower_map.keys():
                node1_value: str = row[lower_map[column_idx]]
                if len(node1_value) == 0:
                    continue  # TODO: raise an exception

                item: str = row[column_idx]
                if len(item) == 0:
                    continue  # Ignore empty node2 values.

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        continue  # Ignore empty node2 values.

                    if deduplicate_labels:
                        label_key = node1_value + KgtkFormat.COLUMN_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    output_map: typing.Mapping[str, str] = {
                        KgtkFormat.NODE1: node1_value,
                        KgtkFormat.LABEL: label_value,
                        KgtkFormat.NODE2: node2_value,
                    }
                    if lkw is None:
                        kw.writemap(output_map)
                        label_line_count += 1
                        output_line_count += 1
                    else:
                        lkw.writemap(output_map)
                        label_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1