else:
  ds2_mapping = utils.read_mode_file(dsFile2)

add_header = True
if os.path.isfile(outFNm):
  add_header = False


with open(outFNm, 'a') as equivF:
  if add_header:
    equivF.write('# Equivalence table for mode %s\n' % mode_name)
    equivF.write('# File generated on: %s\n' % utils.get_current_date())
    equivF.write('# snap_nid_1\tsnap_nid_2\n')
  if inFNm is not None:
    with open(inFNm, 'r') as inF:
      for line in inF:
        if line[0] == '#' or line[0] == '\n':
          continue
        vals = utils.split_then_strip(line, '\t')
        id1 = vals[ds1Idx]
        id2 = vals[ds2Idx]
        if id1 == '' or id2 == '':
          continue
        if args.skip_missing_ids and (id1 not in ds1_mapping or id2 not in ds2_mapping):
          continue
        equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id2]))
  else:
    for id1 in ds1_mapping:
      if id1 in ds2_mapping:
        equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id1]))
示例#2
0
                    help='how to identify the src nodes in the header for tsv')
parser.add_argument('--dst_node_name',
                    default='node_id2',
                    type=str,
                    help='how to identify the dst nodes in the header for tsv')

if __name__ == '__main__':
    args = parser.parse_args()
    with open(args.input_file, 'r') as inF:
        with open(args.output_file, 'w') as outF:
            outF.write('# Dataset: %s\n' % args.dataset_name)
            outF.write('# %s\t%s\n' % (args.src_node_name, args.dst_node_name))
            for i, line in enumerate(inF):
                if args.verbose and i % 1000000 == 0:
                    print 'Finished processing line %d in the original input file' % i
                if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (
                        i == 0 and args.has_title):
                    continue
                vals = utils.split_then_strip(line, args.divider)
                src_nodes = [vals[args.src_node_col]]
                dst_nodes = [vals[args.dst_node_col]]
                if args.src_node_sep is not None:
                    src_nodes = src_nodes[0].split(args.src_node_sep)
                if args.dst_node_sep is not None:
                    dst_nodes = dst_nodes[0].split(args.dst_node_sep)
                for src_node in src_nodes:
                    if src_node == '':
                        continue
                    for dst_node in dst_nodes:
                        if dst_node != '':
                            outF.write('%s\t%s\n' % (src_node, dst_node))
parser.add_argument('input_file', help='input file name.')
parser.add_argument('output_file', help='output file name.')
parser.add_argument('dataset_name', help='Name of the dataset')
parser.add_argument('columns', metavar='N', type=int, nargs='+',
                    help='columnswith node ids')
parser.add_argument('--has_title', action='store_true',
                    help='has a title line that is not prefixed with a #')
parser.add_argument('--verbose', action='store_true',
                    help='Print every 1,000,000 lines processed')
parser.add_argument('--divider', default='\t', type=str, help='column separator, by default a tab')
parser.add_argument('--node_name', default='node_id', type=str, help='how to identify the nodes in the header for tsv')

if __name__ == '__main__':
    args = parser.parse_args()
    with open(args.input_file, 'r') as inF:
        unique_ids = set()
        with open(args.output_file, 'w') as outF:
            outF.write('# Dataset: %s\n' % args.dataset_name)
            outF.write('# %s\n' % args.node_name)
            for i, line in enumerate(inF):
                if args.verbose and i%1000000 == 0:
                    print 'Finished processing line %d in the original input file' % i
                if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (i==0 and args.has_title):
                    continue
                vals = utils.split_then_strip(line, args.divider)
                for column in args.columns:
                    if vals[column] not in unique_ids and len(vals[column]) > 0:
                        unique_ids.add(vals[column])
                        new_line = '%s\n' % vals[column]
                        outF.write(new_line)
def create_mambo_mode_table(input_file,
                            db_id,
                            mode_name,
                            dataset_name,
                            full_mode_file,
                            output_dir,
                            db_node_file,
                            mambo_id_counter_start,
                            node_index,
                            verbose=False,
                            delimiter=DELIMITER):
    # Process command line arguments, get default path names
    inFNm = input_file
    db_id = db_id
    mode_name = mode_name
    dataset = dataset_name
    outFNm = full_mode_file
    if outFNm is None:
        outFNm = os.path.join(output_dir,
                              utils.get_full_mode_file_name(mode_name))
    dbFNm = db_node_file
    if dbFNm is None:
        dbFNm = os.path.join(
            output_dir, utils.get_mode_file_name(mode_name, db_id, dataset))

    counter = mambo_id_counter_start
    if counter == -1:
        counter = utils.get_max_id(outFNm)

    # Read input file, create output files.
    seen = set()
    if verbose:
        print 'Starting at mambo id: %d' % counter
    with open(inFNm, 'r') as inF, open(outFNm, 'a') as outF, open(dbFNm,
                                                                  'w') as dbF:
        if counter == 0:
            outF.write('# Full mode table for %s\n' % mode_name)
            outF.write('# File generated on: %s\n' % utils.get_current_date())
            outF.write('# mambo_nid%sdataset id\n' % delimiter)
        dbF.write('# Mode table for dataset: %s\n' % dataset)
        dbF.write('# File generated on: %s\n' % utils.get_current_date())
        add_schema = True
        for line in inF:
            if line[0] in COMMENT:  # skip comments
                continue
            vals = utils.split_then_strip(line, delimiter)
            if add_schema:
                attrs_schema = '# mambo_nid%sdataset_nid' % delimiter
                for i in range(len(vals)):
                    if i != node_index:
                        attrs_schema += '%sC%d' % (delimiter, i)
                dbF.write('%s\n' % attrs_schema)
                add_schema = False
            node_id = vals[node_index]
            if node_id in seen or len(node_id) == 0:
                continue
            attrs_str = ''
            for i in range(len(vals)):
                if i != node_index:
                    attrs_str += delimiter + vals[i]
            outF.write('%d%s%d\n' % (counter, delimiter, db_id))
            dbF.write('%d%s%s%s\n' % (counter, delimiter, node_id, attrs_str))
            seen.add(node_id)
            counter += 1
    if verbose:
        print 'Ending at mambo id: %d' % counter
示例#5
0
def create_mapped_mode_table(mode_name,
                             input_file,
                             dataset_name,
                             db_id,
                             mapping_file,
                             skip,
                             map_index,
                             node_index,
                             output_dir,
                             full_mode_file,
                             db_node_file,
                             delimiter=DELIMITER):
    if full_mode_file is None:
        full_mode_file = os.path.join(output_dir,
                                      utils.get_full_mode_file_name(mode_name))
    full_mode_map = {}
    if os.path.isfile(full_mode_file):
        with open(full_mode_file, 'r') as fm_file:
            for line in fm_file:
                if line[0] in COMMENT:  # skip comments
                    continue
                split_line = line.strip().split(delimiter)
                full_mode_map[int(split_line[0])] = split_line[1]

    if db_node_file is None:
        db_node_file = os.path.join(
            output_dir, utils.get_mode_file_name(mode_name, db_id,
                                                 dataset_name))

    max_id = 0
    mapping = {}
    num_cols = 0
    with open(mapping_file, 'r') as mf:
        for line in mf:
            if line[0] in COMMENT:
                continue
            split_line = line.strip().split(delimiter)
            num_cols = len(split_line)
            mapping[split_line[map_index]] = split_line[0]
            max_id = int(split_line[0])

    has_header = True
    seen = set()
    seen_counter = set()
    with open(full_mode_file, 'w') as fm_file, \
            open(input_file, "r") as in_file, \
            open(db_node_file, 'w') as db_file, open(
            mapping_file, 'a') as mf:
        fm_file.write('# Full mode table for %s\n' % mode_name)
        fm_file.write('# File generated on: %s\n' % utils.get_current_date())
        fm_file.write('# mambo_nid%sdataset_ids\n' % delimiter)

        db_file.write('# Mode table for dataset: %s\n' % dataset_name)
        db_file.write('# File generated on: %s\n' % utils.get_current_date())

        add_schema = True
        for line in in_file:
            if line[0] in COMMENT or has_header:  # skip comments
                has_header = False
                continue

            vals = utils.split_then_strip(line, delimiter)
            if add_schema:
                attrs_schema = '# mambo_nid%sdataset_nid' % delimiter
                for i in range(len(vals)):
                    if i != node_index:
                        attrs_schema += '%sC%d' % (delimiter, i)
                db_file.write('%s\n' % attrs_schema)
                add_schema = False

            node_id = vals[node_index].split('.')
            node_id = node_id[0] if len(node_id) == 1 else node_id[1]
            if node_id in seen or len(node_id) == 0:
                continue
            attrs_str = ''
            for i in range(len(vals)):
                if i != node_index:
                    attrs_str += delimiter + vals[i]
            counter = 0
            if node_id in mapping:
                counter = int(mapping[node_id])
            elif not skip:
                max_id = max_id + 1
                counter = max_id
                result = "%d%s" % (counter, delimiter)
                for i in range(num_cols - 1):
                    label = NONE if i + 1 != map_index else node_id
                    result = result + label + delimiter
                result = result.strip(delimiter) + '\n'
                mf.write(result)
            db_ids = full_mode_map[counter] + "," + str(
                db_id) if counter in full_mode_map else str(db_id)
            fm_file.write('%d%s%s\n' % (counter, delimiter, db_ids))
            db_file.write('%d%s%s%s\n' %
                          (counter, delimiter, vals[node_index], attrs_str))
            seen.add(node_id)
            seen_counter.add(counter)
        for counter in full_mode_map:
            if counter not in seen_counter:
                fm_file.write('%d%s%s\n' %
                              (counter, delimiter, full_mode_map[counter]))
def create_mambo_crossnet_table(input_file,
                                src_file,
                                dst_file,
                                dataset_name,
                                db_id,
                                src_node_index,
                                dst_node_index,
                                mode_name1,
                                mode_name2,
                                output_dir,
                                full_crossnet_file,
                                db_edge_file,
                                src_mode_filter,
                                dst_mode_filter,
                                mambo_id_counter_start,
                                skip_missing_ids,
                                verbose=False,
                                delimiter=DELIMITER):
    inFNm = input_file
    srcFile = src_file
    dstFile = dst_file
    dataset = dataset_name
    db_id = db_id

    srcIdx = src_node_index
    dstIdx = dst_node_index

    src_db_id = utils.parse_dataset_id_from_name(os.path.basename(srcFile))
    dst_db_id = utils.parse_dataset_id_from_name(os.path.basename(dstFile))

    mode_name1 = utils.parse_mode_name_from_name(
        os.path.basename(srcFile)) if mode_name1 is None else mode_name1
    mode_name2 = utils.parse_mode_name_from_name(
        os.path.basename(dstFile)) if mode_name2 is None else mode_name2

    outFNm = full_crossnet_file
    if outFNm is None:
        outFNm = os.path.join(
            output_dir, utils.get_full_cross_file_name(mode_name1, mode_name2))
    outFNm2 = db_edge_file
    if outFNm2 is None:
        outFNm2 = os.path.join(
            output_dir,
            utils.get_cross_file_name(mode_name1, mode_name2, db_id, dataset))

    src_mapping = utils.read_mode_file(srcFile)
    if os.path.samefile(srcFile, dstFile):
        dst_mapping = src_mapping
    else:
        dst_mapping = utils.read_mode_file(dstFile)

    src_filter = utils.get_filter(src_mode_filter)
    dst_filter = utils.get_filter(dst_mode_filter)

    add_schema = True
    counter = mambo_id_counter_start
    if counter == -1:
        counter = utils.get_max_id(outFNm)
    if verbose:
        print 'Starting at mambo id: %d' % counter
    with open(inFNm, 'r') as inF, open(outFNm,
                                       'a') as fullF, open(outFNm2,
                                                           'w') as dbF:
        # Add schema/metadata
        if counter == 0:
            fullF.write('# Full crossnet file for %s to %s\n' %
                        (mode_name1, mode_name2))
            fullF.write('# File generated on: %s\n' % utils.get_current_date())
            fullF.write(
                '# mambo_eid%sdataset_id%ssrc_mambo_nid%sdst_mambo_nid\n' %
                (delimiter, delimiter, delimiter))
        dbF.write('# Crossnet table for dataset: %s\n' % dataset)
        dbF.write('# File generated on: %s\n' % utils.get_current_date())
        # Process file
        for line in inF:
            if line[0] in COMMENT:
                continue
            vals = utils.split_then_strip(line, delimiter)
            if add_schema:
                attrs_schema = '# mambo_eid%ssrc_dataset_id%sdst_dataset_id' % (
                    delimiter, delimiter)
                for i in range(len(vals)):
                    if i != srcIdx and i != dstIdx:
                        attrs_schema += '%sC%d' % (delimiter, i)
                dbF.write('%s\n' % attrs_schema)
                add_schema = False
            id1 = vals[srcIdx]
            id2 = vals[dstIdx]
            if src_filter:
                id1 = src_filter(id1)
            if dst_filter:
                id2 = dst_filter(id2)
            if id1 == '' or id2 == '':
                continue
            if skip_missing_ids and (id1 not in src_mapping
                                     or id2 not in dst_mapping):
                #print id1, id2
                continue
            attr_strs = ''
            for i in range(len(vals)):
                if i != srcIdx and i != dstIdx:
                    attr_strs += delimiter + vals[i]
            fullF.write('%d%s%d%s%d%s%d\n' %
                        (counter, delimiter, db_id, delimiter,
                         src_mapping[id1], delimiter, dst_mapping[id2]))
            dbF.write('%d%s%d%s%d%s\n' % (counter, delimiter, src_db_id,
                                          delimiter, dst_db_id, attr_strs))
            counter += 1
    if verbose:
        print 'Ending at mambo id: %d' % counter