else: ds2_mapping = utils.read_mode_file(dsFile2) add_header = True if os.path.isfile(outFNm): add_header = False with open(outFNm, 'a') as equivF: if add_header: equivF.write('# Equivalence table for mode %s\n' % mode_name) equivF.write('# File generated on: %s\n' % utils.get_current_date()) equivF.write('# snap_nid_1\tsnap_nid_2\n') if inFNm is not None: with open(inFNm, 'r') as inF: for line in inF: if line[0] == '#' or line[0] == '\n': continue vals = utils.split_then_strip(line, '\t') id1 = vals[ds1Idx] id2 = vals[ds2Idx] if id1 == '' or id2 == '': continue if args.skip_missing_ids and (id1 not in ds1_mapping or id2 not in ds2_mapping): continue equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id2])) else: for id1 in ds1_mapping: if id1 in ds2_mapping: equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id1]))
help='how to identify the src nodes in the header for tsv') parser.add_argument('--dst_node_name', default='node_id2', type=str, help='how to identify the dst nodes in the header for tsv') if __name__ == '__main__': args = parser.parse_args() with open(args.input_file, 'r') as inF: with open(args.output_file, 'w') as outF: outF.write('# Dataset: %s\n' % args.dataset_name) outF.write('# %s\t%s\n' % (args.src_node_name, args.dst_node_name)) for i, line in enumerate(inF): if args.verbose and i % 1000000 == 0: print 'Finished processing line %d in the original input file' % i if line[0] == '#' or line[0] == '!' or line[0] == '\n' or ( i == 0 and args.has_title): continue vals = utils.split_then_strip(line, args.divider) src_nodes = [vals[args.src_node_col]] dst_nodes = [vals[args.dst_node_col]] if args.src_node_sep is not None: src_nodes = src_nodes[0].split(args.src_node_sep) if args.dst_node_sep is not None: dst_nodes = dst_nodes[0].split(args.dst_node_sep) for src_node in src_nodes: if src_node == '': continue for dst_node in dst_nodes: if dst_node != '': outF.write('%s\t%s\n' % (src_node, dst_node))
parser.add_argument('input_file', help='input file name.') parser.add_argument('output_file', help='output file name.') parser.add_argument('dataset_name', help='Name of the dataset') parser.add_argument('columns', metavar='N', type=int, nargs='+', help='columnswith node ids') parser.add_argument('--has_title', action='store_true', help='has a title line that is not prefixed with a #') parser.add_argument('--verbose', action='store_true', help='Print every 1,000,000 lines processed') parser.add_argument('--divider', default='\t', type=str, help='column separator, by default a tab') parser.add_argument('--node_name', default='node_id', type=str, help='how to identify the nodes in the header for tsv') if __name__ == '__main__': args = parser.parse_args() with open(args.input_file, 'r') as inF: unique_ids = set() with open(args.output_file, 'w') as outF: outF.write('# Dataset: %s\n' % args.dataset_name) outF.write('# %s\n' % args.node_name) for i, line in enumerate(inF): if args.verbose and i%1000000 == 0: print 'Finished processing line %d in the original input file' % i if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (i==0 and args.has_title): continue vals = utils.split_then_strip(line, args.divider) for column in args.columns: if vals[column] not in unique_ids and len(vals[column]) > 0: unique_ids.add(vals[column]) new_line = '%s\n' % vals[column] outF.write(new_line)
def create_mambo_mode_table(input_file, db_id, mode_name, dataset_name, full_mode_file, output_dir, db_node_file, mambo_id_counter_start, node_index, verbose=False, delimiter=DELIMITER): # Process command line arguments, get default path names inFNm = input_file db_id = db_id mode_name = mode_name dataset = dataset_name outFNm = full_mode_file if outFNm is None: outFNm = os.path.join(output_dir, utils.get_full_mode_file_name(mode_name)) dbFNm = db_node_file if dbFNm is None: dbFNm = os.path.join( output_dir, utils.get_mode_file_name(mode_name, db_id, dataset)) counter = mambo_id_counter_start if counter == -1: counter = utils.get_max_id(outFNm) # Read input file, create output files. seen = set() if verbose: print 'Starting at mambo id: %d' % counter with open(inFNm, 'r') as inF, open(outFNm, 'a') as outF, open(dbFNm, 'w') as dbF: if counter == 0: outF.write('# Full mode table for %s\n' % mode_name) outF.write('# File generated on: %s\n' % utils.get_current_date()) outF.write('# mambo_nid%sdataset id\n' % delimiter) dbF.write('# Mode table for dataset: %s\n' % dataset) dbF.write('# File generated on: %s\n' % utils.get_current_date()) add_schema = True for line in inF: if line[0] in COMMENT: # skip comments continue vals = utils.split_then_strip(line, delimiter) if add_schema: attrs_schema = '# mambo_nid%sdataset_nid' % delimiter for i in range(len(vals)): if i != node_index: attrs_schema += '%sC%d' % (delimiter, i) dbF.write('%s\n' % attrs_schema) add_schema = False node_id = vals[node_index] if node_id in seen or len(node_id) == 0: continue attrs_str = '' for i in range(len(vals)): if i != node_index: attrs_str += delimiter + vals[i] outF.write('%d%s%d\n' % (counter, delimiter, db_id)) dbF.write('%d%s%s%s\n' % (counter, delimiter, node_id, attrs_str)) seen.add(node_id) counter += 1 if verbose: print 'Ending at mambo id: %d' % counter
def create_mapped_mode_table(mode_name, input_file, dataset_name, db_id, mapping_file, skip, map_index, node_index, output_dir, full_mode_file, db_node_file, delimiter=DELIMITER): if full_mode_file is None: full_mode_file = os.path.join(output_dir, utils.get_full_mode_file_name(mode_name)) full_mode_map = {} if os.path.isfile(full_mode_file): with open(full_mode_file, 'r') as fm_file: for line in fm_file: if line[0] in COMMENT: # skip comments continue split_line = line.strip().split(delimiter) full_mode_map[int(split_line[0])] = split_line[1] if db_node_file is None: db_node_file = os.path.join( output_dir, utils.get_mode_file_name(mode_name, db_id, dataset_name)) max_id = 0 mapping = {} num_cols = 0 with open(mapping_file, 'r') as mf: for line in mf: if line[0] in COMMENT: continue split_line = line.strip().split(delimiter) num_cols = len(split_line) mapping[split_line[map_index]] = split_line[0] max_id = int(split_line[0]) has_header = True seen = set() seen_counter = set() with open(full_mode_file, 'w') as fm_file, \ open(input_file, "r") as in_file, \ open(db_node_file, 'w') as db_file, open( mapping_file, 'a') as mf: fm_file.write('# Full mode table for %s\n' % mode_name) fm_file.write('# File generated on: %s\n' % utils.get_current_date()) fm_file.write('# mambo_nid%sdataset_ids\n' % delimiter) db_file.write('# Mode table for dataset: %s\n' % dataset_name) db_file.write('# File generated on: %s\n' % utils.get_current_date()) add_schema = True for line in in_file: if line[0] in COMMENT or has_header: # skip comments has_header = False continue vals = utils.split_then_strip(line, delimiter) if add_schema: attrs_schema = '# mambo_nid%sdataset_nid' % delimiter for i in range(len(vals)): if i != node_index: attrs_schema += '%sC%d' % (delimiter, i) db_file.write('%s\n' % attrs_schema) add_schema = False node_id = vals[node_index].split('.') node_id = node_id[0] if len(node_id) == 1 else node_id[1] if node_id in seen or len(node_id) == 0: continue attrs_str = '' for i in range(len(vals)): if i != node_index: attrs_str += delimiter + vals[i] counter = 0 if node_id in mapping: counter = int(mapping[node_id]) elif not skip: max_id = max_id + 1 counter = max_id result = "%d%s" % (counter, delimiter) for i in range(num_cols - 1): label = NONE if i + 1 != map_index else node_id result = result + label + delimiter result = result.strip(delimiter) + '\n' mf.write(result) db_ids = full_mode_map[counter] + "," + str( db_id) if counter in full_mode_map else str(db_id) fm_file.write('%d%s%s\n' % (counter, delimiter, db_ids)) db_file.write('%d%s%s%s\n' % (counter, delimiter, vals[node_index], attrs_str)) seen.add(node_id) seen_counter.add(counter) for counter in full_mode_map: if counter not in seen_counter: fm_file.write('%d%s%s\n' % (counter, delimiter, full_mode_map[counter]))
def create_mambo_crossnet_table(input_file, src_file, dst_file, dataset_name, db_id, src_node_index, dst_node_index, mode_name1, mode_name2, output_dir, full_crossnet_file, db_edge_file, src_mode_filter, dst_mode_filter, mambo_id_counter_start, skip_missing_ids, verbose=False, delimiter=DELIMITER): inFNm = input_file srcFile = src_file dstFile = dst_file dataset = dataset_name db_id = db_id srcIdx = src_node_index dstIdx = dst_node_index src_db_id = utils.parse_dataset_id_from_name(os.path.basename(srcFile)) dst_db_id = utils.parse_dataset_id_from_name(os.path.basename(dstFile)) mode_name1 = utils.parse_mode_name_from_name( os.path.basename(srcFile)) if mode_name1 is None else mode_name1 mode_name2 = utils.parse_mode_name_from_name( os.path.basename(dstFile)) if mode_name2 is None else mode_name2 outFNm = full_crossnet_file if outFNm is None: outFNm = os.path.join( output_dir, utils.get_full_cross_file_name(mode_name1, mode_name2)) outFNm2 = db_edge_file if outFNm2 is None: outFNm2 = os.path.join( output_dir, utils.get_cross_file_name(mode_name1, mode_name2, db_id, dataset)) src_mapping = utils.read_mode_file(srcFile) if os.path.samefile(srcFile, dstFile): dst_mapping = src_mapping else: dst_mapping = utils.read_mode_file(dstFile) src_filter = utils.get_filter(src_mode_filter) dst_filter = utils.get_filter(dst_mode_filter) add_schema = True counter = mambo_id_counter_start if counter == -1: counter = utils.get_max_id(outFNm) if verbose: print 'Starting at mambo id: %d' % counter with open(inFNm, 'r') as inF, open(outFNm, 'a') as fullF, open(outFNm2, 'w') as dbF: # Add schema/metadata if counter == 0: fullF.write('# Full crossnet file for %s to %s\n' % (mode_name1, mode_name2)) fullF.write('# File generated on: %s\n' % utils.get_current_date()) fullF.write( '# mambo_eid%sdataset_id%ssrc_mambo_nid%sdst_mambo_nid\n' % (delimiter, delimiter, delimiter)) dbF.write('# Crossnet table for dataset: %s\n' % dataset) dbF.write('# File generated on: %s\n' % utils.get_current_date()) # Process file for line in inF: if line[0] in COMMENT: continue vals = utils.split_then_strip(line, delimiter) if add_schema: attrs_schema = '# mambo_eid%ssrc_dataset_id%sdst_dataset_id' % ( delimiter, delimiter) for i in range(len(vals)): if i != srcIdx and i != dstIdx: attrs_schema += '%sC%d' % (delimiter, i) dbF.write('%s\n' % attrs_schema) add_schema = False id1 = vals[srcIdx] id2 = vals[dstIdx] if src_filter: id1 = src_filter(id1) if dst_filter: id2 = dst_filter(id2) if id1 == '' or id2 == '': continue if skip_missing_ids and (id1 not in src_mapping or id2 not in dst_mapping): #print id1, id2 continue attr_strs = '' for i in range(len(vals)): if i != srcIdx and i != dstIdx: attr_strs += delimiter + vals[i] fullF.write('%d%s%d%s%d%s%d\n' % (counter, delimiter, db_id, delimiter, src_mapping[id1], delimiter, dst_mapping[id2])) dbF.write('%d%s%d%s%d%s\n' % (counter, delimiter, src_db_id, delimiter, dst_db_id, attr_strs)) counter += 1 if verbose: print 'Ending at mambo id: %d' % counter