def find_ids(nodes, *args): if not args: return [-1] bar = StatusBar(len(nodes)) regex = re.compile(r'^.*?\tartist\t(%s)\n' % '|'.join(args), re.IGNORECASE) ids = [] for index, row in enumerate(nodes): if regex.match(row): ids.append(index) if index % 10000 == 0: bar.update(index) bar.close() return ids
def merge(nodes, edges, components): merged = [] for list, type in ((nodes, 'n'), (edges, 'e')): bar = StatusBar(len(list)) counter = 0 while list: line = list.pop() # modify input list to reduce memory consumption if line: merged.append("%s\t%s\t%s" % (components[get_id(line)], type, line)) counter += 1 if counter % 10000 == 0: bar.update(counter) bar.close() return merged
def delete(nodes, edges, *deletion_indices): from datetime import datetime for index in deletion_indices: nodes[index] = None bar = StatusBar(len(edges)) deletion_indices = set(deletion_indices) # set for constant time 'in' check for i, row in enumerate(edges): data = row.split("\t", 2) left = int(data[0]) right = int(data[1]) if left in deletion_indices or right in deletion_indices: edges[i] = None if i % 10000 == 0: bar.update(i) bar.close()
def read_lines(path, approx=10000000): bar = StatusBar(approx) lines = [] counter = 0 with open(path) as file: file.readline() # drop header for line in file: lines.append(line) counter += 1 if counter % 10000 == 0: bar.update(counter) if not lines[-1].endswith("\n"): lines[-1] += "\n" bar.close() return lines
def compute(nodes, edges): parents = make_sets(len(nodes)) bar = StatusBar(len(edges)) counter = 0 for line in edges: if line: first_delimiter = line.find("\t") second_delimiter = line.find("\t", first_delimiter + 1) left = int(line[:first_delimiter]) right = int(line[first_delimiter:second_delimiter]) union(parents, left, right) counter += 1 if counter % 5000 == 0: bar.update(counter) bar.close() bar = StatusBar(len(parents)) for counter, x in enumerate(parents): parents[counter] = find(parents, x) if counter % 10000 == 0: bar.update(counter) bar.close() return parents
def process_data(path_to_nodes, path_to_edges, path_to_output, *deletion_names): """ Process the given data to be able to use the graph structure with networkx while not allocating over 9000MB of RAM. The nodes of the input data must have continuous ids. Furthermore, artist entries are expected to not end with a \t. Otherwise an entry which should be deleted might not be deleted. The data undergoes the following steps. Step 1: read data into memory Step 2: delete nodes that match a given name Step 3: delete edges adjacent to nodes deleted in Step 2 Step 4: find connected components Step 5: merge nodes and edges Step 6: sort Step 7: output to file """ begin = datetime.now() # Step 1 def read_lines(path, approx=10000000): bar = StatusBar(approx) lines = [] counter = 0 with open(path) as file: file.readline() # drop header for line in file: lines.append(line) counter += 1 if counter % 10000 == 0: bar.update(counter) if not lines[-1].endswith("\n"): lines[-1] += "\n" bar.close() return lines print ">>> Reading nodes and edges..." nodes = read_lines(path_to_nodes, approx=10000000) edges = read_lines(path_to_edges, approx=27000000) # Step 2 and 3 if deletion_names: print ">>> Searching ids voted for deletion..." deletion_ids = various_artists.find_ids(nodes, *deletion_names) print ">>> Deleting nodes and edges..." various_artists.delete(nodes, edges, *deletion_ids) # Step 4 print ">>> Searching for connected components..." components = connected_components.compute(nodes, edges) # Step 5 print ">>> Merging nodes and edges..." merged = merge.merge(nodes, edges, components) del nodes del edges # Step 6 print ">>> Sorting according to connected components..." merged.sort() # Step 7 print ">>> Writing to file..." bar = StatusBar(len(merged)) counter = 0 with open(path_to_output, "w") as file: file.write(HEADER) for line in merged: file.write(line) counter += 1 if counter % 10000 == 0: bar.update(counter) file.close() bar.close() del merged # say goodbye diff = datetime.now() - begin print ">>> Jobs Done! [%s]" % str(timedelta(seconds=int(diff.total_seconds()))) return