def __init__(self, model: CommunitiesModel, batch_size: int, session: Session, table: str): self._log = logging.getLogger("BatchedCommunityResolver") self.resolver = progress_bar( BatchedHashResolver(self._gen_hashes(model), batch_size, session, table), self._log, expected_size=model.count_elements() ) self._prev = None, None, None
def evaluate_communities(args): log = logging.getLogger("evalcc") model = CommunitiesModel().load(args.input) patch_tables(args) spark = create_spark("evalcc-%s" % uuid4(), **args.__dict__) log.info("Preparing the communities' RDD") items = [] for i, c in progress_bar(enumerate(model.communities), log, expected_size=len(model.communities)): for m in c: if m < len(model.id_to_element): items.append(Row(sha1=model.id_to_element[m], community=i)) log.info("Running") items_in_spark = spark.sparkContext.parallelize(items).toDF() bags = spark \ .read \ .format("org.apache.spark.sql.cassandra") \ .options(table=args.tables["bags"], keyspace=args.keyspace) \ .load() log.info("Loaded the bags, calculating the vocabulary") vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect() vocabulary = {v: i for i, v in enumerate(vocabulary)} log.info("Vocabulary size: %d", len(vocabulary)) element_to_id = {e: i for i, e in enumerate(model.id_to_element)} metrics = items_in_spark.join(bags, "sha1").rdd \ .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \ .groupByKey() \ .map(CommunityEvaluator(args.threshold, len(vocabulary))) \ .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)]) log.info("Total misses: %d", metrics[0]) log.info("Average normalized misses: %f", metrics[1] / len(model.communities)) log.info("Total loss: %f", metrics[2]) log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
def convert(self, models_path: List[str], destdir: str) -> int: """ Performs the model -> model conversion. Runs the conversions in a pool of processes. :param models_path: List of Models path. :param destdir: The directory where to store the models. The directory structure is \ preserved. :return: The number of converted files. """ files = list(models_path) self._log.info("Found %d files", len(files)) if not files: return 0 queue_in = multiprocessing.Manager().Queue() queue_out = multiprocessing.Manager().Queue(1) processes = [multiprocessing.Process(target=self._process_entry, args=(i, destdir, queue_in, queue_out)) for i in range(self.num_processes)] for p in processes: p.start() for f in files: queue_in.put(f) for _ in processes: queue_in.put(None) failures = 0 for _ in progress_bar(files, self._log, expected_size=len(files)): filename, ok = queue_out.get() if not ok: failures += 1 for p in processes: p.join() self._log.info("Finished, %d failed files", failures) return len(files) - failures
def convert(self, srcdir: str, destdir: str, pattern: str="**/*.asdf") -> int: """ Performs the model -> model conversion. Runs the conversions in a pool of processes. :param srcdir: The directory to scan for the models. :param destdir: The directory where to store the models. The directory structure is \ preserved. :param pattern: glob pattern for the files. :return: The number of converted files. """ self._log.info("Scanning %s", srcdir) files = [str(p) for p in Path(srcdir).glob(pattern)] self._log.info("Found %d files", len(files)) if not files: return 0 queue_in = multiprocessing.Manager().Queue() queue_out = multiprocessing.Manager().Queue(1) processes = [multiprocessing.Process(target=self._process_entry, args=(i, destdir, srcdir, queue_in, queue_out)) for i in range(self.num_processes)] for p in processes: p.start() for f in files: queue_in.put(f) for _ in processes: queue_in.put(None) failures = 0 for _ in progress_bar(files, self._log, expected_size=len(files)): filename, ok = queue_out.get() if not ok: failures += 1 for p in processes: p.join() self._log.info("Finished, %d failed files", failures) return len(files) - failures
def fetch_model(self, source: str, file: Union[str, BinaryIO], chunk_size: int = DEFAULT_CHUNK_SIZE) -> None: self._log.info("Fetching %s...", source) r = requests.get(source, stream=True) if r.status_code != 200: self._log.error( "An error occurred while fetching the model, with code %s" % r.status_code) raise ValueError if isinstance(file, str): os.makedirs(os.path.dirname(file), exist_ok=True) f = open(file, "wb") else: f = file try: total_length = int(r.headers.get("content-length")) num_chunks = math.ceil(total_length / chunk_size) if num_chunks == 1: f.write(r.content) else: for chunk in progress_bar( r.iter_content(chunk_size=chunk_size), self._log, expected_size=num_chunks): if chunk: f.write(chunk) finally: if isinstance(file, str): f.close()
def test_progress_bar(self): logger = logging.getLogger("progress") logger.setLevel(logging.INFO) stream = io.StringIO() stream.isatty = lambda: True progress.STREAM = stream list(progress_bar.progress_bar(range(10), logger, expected_size=10)) self.assertEqual(stream.getvalue().strip()[-51:], "[################################] 10/10 - 00:00:00") progress.STREAM = sys.stderr
def transform(self, repos, output, num_processes=None): """ Converts repositories to models and saves them to the output directory. :param repos: "repos" is the list of repository URLs or paths or \ files with repository URLS or paths. :param output: "output" is the output directory where to store the \ results. :param num_processes: number of processes to use, if negative - use all \ CPUs. :return: None """ self._args["log_level"] = self._log.level if num_processes is None: num_processes = self.num_processes if num_processes < 0: num_processes = multiprocessing.cpu_count() inputs = [] if isinstance(repos, str): repos = [repos] for repo in repos: # check if it's a text file if os.path.isfile(repo): with open(repo) as f: inputs.extend(l.strip() for l in f) else: inputs.append(repo) os.makedirs(output, exist_ok=True) queue = multiprocessing.Manager().Queue(1) failures = 0 with multiprocessing.Pool(processes=num_processes) as pool: pool.starmap_async( type(self).process_entry, zip(inputs, repeat(self._args), repeat(output), repeat(queue), repeat(self._organize_files))) for _ in progress_bar(inputs, self._log, expected_size=len(inputs)): repo, ok = queue.get() if not ok: failures += 1 self._log.info("Finished, %d failed repos", failures) return len(inputs) - failures
def convert_bow_to_vw(bow: BOW, output: str): log = logging.getLogger("bow2vw") log.info("Writing %s", output) with open(output, "w") as fout: for index in progress_bar(bow, log, expected_size=len(bow)): record = bow[index] fout.write(record[0].replace(":", "").replace(" ", "_") + " ") pairs = [] for t, v in zip(*record[1:]): try: word = bow.tokens[t] except (KeyError, IndexError): log.warning("%d not found in the vocabulary", t) continue pairs.append("%s:%s" % (word, v)) fout.write(" ".join(pairs)) fout.write("\n")
def load_and_check(filepaths: list, log: logging.Logger): """ Load Cooccurrences models from filepaths list and perform simple check: 1. If model contains values more than MAX_INT32 we saturate. 2. If model contains negative values we consider it as corrupted, report and skip. """ for path in progress_bar(filepaths, log): coocc = Cooccurrences().load(path) negative_values = np.where(coocc.matrix.data < 0) if negative_values[0].size > 0: log.warning("Model %s is corrupted and will be skipped. " "It contains negative elements.", path) continue too_big_values = np.where(coocc.matrix.data > MAX_INT32) if too_big_values[0].size > 0: log.warning("Model %s contains elements with values more than MAX_INT32. " "They will be saturated to MAX_INT32", path) coocc.matrix.data[too_big_values] = MAX_INT32 yield path, coocc
def download(source: str, file: Union[str, BinaryIO], log: logging.Logger, chunk_size: int = -1) -> None: """ Download a file from an HTTP source. :param source: URL to fetch. :param file: Where to store the downloaded data. :param log: Logger. :param chunk_size: Size of the download buffer. """ log.info("Fetching %s...", source) if chunk_size < 0: chunk_size = DEFAULT_DOWNLOAD_CHUNK_SIZE r = requests.get(source, stream=True) if r.status_code != 200: log.error("An error occurred while fetching the model, with code %s" % r.status_code) raise ValueError if isinstance(file, str): os.makedirs(os.path.dirname(file), exist_ok=True) f = open(file, "wb") else: f = file try: total_length = int(r.headers.get("content-length")) num_chunks = math.ceil(total_length / chunk_size) if num_chunks == 1: f.write(r.content) else: for chunk in progress_bar(r.iter_content(chunk_size=chunk_size), log, expected_size=num_chunks): if chunk: f.write(chunk) finally: if isinstance(file, str): f.close()
def read_identifiers(csv_path: str, use_header: bool, max_identifier_len: int, identifier_col: int, split_identifier_col: int, shuffle: bool = True) -> List[str]: """ Reads and filters too long identifiers in the CSV file. :param csv_path: path to the CSV file. :param use_header: uses header as normal line (True) or treat as header line with column names. :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer. :param identifier_col: column name in the CSV file for the raw identifier. :param split_identifier_col: column name in the CSV file for the splitted identifier lowercase. :param shuffle: indicates whether to reorder the list of identifiers at random after reading it. :return: list of splitted identifiers. """ log = logging.getLogger("read_identifiers") log.info("Reading data from the CSV file %s", csv_path) identifiers = [] # TODO: Update dataset loading as soon as https://github.com/src-d/backlog/issues/1212 done # Think about dataset download step with tarfile.open(csv_path, encoding="utf-8") as f: assert len( f.members) == 1, "One archived file is expected, got: %s" % len( f.members) content = f.extractfile(f.members[0]) if not use_header: content.readline() for line in progress_bar(content.readlines(), log): row = line.decode("utf-8").strip().split(",") if len(row[identifier_col]) <= max_identifier_len: identifiers.append(row[split_identifier_col]) if shuffle: numpy.random.shuffle(identifiers) log.info("Number of identifiers after filtering: %s." % len(identifiers)) return identifiers
def _fetch(self, url, where, chunk_size=DEFAULT_CHUNK_SIZE): self._log.info("Fetching %s...", url) r = requests.get(url, stream=True) if isinstance(where, str): os.makedirs(os.path.dirname(where), exist_ok=True) f = open(where, "wb") else: f = where try: total_length = int(r.headers.get("content-length")) num_chunks = math.ceil(total_length / chunk_size) if num_chunks == 1: f.write(r.content) else: for chunk in progress_bar( r.iter_content(chunk_size=chunk_size), self._log, expected_size=num_chunks): if chunk: f.write(chunk) finally: if isinstance(where, str): f.close()
def get_dependent_reps(self, libs_info, save_to=None): """ Creates pandas dataframe with all information about dependent repositories from libraries. :param libs_info: Pandas dataframe with all information about libraries. :param save_to: Path to save pandas dataframe with all information about libraries if you \ want to save it. :return: Pandas dataframe with all information about dependent repositories. """ self._log.info("Creating list of dependent repos...") if hasattr(libs_info["ID"], "tolist"): lib_id2name = dict( zip(libs_info["ID"].tolist(), libs_info["Name"].tolist())) else: lib_id2name = {libs_info["ID"]: libs_info["Name"]} pd_result = [] dependencies_path = os.path.join(self._librariesio_path, dependencies_filename) for chunk in progress_bar(pd.read_csv( dependencies_path, chunksize=LibrariesIOFetcher.CHUNKSIZE, index_col=False), self._log, expected_size=100): for lib_id in lib_id2name: res = chunk[chunk["Dependency Project ID"] == int(lib_id)] if len(res) > 0: pd_result.append(res) pd_result = pd.concat(pd_result) pd_result["url"] = "https://" + \ pd_result["Host Type"].map(LibrariesIOFetcher.HOST2LINK) + \ pd_result["Repository Name with Owner"] if save_to: pd_result.to_csv(save_to, index=False) return pd_result
def preprocess(args): inputs = [] for i in args.input: if os.path.isdir(i): inputs.extend([os.path.join(i, f) for f in os.listdir(i)]) else: inputs.append(i) all_words = defaultdict(int) skipped = 0 for i, path in progress_bar(enumerate(inputs), log, expected_size=len(inputs)): try: model = Cooccurrences().load(source=path) except ValueError: skipped += 1 continue for w in model.tokens: all_words[w] += 1 vs = args.vocabulary_size if len(all_words) < vs: vs = len(all_words) sz = args.shard_size if vs < sz: raise ValueError("vocabulary_size={0} is less than shard_size={1}. " "You should specify smaller shard_size " "(pass shard_size={0} argument).".format(vs, sz)) vs -= vs % sz words = numpy.array(list(all_words.keys())) freqs = numpy.array(list(all_words.values()), dtype=numpy.int64) del all_words chosen_indices = numpy.argpartition(freqs, len(freqs) - vs)[len(freqs) - vs:] chosen_freqs = freqs[chosen_indices] chosen_words = words[chosen_indices] border_freq = chosen_freqs.min() border_mask = chosen_freqs == border_freq border_num = border_mask.sum() border_words = words[freqs == border_freq] border_words = numpy.sort(border_words) chosen_words[border_mask] = border_words[:border_num] del words del freqs sorted_indices = numpy.argsort(chosen_words) chosen_freqs = chosen_freqs[sorted_indices] chosen_words = chosen_words[sorted_indices] word_indices = {w: i for i, w in enumerate(chosen_words)} if args.df is not None: model = DocumentFrequencies() model.construct(docs=len(inputs) - skipped, tokens=chosen_words, freqs=chosen_freqs) model.save(args.df) del chosen_freqs if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "row_vocab.txt"), "w") as out: out.write('\n'.join(chosen_words)) shutil.copyfile(os.path.join(args.output, "row_vocab.txt"), os.path.join(args.output, "col_vocab.txt")) del chosen_words ccmatrix = csr_matrix((vs, vs), dtype=numpy.int64) for i, path in progress_bar(enumerate(inputs), log, expected_size=len(inputs)): try: model = Cooccurrences().load(path) except ValueError: continue if len(model) == 0: continue matrix = _extract_coocc_matrix(ccmatrix.shape, word_indices, model) # Stage 5 - simply add this converted matrix to the global one ccmatrix += matrix bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1] with open(os.path.join(args.output, "row_sums.txt"), "w") as out: out.write('\n'.join(map(str, bool_sums.tolist()))) shutil.copyfile(os.path.join(args.output, "row_sums.txt"), os.path.join(args.output, "col_sums.txt")) reorder = numpy.argsort(-bool_sums) os.makedirs(args.output, exist_ok=True) nshards = vs // args.shard_size for row in progress_bar(range(nshards), log, expected_size=nshards): for col in range(nshards): def _int64s(xs): return tf.train.Feature(int64_list=tf.train.Int64List( value=list(xs))) def _floats(xs): return tf.train.Feature(float_list=tf.train.FloatList( value=list(xs))) indices_row = reorder[row::nshards] indices_col = reorder[col::nshards] shard = ccmatrix[indices_row][:, indices_col].tocoo() example = tf.train.Example(features=tf.train.Features( feature={ "global_row": _int64s(indices_row), "global_col": _int64s(indices_col), "sparse_local_row": _int64s(shard.row), "sparse_local_col": _int64s(shard.col), "sparse_value": _floats(shard.data) })) with open( os.path.join(args.output, "shard-%03d-%03d.pb" % (row, col)), "wb") as out: out.write(example.SerializeToString())
def detect_communities(args): log = logging.getLogger("cmd") ccsmodel = ConnectedComponentsModel().load(args.input) log.info("Building the connected components") ccs = defaultdict(list) for i, c in enumerate(ccsmodel.id_to_cc): ccs[c].append(i) buckmat = ccsmodel.id_to_buckets buckindices = buckmat.indices buckindptr = buckmat.indptr total_nvertices = buckmat.shape[0] linear = args.edges in ("linear", "1") graphs = [] communities = [] if not linear: log.info("Transposing the matrix") buckmat_csc = buckmat.T.tocsr() fat_ccs = [] for vertices in ccs.values(): if len(vertices) == 1: continue if len(vertices) == 2: communities.append(vertices) continue fat_ccs.append(vertices) log.info("Building %d graphs", len(fat_ccs)) for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)): if linear: edges = [] weights = [] bucket_weights = buckmat.sum(axis=0) buckets = set() for i in vertices: for j in range(buckindptr[i], buckindptr[i + 1]): bucket = buckindices[j] weights.append(bucket_weights[0, bucket]) bucket += total_nvertices buckets.add(bucket) edges.append((str(i), str(bucket))) else: edges = set() weights = None buckets = set() for i in vertices: for j in range(buckindptr[i], buckindptr[i + 1]): buckets.add(buckindices[j]) for bucket in buckets: buckverts = \ buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]] for i, x in enumerate(buckverts): for y in buckverts: if x < y: edges.add((str(x), str(y))) buckets.clear() edges = list(edges) graph = Graph(directed=False) graph.add_vertices(list(map(str, vertices + list(buckets)))) graph.add_edges(edges) graph.edge_weights = weights graphs.append(graph) log.info("Launching the community detection") detector = CommunityDetector(algorithm=args.algorithm, config=args.params) if not args.no_spark: spark = create_spark("cmd-%s" % uuid4(), **args.__dict__).sparkContext communities.extend(spark.parallelize(graphs).flatMap(detector).collect()) else: communities.extend(chain.from_iterable(progress_bar( (detector(g) for g in graphs), log, expected_size=len(graphs)))) log.info("Overall communities: %d", len(communities)) log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities])) log.info("Median community size: %.1f", numpy.median([len(c) for c in communities])) log.info("Max community size: %d", max(map(len, communities))) log.info("Writing %s", args.output) CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
def id2vec_preprocess(args): """ Loads co-occurrence matrices for several repositories and generates the document frequencies and the Swivel protobuf dataset. :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \ "shard_size", "df" and "output". :return: None """ log = logging.getLogger("preproc") log.info("Loading docfreq model from %s", args.docfreq_in) df_model = DocumentFrequencies(log_level=args.log_level).load( source=args.docfreq_in) coocc_model = Cooccurrences().load(args.input) if numpy.any(coocc_model.matrix.data < 0): raise ValueError( ("Co-occurrence matrix %s contains negative elements. " "Please check its correctness.") % args.input) if numpy.any(numpy.isnan(coocc_model.matrix.data)): raise ValueError(("Co-occurrence matrix %s contains nan elements. " "Please check its correctness.") % args.input) try: df_meta = coocc_model.get_dep(DocumentFrequencies.NAME) if df_model.meta != df_meta: raise ValueError(( "Document frequency model you provided does not match dependency inside " "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep" "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta)) except KeyError: pass # There is no docfreq dependency vs = args.vocabulary_size if len(df_model) < vs: vs = len(df_model) sz = args.shard_size if vs < sz: raise ValueError( "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller " "shard_size (e.g. shard_size=%s)." % (vs, sz, vs)) vs -= vs % sz log.info("Effective vocabulary size: %d", vs) df_model = df_model.greatest(vs) log.info("Sorting the vocabulary...") chosen_words = sorted(df_model.tokens()) word_indices = {w: i for i, w in enumerate(chosen_words)} if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "row_vocab.txt"), "w") as out: out.write('\n'.join(chosen_words)) log.info("Saved row_vocab.txt") shutil.copyfile(os.path.join(args.output, "row_vocab.txt"), os.path.join(args.output, "col_vocab.txt")) log.info("Saved col_vocab.txt") del chosen_words ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model) log.info("Planning the sharding...") bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1] reorder = numpy.argsort(-bool_sums) with open(os.path.join(args.output, "row_sums.txt"), "w") as out: out.write('\n'.join(map(str, bool_sums.tolist()))) log.info("Saved row_sums.txt") shutil.copyfile(os.path.join(args.output, "row_sums.txt"), os.path.join(args.output, "col_sums.txt")) log.info("Saved col_sums.txt") log.info("Writing the shards...") os.makedirs(args.output, exist_ok=True) nshards = vs // args.shard_size for row in progress_bar(range(nshards), log, expected_size=nshards): for col in range(nshards): indices_row = reorder[row::nshards] indices_col = reorder[col::nshards] shard = ccmatrix[indices_row][:, indices_col].tocoo() example = tf.train.Example(features=tf.train.Features( feature={ "global_row": _int64s(indices_row), "global_col": _int64s(indices_col), "sparse_local_row": _int64s(shard.row), "sparse_local_col": _int64s(shard.col), "sparse_value": _floats(shard.data) })) with open( os.path.join(args.output, "shard-%03d-%03d.pb" % (row, col)), "wb") as out: out.write(example.SerializeToString()) log.info("Success")
def preprocess(args): """ Loads co-occurrence matrices for several repositories and generates the document frequencies and the Swivel protobuf dataset. :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \ "shard_size", "df" and "output". :return: None """ log = logging.getLogger("preproc") log.info("Scanning the inputs...") inputs = [] for i in args.input: if os.path.isdir(i): inputs.extend([os.path.join(i, f) for f in os.listdir(i)]) else: inputs.append(i) log.info("Reading word indices from %d files...", len(inputs)) all_words = defaultdict(int) skipped = 0 for i, path in progress_bar(enumerate(inputs), log, expected_size=len(inputs)): try: model = Cooccurrences().load(source=path) except ValueError: skipped += 1 log.warning("Skipped %s", path) continue for w in model.tokens: all_words[w] += 1 vs = args.vocabulary_size if len(all_words) < vs: vs = len(all_words) sz = args.shard_size if vs < sz: raise ValueError( "vocabulary_size={0} is less than shard_size={1}. " "You should specify smaller shard_size " "(pass shard_size={0} argument).".format(vs, sz)) vs -= vs % sz log.info("Effective vocabulary size: %d", vs) log.info("Truncating the vocabulary...") words = numpy.array(list(all_words.keys())) freqs = numpy.array(list(all_words.values()), dtype=numpy.int64) del all_words chosen_indices = numpy.argpartition( freqs, len(freqs) - vs)[len(freqs) - vs:] chosen_freqs = freqs[chosen_indices] chosen_words = words[chosen_indices] border_freq = chosen_freqs.min() border_mask = chosen_freqs == border_freq border_num = border_mask.sum() border_words = words[freqs == border_freq] border_words = numpy.sort(border_words) chosen_words[border_mask] = border_words[:border_num] del words del freqs log.info("Sorting the vocabulary...") sorted_indices = numpy.argsort(chosen_words) chosen_freqs = chosen_freqs[sorted_indices] chosen_words = chosen_words[sorted_indices] word_indices = {w: i for i, w in enumerate(chosen_words)} if args.df is not None: log.info("Writing the document frequencies to %s...", args.df) model = DocumentFrequencies() model.construct(docs=len(inputs) - skipped, tokens=chosen_words, freqs=chosen_freqs) model.save(args.df) del chosen_freqs if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "row_vocab.txt"), "w") as out: out.write('\n'.join(chosen_words)) log.info("Saved row_vocab.txt...") shutil.copyfile(os.path.join(args.output, "row_vocab.txt"), os.path.join(args.output, "col_vocab.txt")) log.info("Saved col_vocab.txt...") del chosen_words log.info("Combining individual co-occurrence matrices...") ccmatrix = csr_matrix((vs, vs), dtype=numpy.int64) for i, path in progress_bar(enumerate(inputs), log, expected_size=len(inputs)): try: model = Cooccurrences().load(path) except ValueError: log.warning("Skipped %s", path) continue if len(model) == 0: log.warning("Skipped %s", path) continue matrix = _extract_coocc_matrix(ccmatrix.shape, word_indices, model) # Stage 5 - simply add this converted matrix to the global one ccmatrix += matrix log.info("Planning the sharding...") bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1] with open(os.path.join(args.output, "row_sums.txt"), "w") as out: out.write('\n'.join(map(str, bool_sums.tolist()))) log.info("Saved row_sums.txt...") shutil.copyfile(os.path.join(args.output, "row_sums.txt"), os.path.join(args.output, "col_sums.txt")) log.info("Saved col_sums.txt...") reorder = numpy.argsort(-bool_sums) log.info("Writing the shards...") os.makedirs(args.output, exist_ok=True) nshards = vs // args.shard_size for row in progress_bar(range(nshards), log, expected_size=nshards): for col in range(nshards): def _int64s(xs): return tf.train.Feature( int64_list=tf.train.Int64List(value=list(xs))) def _floats(xs): return tf.train.Feature( float_list=tf.train.FloatList(value=list(xs))) indices_row = reorder[row::nshards] indices_col = reorder[col::nshards] shard = ccmatrix[indices_row][:, indices_col].tocoo() example = tf.train.Example(features=tf.train.Features(feature={ "global_row": _int64s(indices_row), "global_col": _int64s(indices_col), "sparse_local_row": _int64s(shard.row), "sparse_local_col": _int64s(shard.col), "sparse_value": _floats(shard.data)})) with open(os.path.join(args.output, "shard-%03d-%03d.pb" % (row, col)), "wb") as out: out.write(example.SerializeToString()) log.info("Success")