def test_partition_all(): assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)] assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)] assert list(partition_all(2, [])) == [] # Regression test: https://github.com/pytoolz/toolz/issues/387 class NoCompare(object): def __eq__(self, other): if self.__class__ == other.__class__: return True raise ValueError() obj = NoCompare() result = [(obj, obj, obj, obj), (obj, obj, obj)] assert list(partition_all(4, [obj]*7)) == result assert list(partition_all(4, iter([obj]*7))) == result
def fetch_quotes(stock_codes): """ 获取股票列表的分时报价 Parameters ---------- stock_codes : list 股票代码列表 Returns ------- res : DataFrame 行数 = len(stock_codes) 33列 Example ------- >>> df = fetch_quotes(['000001','000002']) >>> df.iloc[:,:8] 股票代码 股票简称 开盘 前收盘 现价 最高 最低 竞买价 0 000001 平安银行 11.040 11.050 10.900 11.050 10.880 10.900 1 000002 万 科A 33.700 34.160 33.290 33.990 33.170 33.290 """ stock_codes = ensure_list(stock_codes) num = len(stock_codes) length = 800 url_fmt = 'http://hq.sinajs.cn/list={}' dfs = [] for p_codes in partition_all(length, stock_codes): # p_codes = stock_codes[i * length:(i + 1) * length] url = url_fmt.format(','.join(map(_add_prefix, p_codes))) content = get_page_response(url).text dfs.append(_to_dataframe(content, p_codes)) return pd.concat(dfs).sort_values('股票代码')
def _(xs): try: _, xs = peek(xs) for chunk in partition_all(max_records_per_trie, xs): yield _dawg_with_bounds(chunk) except StopIteration: pass
def mapreduce(chunk_size): """ A long running task which splits up the input data to many workers """ # create some sample data for our summation function data = [] for i in range(10000): x = [] for j in range(random.randrange(10) + 5): x.append(random.randrange(10000)) data.append(x) # break up our data into chunks and create a dynamic list of workers print('preparing map') q = Queue('mapreduce_c', connection=StrictRedis()) chunk_jobs = [] for chunk in partition_all(chunk_size, data): chunk_jobs.append( q.enqueue_call(func=_map, args=(chunk,))) print('running map') while not all((job.is_finished for job in chunk_jobs)): pass print('preparing reduce') reduce_job = q.enqueue_call( func=_reduce, args=(tuple(job.result for job in chunk_jobs))) print('running reduce') while not reduce_job.is_finished: pass print('fin') return reduce_job.result
def fold(binop, seq, default=no_default, map=map, chunksize=128, combine=None): """ Reduce without guarantee of ordered reduction. inputs: ``binop`` - associative operator. The associative property allows us to leverage a parallel map to perform reductions in parallel. ``seq`` - a sequence to be aggregated ``default`` - an identity element like 0 for ``add`` or 1 for mul ``map`` - an implementation of ``map``. This may be parallel and determines how work is distributed. ``chunksize`` - Number of elements of ``seq`` that should be handled within a single function call ``combine`` - Binary operator to combine two intermediate results. If ``binop`` is of type (total, item) -> total then ``combine`` is of type (total, total) -> total Defaults to ``binop`` for common case of operators like add Fold chunks up the collection into blocks of size ``chunksize`` and then feeds each of these to calls to ``reduce``. This work is distributed with a call to ``map``, gathered back and then refolded to finish the computation. In this way ``fold`` specifies only how to chunk up data but leaves the distribution of this work to an externally provided ``map`` function. This function can be sequential or rely on multithreading, multiprocessing, or even distributed solutions. If ``map`` intends to serialize functions it should be prepared to accept and serialize lambdas. Note that the standard ``pickle`` module fails here. Example ------- >>> # Provide a parallel map to accomplish a parallel sum >>> from operator import add >>> fold(add, [1, 2, 3, 4], chunksize=2, map=map) 10 """ assert chunksize > 1 if combine is None: combine = binop chunks = partition_all(chunksize, seq) # Evaluate sequence in chunks via map if default == no_default: results = map(functools.partial(_reduce, binop), chunks) else: results = map(functools.partial(_reduce, binop, initial=default), chunks) results = list(results) # TODO: Support complete laziness if len(results) == 1: # Return completed result return results[0] else: # Recurse to reaggregate intermediate results return fold(combine, results, map=map, chunksize=chunksize)
def predict_files(files: List[Path], inference_session, bs, csv_fname) -> Tuple[set, int]: """Predict files""" with Progress() as progress: total_progress = progress.add_task("prediction progress", total=len(files)) images_checked = 0 bad_batch_files = [] for i, batch in enumerate(itertoolz.partition_all(bs, files)): batch_predictions, bad_batch = try_predict_batch( batch, inference_session, bs) if bad_batch: bad_batch_files.append(batch) if i == 0 and not bad_batch: create_csv_header(batch_predictions, csv_fname) if not bad_batch: write_batch_preds_to_csv(batch_predictions, csv_fname) progress.update(total_progress, advance=len(batch)) images_checked += len(batch) corrupt_images = set() if bad_batch_files: for batch in bad_batch_files: for file in batch: try: batch_predictions = inference_session.predict_batch( [file], bs) write_batch_preds_to_csv(batch_predictions, csv_fname) except PIL.UnidentifiedImageError: corrupt_images.add(file) return corrupt_images, images_checked
def _generate_batches(h, w, ps, patch_size, stride, batch_size=64): hdc_wdx_generator = itertools.product( range(0, h - patch_size + ps, stride), range(0, w - patch_size + ps, stride), ) for batch_indexes in itertoolz.partition_all(batch_size, hdc_wdx_generator): yield batch_indexes
def fold(binop, seq, default=no_default, map=map, chunksize=128, combine=None): """ Reduce without guarantee of ordered reduction. inputs: ``binop`` - associative operator. The associative property allows us to leverage a parallel map to perform reductions in parallel. ``seq`` - a sequence to be aggregated ``default`` - an identity element like 0 for ``add`` or 1 for mul ``map`` - an implementation of ``map``. This may be parallel and determines how work is distributed. ``chunksize`` - Number of elements of ``seq`` that should be handled within a single function call ``combine`` - Binary operator to combine two intermediate results. If ``binop`` is of type (total, item) -> total then ``combine`` is of type (total, total) -> total Defaults to ``binop`` for common case of operators like add Fold chunks up the collection into blocks of size ``chunksize`` and then feeds each of these to calls to ``reduce``. This work is distributed with a call to ``map``, gathered back and then refolded to finish the computation. In this way ``fold`` specifies only how to chunk up data but leaves the distribution of this work to an externally provided ``map`` function. This function can be sequential or rely on multithreading, multiprocessing, or even distributed solutions. If ``map`` intends to serialize functions it should be prepared to accept and serialize lambdas. Note that the standard ``pickle`` module fails here. Example ------- >>> # Provide a parallel map to accomplish a parallel sum >>> from operator import add >>> fold(add, [1, 2, 3, 4], chunksize=2, map=map) 10 """ if combine is None: combine = binop chunks = partition_all(chunksize, seq) # Evaluate sequence in chunks via map if default is no_default: results = map(lambda chunk: reduce(binop, chunk), chunks) else: results = map(lambda chunk: reduce(binop, chunk, default), chunks) results = list(results) # TODO: Support complete laziness if len(results) == 1: # Return completed result return results[0] else: # Recurse to reaggregate intermediate results return fold(combine, results, map=map, chunksize=chunksize)
async def fetch_all(batch_num=800): """获取所有股票实时报价原始数据""" db = get_db() collection = db['指数列表'] stock_codes = collection.find_one()['codes'] b_codes = partition_all(batch_num, stock_codes) tasks = [to_dataframe(codes) for codes in b_codes] dfs = await asyncio.gather(*tasks) return pd.concat(dfs)
def test_merge(partition_size, num_items): """ Verify that _merge correctly merges batches into one sorted iterable. """ items = range(num_items) partitions = list(partition_all(partition_size, items)) partition_ids = range(len(partitions)) random.shuffle(partitions) merged = _merge(lambda x: partitions[x], partition_ids) assert list(merged) == list(items)
def predict_directory( directory: Path = typer.Argument( ..., readable=True, resolve_path=True, help="Directory to start searching for images from", ), csv_save_dir: Path = typer.Argument( ..., writable=True, resolve_path=True, help="Directory used to store the csv report", ), pattern: str = typer.Option("fs", help="Pattern used to filter image filenames"), bs: int = typer.Option(16, help="Batch Size"), image_format: str = typer.Option( ".tif", help="Image format for flyswot to use for predictions, defaults to `*.tif`", ), model_name: str = typer.Option( "latest", help="Which model flyswot should use for making predictions" ), model_path: str = None, ): """Predicts against all images stored under DIRECTORY which match PATTERN in the filename. By default searches for filenames containing 'fs'. Creates a CSV report saved to `csv_save_dir` """ start_time = time.perf_counter() model_dir = models.ensure_model_dir() model = models.ensure_model(model_dir) # if model_name != "latest" and not model_path: # model_parts = models._get_model_parts(Path(model_dir / Path(model_name))) # if model_name != "latest" and model_path: # model_parts = models._get_model_parts(Path(model_path / Path(model_name))) onnxinference = OnnxInferenceSession(model.model, model.vocab) files = sorted(core.get_image_files_from_pattern(directory, pattern, image_format)) check_files(files, pattern, directory) typer.echo(f"Found {len(files)} files matching {pattern} in {directory}") csv_fname = create_csv_fname(csv_save_dir) with typer.progressbar(length=len(files)) as progress: images_checked = 0 for i, batch in enumerate(itertoolz.partition_all(bs, files)): batch_predictions = onnxinference.predict_batch(batch, bs) if i == 0: # pragma: no cover create_csv_header(batch_predictions, csv_fname) write_batch_preds_to_csv(batch_predictions, csv_fname) progress.update(len(batch)) images_checked += len(batch) delta = timedelta(seconds=time.perf_counter() - start_time) print_inference_summary( str(delta), pattern, directory, csv_fname, image_format, images_checked )
def mapreduce(chunk_size): """ A long running task which splits up the input data to many workers """ # create some sample data for our summation function data = [] for i in range(10000): x = [] for j in range(random.randrange(10) + 5): x.append(random.randrange(10000)) data.append(x) # break up our data into chunks and create a dynamic list of workers maps = (map.s(x) for x in partition_all(chunk_size, data)) mapreducer = chord(maps)(reduce.s()) return {'chord_id': mapreducer.id}
def load_data(self, url: str) -> Iterable: data = [] logger.debug('Getting hidden data from: {}'.format(url)) d = pq(url=url) headers = (pq(e).text() for e in d('table.striped-table th')) headers = [slugify(h, separator="_") for h in headers] # table data cells = d('table.striped-table td') for grp in partition_all(len(headers), cells): values = [pq(i).text() for i in grp] data.append(dict(zip(headers, values))) return data
def fetch_quote(codes, is_index=False, n=800): """股票代码或指数列表报价. Args: codes (list-like): 代码列表 is_index (bool, optional): 是否为指数代码. Defaults to False. n (int, optional): 每批请求代码数量. Defaults to 800. Returns: list of dictionary: 报价列表字典 """ url_fmt = 'http://api.money.126.net/data/feed/{}' codes = ensure_list(codes) b_codes = partition_all(n, codes) urls = [url_fmt.format(','.join([_query_code(code, is_index) for code in batch])) for batch in b_codes] with ThreadPoolExecutor(MAX_WORKER) as excutor: docs = excutor.map(_fetch_quote, urls) return concat(docs)
def _split(dump, partition_size, iterable, key=None, reverse=False): """ Spit iterable into a number of sorted partitions of size partition_size (the last partition may have fewer items) and serialize using the dump callable. :param dump: Callable which takes an iterable and serializes to some external source, returning an iterable of ids which can be used to reload the externalized partitions. :param partition_size: The number of items to place in each partition. :param iterable: The iterable to split into sorted partitions. :param key: Callable which is used to retrieve the field to sort by. :param reverse: If set to ``True``, then the list elements are sorted as if each comparison were reversed. :return: iterable of the ids which can be used to reload the externalized partitions. """ sort_by_key_and_maybe_reverse = partial(sorted, key=key, reverse=reverse) partitioned = partition_all(partition_size, iterable) dump_sorted = compose(dump, sort_by_key_and_maybe_reverse) return [dump_sorted(x) for x in partitioned]
def test_partition_all(): assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)] assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)] assert list(partition_all(2, [])) == []
def main(): # prameters data_dir = '../input/' bert_model = '../bert-large-wwm-uncased' # 把自己large wwm模型路径取代这个bert_model # bert_model = 'bert-base-uncased' # bert_model = './oldtoxic'#使用在老toxic上训练好的预训练模型权重.下载路径:https://www.kaggle.com/qinhui1999/old-toxic-bert-v2 task_name = 'MyPro' output_dir = 'checkpoints/' model_save_pth = 'checkpoints/bert_large_wwm.pth' max_seq_length = 220 do_train = True do_eval = True do_lower_case = True train_batch_size = 56 eval_batch_size = 200 learning_rate = 1e-5 num_train_epochs = 1 warmup_proportion = 0.05 no_cuda = False local_rank = -1 seed = 42 gradient_accumulation_steps = 8 optimize_on_cpu = False fp16 = False save_checkpoints_steps = 50000 loss_scale = 128 # 对模型输入进行处理的processor,git上可能都是针对英文的processor processors = {'mypro': MyPro} if local_rank == -1 or no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 torch.distributed.init_process_group(backend='nccl') if fp16: logger.info( "16-bits training currently not supported in distributed training" ) fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1)) if gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(gradient_accumulation_steps)) train_batch_size = int(train_batch_size / gradient_accumulation_steps) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) if not do_train and not do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(output_dir) and os.listdir(output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir)) print('The checkpoint directory is aleady existed...') else: os.makedirs(output_dir, exist_ok=True) task_name = task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) # print("tokenizer",tokenizer) train_examples = None num_train_steps = None if do_train: train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=2, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank)) model = ToxicModel(bert_model, device) # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True) # set_trainable(model.bert, False) # 锁定embedding层 # set_trainable(model.bert.embeddings, False) # set_trainable(model.bert.encoder.layer[11], True) # set_trainable(model.head, True) # model.load_state_dict(torch.load('checkpoints/bert_classification_2epoch.pth')['state_dict']) if fp16: model.half() model.to(device) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if fp16: param_optimizer = [ (n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in model.named_parameters() ] elif optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) global_step = 0 if do_train: if os.path.exists('train.token_new_cleaned_wwm.npy'): train_features = np.load('train.token_new_cleaned_wwm.npy', allow_pickle=True) else: parallel = Parallel(300, backend="multiprocessing", verbose=5) train_features = list( concatv(*parallel( delayed(convert_examples_to_features)( example, label_list, max_seq_length, tokenizer) for example in list(partition_all(300, train_examples))))) train_features = np.asarray(train_features) np.save('train.token_new_cleaned_wwm', train_features) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) torch.cuda.empty_cache() all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) print('y_aux', np.asarray([f.y_aux for f in train_features]).shape) all_label_ids = torch.tensor(np.hstack([ np.asarray([f.label_id for f in train_features]), np.asarray([f.y_aux for f in train_features]) ]), dtype=torch.float32) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=train_batch_size, num_workers=2, pin_memory=True, ) #model.load_state_dict(torch.load('checkpoints/bert_large_wwm.pth')['state_dict']) # model.load_state_dict(torch.load('checkpoints/0_80000_iterations.pth')['state_dict']) model.train() best_score = 0 flags = 0 torch.cuda.empty_cache() ''' model.load_state_dict(torch.load('checkpoints/0_20000_iterations.pth')['model']) optimizer.load_state_dict(torch.load('checkpoints/0_20000_iterations.pth')['optimizer']) old_iter = int(torch.load('checkpoints/0_20000_iterations.pth')['iteration']) ''' old_iter = -1 for i_epoch in trange(int(num_train_epochs), desc="Epoch"): torch.cuda.empty_cache() iteration = 0 # counter save_point = save_checkpoints_steps # 10000 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if iteration <= old_iter: iteration += 1 continue batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) torch.cuda.empty_cache() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if fp16 and loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * loss_scale if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() if (step + 1) % gradient_accumulation_steps == 0: if fp16 or optimize_on_cpu: if fp16 and loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) loss_scale = loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() #Save model if iteration % save_point == 0 and iteration > 0: checkpoint = { 'iteration': iteration, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } checkpoint_path = os.path.join( output_dir, '{}_{}_iterations.pth'.format(i_epoch, iteration)) torch.save(checkpoint, checkpoint_path) logging.info('Model saved to {}'.format(checkpoint_path)) val(model, processor, data_dir, max_seq_length, eval_batch_size, label_list, tokenizer, device) iteration += 1 checkpoint = { 'state_dict': model.state_dict(), 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(checkpoint, model_save_pth) val(model, processor, data_dir, max_seq_length, eval_batch_size, label_list, tokenizer, device) test(model, processor, data_dir, max_seq_length, eval_batch_size, label_list, tokenizer, device)
def train_model(revision_texts, matches_dict): """ Apply the initial model to raw examples. You'll want to experiment with finding a good number of revision texts. It can also help to filter out some data. """ revision_data = create_revision_data(revision_texts) nlp_training = spacy.load("D:/Ananth/Allstate/spacy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0", disable=['ner']) #nlp_training.entity.add_label(LABEL) for key, value in matches_dict.items(): # disable ner for training data doc = nlp_training(key) n = len(doc) tags = [None] * n heads = [None] * n deps = [None] * n # tags = [w.tag_ for w in doc] # heads = [w.head.i for w in doc] # deps = [w.dep_ for w in doc] losses = {} entities = [(e[1],e[2], LABEL) for e in value] training_data.append((doc, GoldParse(doc, tags=tags, heads=heads, deps=deps, entities=entities))) #delete training module loaded... print('deleting nlp_training model....(2)') del nlp_training #print(revision_data) n_epoch = 5 batch_size = 120 with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for i in range(n_epoch): examples = revision_data + training_data #examples = training_data losses = {} random.shuffle(examples) for batch in itertoolz.partition_all(batch_size, examples): docs, golds = zip(*batch) print('progress... training batch:', i+1*batch_size) #print(batch) # recreate the doc to avoid a bug in spacy training module do this only for new NER docs # do this only for custom NER DOC object. For revision texts based DOC objects, have the # original doc so original NERs remain. docs_modified = [] for doc in docs: if doc.user_data != 'generic': doc = nlp.make_doc(doc.text) docs_modified.append(doc) nlp.update(docs_modified, golds, sgd=optimizer, drop=0.35, losses=losses) print('training completed... losses:', losses) # test the trained model test_text = 'What are different Product Type that comes after\ Conviction date or occurence data of an endorsement insurance? This is New york' doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to a directory output_dir = Path(OUT_DIR) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = NEW_MODEL_NAME # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def create_db(vcf_path, db_path, rsmerge_path, snphistory_path, dbsnp_build, chunksize=500000): print("Parsing histories...") history = MergeHistory(rsmerge_path, snphistory_path, dbsnp_build) print("Creating database...") con = sqlite3.connect(db_path) con.execute("PRAGMA SYNCHRONOUS=OFF") con.execute("PRAGMA TEMP_STORE=MEMORY") con.execute("PRAGMA PAGE_SIZE=4096") con.execute("PRAGMA CACHE_SIZE=500000") con.execute("DROP TABLE IF EXISTS dbsnp") con.execute( "CREATE TABLE dbsnp (rsid TEXT, chrpos TEXT, epacts TEXT, chrom TEXT, pos INTEGER, ref TEXT, alt TEXT)" ) con.execute("DROP TABLE IF EXISTS trans") con.execute("CREATE TABLE trans (rs_orig TEXT, rs_current TEXT)") with gzip.open(vcf_path, "rt") as vcf, con: print("Parsing VCF: %s" % vcf_path) for chunk in partition_all(chunksize, vcf): processed = [] for line in chunk: if line.startswith("#"): continue chrom, pos, vid, ref, alt = line.split("\t")[0:5] rsid = vid.split(":")[0] epacts = "{}:{}_{}/{}".format(chrom, pos, ref, alt) if not rsid.startswith("rs"): continue rsid_trans = history.find_current(rsid) chrpos = "{}:{}".format(chrom, pos) use_rsid = rsid if rsid_trans is None else rsid_trans data = (use_rsid, chrpos, epacts, chrom, pos, ref, alt) processed.append(data) con.executemany( "INSERT INTO dbsnp (rsid,chrpos,epacts,chrom,pos,ref,alt) VALUES (?,?,?,?,?,?,?)", processed) # Create indexes for important columns print("Creating indexes...") indexes = [ "CREATE INDEX idx_snp ON dbsnp (rsid)", "CREATE INDEX idx_chrpos ON dbsnp (chrpos)", "CREATE INDEX idx_chrom_and_pos ON dbsnp (chrom,pos)", "CREATE INDEX idx_epacts ON dbsnp (epacts)" ] for idx in indexes: print(idx) con.execute(idx) # Create SNP translation table print("Creating SNP translation table..") for chunk in partition_all(chunksize, history.iter_nodes()): rows = [] for node in chunk: # Only want the "sink" nodes - the most recent rsids at the bottom of the tree if node.child is None: parents = history.find_all_parents(node) for p in parents: rows.append([p, node.rsid]) con.executemany( "INSERT INTO trans (rs_orig,rs_current) VALUES (?,?)", rows) print("Creating indexes...") indexes = [ "CREATE INDEX idx_rs_orig ON trans (rs_orig)", "CREATE INDEX idx_rs_current ON trans (rs_current)" ] for idx in indexes: print(idx) con.execute(idx)
#!/usr/bin/env python import os, os.path, shutil from toolz.itertoolz import partition_all prefix = "sub" size = 100 for idx, part in enumerate(partition_all(size, filter(os.path.isfile, os.listdir('.')))): target = "sub-%03d" % idx os.mkdir(target) for f in part: shutil.move(f, os.path.join(target, f))
import sys import argparse from toolz import itertoolz ''' Program #1 Input: cat myfiles.txt | python split_stdin_to_files.py 1000 Output: Split into N files where each file is M files long where M is the script parameter. Last file will be remainder ''' parser = argparse.ArgumentParser(description='Split file lines into multiple files. The input argument sets the number of ') parser.add_argument('num_filenames_per_file', metavar='num_filenames_per_file', type=int, help="Number of file names per file.") args = parser.parse_args() filenames = sys.stdin.readlines() files_list = itertoolz.partition_all(args.num_filenames_per_file, filenames) for index, files in enumerate(files_list, start=0): outfilename = f"./output/{index}" with open(outfilename, 'w') as outfile: outfile.writelines(files)
def generate_data(elements_count, chunk_size): # Generate input data data data = [(chunk, random.randrange(10000)) for chunk in range(elements_count)] data = partition_all(chunk_size, data) return data
def create_db(vcf_path,db_path,rsmerge_path,snphistory_path,dbsnp_build,chunksize=500000): print("Parsing histories...") history = MergeHistory(rsmerge_path,snphistory_path,dbsnp_build) print("Creating database...") con = sqlite3.connect(db_path) con.execute("PRAGMA SYNCHRONOUS=OFF") con.execute("PRAGMA TEMP_STORE=MEMORY") con.execute("PRAGMA PAGE_SIZE=4096") con.execute("PRAGMA CACHE_SIZE=500000") con.execute("DROP TABLE IF EXISTS dbsnp") con.execute("CREATE TABLE dbsnp (rsid TEXT, chrpos TEXT, epacts TEXT, chrom TEXT, pos INTEGER, ref TEXT, alt TEXT)") con.execute("DROP TABLE IF EXISTS trans") con.execute("CREATE TABLE trans (rs_orig TEXT, rs_current TEXT)") with gzip.open(vcf_path,"rt") as vcf, con: print("Parsing VCF: %s" % vcf_path) for chunk in partition_all(chunksize,vcf): processed = [] for line in chunk: if line.startswith("#"): continue chrom, pos, vid, ref, alt = line.split("\t")[0:5] rsid = vid.split(":")[0] epacts = "{}:{}_{}/{}".format(chrom,pos,ref,alt) if not rsid.startswith("rs"): continue rsid_trans = history.find_current(rsid) chrpos = "{}:{}".format(chrom,pos) use_rsid = rsid if rsid_trans is None else rsid_trans data = (use_rsid,chrpos,epacts,chrom,pos,ref,alt) processed.append(data) con.executemany("INSERT INTO dbsnp (rsid,chrpos,epacts,chrom,pos,ref,alt) VALUES (?,?,?,?,?,?,?)",processed) # Create indexes for important columns print("Creating indexes...") indexes = [ "CREATE INDEX idx_snp ON dbsnp (rsid)", "CREATE INDEX idx_chrpos ON dbsnp (chrpos)", "CREATE INDEX idx_chrom_and_pos ON dbsnp (chrom,pos)", "CREATE INDEX idx_epacts ON dbsnp (epacts)" ] for idx in indexes: print(idx) con.execute(idx) # Create SNP translation table print("Creating SNP translation table..") for chunk in partition_all(chunksize,history.iter_nodes()): rows = [] for node in chunk: # Only want the "sink" nodes - the most recent rsids at the bottom of the tree if node.child is None: parents = history.find_all_parents(node) for p in parents: rows.append([p,node.rsid]) con.executemany("INSERT INTO trans (rs_orig,rs_current) VALUES (?,?)",rows) print("Creating indexes...") indexes = [ "CREATE INDEX idx_rs_orig ON trans (rs_orig)", "CREATE INDEX idx_rs_current ON trans (rs_current)" ] for idx in indexes: print(idx) con.execute(idx)