def test_partition_all():
    assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)]
    assert list(partition_all(2, [])) == []

    # Regression test: https://github.com/pytoolz/toolz/issues/387
    class NoCompare(object):
        def __eq__(self, other):
            if self.__class__ == other.__class__:
                return True
            raise ValueError()
    obj = NoCompare()
    result = [(obj, obj, obj, obj), (obj, obj, obj)]
    assert list(partition_all(4, [obj]*7)) == result
    assert list(partition_all(4, iter([obj]*7))) == result
Exemplo n.º 2
0
def fetch_quotes(stock_codes):
    """
    获取股票列表的分时报价

    Parameters
    ----------
    stock_codes : list
        股票代码列表

    Returns
    -------
    res : DataFrame
        行数 = len(stock_codes)   
        33列   

    Example
    -------
    >>> df = fetch_quotes(['000001','000002'])
    >>> df.iloc[:,:8] 
        股票代码  股票简称      开盘     前收盘      现价      最高      最低     竞买价
    0  000001  平安银行  11.040  11.050  10.900  11.050  10.880  10.900
    1  000002  万 科A  33.700  34.160  33.290  33.990  33.170  33.290
    """
    stock_codes = ensure_list(stock_codes)
    num = len(stock_codes)
    length = 800
    url_fmt = 'http://hq.sinajs.cn/list={}'
    dfs = []
    for p_codes in partition_all(length, stock_codes):
        # p_codes = stock_codes[i * length:(i + 1) * length]
        url = url_fmt.format(','.join(map(_add_prefix, p_codes)))
        content = get_page_response(url).text
        dfs.append(_to_dataframe(content, p_codes))
    return pd.concat(dfs).sort_values('股票代码')
Exemplo n.º 3
0
 def _(xs):
     try:
         _, xs = peek(xs)
         for chunk in partition_all(max_records_per_trie, xs):
             yield _dawg_with_bounds(chunk)
     except StopIteration:
         pass
Exemplo n.º 4
0
def mapreduce(chunk_size):
    """ A long running task which splits up the input data to many workers """
    # create some sample data for our summation function
    data = []
    for i in range(10000):
        x = []
        for j in range(random.randrange(10) + 5):
            x.append(random.randrange(10000))
        data.append(x)

    # break up our data into chunks and create a dynamic list of workers
    print('preparing map')
    q = Queue('mapreduce_c', connection=StrictRedis())
    chunk_jobs = []
    for chunk in partition_all(chunk_size, data):
        chunk_jobs.append(
            q.enqueue_call(func=_map, args=(chunk,)))
    print('running map')
    while not all((job.is_finished for job in chunk_jobs)):
        pass
    print('preparing reduce')
    reduce_job = q.enqueue_call(
        func=_reduce, args=(tuple(job.result for job in chunk_jobs)))
    print('running reduce')
    while not reduce_job.is_finished:
        pass
    print('fin')
    return reduce_job.result
Exemplo n.º 5
0
def fold(binop, seq, default=no_default, map=map, chunksize=128, combine=None):
    """
    Reduce without guarantee of ordered reduction.

    inputs:

    ``binop``     - associative operator. The associative property allows us to
                    leverage a parallel map to perform reductions in parallel.
    ``seq``       - a sequence to be aggregated
    ``default``   - an identity element like 0 for ``add`` or 1 for mul

    ``map``       - an implementation of ``map``. This may be parallel and
                    determines how work is distributed.
    ``chunksize`` - Number of elements of ``seq`` that should be handled
                    within a single function call
    ``combine``   - Binary operator to combine two intermediate results.
                    If ``binop`` is of type (total, item) -> total
                    then ``combine`` is of type (total, total) -> total
                    Defaults to ``binop`` for common case of operators like add

    Fold chunks up the collection into blocks of size ``chunksize`` and then
    feeds each of these to calls to ``reduce``. This work is distributed
    with a call to ``map``, gathered back and then refolded to finish the
    computation. In this way ``fold`` specifies only how to chunk up data but
    leaves the distribution of this work to an externally provided ``map``
    function. This function can be sequential or rely on multithreading,
    multiprocessing, or even distributed solutions.

    If ``map`` intends to serialize functions it should be prepared to accept
    and serialize lambdas. Note that the standard ``pickle`` module fails
    here.

    Example
    -------

    >>> # Provide a parallel map to accomplish a parallel sum
    >>> from operator import add
    >>> fold(add, [1, 2, 3, 4], chunksize=2, map=map)
    10
    """
    assert chunksize > 1

    if combine is None:
        combine = binop

    chunks = partition_all(chunksize, seq)

    # Evaluate sequence in chunks via map
    if default == no_default:
        results = map(functools.partial(_reduce, binop), chunks)
    else:
        results = map(functools.partial(_reduce, binop, initial=default),
                      chunks)

    results = list(results)  # TODO: Support complete laziness

    if len(results) == 1:  # Return completed result
        return results[0]
    else:  # Recurse to reaggregate intermediate results
        return fold(combine, results, map=map, chunksize=chunksize)
Exemplo n.º 6
0
def predict_files(files: List[Path], inference_session, bs,
                  csv_fname) -> Tuple[set, int]:
    """Predict files"""
    with Progress() as progress:
        total_progress = progress.add_task("prediction progress",
                                           total=len(files))
        images_checked = 0
        bad_batch_files = []
        for i, batch in enumerate(itertoolz.partition_all(bs, files)):
            batch_predictions, bad_batch = try_predict_batch(
                batch, inference_session, bs)
            if bad_batch:
                bad_batch_files.append(batch)
            if i == 0 and not bad_batch:
                create_csv_header(batch_predictions, csv_fname)
            if not bad_batch:
                write_batch_preds_to_csv(batch_predictions, csv_fname)
            progress.update(total_progress, advance=len(batch))
            images_checked += len(batch)
        corrupt_images = set()
        if bad_batch_files:
            for batch in bad_batch_files:
                for file in batch:
                    try:
                        batch_predictions = inference_session.predict_batch(
                            [file], bs)
                        write_batch_preds_to_csv(batch_predictions, csv_fname)
                    except PIL.UnidentifiedImageError:
                        corrupt_images.add(file)
        return corrupt_images, images_checked
Exemplo n.º 7
0
Arquivo: test.py Projeto: keshava/dlsi
def _generate_batches(h, w, ps, patch_size, stride, batch_size=64):
    hdc_wdx_generator = itertools.product(
        range(0, h - patch_size + ps, stride),
        range(0, w - patch_size + ps, stride),
    )
    for batch_indexes in itertoolz.partition_all(batch_size,
                                                 hdc_wdx_generator):
        yield batch_indexes
Exemplo n.º 8
0
def fold(binop, seq, default=no_default, map=map, chunksize=128, combine=None):
    """
    Reduce without guarantee of ordered reduction.

    inputs:

    ``binop``     - associative operator. The associative property allows us to
                    leverage a parallel map to perform reductions in parallel.
    ``seq``       - a sequence to be aggregated
    ``default``   - an identity element like 0 for ``add`` or 1 for mul

    ``map``       - an implementation of ``map``. This may be parallel and
                    determines how work is distributed.
    ``chunksize`` - Number of elements of ``seq`` that should be handled
                    within a single function call
    ``combine``   - Binary operator to combine two intermediate results.
                    If ``binop`` is of type (total, item) -> total
                    then ``combine`` is of type (total, total) -> total
                    Defaults to ``binop`` for common case of operators like add

    Fold chunks up the collection into blocks of size ``chunksize`` and then
    feeds each of these to calls to ``reduce``. This work is distributed
    with a call to ``map``, gathered back and then refolded to finish the
    computation. In this way ``fold`` specifies only how to chunk up data but
    leaves the distribution of this work to an externally provided ``map``
    function. This function can be sequential or rely on multithreading,
    multiprocessing, or even distributed solutions.

    If ``map`` intends to serialize functions it should be prepared to accept
    and serialize lambdas. Note that the standard ``pickle`` module fails
    here.

    Example
    -------

    >>> # Provide a parallel map to accomplish a parallel sum
    >>> from operator import add
    >>> fold(add, [1, 2, 3, 4], chunksize=2, map=map)
    10
    """
    if combine is None:
        combine = binop

    chunks = partition_all(chunksize, seq)

    # Evaluate sequence in chunks via map
    if default is no_default:
        results = map(lambda chunk: reduce(binop, chunk), chunks)
    else:
        results = map(lambda chunk: reduce(binop, chunk, default), chunks)

    results = list(results)  # TODO: Support complete laziness

    if len(results) == 1:    # Return completed result
        return results[0]
    else:                    # Recurse to reaggregate intermediate results
        return fold(combine, results, map=map, chunksize=chunksize)
Exemplo n.º 9
0
async def fetch_all(batch_num=800):
    """获取所有股票实时报价原始数据"""
    db = get_db()
    collection = db['指数列表']
    stock_codes = collection.find_one()['codes']
    b_codes = partition_all(batch_num, stock_codes)
    tasks = [to_dataframe(codes) for codes in b_codes]
    dfs = await asyncio.gather(*tasks)
    return pd.concat(dfs)
Exemplo n.º 10
0
def test_merge(partition_size, num_items):
    """
    Verify that _merge correctly merges batches into one sorted iterable.
    """
    items = range(num_items)
    partitions = list(partition_all(partition_size, items))
    partition_ids = range(len(partitions))
    random.shuffle(partitions)
    merged = _merge(lambda x: partitions[x], partition_ids)
    assert list(merged) == list(items)
Exemplo n.º 11
0
def predict_directory(
    directory: Path = typer.Argument(
        ...,
        readable=True,
        resolve_path=True,
        help="Directory to start searching for images from",
    ),
    csv_save_dir: Path = typer.Argument(
        ...,
        writable=True,
        resolve_path=True,
        help="Directory used to store the csv report",
    ),
    pattern: str = typer.Option("fs", help="Pattern used to filter image filenames"),
    bs: int = typer.Option(16, help="Batch Size"),
    image_format: str = typer.Option(
        ".tif",
        help="Image format for flyswot to use for predictions, defaults to `*.tif`",
    ),
    model_name: str = typer.Option(
        "latest", help="Which model flyswot should use for making predictions"
    ),
    model_path: str = None,
):
    """Predicts against all images stored under DIRECTORY which match PATTERN in the filename.

    By default searches for filenames containing 'fs'.

    Creates a CSV report saved to `csv_save_dir`
    """
    start_time = time.perf_counter()
    model_dir = models.ensure_model_dir()
    model = models.ensure_model(model_dir)
    # if model_name != "latest" and not model_path:
    #     model_parts = models._get_model_parts(Path(model_dir / Path(model_name)))
    # if model_name != "latest" and model_path:
    #     model_parts = models._get_model_parts(Path(model_path / Path(model_name)))
    onnxinference = OnnxInferenceSession(model.model, model.vocab)
    files = sorted(core.get_image_files_from_pattern(directory, pattern, image_format))
    check_files(files, pattern, directory)
    typer.echo(f"Found {len(files)} files matching {pattern} in {directory}")
    csv_fname = create_csv_fname(csv_save_dir)
    with typer.progressbar(length=len(files)) as progress:
        images_checked = 0
        for i, batch in enumerate(itertoolz.partition_all(bs, files)):
            batch_predictions = onnxinference.predict_batch(batch, bs)
            if i == 0:  # pragma: no cover
                create_csv_header(batch_predictions, csv_fname)
            write_batch_preds_to_csv(batch_predictions, csv_fname)
            progress.update(len(batch))
            images_checked += len(batch)
    delta = timedelta(seconds=time.perf_counter() - start_time)
    print_inference_summary(
        str(delta), pattern, directory, csv_fname, image_format, images_checked
    )
def mapreduce(chunk_size):
    """ A long running task which splits up the input data to many workers """
    # create some sample data for our summation function
    data = []
    for i in range(10000):
        x = []
        for j in range(random.randrange(10) + 5):
            x.append(random.randrange(10000))
        data.append(x)

    # break up our data into chunks and create a dynamic list of workers
    maps = (map.s(x) for x in partition_all(chunk_size, data))
    mapreducer = chord(maps)(reduce.s())
    return {'chord_id': mapreducer.id}
Exemplo n.º 13
0
    def load_data(self, url: str) -> Iterable:
        data = []
        logger.debug('Getting hidden data from: {}'.format(url))

        d = pq(url=url)
        headers = (pq(e).text() for e in d('table.striped-table th'))
        headers = [slugify(h, separator="_") for h in headers]

        # table data
        cells = d('table.striped-table td')
        for grp in partition_all(len(headers), cells):
            values = [pq(i).text() for i in grp]
            data.append(dict(zip(headers, values)))

        return data
Exemplo n.º 14
0
def fetch_quote(codes, is_index=False, n=800):
    """股票代码或指数列表报价.

    Args:
        codes (list-like): 代码列表
        is_index (bool, optional): 是否为指数代码. Defaults to False.
        n (int, optional): 每批请求代码数量. Defaults to 800.

    Returns:
        list of dictionary: 报价列表字典
    """
    url_fmt = 'http://api.money.126.net/data/feed/{}'
    codes = ensure_list(codes)
    b_codes = partition_all(n, codes)
    urls = [url_fmt.format(','.join([_query_code(code, is_index)
                                     for code in batch])) for batch in b_codes]
    with ThreadPoolExecutor(MAX_WORKER) as excutor:
        docs = excutor.map(_fetch_quote, urls)
        return concat(docs)
Exemplo n.º 15
0
def _split(dump, partition_size, iterable, key=None, reverse=False):
    """
    Spit iterable into a number of sorted partitions of size partition_size (the last partition
    may have fewer items) and serialize using the dump callable.

    :param dump:            Callable which takes an iterable and serializes to some external
                            source, returning an iterable of ids which can be used to reload the
                            externalized partitions.

    :param partition_size:  The number of items to place in each partition.

    :param iterable:        The iterable to split into sorted partitions.

    :param key:             Callable which is used to retrieve the field to sort by.

    :param reverse:         If set to ``True``, then the list elements are sorted as if each
                            comparison were reversed.

    :return: iterable of the ids which can be used to reload the externalized partitions.
    """
    sort_by_key_and_maybe_reverse = partial(sorted, key=key, reverse=reverse)
    partitioned = partition_all(partition_size, iterable)
    dump_sorted = compose(dump, sort_by_key_and_maybe_reverse)
    return [dump_sorted(x) for x in partitioned]
Exemplo n.º 16
0
def test_partition_all():
    assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)]
    assert list(partition_all(2, [])) == []
Exemplo n.º 17
0
def main():
    # prameters
    data_dir = '../input/'
    bert_model = '../bert-large-wwm-uncased'  # 把自己large wwm模型路径取代这个bert_model
    #     bert_model = 'bert-base-uncased'
    #     bert_model = './oldtoxic'#使用在老toxic上训练好的预训练模型权重.下载路径:https://www.kaggle.com/qinhui1999/old-toxic-bert-v2
    task_name = 'MyPro'
    output_dir = 'checkpoints/'
    model_save_pth = 'checkpoints/bert_large_wwm.pth'
    max_seq_length = 220
    do_train = True
    do_eval = True
    do_lower_case = True
    train_batch_size = 56
    eval_batch_size = 200
    learning_rate = 1e-5
    num_train_epochs = 1
    warmup_proportion = 0.05
    no_cuda = False
    local_rank = -1
    seed = 42
    gradient_accumulation_steps = 8
    optimize_on_cpu = False
    fp16 = False
    save_checkpoints_steps = 50000
    loss_scale = 128

    # 对模型输入进行处理的processor,git上可能都是针对英文的processor
    processors = {'mypro': MyPro}

    if local_rank == -1 or no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(output_dir) and os.listdir(output_dir):
        # raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
        print('The checkpoint directory is aleady existed...')
    else:
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(bert_model,
                                              do_lower_case=do_lower_case)
    # print("tokenizer",tokenizer)
    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size /
            gradient_accumulation_steps * num_train_epochs)

    # Prepare model
    # model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=2,
    #             cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank))
    model = ToxicModel(bert_model, device)
    # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
    # set_trainable(model.bert, False)
    # 锁定embedding层
    #     set_trainable(model.bert.embeddings, False)
    # set_trainable(model.bert.encoder.layer[11], True)
    # set_trainable(model.head, True)
    # model.load_state_dict(torch.load('checkpoints/bert_classification_2epoch.pth')['state_dict'])
    if fp16:
        model.half()
    model.to(device)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if fp16:
        param_optimizer = [
            (n, param.clone().detach().to('cpu').float().requires_grad_())
            for n, param in model.named_parameters()
        ]
    elif optimize_on_cpu:
        param_optimizer = [(n,
                            param.clone().detach().to('cpu').requires_grad_())
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    t_total = num_train_steps
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:

        if os.path.exists('train.token_new_cleaned_wwm.npy'):
            train_features = np.load('train.token_new_cleaned_wwm.npy',
                                     allow_pickle=True)
        else:
            parallel = Parallel(300, backend="multiprocessing", verbose=5)
            train_features = list(
                concatv(*parallel(
                    delayed(convert_examples_to_features)(
                        example, label_list, max_seq_length, tokenizer)
                    for example in list(partition_all(300, train_examples)))))
            train_features = np.asarray(train_features)
            np.save('train.token_new_cleaned_wwm', train_features)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        torch.cuda.empty_cache()
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        print('y_aux', np.asarray([f.y_aux for f in train_features]).shape)
        all_label_ids = torch.tensor(np.hstack([
            np.asarray([f.label_id for f in train_features]),
            np.asarray([f.y_aux for f in train_features])
        ]),
                                     dtype=torch.float32)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(
            train_data,
            sampler=train_sampler,
            batch_size=train_batch_size,
            num_workers=2,
            pin_memory=True,
        )

        #model.load_state_dict(torch.load('checkpoints/bert_large_wwm.pth')['state_dict'])
        # model.load_state_dict(torch.load('checkpoints/0_80000_iterations.pth')['state_dict'])

        model.train()
        best_score = 0
        flags = 0
        torch.cuda.empty_cache()
        ''' 
        model.load_state_dict(torch.load('checkpoints/0_20000_iterations.pth')['model'])
        optimizer.load_state_dict(torch.load('checkpoints/0_20000_iterations.pth')['optimizer'])
        old_iter = int(torch.load('checkpoints/0_20000_iterations.pth')['iteration'])
        '''
        old_iter = -1

        for i_epoch in trange(int(num_train_epochs), desc="Epoch"):
            torch.cuda.empty_cache()
            iteration = 0  # counter
            save_point = save_checkpoints_steps  # 10000
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if iteration <= old_iter:
                    iteration += 1
                    continue
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                torch.cuda.empty_cache()
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                #Save model
                if iteration % save_point == 0 and iteration > 0:
                    checkpoint = {
                        'iteration': iteration,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }

                    checkpoint_path = os.path.join(
                        output_dir,
                        '{}_{}_iterations.pth'.format(i_epoch, iteration))

                    torch.save(checkpoint, checkpoint_path)
                    logging.info('Model saved to {}'.format(checkpoint_path))
                    val(model, processor, data_dir, max_seq_length,
                        eval_batch_size, label_list, tokenizer, device)

                iteration += 1

    checkpoint = {
        'state_dict': model.state_dict(),
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(checkpoint, model_save_pth)
    val(model, processor, data_dir, max_seq_length, eval_batch_size,
        label_list, tokenizer, device)

    test(model, processor, data_dir, max_seq_length, eval_batch_size,
         label_list, tokenizer, device)
def train_model(revision_texts, matches_dict):
    """
    Apply the initial model to raw examples. You'll want to experiment
    with finding a good number of revision texts. It can also help to
    filter out some data.
    """
    revision_data = create_revision_data(revision_texts)
    nlp_training = spacy.load("D:/Ananth/Allstate/spacy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0", disable=['ner'])
    #nlp_training.entity.add_label(LABEL)

    for key, value in matches_dict.items():
       # disable ner for training data
        doc = nlp_training(key)
        n = len(doc)
        tags = [None] * n
        heads = [None] * n
        deps = [None] * n
        # tags = [w.tag_ for w in doc]
        # heads = [w.head.i for w in doc]
        # deps = [w.dep_ for w in doc]
        losses = {}
        entities = [(e[1],e[2], LABEL) for e in value]
        training_data.append((doc, GoldParse(doc, tags=tags, heads=heads,
                                            deps=deps, entities=entities)))
    #delete training module loaded... 
    print('deleting nlp_training model....(2)')
    del nlp_training
    #print(revision_data)
    n_epoch = 5
    batch_size = 120
    


    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()

        for i in range(n_epoch):
            examples = revision_data + training_data
            #examples = training_data
            losses = {}
            random.shuffle(examples)
            for batch in itertoolz.partition_all(batch_size, examples):
                docs, golds = zip(*batch)
                print('progress... training batch:',  i+1*batch_size)
                #print(batch)
                # recreate the doc to avoid a bug in spacy training module do this only for new NER docs
                # do this only for custom NER DOC object. For revision texts based DOC objects, have the 
                # original doc so original NERs remain.
                docs_modified = []
                for doc in docs:
                    if doc.user_data != 'generic':
                        doc = nlp.make_doc(doc.text)
                    docs_modified.append(doc)
                nlp.update(docs_modified, golds, sgd=optimizer, drop=0.35, losses=losses)
    print('training completed... losses:', losses)

    # test the trained model
    test_text = 'What are different Product Type that comes after\
    Conviction date or occurence data of an endorsement insurance? This is New york'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to a directory
    output_dir = Path(OUT_DIR)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta['name'] = NEW_MODEL_NAME  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(test_text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)
Exemplo n.º 19
0
def create_db(vcf_path,
              db_path,
              rsmerge_path,
              snphistory_path,
              dbsnp_build,
              chunksize=500000):
    print("Parsing histories...")
    history = MergeHistory(rsmerge_path, snphistory_path, dbsnp_build)

    print("Creating database...")
    con = sqlite3.connect(db_path)

    con.execute("PRAGMA SYNCHRONOUS=OFF")
    con.execute("PRAGMA TEMP_STORE=MEMORY")
    con.execute("PRAGMA PAGE_SIZE=4096")
    con.execute("PRAGMA CACHE_SIZE=500000")

    con.execute("DROP TABLE IF EXISTS dbsnp")
    con.execute(
        "CREATE TABLE dbsnp (rsid TEXT, chrpos TEXT, epacts TEXT, chrom TEXT, pos INTEGER, ref TEXT, alt TEXT)"
    )
    con.execute("DROP TABLE IF EXISTS trans")
    con.execute("CREATE TABLE trans (rs_orig TEXT, rs_current TEXT)")

    with gzip.open(vcf_path, "rt") as vcf, con:
        print("Parsing VCF: %s" % vcf_path)
        for chunk in partition_all(chunksize, vcf):
            processed = []
            for line in chunk:
                if line.startswith("#"):
                    continue

                chrom, pos, vid, ref, alt = line.split("\t")[0:5]
                rsid = vid.split(":")[0]
                epacts = "{}:{}_{}/{}".format(chrom, pos, ref, alt)

                if not rsid.startswith("rs"):
                    continue

                rsid_trans = history.find_current(rsid)
                chrpos = "{}:{}".format(chrom, pos)

                use_rsid = rsid if rsid_trans is None else rsid_trans

                data = (use_rsid, chrpos, epacts, chrom, pos, ref, alt)

                processed.append(data)

            con.executemany(
                "INSERT INTO dbsnp (rsid,chrpos,epacts,chrom,pos,ref,alt) VALUES (?,?,?,?,?,?,?)",
                processed)

        # Create indexes for important columns
        print("Creating indexes...")
        indexes = [
            "CREATE INDEX idx_snp ON dbsnp (rsid)",
            "CREATE INDEX idx_chrpos ON dbsnp (chrpos)",
            "CREATE INDEX idx_chrom_and_pos ON dbsnp (chrom,pos)",
            "CREATE INDEX idx_epacts ON dbsnp (epacts)"
        ]

        for idx in indexes:
            print(idx)
            con.execute(idx)

        # Create SNP translation table
        print("Creating SNP translation table..")
        for chunk in partition_all(chunksize, history.iter_nodes()):
            rows = []
            for node in chunk:
                # Only want the "sink" nodes - the most recent rsids at the bottom of the tree
                if node.child is None:
                    parents = history.find_all_parents(node)
                    for p in parents:
                        rows.append([p, node.rsid])

            con.executemany(
                "INSERT INTO trans (rs_orig,rs_current) VALUES (?,?)", rows)

        print("Creating indexes...")
        indexes = [
            "CREATE INDEX idx_rs_orig ON trans (rs_orig)",
            "CREATE INDEX idx_rs_current ON trans (rs_current)"
        ]

        for idx in indexes:
            print(idx)
            con.execute(idx)
Exemplo n.º 20
0
#!/usr/bin/env python

import os, os.path, shutil
from toolz.itertoolz import partition_all

prefix = "sub"
size = 100
for idx, part in enumerate(partition_all(size, filter(os.path.isfile, os.listdir('.')))):
    target = "sub-%03d" % idx
    os.mkdir(target)
    for f in part:
        shutil.move(f, os.path.join(target, f))

Exemplo n.º 21
0
import sys
import argparse
from toolz import itertoolz

'''
Program #1
Input: cat myfiles.txt | python split_stdin_to_files.py 1000
Output: Split into N files where each file is M files long where M is the script parameter. Last file will be remainder
'''

parser = argparse.ArgumentParser(description='Split file lines into multiple files. The input argument sets the number of ')
parser.add_argument('num_filenames_per_file', metavar='num_filenames_per_file', type=int,
                   help="Number of file names per file.")
args = parser.parse_args()

filenames = sys.stdin.readlines()
files_list = itertoolz.partition_all(args.num_filenames_per_file, filenames)

for index, files in enumerate(files_list, start=0):
    outfilename = f"./output/{index}"
    with open(outfilename, 'w') as outfile:
        outfile.writelines(files)
Exemplo n.º 22
0
def generate_data(elements_count, chunk_size):
    # Generate input data data
    data = [(chunk, random.randrange(10000)) for chunk in range(elements_count)]
    data = partition_all(chunk_size, data)
    return data
Exemplo n.º 23
0
def create_db(vcf_path,db_path,rsmerge_path,snphistory_path,dbsnp_build,chunksize=500000):
  print("Parsing histories...")
  history = MergeHistory(rsmerge_path,snphistory_path,dbsnp_build)

  print("Creating database...")
  con = sqlite3.connect(db_path)

  con.execute("PRAGMA SYNCHRONOUS=OFF")
  con.execute("PRAGMA TEMP_STORE=MEMORY")
  con.execute("PRAGMA PAGE_SIZE=4096")
  con.execute("PRAGMA CACHE_SIZE=500000")

  con.execute("DROP TABLE IF EXISTS dbsnp")
  con.execute("CREATE TABLE dbsnp (rsid TEXT, chrpos TEXT, epacts TEXT, chrom TEXT, pos INTEGER, ref TEXT, alt TEXT)")
  con.execute("DROP TABLE IF EXISTS trans")
  con.execute("CREATE TABLE trans (rs_orig TEXT, rs_current TEXT)")

  with gzip.open(vcf_path,"rt") as vcf, con:
    print("Parsing VCF: %s" % vcf_path)
    for chunk in partition_all(chunksize,vcf):
      processed = []
      for line in chunk:
        if line.startswith("#"):
          continue

        chrom, pos, vid, ref, alt = line.split("\t")[0:5]
        rsid = vid.split(":")[0]
        epacts = "{}:{}_{}/{}".format(chrom,pos,ref,alt)

        if not rsid.startswith("rs"):
          continue

        rsid_trans = history.find_current(rsid)
        chrpos = "{}:{}".format(chrom,pos)

        use_rsid = rsid if rsid_trans is None else rsid_trans

        data = (use_rsid,chrpos,epacts,chrom,pos,ref,alt)

        processed.append(data)

      con.executemany("INSERT INTO dbsnp (rsid,chrpos,epacts,chrom,pos,ref,alt) VALUES (?,?,?,?,?,?,?)",processed)

    # Create indexes for important columns
    print("Creating indexes...")
    indexes = [
      "CREATE INDEX idx_snp ON dbsnp (rsid)",
      "CREATE INDEX idx_chrpos ON dbsnp (chrpos)",
      "CREATE INDEX idx_chrom_and_pos ON dbsnp (chrom,pos)",
      "CREATE INDEX idx_epacts ON dbsnp (epacts)"
    ]

    for idx in indexes:
      print(idx)
      con.execute(idx)

    # Create SNP translation table
    print("Creating SNP translation table..")
    for chunk in partition_all(chunksize,history.iter_nodes()):
      rows = []
      for node in chunk:
        # Only want the "sink" nodes - the most recent rsids at the bottom of the tree
        if node.child is None:
          parents = history.find_all_parents(node)
          for p in parents:
            rows.append([p,node.rsid])

      con.executemany("INSERT INTO trans (rs_orig,rs_current) VALUES (?,?)",rows)

    print("Creating indexes...")
    indexes = [
      "CREATE INDEX idx_rs_orig ON trans (rs_orig)",
      "CREATE INDEX idx_rs_current ON trans (rs_current)"
    ]

    for idx in indexes:
      print(idx)
      con.execute(idx)