Exemplo n.º 1
0
def enhance(pattern, rules, n):
    size = len(pattern)

    # square_rows becomes a list of lists of grids
    square_rows = []
    # split pattern into rows
    for rows in chunked_iter(pattern, n):
        squares = [[] for _ in range(size // n)]

        for row in rows:
            # split rows into columns, appending each row to the correct square
            # in the list we created above
            for i, c in enumerate(chunked_iter(row, n)):
                squares[i].append(c)

        square_rows.append(squares)

    # now enhance each square that we created above
    for y, squares in enumerate(square_rows):
        for x, square in enumerate(as_tuple_grid(s) for s in squares):
            square_rows[y][x] = rules[square]

    # convert square_rows back into a normal grid
    out = []

    for squares in square_rows:
        rows = ['' for _ in range(len(squares[0]))]
        for square in squares:
            for i, row in enumerate(square):
                rows[i] += ''.join(row)

        out.extend(tuple(r) for r in rows)

    return tuple(out)
Exemplo n.º 2
0
def tag(
    model: torch.nn.Module, data: ty.Iterable, batch_size: int = 128
) -> ty.List[ty.Tuple[int, ty.List[float]]]:
    """Tag a dataset

    Output: (tag, scores)
    """
    device = next(model.parameters()).device
    model.eval()
    sys_out = []  # type: ty.List[ty.Tuple[int, ty.List[float]]]
    if isinstance(data, ty.Sized):
        data_len = (len(data) - 1) // batch_size + 1  # type: ty.Optional[int]
    else:
        data_len = None
    data = map(datatools.FeaturefulSpan.collate, itu.chunked_iter(data, batch_size))
    pbar = tqdm.tqdm(
        data,
        total=data_len,
        unit="batch",
        desc="Tagging",
        mininterval=2,
        unit_scale=True,
        dynamic_ncols=True,
        disable=None,
        leave=False,
    )
    with torch.no_grad():
        for d in pbar:
            r = model(datatools.move(d, device=device))
            sys_tags = r.argmax(dim=-1).tolist()
            scores = r.exp().tolist()
            sys_out.extend(zip(sys_tags, scores))
    return sys_out
Exemplo n.º 3
0
def invoke_semgrep(semgrep_args: List[str],
                   targets: List[str]) -> Dict[str, List[Any]]:
    """
    Call semgrep passing in semgrep_args + targets as the arguments

    Returns json output of semgrep as dict object
    """
    output: Dict[str, List[Any]] = {"results": [], "errors": []}

    for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE):
        with tempfile.NamedTemporaryFile("w") as output_json_file:
            args = semgrep_args.copy()
            args.extend([
                "-o",
                output_json_file.
                name,  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ])
            for c in chunk:
                args.append(c)

            _ = semgrep_exec(*(args))
            with open(
                    output_json_file.
                    name  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ) as f:
                parsed_output = json.load(f)

            output["results"].extend(parsed_output["results"])
            output["errors"].extend(parsed_output["errors"])

    return output
Exemplo n.º 4
0
        def create_graph(client=None):
            # In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise
            # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and
            # unpicking between processes.
            delayed_or_future_annotations = client.scatter(motif_annotations, broadcast=True) if client \
                                                else delayed(motif_annotations, pure=True)

            # Chunking the gene signatures might not be necessary anymore because the overhead of the dask
            # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling).
            # The original behind the decision to implement this was the refuted assumption that fast executing tasks
            # would greatly be impacted by scheduler overhead. The chunking of signatures seemed to corroborate
            # this assumption. However, the benefit was through less pickling and unpickling of the motif annotations
            # dataframe as this was not wrapped in a delayed() construct.

            # Remark on sharing ranking databases across a cluster. Because the frontnodes of the VSC for the LCB share
            # a file server and have a common home folder configured, these database (stored on this shared drive)
            # can be accessed from all nodes in the cluster and can all use the same path in the configuration file.

            # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking
            # database) would be to load the database in memory (using the available decorator) for each task.
            # The penalty of loading the database in memory should be shared across multiple gene signature so
            # in this case chunking of gene signatures is mandatory to avoid severe performance penalties.
            # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation
            # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores),
            # this might not be a sound idea to do.

            return aggregate_func(
                (delayed(transform_func)(db, gs_chunk,
                                         delayed_or_future_annotations)
                 for db in rnkdbs
                 for gs_chunk in chunked_iter(modules, module_chunksize)))
Exemplo n.º 5
0
    def load_db(self, n=1000):
        """Load database rows.
        """
        rows = tqdm(self.db_rows_iter())

        for rows in chunked_iter(iter(rows), n):
            session.bulk_save_objects(rows)
            session.commit()
Exemplo n.º 6
0
 def _import_events(cls, f: BinaryIO, full_name: str, company_id: str, _):
     _, _, task_id = full_name[0 : -len(cls.events_file_suffix)].rpartition("_")
     print(f"Writing events for task {task_id} into database")
     for events_chunk in chunked_iter(cls.json_lines(f), 1000):
         events = [json.loads(item) for item in events_chunk]
         cls.event_bll.add_events(
             company_id, events=events, worker="", allow_locked_tasks=True
         )
Exemplo n.º 7
0
def parse(
    model_path: Union[str, pathlib.Path],
    in_file: Union[str, pathlib.Path, IO[str]],
    out_file: Union[str, pathlib.Path, IO[str]],
    batch_size: Optional[int] = None,
    overrides: Optional[Dict[str, str]] = None,
    raw: bool = False,
    strict: bool = True,
):
    parser = BiAffineParser.load(model_path, overrides)
    if batch_size is None:
        batch_size = parser.default_batch_size
    print("Encoding", file=sys.stderr)
    with smart_open(in_file) as in_stream:
        batches: Union[Iterable[DependencyBatch], Iterable[SentencesBatch]]
        if raw:
            sentences = (
                encoded for line in in_stream if line and not line.isspace()
                for encoded in
                [parser.encode_sentence(line.strip().split(), strict=strict)]
                if encoded is not None)
            batches = (parser.batch_sentences(sentences)
                       for sentences in itu.chunked_iter(
                           sentences,
                           size=batch_size,
                       ))
        else:
            test_set = DependencyDataset(
                DepGraph.read_conll(in_file),
                parser.lexer,
                parser.char_rnn,
                parser.ft_lexer,
                use_labels=parser.labels,
                use_tags=parser.tagset,
            )
            batches = (test_set.make_single_batch(sentences)
                       for sentences in itu.chunked_iter(
                           test_set.treelist, size=parser.default_batch_size))
        print("Parsing", file=sys.stderr)
        with smart_open(out_file, "w") as ostream:
            parser.batched_predict(
                batches,
                cast(IO[str], ostream),
                greedy=False,
            )
Exemplo n.º 8
0
def invoke_semgrep_sarif(
    semgrep_args: List[str],
    targets: List[str],
    *,
    timeout: Optional[int],
    explicit_semgrepignore_path: Optional[str] = None,
) -> Tuple[int, Dict[str, List[Any]]]:
    """
    Call semgrep passing in semgrep_args + targets as the arguments

    Returns sarif output of semgrep as dict object
    """
    output: Dict[str, List[Any]] = {}

    max_exit_code = 0
    _env = ({
        "SEMGREP_R2C_INTERNAL_EXPLICIT_SEMGREPIGNORE":
        explicit_semgrepignore_path,
        **os.environ,
    } if explicit_semgrepignore_path else os.environ)

    for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE):
        with tempfile.NamedTemporaryFile("w") as output_json_file:
            args = semgrep_args.copy()
            args.extend(["--debug", "--sarif"])
            args.extend([
                "-o",
                output_json_file.
                name,  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ])
            for c in chunk:
                args.append(c)

            exit_code = semgrep_exec(*args,
                                     _timeout=timeout,
                                     _err=debug_echo,
                                     _env=_env).exit_code
            max_exit_code = max(max_exit_code, exit_code)

            with open(
                    output_json_file.
                    name  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ) as f:
                parsed_output = json.load(f)

            if len(output) == 0:
                output = parsed_output
            else:
                output["runs"][0]["results"].extend(
                    parsed_output["runs"][0]["results"])
                output["runs"][0]["tool"]["driver"]["rules"].extend(
                    parsed_output["runs"][0]["tool"]["driver"]["rules"])

    return max_exit_code, output
Exemplo n.º 9
0
def get_by_filename_remote(filenames, chunk_size=200):
    file_infos = []
    warnings = []
    for filenames_chunk in chunked_iter(filenames, chunk_size):
        params = {'names': filenames_chunk}
        url = REMOTE_UTILS_URL + '/file'
        resp, no_infos = get_from_remote(url, params)
        if no_infos:
            # print '!! info missing for %s' % no_infos
            warnings += no_infos
        file_infos += resp
    return file_infos, warnings
Exemplo n.º 10
0
    def load_db(self, chunk_size=1000):
        """Write db rows.

        Args:
            chunk_size (int): Insert page size.
        """
        rows = self.db_rows()

        chunks = chunked_iter(rows, chunk_size)

        for i, chunk in enumerate(chunks):

            session.bulk_save_objects(chunk)
            session.commit()

            print(dt.now().isoformat(), i)
Exemplo n.º 11
0
    def wrapper(self, iterable: Iterable, **kwargs):
        assert iterutils.is_collection(
            iterable
        ), "The positional parameter should be an iterable for breaking into chunks"

        func_with_params = functools.partial(func, self, **kwargs)
        with ThreadPoolExecutor() as pool:
            return list(
                itertools.chain.from_iterable(
                    filter(
                        None,
                        pool.map(
                            func_with_params,
                            iterutils.chunked_iter(iterable, chunk_size),
                        ),
                    )
                ),
            )
Exemplo n.º 12
0
 def main(self):
     a = self.args(self.text[2:])
     if a.count > 10 and self.mc2['vips'] == False:
         a.count = 10
     elif a.count > 30:
         a.count = 30
     image_url = []
     try:
         if a.count > 4:
             self.sendmsg("начинаю качать пикчи")
         for _ in range(a.count):
             image_url.append(nekos.img(self.text[1]))
         image = self.multithreadwoload(image_url)
         image = list(iterutils.chunked_iter(image.split(","), 10))
         for image in image:
             images = ",".join(image)
             self.sendmsg("Держи!", images)
     except:
         self.sendmsg("""Введи один из этих аргументов, с цифрой на конце или с -c 5 - скинет указанное количество пикч:
         feet, yuri, trap, futanari, hololewd, lewdkemo, solog, feetg, cum, erokemo, les, wallpaper, lewdk, ngif, tickle, lewd, feed, gecg, eroyuri, eron, cum_jpg, bj, nsfw_neko_gif, solo, kemonomimi, nsfw_avatar, gasm, poke, anal, slap, hentai, avatar, erofeet, holo, keta, b*****b, pussy, t**s, holoero, lizard, pussy_jpg, pwankg, classic, kuni, waifu, pat, 8ball, kiss, femdom, neko, spank, cuddle, erok, fox_girl, boobs, random_hentai_gif, smallboobs, hug, ero, smug, goose, baka""")
Exemplo n.º 13
0
    def chunked(self, size, fill=_MISSING):
        """Return a new :class:`Iter()` spec which groups elements in the iterable
        into lists of length *size*.

        If the optional *fill* argument is provided, iterables not
        evenly divisible by *size* will be padded out by the *fill*
        constant. Otherwise, the final chunk will be shorter than *size*.

        >>> list(glom(range(10), Iter().chunked(3)))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
        >>> list(glom(range(10), Iter().chunked(3, fill=None)))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]]
        """
        kw = {'size': size}
        args = size,
        if fill is not _MISSING:
            kw['fill'] = fill
            args += (fill, )
        return self._add_op('chunked', args,
                            lambda it, scope: chunked_iter(it, **kw))
Exemplo n.º 14
0
    def saveSingleRollout(i):

        subsampled_paths_per_thread = []
        # print("Rollout number is ", i)
        # obs_skip = 15  # np.random.randint(7, 28)
        # num_datapoints = int(2500 / (instancesNum * obs_skip))

        path = perform_rollout(policy,
                               environment,
                               debug=False,
                               animate=opt['animate'],
                               control_step_skip=opt['action_skip'])

        useful_path_data = path['observations'][::]
        # useful_path_data_reverse = useful_path_data[::-1]
        # print(len(useful_path_data))
        # useful_path_data_combined = useful_path_data + useful_path_data_reverse
        split_paths = list(
            iterutils.chunked_iter(useful_path_data, instancesNum * obs_skip))

        # Split a rollout in to segments of 50 time steps
        # print(len(split_paths))
        # if (num_datapoints > len(split_paths) - 1):
        num_datapoints = len(split_paths) - 1
        # print("WHYWHY")

        for j in range(num_datapoints):

            observations = split_paths[j]
            obs_sample = []
            # From each 50 steps subsample instancesNum steps and store

            for k in range(instancesNum):
                obs_sample.append(
                    observations[int(len(observations) / instancesNum) *
                                 k][ignoreObs::])
            subsampled_paths_per_thread.append(obs_sample)
            # paths.extend(split_paths)
            # paths.append(path)
        # print(np.shape(subsampled_paths_per_thread))
        return subsampled_paths_per_thread
Exemplo n.º 15
0
    def _batch(
        self,
        entries: Any,
        key: str,
        operation: Callable[..., Dict[str, str]],
        raise_on_error: bool = False,
        apply: Callable[..., Any] = lambda x: x,
    ) -> Dict[str, List[bool]]:
        """[summary]

        Args:
            entries (Any): [description]
            key (str): [description]
            operation (Callable[..., Dict[str, str]]): [description]
            raise_on_error (bool): [description]. Defaults to False.
            apply (Callable[..., Any]): [description]. Defaults to lambdax:x.

        Returns:
            Dict[str, List[bool]]: [description]

        Raises:
            Exception
        """
        res_list = []
        for i_chunk, chunk in enumerate(chunked_iter(entries, 10)):
            payload = [{
                "Id": str(i_chunk * 10 + i),
                key: apply(m)
            } for i, m in enumerate(chunk)]
            res = operation(QueueUrl=self.queue_url, Entries=payload)
            print(res)
            if raise_on_error and res.get("Failed"):
                raise (Exception)
            res_list.append(res)
        return reduce(
            lambda c, r: {
                key: c.get(key, []) + r.get(key, [])
                for key in ["Successful", "Failed"]
            },
            res_list,  # type: ignore
        )
Exemplo n.º 16
0
 def main(self):
     lhelp = []
     mhelp = "\n"
     allowedtype = ["command", "specialcommand"]
     for moduli in mods.modules:
         if moduli.types in allowedtype and moduli.available_for != "admins" and moduli.included:
             lhelp.append(dict(command=moduli.command, doc=moduli.doc))
     lhelp = list(iterutils.chunked_iter(lhelp, 11))
     lhelp = [dict(command="уходи от", doc="сюда мужик")] + lhelp
     try:
         number = int(self.text[1])
         lhelp2 = lhelp[number]
     except:
         number = 1
         lhelp2 = lhelp[1]
     for moduli in lhelp2:
         mhelp += f"• {', '.join(moduli['command'])} - {moduli['doc']} \n"
     mhelp += f"Страница: {number} \n"
     mhelp += f"Всего страниц: {len(lhelp)-1} \n"
     mhelp += "Пример переключения на другую страницу: /хелп 3"
     self.sendmsg(mhelp)
Exemplo n.º 17
0
def invoke_semgrep(ctx: click.Context) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    workdir = Path.cwd()
    targets = TargetFileManager(
        base_path=workdir,
        base_commit=ctx.obj.meta.base_commit_ref,
        paths=[workdir],
        ignore_rules_file=get_semgrepignore(ctx.obj.sapp.scan),
    )

    debug_echo("=== seeing if there are any findings")
    findings = FindingSets()

    with targets.current_paths() as paths, get_semgrep_config(
            ctx) as config_args:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"))
        for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE):
            args = ["--json", *config_args]
            for path in chunk:
                args.extend(["--include", path])
            findings.current.update(
                Finding.from_semgrep_result(result, ctx)
                for result in json.loads(str(semgrep(*args)))["results"])
            click.echo(
                f"| {unit_len(findings.current, 'current issue')} found")

    if not findings.current:
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues"
        )
    else:
        with targets.baseline_paths() as paths, get_semgrep_config(
                ctx) as config_args:
            if paths:
                paths_with_findings = {
                    finding.path
                    for finding in findings.current
                }
                paths_to_check = set(str(path)
                                     for path in paths) & paths_with_findings
                click.echo("=== looking for pre-existing issues in " +
                           unit_len(paths_to_check, "file"))
                for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE):
                    args = ["--json", *config_args]
                    for path in chunk:
                        args.extend(["--include", path])
                    findings.baseline.update(
                        Finding.from_semgrep_result(result, ctx)
                        for result in json.loads(str(semgrep(
                            *args)))["results"])
                click.echo(
                    f"| {unit_len(findings.baseline, 'pre-existing issue')} found"
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report")
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file, get_semgrep_config(ctx) as config_args:
            args = ["--sarif", *config_args]
            for path in paths:
                args.extend(["--include", path])
            semgrep(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return findings
Exemplo n.º 18
0
def _distributed_calc(
        rnkdbs: Sequence[Type[RankingDatabase]],
        modules: Sequence[Type[GeneSignature]],
        motif_annotations_fname: str,
        transform_func: Callable[
            [Type[RankingDatabase], Sequence[Type[GeneSignature]], str], T],
        aggregate_func: Callable[[Sequence[T]], T],
        motif_similarity_fdr: float = 0.001,
        orthologuous_identity_threshold: float = 0.0,
        client_or_address='custom_multiprocessing',
        num_workers=None,
        module_chunksize=100) -> T:
    """
    Perform a parallelized or distributed calculation, either pruning targets or finding enriched motifs.

    :param rnkdbs: A sequence of ranking databases.
    :param modules: A sequence of gene signatures.
    :param motif_annotations_fname: The filename of the motif annotations to use.
    :param transform_func: A function having a signature (Type[RankingDatabase], Sequence[Type[GeneSignature]], str) and
        that returns Union[Sequence[Regulon]],pandas.DataFrame].
    :param aggregate_func: A function having a signature:
        - (Sequence[pandas.DataFrame]) => pandas.DataFrame
        - (Sequence[Sequence[Regulon]]) => Sequence[Regulon]
    :param motif_similarity_fdr: The maximum False Discovery Rate to find factor annotations for enriched motifs.
    :param orthologuous_identity_threshold: The minimum orthologuous identity to find factor annotations
        for enriched motifs.
    :param client_or_address: The client of IP address of the scheduler when working with dask. For local multi-core
        systems 'custom_multiprocessing' or 'dask_multiprocessing' can be supplied.
    :param num_workers: If not using a cluster, the number of workers to use for the calculation.
        None of all available CPUs need to be used.
    :param module_chunksize: The size of the chunk in signatures to use when using the dask framework.
    :return: A pandas dataframe or a sequence of regulons (depends on aggregate function supplied).
    """
    def is_valid(client_or_address):
        if isinstance(client_or_address, str) and ((client_or_address in {
                "custom_multiprocessing", "dask_multiprocessing", "local"
        }) or IP_PATTERN.fullmatch(client_or_address)):
            return True
        elif isinstance(client_or_address, Client):
            return True
        return False

    assert is_valid(
        client_or_address
    ), "\"{}\"is not valid for parameter client_or_address.".format(
        client_or_address)

    # Make sure warnings and info are being logged.
    if not len(LOGGER.handlers):
        LOGGER.addHandler(create_logging_handler(False))
        if LOGGER.getEffectiveLevel() > logging.INFO:
            LOGGER.setLevel(logging.INFO)

    if client_or_address == 'custom_multiprocessing':  # CUSTOM parallelized implementation.
        # This implementation overcomes the I/O-bounded performance. Each worker (subprocess) loads a dedicated ranking
        # database and motif annotation table into its own memory space before consuming module. The implementation of
        # each worker uses the AUC-first numba JIT based implementation of the algorithm.
        assert len(rnkdbs) <= num_workers if num_workers else cpu_count(
        ), "The number of databases is larger than the number of cores."
        amplifier = int(
            (num_workers if num_workers else cpu_count()) / len(rnkdbs))
        LOGGER.info("Using {} workers.".format(len(rnkdbs) * amplifier))
        receivers = []
        for db in rnkdbs:
            for idx, chunk in enumerate(
                    chunked_iter(modules,
                                 ceil(len(modules) / float(amplifier)))):
                sender, receiver = Pipe()
                receivers.append(receiver)
                Worker("{}({})".format(db.name, idx + 1), db, chunk,
                       motif_annotations_fname, sender, motif_similarity_fdr,
                       orthologuous_identity_threshold,
                       transform_func).start()
        # Retrieve the name of the temporary file to which the data is stored. This is a blocking operation.
        fnames = [recv.recv() for recv in receivers]

        # Load all data from disk and concatenate.
        def load(fname):
            with open(fname, 'rb') as f:
                return pickle.load(f)

        try:
            return aggregate_func(list(map(load, fnames)))
        finally:
            # Remove temporary files.
            for fname in fnames:
                os.remove(fname)
    else:  # DASK framework.
        # Load motif annotations.
        motif_annotations = load_motif_annotations(
            motif_annotations_fname,
            motif_similarity_fdr=motif_similarity_fdr,
            orthologous_identity_threshold=orthologuous_identity_threshold)

        # Create dask graph.
        def create_graph(client=None):
            # In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise
            # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and
            # unpicking between processes.
            delayed_or_future_annotations = client.scatter(motif_annotations, broadcast=True) if client \
                                                else delayed(motif_annotations, pure=True)

            # Chunking the gene signatures might not be necessary anymore because the overhead of the dask
            # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling).
            # The original behind the decision to implement this was the refuted assumption that fast executing tasks
            # would greatly be impacted by scheduler overhead. The chunking of signatures seemed to corroborate
            # this assumption. However, the benefit was through less pickling and unpickling of the motif annotations
            # dataframe as this was not wrapped in a delayed() construct.

            # Remark on sharing ranking databases across a cluster. Because the frontnodes of the VSC for the LCB share
            # a file server and have a common home folder configured, these database (stored on this shared drive)
            # can be accessed from all nodes in the cluster and can all use the same path in the configuration file.

            # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking
            # database) would be to load the database in memory (using the available decorator) for each task.
            # The penalty of loading the database in memory should be shared across multiple gene signature so
            # in this case chunking of gene signatures is mandatory to avoid severe performance penalties.
            # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation
            # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores),
            # this might not be a sound idea to do.

            return aggregate_func(
                (delayed(transform_func)(db, gs_chunk,
                                         delayed_or_future_annotations)
                 for db in rnkdbs
                 for gs_chunk in chunked_iter(modules, module_chunksize)))

        # Compute dask graph ...
        if client_or_address == "dask_multiprocessing":
            # ... via multiprocessing.
            return create_graph().compute(
                get=get,
                num_workers=num_workers if num_workers else cpu_count())
        else:
            # ... via dask.distributed framework.
            client, shutdown_callback = _prepare_client(
                client_or_address,
                num_workers=num_workers if num_workers else cpu_count())
            try:
                return client.compute(create_graph(client), sync=True)
            finally:
                shutdown_callback(False)
Exemplo n.º 19
0
 def batches(self, size):
     """Iterate all batches.
     """
     for grafs in chunked_iter(self.grafs, size):
         yield Batch(grafs)
Exemplo n.º 20
0
    help=
    'Upper bound on the number of processes that can be launched in parallel. Default value is '
    'the number of cores on your machine.',
    nargs='?',
    default=mp.cpu_count(),
    type=int)
args = parser.parse_args()

list_bigfiles = [
    bigfile for bigfile in os.listdir(args.directory)
    if args.pattern in bigfile
]

process_dict = {}
if __name__ == '__main__':
    for bigfile in list_bigfiles:
        process_dict[bigfile] = mp.Process(name=bigfile,
                                           target=parallel_upload_gcs,
                                           args=(
                                               args.directory + bigfile,
                                               args.bucket,
                                           ))

# print(args)

for sublist_bigfiles in iterutils.chunked_iter(list_bigfiles, args.nb_process):
    for bigfile in sublist_bigfiles:
        process_dict[bigfile].start()
    for bigfile in sublist_bigfiles:
        process_dict[bigfile].join()
def main(start_date_, working_dir_, nblocks_, email_notification_, top_):
    """
    The parametrized main function for CLI in the cloud
    """
# use the following command:
# rm -r temp/*; python test.py --top 1000 -s 2018-01-01
#-dir ./temp/ -nblocks 100 --email-notification
# on Mac terminal from the dir where you have test.py
# comand line arguments; use comments below as an example
#TOP = 10000000
# reduce TOP value to 10 for debugging; put it to inf for a full run
#DATE = '2017-01-01'
# 'from' parameter for historical pricing data
#WORKING_DIR = './refinitiv_qa_direct_qai_master_and_pricing_tables/'\
#    +str(time.strftime("%Y-%m-%d"))+'/'
# dir where all outputs go; it can be dated as above
#NBLOCKS = 100
# pricing data are very long queries; they need to be partitioned in blocks
# as a separate project, optimize queries
#
#
# pylint: disable=too-many-branches
# pylint: disable=too-many-statements
# pylint: disable=too-many-locals
#
    top = top_
    date_from = start_date_
    nblocks = nblocks_
    cwd = os.path.realpath(os.path.dirname(__file__)) #os.getcwd() # ./
    working_dir = working_dir_
    # empty the whole working dir
    for root, dirs, files in os.walk(working_dir):
        for f_f in files:
            os.unlink(os.path.join(root, f_f))
        for d_d in dirs:
            shutil.rmtree(os.path.join(root, d_d))
    shutil.copy(cwd+'/master_file_joe.csv', working_dir)
#
    database = 'qai'
    server = 'cd5m7wkqacpdeus2mia12301.public.dabc3424290b.database.windows.net,3342'
    username = '******'
    password = '******'
#Authentication: SQL Server Authentication
    # NOTE: The following works on a Mac with the MSSQL 13 driver installed - it is here as the
    # default because Art's Anaconda environment doesn't show a non-empty list of drivers from
    # pyodbc
    driver = '/usr/local/lib/libmsodbcsql.13.dylib' # '{ODBC Driver 13 for SQL Server}'
    drivers = [item for item in pyodbc.drivers()]
    if drivers:
        driver = drivers[0]
    #print('driver:{}'.format(driver))
    #
    cnxn = pyodbc.connect('DRIVER=' + driver +
                          ';SERVER=' + server +
                          ';PORT=1433;DATABASE=' + database +
                          ';UID=' + username +
                          ';PWD=' + password)
    cursor_ = cnxn.cursor()
    refinitiv_data_n_columns = 8
    s_s = ""
    if top is not None:
        s_s = ''' TOP '''+str(top)
    query = '''SELECT'''+s_s+'''
                        A.SecCode
                ,       MR1.ID,MR1.NAME     AS CURRNAME
                ,       G1.ISSUER           AS PITISSUER,G1.EXCHANGE
                ,       MR1.Country
                ,       G1.StartDate
                ,       G1.EndDate
                ,       K1.TICKER
                ,       G1.EXCHANGE
                ,       I.ISSUER            AS CURRENTISSUE
                ,       I.STATUS
                ,       I.SECTYPE           AS CURRSECTYPE

                FROM            SecMstrX                        A

                JOIN            SECMAPX             M
                                ON                  M.SECCODE = A.SECCODE
                                AND                 M.VenType = 1       -- IDC
                                AND                 TYPE_ = 1           -- NorthAmer Equity
                                AND                 M.EXCHANGE <> 2

                                -- AND     M.RANK = 1   -- VIEW ALL (commented out) OR CURRENT ONLY
                                -- AND     A.COUNTRY = 'USA' -- comment this out for ADR's

                JOIN            Prc.PrcTKChg                    K
                                ON                  M.VENCODE = K.Code

                JOIN            PRC.PRcsCCHG        G
                                ON                  G.CODE =    K.CODE
                                AND                 ISNULL(G.ENDDATE,'1/1/2059')
                                BETWEEN             K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                --JOIN PRCCODE2 Y
                --ON Y.TYPE_ = 2 AND ASCII(G.EXCHANGE) = Y.CODE

                JOIN            PRC.PRCINFO         I
                                ON                  I.CODE =    G.CODE
                                AND                 I.SECTYPE   NOT IN ('X','P','E','I','S','U','W','0','7','T','Q','R','V')

                JOIN            SECMAPX             MP1
                                ON                  MP1.VENCODE =   I.CODE
                                AND                 MP1.RANK =      M.RANK
                                AND                 MP1.VENTYPE =   1
                                AND                 MP1.EXCHANGE =  M.EXCHANGE

                JOIN            SECMSTRX            MR1
                                ON                  MR1.SECCODE =   MP1.SECCODE
                                AND                 MR1.TYPE_ =     1

                JOIN            SECMAPX             MP2
                                ON                  MP2.SECCODE =   MR1.SECCODE
                                AND                 MP2.VENTYPE = 1
                                AND                 MP2.RANK =      M.RANK
                JOIN            PRC.PRCTKCHG        K1
                                ON                  K1.CODE =       MP2.VENCODE
                                --AND ISNULL(K1.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                JOIN            PRC.PRCSCCHG        G1
                                ON                  G1.CODE =       K1.CODE
                                AND                 ISNULL(G1.ENDDATE,'1/1/2059')
                                BETWEEN             K1.STARTDATE    AND     ISNULL(K1.ENDDATE,'1/1/2059')

                 GROUP BY       A.SecCode
                 ,              MR1.ID
                 ,              MR1.NAME
                 ,              G1.ISSUER
                 ,              G1.EXCHANGE
                 ,              MR1.Country
                 ,              G1.StartDate
                 ,              G1.EndDate
                 ,              K1.TICKER
                 ,              G1.EXCHANGE
                 ,              I.ISSUER
                 ,              I.STATUS
                 ,              I.SECTYPE

                 ORDER BY       MR1.ID
                 ,              G1.STARTDATE
                 '''
    # output the query string to a file
    with open(working_dir+'query_master_table.txt', "w") as query_file:
        query_file.write(query)
    print('\n\nexecuting the query ... ', datetime.now())
    try:
        print('\n\ntrying to execute cursor_.execute(query) ...', datetime.now())
        cursor_.execute(query)
    except Exception as err:
        print('\n\nexception #1 for cursor_.execute(query)', err, datetime.now())
    print('\n\nfetching query result ... ', datetime.now())
    try:
        print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now())
        result = cursor_.fetchall()
    except Exception as err:
        print('\n\nexception #2 for result = cursor_.fetchall()', err, datetime.now())

    tickers = []
    print('\n\nwriting .csv file (master table) ... ', datetime.now())
    with tqdm(total=len(result), file=sys.stdout) as pbar:
        TABLE_MASTER = []
        TABLE_MERGED = []
        for row in result:
            pbar.set_description('progress at %s' % datetime.now())
            pbar.update(1)
            row1 = []
            row3 = []
            date_to = datetime.date(datetime.now())
            if row[7] is not None:                  # to
                date_to = datetime.date(row[7])
            else:
                date_to = datetime.date(datetime.now())
            if date_to > datetime.date(datetime.now()):
                date_to = datetime.date(datetime.now())
    #
            row1.append(str(row[8]))                # ticker
            tickers.append(row[8])
            row1.append(str(row[3]))                # point-in-time name
            row1.append(str(date_to))               # to
    #
            row1.append(str(row[0]))                # SecCode
            row3.append(int(row[0]))                # int for sorting
            row1.append(datetime.date(row[6]))      # from
            row3.append(datetime.date(row[6]))
            row1.append(date_to)                    # to
            row3.append(date_to)
            row1.append(str(row[3]))                # point-in-time name
            row3.append(str(row[3]))
            row1.append(str(row[8]))                # ticker
            row3.append(str(row[8]))
            row1.append(str(row[5]))                # country
            row3.append(str(row[5]))
            row1.append(str(row[2]))                # current name
            row3.append(str(row[2]))
            row1.append(str(row[12]))               # type
            row3.append(str(row[12]))
            if row1 not in TABLE_MERGED:
                TABLE_MERGED.append(row1)
            if row3 not in TABLE_MERGED:
                TABLE_MASTER.append(row3)

        with open(working_dir+'master_table.csv', 'w') as result_file:
            TABLE_MASTER1 = []
            TABLE_MASTER1.append(create_titles([
                'SecCode'
                , 'From'
                , 'To'
                , 'Point-in-time name'
                , 'Ticker'
                , 'Country'
                , 'Current name'
                , 'Type'
                ]))
            TABLE_MASTER = sorted(TABLE_MASTER, key=lambda item: item[0])
    #         sorted(TABLE_MASTER, key=operator.itemgetter(0))
            TABLE_MASTER1 += TABLE_MASTER
            WR = csv.writer(result_file, dialect='excel')
            print("HERE")
            WR.writerows(TABLE_MASTER1)

        print('\n\npost-processing 1 ... ', datetime.now())

        with open(working_dir+'master_file_joe.csv', 'r') as csv_file:
            CSV_READER = csv.reader(csv_file, delimiter=',')
            NROW = 0
            for row in CSV_READER:
                row1 = [] # change True to False to use the list
                if (str(row[3]) in ('C', 'BAC', 'AAPL') or True) and NROW != 0: # skip titles
                    row1.append(str(row[3]))
                    row1.append(str(row[4]))
                    row1.append(str(row[2]))
                    for _ in range(refinitiv_data_n_columns):
                        row1.append('') # fill in with blanks for merged .csv
                    for r in row:
                        row1.append(r)
                    TABLE_MERGED.append(row1)
                NROW += 1

        print('\n\npost-processing 2 ... ', datetime.now())

        with open(working_dir+'master_table_merged_art_vs_joe.csv', 'w') as result_file:
            WR = csv.writer(result_file, dialect='excel')
            TABLE_MERGED1 = sorted(TABLE_MERGED, key=operator.itemgetter(0, 1, 2))
            TABLE_MERGED2 = []
            TABLE_MERGED2.append(create_titles([
                ''
                , ''
                , ''
                , 'SecCode'
                , 'From'
                , 'To'
                , 'Point-in-time name'
                , 'Ticker'
                , 'Country'
                , 'Current name'
                , 'Type'
                , 'ID'
                , 'FROM'
                , 'TO'
                , 'TICKER'
                , 'NAME'
                , 'TYPE'
                ]))
            TABLE_MERGED2 += TABLE_MERGED1
            WR.writerows(TABLE_MERGED2)

        print('\n\npost-processing 3 ... ', datetime.now())

        TICKERS_JOE = [] # this should be an array of unique tickers
        i = 0
        with open(working_dir+'master_file_joe.csv', 'r') as csv_file:
            CSV_READER = csv.reader(csv_file, delimiter=',')
            for row in CSV_READER:
                if  i != 0: # skip titles at i = 0
                    if row[3] not in TICKERS_JOE: # unique tickers
                        TICKERS_JOE.append(row[3])
                i += 1

        TICKERS_ART = [] # this should be an array of unique tickers
        for t1 in tickers:
            if t1 not in TICKERS_ART:
                TICKERS_ART.append(t1)

        print('\n\nnumber of unique tickers in the master: ', len(TICKERS_ART), datetime.now())

        if top is None:
            print('\n\npost-processing 4 ... ', datetime.now())

            MISSING_TICKERS = []
            for tj in TICKERS_JOE:
                if tj not in TICKERS_ART: # unique tickers
                    MISSING_TICKERS.append(tj)

            MISSING_TICKERS1 = []
            for mt in MISSING_TICKERS:
                if mt not in MISSING_TICKERS1: # unique tickers
                    MISSING_TICKERS1.append(mt)

            print('\n\nnumber of missing tickers: ', len(MISSING_TICKERS1), datetime.now())

            TICKERS_WITHOUT_SUFFIX = []
            for mt in MISSING_TICKERS1:
                if mt.find('.') != -1:
                    mt = mt.split('.')[0]
                else:
                    mt = mt[:-1] # try to remove the fused suffix for missing tickers
                if mt not in TICKERS_WITHOUT_SUFFIX:
                    TICKERS_WITHOUT_SUFFIX.append(mt)
            print('\n\nnumber of missing tickers without suffix: ',
                  len(TICKERS_WITHOUT_SUFFIX), datetime.now())

            query = '''SELECT * FROM PRC.PRCSCCHG WHERE TICKER IN (\''''

            for tws in TICKERS_WITHOUT_SUFFIX:
                query += str(tws)+'''\', \''''
            query = query[:-3]
            query += ''')'''

            try:
                print('\n\ntrying to execute cursor_.execute(query)...', datetime.now())
                cursor_.execute(query)
            except Exception as err:
                print('\n\nexception #3 for cursor_.execute(query)', err, datetime.now())

            print('\n\nfetching second query result ... ', datetime.now())
            try:
                print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now())
                result = cursor_.fetchall()
            except Exception as err:
                print('\n\nexception #4 for result = cursor_.fetchall()', err, datetime.now())

            with open(working_dir+'addendum_master_table.csv', 'w') as result_file:
                TABLE_ADDENDUM = result
                TABLE_ADDENDUM = sorted(TABLE_ADDENDUM, key=operator.itemgetter(4))
                TABLE_ADDENDUM1 = []
                TABLE_ADDENDUM1.append(create_titles([
                    'SecCode'
                    , 'From'
                    , 'To'
                    , 'CUSIP'
                    , 'Ticker'
                    , 'SEDOL'
                    , 'Issuer'
                    , 'Full ticker'
                    , 'Base ticker'
                    , 'Group'
                    , 'Series'
                    , 'Exchange'
                    ]))
                TABLE_ADDENDUM1 += TABLE_ADDENDUM
                WR = csv.writer(result_file, dialect='excel')
                WR.writerows(TABLE_ADDENDUM1)

            FOUND_TICKERS = []
            for row in result:
                if str(row[4]) not in FOUND_TICKERS:
                    FOUND_TICKERS.append(str(row[4]))

            print('\n\nnumber of found tickers: ', len(FOUND_TICKERS), datetime.now())

            MISSING_TICKERS2 = []
            for mt in MISSING_TICKERS1:
                wosuffix = mt
                if wosuffix.find('.') != -1:
                    wosuffix = wosuffix.split('.')[0]
                else:
                    wosuffix = wosuffix[:-1] # try to remove the fused suffix
                if wosuffix not in FOUND_TICKERS and mt not in FOUND_TICKERS:
                    # tickers w/o and with suffix
                    MISSING_TICKERS2.append(mt)

            print('\n\nfinal number of missing tickers: ', len(MISSING_TICKERS2), datetime.now())
            print('\n\nwriting missing tickers ... ', datetime.now())

            with open(working_dir+'missing_tickers.csv', 'w') as result_file:
                WR = csv.writer(result_file, dialect='excel')
                MISSING_TICKERS2.sort()
                MISSING_TICKERS3 = []
                for row in MISSING_TICKERS2:
                    with open(working_dir+'master_file_joe.csv', 'r') as csv_file:
                        CSV_READER = csv.reader(csv_file, delimiter=',')
                        i = 0
                        for row2 in CSV_READER:
                            if row2[3] == row and i != 0: # skip titles at i = 0
                                row5 = []
                                row5.append(str(row2[3]))
                                row5.append(str(row2[4]))
                                if row5 not in MISSING_TICKERS3: # unique entries
                                    MISSING_TICKERS3.append(row5)
                            i += 1
                MISSING_TICKERS4 = []
                MISSING_TICKERS4.append(create_titles(['Tickers', 'Co. names']))
                MISSING_TICKERS4 += MISSING_TICKERS3
                WR.writerows(MISSING_TICKERS4)

        # build objects for missing ticker qqq
        #i = 0
        #for t in MISSING_TICKERS3:
        #    print(t)
        #    T = TickerNeighborhood(ticker=t[0])
        #    T.current_name = t[1]
        #    print(T)
        #    print(T.ticker)
        #    print(T.name)
        #    list_of_suggested_tickers_for_addendum=[]
        #    list_of_suggested_tickers_for_addendum
        #=T.analyze_the_neighborhood_of_T_while_keeping_in_mind_joes_master_table
        #('master_table_joe.csv')


    print('\n\ndownloading pricing data ... ', datetime.now())

    SECCODES = []
    with open(working_dir+'master_table.csv') as csv_file:
        CSV_READER = csv.reader(csv_file, delimiter=',')
        L = 0
        for row in CSV_READER:
            if row[0] not in SECCODES and L > 0: # skip titles, unique seccodes
                SECCODES.append(row[0])
            L += 1

    print('\n\ndistinct seccodes = ', len(SECCODES), datetime.now())
    print('\n\nprocessing ...', datetime.now())

    query = '''
    --This query returns the fully adjusted Open, High, Low, and Close Pricing data in Local Currency using the Ds2Primqtprc table for North American Equities*/

                    SELECT DISTINCT

                        A.SecCode
                ,       MR1.ID,MR1.NAME AS CURRNAME
                ,       G1.ISSUER AS PITISSUER,G1.EXCHANGE
                ,       MR1.Country
                ,       G1.StartDate
                ,       G1.EndDate
                ,       K1.TICKER
                ,       G1.EXCHANGE
                ,       I.ISSUER AS CURRENTISSUE
                ,       I.STATUS
                ,       I.SECTYPE AS CURRSECTYPE
                ,       C1.TotRet
                ,       C1.*

                FROM            SecMstrX                        A

                JOIN            SECMAPX             M

                                ON                  M.SECCODE = A.SECCODE
                                AND                 M.VenType = 1       -- IDC
                                AND                 TYPE_ = 1           -- NorthAmer Equity
                                AND                 M.EXCHANGE <> 2

                                -- AND M.EXCHANGE = 1 AND A.TYPE_ = 1
                                -- AND     M.RANK = 1   -- VIEW ALL OR CURRENT ONLY
                                -- AND     A.COUNTRY = 'USA' -- comment this out for ADR's

                JOIN            Prc.PrcTKChg                    K
                                ON                  M.VENCODE = K.Code

                JOIN            PRC.PRcsCCHG        G
                                ON                  G.CODE =    K.CODE
                                AND                 ISNULL(G.ENDDATE,'1/1/2059')
                                BETWEEN             K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                JOIN            PRC.PRCINFO         I
                                ON                  I.CODE =    G.CODE
                                AND                 I.SECTYPE   NOT IN ('X','P','E','I','S','U','W','0','7','T','Q','R','V')

                JOIN            SECMAPX             MP1
                                ON                  MP1.VENCODE =   I.CODE
                                AND                 MP1.RANK =      M.RANK
                                AND                 MP1.VENTYPE =   1
                                AND                 MP1.EXCHANGE =  M.EXCHANGE

                JOIN            SECMSTRX            MR1
                                ON                  MR1.SECCODE =   MP1.SECCODE
                                AND                 MR1.TYPE_ =     1

                JOIN            SECMAPX             MP2
                                ON                  MP2.SECCODE =   MR1.SECCODE
                                AND                 MP2.VENTYPE = 1
                                AND                 MP2.RANK =      M.RANK

                JOIN            PRC.PRCTKCHG        K1
                                ON                  K1.CODE =       MP2.VENCODE
                                --AND ISNULL(K1.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                JOIN            PRC.PRCSCCHG        G1
                                ON                  G1.CODE =       K1.CODE
                                AND                 ISNULL(G1.ENDDATE,'1/1/2059')
                                BETWEEN             K1.STARTDATE    AND     ISNULL(K1.ENDDATE,'1/1/2059')

                JOIN            PRC.PRCDLY          C1
                                ON                  C1.CODE =       G1.CODE

                WHERE

                                 A.SECCODE          IN ('''
#
    BLOCK_SIZE = int(len(SECCODES)/nblocks)+1
    with tqdm(total=nblocks, file=sys.stdout) as pbar:
        TABLE = []
        LIST = [[] for n in range(20750101)]
        for seccodeblock in list(iterutils.chunked_iter(SECCODES, BLOCK_SIZE)):
            pbar.set_description('progress at %s' % time.strftime("%c"))
            pbar.update(1)
            query_SECCODES = ''
            print('\n\nseccodeblock = ', len(seccodeblock), datetime.now())
            for sc in seccodeblock:
                query_SECCODES += str(sc) + ''','''
            query_SECCODES = query_SECCODES[:-1]
            query_DATE = '''CAST(C1.Date_ AS DATETIME)>= \'''' + date_from + '''\''''
            COMPOSED_query = query +\
                            query_SECCODES + ''')\n\nAND\n\n''' +\
                            query_DATE + '''\n\nORDER BY C1.Date_'''
            with open(working_dir+'query_pricing_data.txt', 'w') as query_file:
                query_file.write(COMPOSED_query)
            keep_trying_to_query = True
            result = None
# the query might fail because the computer got moved to a different location,
# which resulted in IP change; in this case, try to re-open the connection, then re-do the query
            while keep_trying_to_query:
                try:
                    print('\n\ntrying to execute cursor_.execute(COMPOSED_query)...',
                          datetime.now())
                    cursor_.execute(COMPOSED_query)
                    try:
                        print('\n\ntrying to execute result = cursor_.fetchall()...',
                              datetime.now())
                        result = cursor_.fetchall()
                        keep_trying_to_query = False
                    except Exception as err:
                        try:
                            print('\n\nexception #5 for cursor_.execute(COMPOSED_query)',
                                  err, datetime.now())
                            print('\n\nexception #6 for result = cursor_.fetchall()',
                                  err, datetime.now())
                            cursor_.close()
                            cnxn.close()
                            print("\n\nre-opening server connection...", datetime.now())
                            cnxn = pyodbc.connect('DRIVER='+driver+
                                                  ';SERVER='+server+
                                                  ';PORT=1433;DATABASE='+database+
                                                  ';UID='+username+
                                                  ';PWD='+password)
                            cursor_ = cnxn.cursor()
                        except Exception as err:
                            print('\n\nexception #7 for reconnect', err, datetime.now())
                except Exception as err:
                    try:
                        print('\n\nexception #8 for cursor_.execute(COMPOSED_query)',
                              err, datetime.now())
                        print('\n\nexception #9 for result = cursor_.fetchall()',
                              err, datetime.now())
                        cursor_.close()
                        cnxn.close()
                        print("\n\nre-opening server connection...", datetime.now())
                        cnxn = pyodbc.connect('DRIVER='+driver+
                                              ';SERVER='+server+
                                              ';PORT=1433;DATABASE='+database+
                                              ';UID='+username+
                                              ';PWD='+password)
                        cursor_ = cnxn.cursor()
                    except Exception as err:
                        print('\n\nexception #10 for reconnect', err, datetime.now())
#
            if result is not None:
                print("\n\nquery produced %d rows" % len(result), datetime.now())
                for row in result:
                    row3 = []
                    row3.append(int(row[0]))            # SecCode
                    row3.append(row[8])                 # ticker
                    if row[15] is not None:
                        date1 = str(row[15])[:-9]       # market date
                        row3.append(date1)
                    else:
                        row3.append('-1.0')
                    if row[16] is not None:
                        row3.append(row[16])            # open
                    else:
                        row3.append('-1.0')
                    if row[17] is not None:
                        row3.append(row[17])            # high
                    else:
                        row3.append('-1.0')
                    if row[18] is not None:
                        row3.append(row[18])            # low
                    else:
                        row3.append('-1.0')
                    if row[19] is not None:
                        row3.append(row[19])            # unadjusted close
                    else:
                        row3.append('-1.0')
                    if row[20] is not None:
                        row3.append(row[20])            # volume
                    else:
                        row3.append('-1.0')
                    if row[21] is not None:
                        row3.append(row[21])            # TotRet
                    else:
                        row3.append('-1.0')
                    if row3 not in TABLE:
                        TABLE.append(row3)
                        idx = int(row[15].strftime('%Y%m%d'))
                        LIST[idx].append(row3)
#
    for i, it in enumerate(LIST):
        if it:
            s = str(i)
            year = s[:-4]
            month = s[4:-2]
            day = s[6:]
            date2 = year+'-'+month+'-'+day
            table1 = []
            table2 = []
            table2.append(create_titles([
                'SecCode'
                , 'Ticker'
                , 'Date'
                , 'Open'
                , 'High'
                , 'Low'
                , 'Close, unadjusted'
                , 'Volume'
                , 'Total return'
                ]))
            for _, item in enumerate(it):
                if item not in table1:
                    table1.append(item)
            table1 = sorted(table1, key=operator.itemgetter(0, 1))
            table2 += table1
            ofp = Path(dir_from_date(date2, 'y', working_dir)+'pricing_data_for_'+date2+'.csv')
            with open(ofp, 'a') as result_file:
                wr = csv.writer(result_file, dialect='excel')
                wr.writerows(table2)
#
#
    NOW = str(date.today())
    print('\n\ncompressing output and timestamping ... ', datetime.now())
    FILE_NAME = 'refinitiv_qa_direct_qai_master_and_pricing_tables_'+NOW
    print(FILE_NAME, datetime.now())
    shutil.make_archive(FILE_NAME, 'zip', working_dir)

    print('\n\nmoving the data to the timestamped repository ... ', datetime.now())
    SRC = cwd
    data_repo = os.path.join(SRC, 'RefinitivDataRepository')
    if not os.path.exists(data_repo):
        os.mkdir(data_repo)
    if not os.path.isdir(data_repo):
        raise Exception(f'Data repository is not a directory: {data_repo}')

    OUTPUT_FILE_STAGING_PATH = os.path.join(SRC, FILE_NAME+'.zip')
    OUTPUT_FILE_PATH = Path(os.path.join(data_repo, FILE_NAME+'.zip'))
    print('OUTPUT_FILE_STAGING_PATH = ', OUTPUT_FILE_STAGING_PATH,
          'OUTPUT_FILE_PATH', OUTPUT_FILE_PATH)
    if os.path.isfile(OUTPUT_FILE_STAGING_PATH):
        if os.path.isfile(OUTPUT_FILE_PATH):
            new_file_size = os.stat(OUTPUT_FILE_STAGING_PATH).st_size
            old_file_size = os.stat(OUTPUT_FILE_PATH).st_size
            print('\n\nnew zip size = ', new_file_size, '\told_file_size = ', old_file_size)
            if new_file_size > old_file_size:
                os.remove(OUTPUT_FILE_PATH)
                shutil.move(OUTPUT_FILE_STAGING_PATH, OUTPUT_FILE_PATH)
        else:
            shutil.move(OUTPUT_FILE_STAGING_PATH, OUTPUT_FILE_PATH)

    if email_notification_:
        print('\n\nemailing the confirmation and the link to compressed data to the author ... ',
              datetime.now())
        ALERT = '''This is to notify that new compressed data set was
        uploaded to FORA google drive ...'''
        EMAIL = 'Alert time: ' + time.strftime("%c") +'\n' + ALERT
        CLIENT_EMAIL = ['*****@*****.**', '*****@*****.**']
        #    #{'*****@*****.**', '*****@*****.**', '*****@*****.**'}
#    MESSAGE = create_message('*****@*****.**',\
 #                            CLIENT_EMAIL, 'Completion alert', EMAIL)
        yagmail.SMTP('*****@*****.**').send(CLIENT_EMAIL, 'Completion alert', EMAIL)
        print('\n\nemailed to the user:\n'+ALERT, datetime.now())

    print('\n\nexiting ... ', datetime.now())
Exemplo n.º 22
0
        def create_graph(client=None):
            # NOTE ON CHUNKING SIGNATURES:
            # Chunking the gene signatures might not be necessary anymore because the overhead of the dask
            # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling).
            # The original behind the decision to implement this was the refuted assumption that fast executing tasks
            # would greatly be impacted by scheduler overhead. The performance gain introduced by chunking of signatures
            # seemed to corroborate this assumption. However, the benefit was through less pickling and unpickling of
            # the motif annotations dataframe as this was not wrapped in a delayed() construct.
            # When using a distributed scheduler chunking even has a negative impact and is therefore overruled. The
            # negative impact is due to having these large chunks to be shipped to different workers across cluster nodes.

            # NOTE ON BROADCASTING DATASET:
            # There are three large pieces of data that need to be orchestrated between client/scheduler and workers:
            # 1. In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise
            # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and
            # unpicking between processes.
            def wrap(data):
                return client.scatter(data, broadcast=True) if client else delayed(data, pure=True)
            delayed_or_future_annotations = wrap(motif_annotations)
            # 2. The databases: these database objects are typically proxies to the data on disk. They only have
            # the name and location on shared storage as fields. For consistency reason we do broadcast these database
            # objects to the workers. If we decide to have all information of a database loaded into memory we can still
            # safely use clusters.
            #def memoize(db: Type[RankingDatabase]) -> Type[RankingDatabase]:
            #    return MemoryDecorator(db)
            #delayed_or_future_dbs = list(map(wrap, map(memoize, rnkdbs)))
            # Check also latest Stackoverflow message: https://stackoverflow.com/questions/50795901/dask-scatter-broadcast-a-list
            delayed_or_future_dbs = list(map(wrap, rnkdbs))
            # 3. The gene signatures: these signatures become large when chunking them, therefore chunking is overruled
            # when using dask.distributed.
            # See earlier.

            # NOTE ON SHARING RANKING DATABASES ACROSS NODES:
            # Because the frontnodes of the VSC share the staging disk, these databases can be accessed from all nodes
            # in the cluster and can all use the same path in the configuration file. The RankingDatabase objects shared
            # from scheduler to workers can therefore be just contain information on database file location.
            # There might be a need to be able to run on clusters that do not share a network drive. This can be
            # achieved via by loading all data in from the scheduler and use the broadcasting system to share data
            # across nodes. The only element that needs to be adapted to cater for this need is loading the databases
            # in memory on the scheduler via the already available MemoryDecorator for databases. But make sure the
            # adapt memory limits for workers to avoid "distributed.nanny - WARNING - Worker exceeded 95% memory budget.".

            # NOTE ON REMOVING I/O CONTENTION:
            # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking
            # database) would be to load the database in memory (using the available decorator) for each task.
            # The penalty of loading the database in memory should be shared across multiple gene signature so
            # in this case chunking of gene signatures is mandatory to avoid severe performance penalties.
            # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation
            # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores),
            # this might not be a sound idea to do.
            # Another approach to overcome the I/O bottleneck in a clustered infrastructure is to assign each cluster
            # to a different database which is achievable in the dask framework. This approach has of course many
            # limitations: for 6 database you need at least 6 cores and you cannot take advantage of more
            # (http://distributed.readthedocs.io/en/latest/locality.html)

            # NOTE ON REMAINING WARNINGS:
            # >> distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.
            # >> Perhaps some other process is leaking memory?  Process memory: 1.51 GB -- Worker memory limit: 2.15 GB
            # My current idea is that this cannot be avoided processing a single module can sometimes required
            # substantial amount of memory because of pre-allocation of recovery curves (see code notes on how to
            # mitigate this problem). Setting module_chunksize=1 also limits this problem.
            #
            # >> distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%)
            # The current implementation of module2df removes substantial amounts of memory (i.e. the RCCs) so this might
            # again be unavoidable. TBI + See following stackoverflow question:
            # https://stackoverflow.com/questions/47776936/why-is-a-computation-much-slower-within-a-dask-distributed-worker

            return aggregate_func(
                        (delayed(transform_func)
                            (db, gs_chunk, delayed_or_future_annotations)
                                for db in delayed_or_future_dbs
                                    for gs_chunk in chunked_iter(modules, module_chunksize)))
Exemplo n.º 23
0
def _distributed_calc(rnkdbs: Sequence[Type[RankingDatabase]], modules: Sequence[Type[GeneSignature]],
                      motif_annotations_fname: str,
                      transform_func: Callable[[Type[RankingDatabase], Sequence[Type[GeneSignature]], str], T],
                      aggregate_func: Callable[[Sequence[T]], T],
                      motif_similarity_fdr: float = 0.001, orthologuous_identity_threshold: float = 0.0,
                      client_or_address='dask_multiprocessing',
                      num_workers=None, module_chunksize=100) -> T:
    """
    Perform a parallelized or distributed calculation, either pruning targets or finding enriched motifs.

    :param rnkdbs: A sequence of ranking databases.
    :param modules: A sequence of gene signatures.
    :param motif_annotations_fname: The filename of the motif annotations to use.
    :param transform_func: A function having a signature (Type[RankingDatabase], Sequence[Type[GeneSignature]], str) and
        that returns Union[Sequence[Regulon]],pandas.DataFrame].
    :param aggregate_func: A function having a signature:
        - (Sequence[pandas.DataFrame]) => pandas.DataFrame
        - (Sequence[Sequence[Regulon]]) => Sequence[Regulon]
    :param motif_similarity_fdr: The maximum False Discovery Rate to find factor annotations for enriched motifs.
    :param orthologuous_identity_threshold: The minimum orthologuous identity to find factor annotations
        for enriched motifs.
    :param client_or_address: The client of IP address of the scheduler when working with dask. For local multi-core
        systems 'custom_multiprocessing' or 'dask_multiprocessing' can be supplied.
    :param num_workers: If not using a cluster, the number of workers to use for the calculation.
        None of all available CPUs need to be used.
    :param module_chunksize: The size of the chunk in signatures to use when using the dask framework with the
        multiprocessing scheduler.
    :return: A pandas dataframe or a sequence of regulons (depends on aggregate function supplied).
    """
    def is_valid(client_or_address):
        if isinstance(client_or_address, str) and ((client_or_address in
                                                    {"custom_multiprocessing", "dask_multiprocessing", "local"})
                                                   or IP_PATTERN.fullmatch(client_or_address)):
            return True
        elif isinstance(client_or_address, Client):
            return True
        return False
    assert is_valid(client_or_address), "\"{}\"is not valid for parameter client_or_address.".format(client_or_address)

    if client_or_address not in {'custom_multiprocessing', 'dask_multiprocessing'}:
        module_chunksize = 1

    # Make sure warnings and info are being logged.
    if not len(LOGGER.handlers):
        LOGGER.addHandler(create_logging_handler(False))
        if LOGGER.getEffectiveLevel() > logging.INFO:
            LOGGER.setLevel(logging.INFO)

    if client_or_address == 'custom_multiprocessing': # CUSTOM parallelized implementation.
        # This implementation overcomes the I/O-bounded performance. Each worker (subprocess) loads a dedicated ranking
        # database and motif annotation table into its own memory space before consuming module. The implementation of
        # each worker uses the AUC-first numba JIT based implementation of the algorithm.
        assert len(rnkdbs) <= num_workers if num_workers else cpu_count(), "The number of databases is larger than the number of cores."
        amplifier = int((num_workers if num_workers else cpu_count())/len(rnkdbs))
        LOGGER.info("Using {} workers.".format(len(rnkdbs) * amplifier))
        receivers = []
        for db in rnkdbs:
            for idx, chunk in enumerate(chunked_iter(modules, ceil(len(modules)/float(amplifier)))):
                sender, receiver = Pipe()
                receivers.append(receiver)
                Worker("{}({})".format(db.name, idx+1), db, chunk, motif_annotations_fname, sender,
                       motif_similarity_fdr, orthologuous_identity_threshold, transform_func).start()
        # Retrieve the name of the temporary file to which the data is stored. This is a blocking operation.
        fnames = [recv.recv() for recv in receivers]
        # Load all data from disk and concatenate.
        def load(fname):
            with open(fname, 'rb') as f:
                return pickle.load(f)
        try:
            return aggregate_func(list(map(load, fnames)))
        finally:
            # Remove temporary files.
            for fname in fnames:
                os.remove(fname)
    else: # DASK framework.
        # Load motif annotations.
        motif_annotations = load_motif_annotations(motif_annotations_fname,
                                                   motif_similarity_fdr=motif_similarity_fdr,
                                                   orthologous_identity_threshold=orthologuous_identity_threshold)

        # Create dask graph.
        def create_graph(client=None):
            # NOTE ON CHUNKING SIGNATURES:
            # Chunking the gene signatures might not be necessary anymore because the overhead of the dask
            # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling).
            # The original behind the decision to implement this was the refuted assumption that fast executing tasks
            # would greatly be impacted by scheduler overhead. The performance gain introduced by chunking of signatures
            # seemed to corroborate this assumption. However, the benefit was through less pickling and unpickling of
            # the motif annotations dataframe as this was not wrapped in a delayed() construct.
            # When using a distributed scheduler chunking even has a negative impact and is therefore overruled. The
            # negative impact is due to having these large chunks to be shipped to different workers across cluster nodes.

            # NOTE ON BROADCASTING DATASET:
            # There are three large pieces of data that need to be orchestrated between client/scheduler and workers:
            # 1. In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise
            # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and
            # unpicking between processes.
            def wrap(data):
                return client.scatter(data, broadcast=True) if client else delayed(data, pure=True)
            delayed_or_future_annotations = wrap(motif_annotations)
            # 2. The databases: these database objects are typically proxies to the data on disk. They only have
            # the name and location on shared storage as fields. For consistency reason we do broadcast these database
            # objects to the workers. If we decide to have all information of a database loaded into memory we can still
            # safely use clusters.
            #def memoize(db: Type[RankingDatabase]) -> Type[RankingDatabase]:
            #    return MemoryDecorator(db)
            #delayed_or_future_dbs = list(map(wrap, map(memoize, rnkdbs)))
            # Check also latest Stackoverflow message: https://stackoverflow.com/questions/50795901/dask-scatter-broadcast-a-list
            delayed_or_future_dbs = list(map(wrap, rnkdbs))
            # 3. The gene signatures: these signatures become large when chunking them, therefore chunking is overruled
            # when using dask.distributed.
            # See earlier.

            # NOTE ON SHARING RANKING DATABASES ACROSS NODES:
            # Because the frontnodes of the VSC share the staging disk, these databases can be accessed from all nodes
            # in the cluster and can all use the same path in the configuration file. The RankingDatabase objects shared
            # from scheduler to workers can therefore be just contain information on database file location.
            # There might be a need to be able to run on clusters that do not share a network drive. This can be
            # achieved via by loading all data in from the scheduler and use the broadcasting system to share data
            # across nodes. The only element that needs to be adapted to cater for this need is loading the databases
            # in memory on the scheduler via the already available MemoryDecorator for databases. But make sure the
            # adapt memory limits for workers to avoid "distributed.nanny - WARNING - Worker exceeded 95% memory budget.".

            # NOTE ON REMOVING I/O CONTENTION:
            # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking
            # database) would be to load the database in memory (using the available decorator) for each task.
            # The penalty of loading the database in memory should be shared across multiple gene signature so
            # in this case chunking of gene signatures is mandatory to avoid severe performance penalties.
            # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation
            # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores),
            # this might not be a sound idea to do.
            # Another approach to overcome the I/O bottleneck in a clustered infrastructure is to assign each cluster
            # to a different database which is achievable in the dask framework. This approach has of course many
            # limitations: for 6 database you need at least 6 cores and you cannot take advantage of more
            # (http://distributed.readthedocs.io/en/latest/locality.html)

            # NOTE ON REMAINING WARNINGS:
            # >> distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.
            # >> Perhaps some other process is leaking memory?  Process memory: 1.51 GB -- Worker memory limit: 2.15 GB
            # My current idea is that this cannot be avoided processing a single module can sometimes required
            # substantial amount of memory because of pre-allocation of recovery curves (see code notes on how to
            # mitigate this problem). Setting module_chunksize=1 also limits this problem.
            #
            # >> distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%)
            # The current implementation of module2df removes substantial amounts of memory (i.e. the RCCs) so this might
            # again be unavoidable. TBI + See following stackoverflow question:
            # https://stackoverflow.com/questions/47776936/why-is-a-computation-much-slower-within-a-dask-distributed-worker

            return aggregate_func(
                        (delayed(transform_func)
                            (db, gs_chunk, delayed_or_future_annotations)
                                for db in delayed_or_future_dbs
                                    for gs_chunk in chunked_iter(modules, module_chunksize)))

        # Compute dask graph ...
        if client_or_address == "dask_multiprocessing":
            # ... via multiprocessing.
            return create_graph().compute(scheduler='processes', num_workers=num_workers if num_workers else cpu_count())
        else:
            # ... via dask.distributed framework.
            client, shutdown_callback = _prepare_client(client_or_address, num_workers=num_workers if num_workers else cpu_count())
            try:
                return client.compute(create_graph(client), sync=True)
            finally:
                shutdown_callback(False)
Exemplo n.º 24
0
def invoke_semgrep(
    config_specifier: str,
    committed_datetime: Optional[datetime],
    base_commit_ref: Optional[str],
    semgrep_ignore: TextIO,
) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    workdir = Path.cwd()
    targets = TargetFileManager(
        base_path=workdir,
        base_commit=base_commit_ref,
        paths=[workdir],
        ignore_rules_file=semgrep_ignore,
    )

    config_args = ["--config", config_specifier]

    debug_echo("=== seeing if there are any findings")
    finding_set = FindingSets()

    with targets.current_paths() as paths:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"),
                   err=True)
        for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE):
            args = ["--skip-unknown-extensions", "--json", *config_args]
            for path in chunk:
                args.append(path)
            count = 0
            for result in json.loads(str(semgrep(*args)))["results"]:
                finding_set.update_current(result, committed_datetime)
                count += 1
            click.echo(
                f"| {count} {cardinalize('current issue', count)} found",
                err=True)

    if not finding_set.has_current_issues():
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            if paths:
                paths_with_findings = finding_set.paths_with_current_findings()
                paths_to_check = set(str(path)
                                     for path in paths) & paths_with_findings
                click.echo(
                    "=== looking for pre-existing issues in " +
                    unit_len(paths_to_check, "file"),
                    err=True,
                )
                for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE):
                    args = [
                        "--skip-unknown-extensions", "--json", *config_args
                    ]
                    for path in chunk:
                        args.append(path)
                    count = 0
                    for result in json.loads(str(semgrep(*args)))["results"]:
                        finding_set.update_baseline(result, committed_datetime)
                        count += 1
                click.echo(
                    f"| {count} {cardinalize('pre-existing issue', count)} found",
                    err=True,
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file:
            args = ["--sarif", *config_args]
            for path in paths:
                args.extend(["--include", path])
            semgrep(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return finding_set
Exemplo n.º 25
0
 def batches(self, size):
     """Iterate all batches.
     """
     for abstracts in chunked_iter(self.abstracts, size):
         yield Batch(abstracts)
Exemplo n.º 26
0
def invoke_semgrep(
    semgrep_args: List[str],
    targets: List[str],
    *,
    timeout: Optional[int],
    baseline: bool = False,
    explicit_semgrepignore_path: Optional[str] = None,
) -> Tuple[int, SemgrepOutput]:
    """
    Call semgrep passing in semgrep_args + targets as the arguments
    Also, save semgrep output as a list of json blobs in SEMGREP_SAVE_FILE
    to help debugging. Baseline scan output will be saved separately with
    the "_baseline" suffix.

    Returns json output of semgrep as dict object
    """
    max_exit_code = 0
    output = SemgrepOutput([], [], SemgrepTiming([], []))
    _env = ({
        "SEMGREP_R2C_INTERNAL_EXPLICIT_SEMGREPIGNORE":
        explicit_semgrepignore_path,
        **os.environ,
    } if explicit_semgrepignore_path else os.environ)

    semgrep_save_file_baseline = Path(SEMGREP_SAVE_FILE_BASELINE)
    if not baseline and semgrep_save_file_baseline.exists():
        semgrep_save_file_baseline.unlink()

    semgrep_save_file_path = (SEMGREP_SAVE_FILE_BASELINE
                              if baseline else SEMGREP_SAVE_FILE)
    semgrep_save_file = open(semgrep_save_file_path, "w+")
    semgrep_save_file.write("[")

    first_chunk = True

    for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE):
        with tempfile.NamedTemporaryFile("w") as output_json_file:
            args = semgrep_args.copy()
            args.extend(["--debug"])
            args.extend([
                "-o",
                output_json_file.
                name,  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ])
            for c in chunk:
                args.append(c)

            debug_echo(f"== Invoking semgrep with { len(args) } args")

            exit_code = semgrep_exec(*args,
                                     _timeout=timeout,
                                     _err=debug_echo,
                                     _env=_env).exit_code
            max_exit_code = max(max_exit_code, exit_code)

            debug_echo(f"== Semgrep finished with exit code { exit_code }")

            with open(
                    output_json_file.
                    name  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ) as f:
                semgrep_output = f.read()
            parsed_output = json.loads(semgrep_output)
            if first_chunk:
                first_chunk = False
            else:
                semgrep_save_file.write(",")
            semgrep_save_file.write(semgrep_output)

            output.results = [*output.results, *parsed_output["results"]]
            output.errors = [*output.errors, *parsed_output["errors"]]
            parsed_timing = parsed_output.get("time", {})
            output.timing = SemgrepTiming(
                parsed_timing.get("rules", output.timing.rules),
                [*output.timing.targets, *parsed_timing.get("targets", [])],
            )

    semgrep_save_file.write("]")
    semgrep_save_file.close()

    return max_exit_code, output
Exemplo n.º 27
0
 def chunks(self, size): return KpaIterable(iterutils.chunked_iter(self, size))
 def windows(self, size): return KpaIterable(iterutils.windowed_iter(self, size))
def main(start_date_,
         working_dir_,
         nblocks_,
         email_notification_,
         top_,
         archive=False):
    """
    The parametrized main function for CLI in the cloud
    """
    # use the following commands on Mac:
    #  git status; git pull; git add test.py;
    # git commit -m "Art's SQL/Python script update"; git push
    # to update the script in the cloud
    # /anaconda3/bin/python test.py
    # --start-date 1990-01-01 --working-directory ./temp/ --nblocks 100 --archive
    # to launch the script (--email-notification -- another flag)
    # on Mac terminal from the dir where you have test.py
    # comand line arguments; use comments below as an example
    # TOP = 10000000
    # reduce TOP value to 10 for debugging; put it to inf for a full run
    # DATE = '2017-01-01' --  'from' parameter for historical pricing data
    # WORKING_DIR = './refinitiv_qa_direct_qai_master_and_pricing_tables/'\
    #    +str(time.strftime("%Y-%m-%d"))+'/'
    # dir where all outputs go; it can be dated as above
    # NBLOCKS = 100
    # pricing data are very long queries; they need to be partitioned in blocks
    # as a separate project, optimize queries
    #
    #
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-statements
    # pylint: disable=too-many-locals
    # pylint: disable=too-many-arguments
    #
    top = top_
    date_from = start_date_
    nblocks = nblocks_
    cwd = os.path.realpath(
        os.path.dirname(__file__))  # instead of os.getcwd(), which is './'
    working_dir = working_dir_
    # empty the whole working dir
    for root, dirs, files in os.walk(working_dir):
        for f_f in files:
            os.unlink(os.path.join(root, f_f))
        for d_d in dirs:
            shutil.rmtree(os.path.join(root, d_d))
    shutil.copy(os.path.join(cwd, 'master_file_joe.csv'), working_dir)
    #
    database = 'qai'
    server = 'cd5m7wkqacpdeus2mia12301.public.dabc3424290b.database.windows.net,3342'
    username = '******'
    password = '******'
    #Authentication: SQL Server Authentication
    # NOTE: The following works on a Mac with the MSSQL 13 driver installed - it is here as the
    # default because Art's Anaconda environment doesn't show a non-empty list of drivers from
    # pyodbc
    driver = '/usr/local/lib/libmsodbcsql.13.dylib'  # '{ODBC Driver 13 for SQL Server}'
    drivers = [item for item in pyodbc.drivers()]
    if drivers:
        driver = drivers[0]
#print('driver:{}'.format(driver))
#
    cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server +
                          ';PORT=1433;DATABASE=' + database + ';UID=' +
                          username + ';PWD=' + password)
    cursor_ = cnxn.cursor()
    refinitiv_data_n_columns = 8
    s_s = ""
    if top is not None:
        s_s = ''' TOP ''' + str(top)
    query = '''SELECT''' + s_s + '''
                        A.SecCode                           -- SecCode -- 0
              --  ,       MR1.ID
                ,       MR1.NAME            AS CURRNAME     -- current name -- 1
                ,       G1.ISSUER           AS PITISSUER    -- point-in-time name -- 2
               -- ,       G1.EXCHANGE
                ,       MR1.Country                         -- country -- 3
                ,       G1.StartDate                        -- from -- 4
                ,       G1.EndDate                          -- to -- 5
                ,       K1.TICKER                           -- ticker -- 6
             --   ,       G1.EXCHANGE
              --  ,       I.ISSUER            AS CURRENTISSUE
                --,       I.STATUS
                ,       I.SECTYPE           AS CURRSECTYPE  -- type --7

                FROM            SecMstrX                        A

                JOIN            SECMAPX             M
                                ON                  M.SECCODE = A.SECCODE
                                AND                 M.VenType = 1       -- IDC
                                AND                 TYPE_ = 1           -- NorthAmer Equity
                                AND                 M.EXCHANGE <> 2

                                -- AND     M.RANK = 1   -- VIEW ALL (commented out) OR CURRENT ONLY
                                -- AND     A.COUNTRY = 'USA' -- comment this out for ADR's

                JOIN            Prc.PrcTKChg                    K
                                ON                  M.VENCODE = K.Code

                JOIN            PRC.PRcsCCHG        G
                                ON                  G.CODE =    K.CODE
                                AND                 ISNULL(G.ENDDATE,'1/1/2059')
                                BETWEEN             K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                --JOIN PRCCODE2 Y
                --ON Y.TYPE_ = 2 AND ASCII(G.EXCHANGE) = Y.CODE

                JOIN            PRC.PRCINFO         I
                                ON                  I.CODE =    G.CODE
                                AND                 I.SECTYPE   NOT IN ('X','P','E','I','S','U','W','0','7','T','Q','R','V')

                JOIN            SECMAPX             MP1
                                ON                  MP1.VENCODE =   I.CODE
                                AND                 MP1.RANK =      M.RANK
                                AND                 MP1.VENTYPE =   1
                                AND                 MP1.EXCHANGE =  M.EXCHANGE

                JOIN            SECMSTRX            MR1
                                ON                  MR1.SECCODE =   MP1.SECCODE
                                AND                 MR1.TYPE_ =     1

                JOIN            SECMAPX             MP2
                                ON                  MP2.SECCODE =   MR1.SECCODE
                                AND                 MP2.VENTYPE = 1
                                AND                 MP2.RANK =      M.RANK
                JOIN            PRC.PRCTKCHG        K1
                                ON                  K1.CODE =       MP2.VENCODE
                                --AND ISNULL(K1.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                JOIN            PRC.PRCSCCHG        G1
                                ON                  G1.CODE =       K1.CODE
                                AND                 ISNULL(G1.ENDDATE,'1/1/2059')
                                BETWEEN             K1.STARTDATE    AND     ISNULL(K1.ENDDATE,'1/1/2059')

                 GROUP BY       A.SecCode
                 ,              MR1.ID
                 ,              MR1.NAME
                 ,              G1.ISSUER
                 ,              G1.EXCHANGE
                 ,              MR1.Country
                 ,              G1.StartDate
                 ,              G1.EndDate
                 ,              K1.TICKER
                 ,              G1.EXCHANGE
                 ,              I.ISSUER
                 ,              I.STATUS
                 ,              I.SECTYPE

                 ORDER BY       MR1.ID
                 ,              G1.STARTDATE
                 '''
    # output the query string to a file
    with open(os.path.join(working_dir, 'query_master_table.txt'),
              "w") as query_file:
        query_file.write(query)
    print('\n\nexecuting the query ... ', datetime.now())
    try:
        print('\n\ntrying to execute cursor_.execute(query) ...',
              datetime.now())
        cursor_.execute(query)
    except Exception as err:
        print('\n\nexception #1 for cursor_.execute(query)', err,
              datetime.now())
    print('\n\nfetching query result ... ', datetime.now())
    try:
        print('\n\ntrying to execute result = cursor_.fetchall()...',
              datetime.now())
        result = cursor_.fetchall()
    except Exception as err:
        print('\n\nexception #2 for result = cursor_.fetchall()', err,
              datetime.now())

    tickers = []
    print('\n\nwriting .csv file (master table) ... ', datetime.now())
    with tqdm(total=len(result), file=sys.stdout) as pbar:
        table_master = []
        table_merged = []
        for row in result:
            pbar.set_description('progress at %s' % datetime.now())
            pbar.update(1)
            row1 = []
            row3 = []
            # A.SecCode                           -- SecCode -- 0
            #--  ,       MR1.ID
            #    ,       MR1.NAME            AS CURRNAME     -- current name -- 1
            #    ,       G1.ISSUER           AS PITISSUER    -- point-in-time name -- 2
            #--  ,       G1.EXCHANGE
            #    ,       MR1.Country                         -- country -- 3
            #    ,       G1.StartDate                        -- from -- 4
            #    ,       G1.EndDate                          -- to -- 5
            #    ,       K1.TICKER                           -- ticker -- 6
            #--  ,       G1.EXCHANGE
            #--  ,       I.ISSUER            AS CURRENTISSUE
            #--  ,       I.STATUS
            #    ,       I.SECTYPE           AS CURRSECTYPE  -- type --7
            #
            date_to = datetime.date(datetime.now())
            if row[5] is not None:  # to
                date_to = datetime.date(row[5])
            else:
                date_to = datetime.date(datetime.now())
            if date_to > datetime.date(datetime.now()):
                date_to = datetime.date(datetime.now())
#
            row1.append(str(row[6]))  # ticker
            tickers.append(row[6])
            row1.append(str(row[2]))  # point-in-time name
            row1.append(str(date_to))  # to
            #
            row1.append(str(row[0]))  # SecCode
            row3.append(int(row[0]))  # int for sorting
            row1.append(datetime.date(row[4]))  # from
            row3.append(datetime.date(row[4]))
            row1.append(date_to)  # to
            row3.append(date_to)
            row1.append(str(row[2]))  # point-in-time name
            row3.append(str(row[2]))
            row1.append(str(row[6]))  # ticker
            row3.append(str(row[6]))
            row1.append(str(row[3]))  # country
            row3.append(str(row[3]))
            row1.append(str(row[1]))  # current name
            row3.append(str(row[1]))
            row1.append(str(row[7]))  # type
            row3.append(str(row[7]))
            if row1 not in table_merged:
                table_merged.append(row1)
            if row3 not in table_merged:
                table_master.append(row3)

        with open(os.path.join(working_dir, 'master_table.csv'),
                  'w') as result_file:
            table_master1 = []
            table_master1.append(
                create_titles([
                    'SecCode', 'From', 'To', 'Point-in-time name', 'Ticker',
                    'Country', 'Current name', 'Type'
                ]))
            table_master = sorted(table_master, key=lambda item: item[0])
            table_master1 += table_master
            w_r = csv.writer(result_file, dialect='excel')
            w_r.writerows(table_master1)

        print('\n\npost-processing 1 ... ', datetime.now())

        with open(os.path.join(working_dir, 'master_file_joe.csv'),
                  'r') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            nrow = 0
            for row in csv_reader:
                row1 = []  # change True to False to use the list
                if (str(row[3]) in ('C', 'BAC', 'AAPL')
                        or True) and nrow != 0:  # skip titles
                    row1.append(str(row[3]))
                    row1.append(str(row[4]))
                    row1.append(str(row[2]))
                    for _ in range(refinitiv_data_n_columns):
                        row1.append('')  # fill in with blanks for merged .csv
                    for r_r in row:
                        row1.append(r_r)
                    table_merged.append(row1)
                nrow += 1

        print('\n\npost-processing 2 ... ', datetime.now())

        with open(
                os.path.join(working_dir,
                             'master_table_merged_art_vs_joe.csv'),
                'w') as result_file:
            w_r = csv.writer(result_file, dialect='excel')
            table_merged1 = sorted(table_merged,
                                   key=operator.itemgetter(0, 1, 2))
            table_merged2 = []
            table_merged2.append(
                create_titles([
                    '', '', '', 'SecCode', 'From', 'To', 'Point-in-time name',
                    'Ticker', 'Country', 'Current name', 'Type', 'ID', 'FROM',
                    'TO', 'TICKER', 'NAME', 'TYPE'
                ]))
            table_merged2 += table_merged1
            w_r.writerows(table_merged2)

        print('\n\npost-processing 3 ... ', datetime.now())

        tickers_joe = []  # this should be an array of unique tickers
        i = 0
        with open(os.path.join(working_dir, 'master_file_joe.csv'),
                  'r') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                if i != 0:  # skip titles at i = 0
                    if row[3] not in tickers_joe:  # unique tickers
                        tickers_joe.append(row[3])
                i += 1

        tikers_art = []  # this should be an array of unique tickers
        for t_t in tickers:
            if t_t not in tikers_art:
                tikers_art.append(t_t)

        print('\n\nnumber of unique tickers in the master: ', len(tikers_art),
              datetime.now())

        if top is None:
            print('\n\npost-processing 4 ... ', datetime.now())

            missing_tikers = []
            for t_j in tickers_joe:
                if t_j not in tikers_art:  # unique tickers
                    missing_tikers.append(t_j)

            missing_tikers1 = []
            for m_t in missing_tikers:
                if m_t not in missing_tikers1:  # unique tickers
                    missing_tikers1.append(m_t)

            print('\n\nnumber of missing tickers: ', len(missing_tikers1),
                  datetime.now())

            tickers_without_suffix = []
            for m_t in missing_tikers1:
                if m_t.find('.') != -1:
                    m_t = m_t.split('.')[0]
                else:
                    m_t = m_t[:
                              -1]  # try to remove the fused suffix for missing tickers
                if m_t not in tickers_without_suffix:
                    tickers_without_suffix.append(m_t)
            print('\n\nnumber of missing tickers without suffix: ',
                  len(tickers_without_suffix), datetime.now())

            query = '''SELECT * FROM PRC.PRCSCCHG WHERE TICKER IN (\''''

            for tws in tickers_without_suffix:
                query += str(tws) + '''\', \''''
            query = query[:-3]
            query += ''')'''

            try:
                print('\n\ntrying to execute cursor_.execute(query)...',
                      datetime.now())
                cursor_.execute(query)
            except Exception as err:
                print('\n\nexception #3 for cursor_.execute(query)', err,
                      datetime.now())

            print('\n\nfetching second query result ... ', datetime.now())
            try:
                print('\n\ntrying to execute result = cursor_.fetchall()...',
                      datetime.now())
                result = cursor_.fetchall()
            except Exception as err:
                print('\n\nexception #4 for result = cursor_.fetchall()', err,
                      datetime.now())

            with open(os.path.join(working_dir, 'addendum_master_table.csv'),
                      'w') as result_file:
                table_addendum = result
                table_addendum = sorted(table_addendum,
                                        key=operator.itemgetter(4))
                table_addendum1 = []
                table_addendum1.append(
                    create_titles([
                        'SecCode', 'From', 'To', 'CUSIP', 'Ticker', 'SEDOL',
                        'Issuer', 'Full ticker', 'Base ticker', 'Group',
                        'Series', 'Exchange'
                    ]))
                table_addendum1 += table_addendum
                w_r = csv.writer(result_file, dialect='excel')
                w_r.writerows(table_addendum1)

            found_tickers = []
            for row in result:
                if str(row[4]) not in found_tickers:
                    found_tickers.append(str(row[4]))

            print('\n\nnumber of found tickers: ', len(found_tickers),
                  datetime.now())

            missing_tikers2 = []
            for m_t in missing_tikers1:
                wosuffix = m_t
                if wosuffix.find('.') != -1:
                    wosuffix = wosuffix.split('.')[0]
                else:
                    wosuffix = wosuffix[:-1]  # try to remove the fused suffix
                if wosuffix not in found_tickers and m_t not in found_tickers:
                    # tickers w/o and with suffix
                    missing_tikers2.append(m_t)

            print('\n\nfinal number of missing tickers: ',
                  len(missing_tikers2), datetime.now())
            print('\n\nwriting missing tickers ... ', datetime.now())

            with open(os.path.join(working_dir, 'missing_tickers.csv'),
                      'w') as result_file:
                w_r = csv.writer(result_file, dialect='excel')
                missing_tikers2.sort()
                missing_tikers3 = []
                for row in missing_tikers2:
                    with open(os.path.join(working_dir, 'master_file_joe.csv'),
                              'r') as csv_file:
                        csv_reader = csv.reader(csv_file, delimiter=',')
                        i = 0
                        for row2 in csv_reader:
                            if row2[3] == row and i != 0:  # skip titles at i = 0
                                row5 = []
                                row5.append(str(row2[3]))
                                row5.append(str(row2[4]))
                                if row5 not in missing_tikers3:  # unique entries
                                    missing_tikers3.append(row5)
                            i += 1
                missing_tikers4 = []
                missing_tikers4.append(create_titles(['Tickers', 'Co. names']))
                missing_tikers4 += missing_tikers3
                w_r.writerows(missing_tikers4)

    print('\n\ndownloading pricing data ... ', datetime.now())

    seccodes = []
    with open(os.path.join(working_dir, 'master_table.csv')) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        l_l = 0
        for row in csv_reader:
            if row[0] not in seccodes and l_l > 0:  # skip titles, unique seccodes
                seccodes.append(row[0])
            l_l += 1

    print('\n\ndistinct seccodes = ', len(seccodes), datetime.now())
    print('\n\nprocessing ...', datetime.now())

    query = '''
    --This query returns the fully adjusted Open, High, Low, and Close Pricing data in Local Currency using the Ds2Primqtprc table for North American Equities*/

                    SELECT DISTINCT

                        A.SecCode                     -- seccode  new col=0
              --  ,       MR1.ID
             --   ,       MR1.NAME AS CURRNAME
             --   ,       G1.ISSUER AS PITISSUER
              --  ,       G1.EXCHANGE
              --  ,       MR1.Country
              --  ,       G1.StartDate
              --  ,       G1.EndDate
                  ,       K1.TICKER                     -- ticker new col=1
            --    ,       G1.EXCHANGE
             --   ,       I.ISSUER AS CURRENTISSUE
              --  ,       I.STATUS
              --  ,       I.SECTYPE AS CURRSECTYPE
              --  ,       C1.TotRet
             --   ,       C1.placeholder

                  ,       C1.Date_               --  market date col=15; new col=2
                  , C1.Open_                     --  col=16 open; new col=3
                  , C1.High                      --  col=17 high; new col=4
                  , C1.Low                       --  col=18 low; new col=5
                  , C1.Close_                    --  col=19 close; new col=6
                  ,  C1.Volume                   --  col=20 volume; new col=7
                  ,  C1.TotRet                   --  col=21 totret; new col=8

                FROM            SecMstrX                        A

                JOIN            SECMAPX             M

                                ON                  M.SECCODE = A.SECCODE
                                AND                 M.VenType = 1       -- IDC
                                AND                 TYPE_ = 1           -- NorthAmer Equity
                                AND                 M.EXCHANGE <> 2

                                -- AND M.EXCHANGE = 1 AND A.TYPE_ = 1
                                -- AND     M.RANK = 1   -- VIEW ALL OR CURRENT ONLY
                                -- AND     A.COUNTRY = 'USA' -- comment this out for ADR's

                JOIN            Prc.PrcTKChg                    K
                                ON                  M.VENCODE = K.Code

                JOIN            PRC.PRcsCCHG        G
                                ON                  G.CODE =    K.CODE
                                AND                 ISNULL(G.ENDDATE,'1/1/2059')
                                BETWEEN             K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059')

                JOIN            PRC.PRCTKCHG        K1
                                ON                  K1.CODE =       K.CODE
                          
                JOIN            PRC.PRCDLY          C1
                                ON                  C1.CODE =       K1.CODE

                WHERE

                                 A.SECCODE          IN ('''
    #
    block_size = int(len(seccodes) / nblocks) + 1
    with tqdm(total=nblocks, file=sys.stdout) as pbar:
        list_ = [[] for n in range(20750101)]
        for seccodeblock in list(iterutils.chunked_iter(seccodes, block_size)):
            pbar.set_description('progress at %s' % time.strftime("%c"))
            pbar.update(1)
            query_seccodes = ''
            print('\n\nseccodeblock = ', len(seccodeblock), datetime.now())
            for s_c in seccodeblock:
                query_seccodes += str(s_c) + ''','''
            query_seccodes = query_seccodes[:-1]
            query_date = '''CAST(C1.Date_ AS DATETIME)>= \'''' + date_from + '''\''''
            composed_query = query +\
                            query_seccodes + ''')\n\nAND\n\n''' +\
                            query_date + '''\n\nORDER BY C1.Date_'''
            with open(os.path.join(working_dir, 'query_pricing_data.txt'),
                      'w') as query_file:
                query_file.write(composed_query)
            keep_trying_to_query = True
            result = None
            # the query might fail because the computer got moved to a different location,
            # which resulted in IP change; in this case, try to re-open the connection, then re-do the query
            while keep_trying_to_query:
                try:
                    print(
                        '\n\ntrying to execute cursor_.execute(COMPOSED_query)...',
                        datetime.now())
                    cursor_.execute(composed_query)
                    try:
                        print(
                            '\n\ntrying to execute result = cursor_.fetchall()...',
                            datetime.now())
                        result = cursor_.fetchall()
                        keep_trying_to_query = False
                    except Exception as err:
                        try:
                            print(
                                '\n\nexception #5 for cursor_.execute(COMPOSED_query)',
                                err, datetime.now())
                            print(
                                '\n\nexception #6 for result = cursor_.fetchall()',
                                err, datetime.now())
                            cursor_.close()
                            cnxn.close()
                            print("\n\nre-opening server connection...",
                                  datetime.now())
                            cnxn = pyodbc.connect('DRIVER=' + driver +
                                                  ';SERVER=' + server +
                                                  ';PORT=1433;DATABASE=' +
                                                  database + ';UID=' +
                                                  username + ';PWD=' +
                                                  password)
                            cursor_ = cnxn.cursor()
                        except Exception as err:
                            print('\n\nexception #7 for reconnect', err,
                                  datetime.now())
                except Exception as err:
                    try:
                        print(
                            '\n\nexception #8 for cursor_.execute(COMPOSED_query)',
                            err, datetime.now())
                        print(
                            '\n\nexception #9 for result = cursor_.fetchall()',
                            err, datetime.now())
                        cursor_.close()
                        cnxn.close()
                        print("\n\nre-opening server connection...",
                              datetime.now())
                        cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' +
                                              server + ';PORT=1433;DATABASE=' +
                                              database + ';UID=' + username +
                                              ';PWD=' + password)
                        cursor_ = cnxn.cursor()
                    except Exception as err:
                        print('\n\nexception #10 for reconnect', err,
                              datetime.now())
#
            if result is not None:
                print("\n\nquery produced %d rows" % len(result),
                      datetime.now())
                for row in result:
                    row3 = []
                    #                          A.SecCode                     -- seccode  new col=0
                    #              --  ,       MR1.ID
                    #             --   ,       MR1.NAME AS CURRNAME
                    #             --   ,       G1.ISSUER AS PITISSUER
                    #              --  ,       G1.EXCHANGE
                    #              --  ,       MR1.Country
                    #              --  ,       G1.StartDate
                    #              --  ,       G1.EndDate
                    #                ,       K1.TICKER                -- ticker new col=1
                    #            --    ,       G1.EXCHANGE
                    #             --   ,       I.ISSUER AS CURRENTISSUE
                    #              --  ,       I.STATUS
                    #              --  ,       I.SECTYPE AS CURRSECTYPE
                    #              --  ,       C1.TotRet
                    #             --   ,       C1.placeholder
                    #                ,       C1.MarketDate            --  market date col=15; new col=2
                    #                , C1.Open                        --  col=16 open; new col=3
                    #                , C1.High                        --  col=17 high; new col=4
                    #                , C1.Low                         --  col=18 low; new col=5
                    #                , C1.Close                       --  col=19 close; new col=6
                    #                ,  C1.Volume                     --  col=20 volume; new col=7
                    #                ,  C1.TotRet                     --  col=21 totret; new col=8
                    #
                    row3.append(int(row[0]))  # SecCode
                    row3.append(row[1])  # ticker
                    if row[2] is not None:
                        date1 = str(row[2])[:-9]  # market date
                        row3.append(date1)
                    else:
                        row3.append('-1.0')
                    if row[3] is not None:
                        row3.append(row[3])  # open
                    else:
                        row3.append('-1.0')
                    if row[4] is not None:
                        row3.append(row[4])  # high
                    else:
                        row3.append('-1.0')
                    if row[5] is not None:
                        row3.append(row[5])  # low
                    else:
                        row3.append('-1.0')
                    if row[6] is not None:
                        row3.append(row[6])  # unadjusted close
                    else:
                        row3.append('-1.0')
                    if row[7] is not None:
                        row3.append(row[7])  # volume
                    else:
                        row3.append('-1.0')
                    if row[8] is not None:
                        row3.append(row[8])  # TotRet
                    else:
                        row3.append('-1.0')
                    idx = int(row[2].strftime('%Y%m%d'))
                    if row3 not in list_[idx]:
                        list_[idx].append(row3)
#
    for i, i_t in enumerate(list_):
        if i_t:
            s_s = str(i)
            year = s_s[:-4]
            month = s_s[4:-2]
            day = s_s[6:]
            date2 = year + '-' + month + '-' + day
            table1 = []
            table2 = []
            table2.append(
                create_titles([
                    'SecCode', 'Ticker', 'Date', 'Open', 'High', 'Low',
                    'Close, unadjusted', 'Volume', 'Total return'
                ]))
            for _, item in enumerate(i_t):
                if item not in table1:
                    table1.append(item)
            table1 = sorted(table1, key=operator.itemgetter(0, 1))
            table2 += table1
            ofp = os.path.join(dir_from_date(date2, 'ym', working_dir),
                               date2 + '.csv')
            with open(ofp, 'a') as result_file:
                w_r = csv.writer(result_file, dialect='excel')
                w_r.writerows(table2)


#
    if archive:
        now = str(date.today())
        print('\n\ncompressing output and timestamping ... ', datetime.now())
        file_name = 'refinitiv_qa_direct_qai_master_and_pricing_tables_' + now
        print(file_name, datetime.now())
        shutil.make_archive(file_name, 'zip', working_dir)

        print('\n\nmoving the data to the timestamped repository ... ',
              datetime.now())
        src = cwd
        data_repo = os.path.join(src, 'RefinitivDataRepository')
        if not os.path.exists(data_repo):
            os.mkdir(data_repo)
        if not os.path.isdir(data_repo):
            raise Exception(f'Data repository is not a directory: {data_repo}')

        output_file_staging_path = os.path.join(src, file_name + '.zip')
        output_file_path = Path(os.path.join(data_repo, file_name + '.zip'))
        print('OUTPUT_FILE_STAGING_PATH = ', output_file_staging_path,
              'OUTPUT_FILE_PATH', output_file_path)
        if os.path.isfile(output_file_staging_path):
            if os.path.isfile(output_file_path):
                new_file_size = os.stat(output_file_staging_path).st_size
                old_file_size = os.stat(output_file_path).st_size
                print('\n\nnew zip size = ', new_file_size,
                      '\told_file_size = ', old_file_size)
                if new_file_size > old_file_size:
                    os.remove(output_file_path)
                    shutil.move(output_file_staging_path, output_file_path)
            else:
                shutil.move(output_file_staging_path, output_file_path)

    if email_notification_:
        print(
            '\n\nemailing the confirmation and the link to compressed data to the author ... ',
            datetime.now())
        alert = '''This is to notify that new compressed data set was
        uploaded to FORA google drive ...'''
        email = 'Alert time: ' + time.strftime("%c") + '\n' + alert
        client_email = [
            '*****@*****.**', '*****@*****.**'
        ]
        #    MESSAGE = create_message('*****@*****.**',\
        #                            CLIENT_EMAIL, 'Completion alert', EMAIL)
        yagmail.SMTP('*****@*****.**').send(
            client_email, 'Completion alert', email)
        print('\n\nemailed to the user:\n' + alert, datetime.now())

    print('\n\nexiting ... ', datetime.now())
Exemplo n.º 29
0
def invoke_semgrep(
    config_specifier: str,
    committed_datetime: Optional[datetime],
    base_commit_ref: Optional[str],
    semgrep_ignore: TextIO,
    uses_managed_policy: bool,
) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    workdir = Path.cwd()
    targets = TargetFileManager(
        base_path=workdir,
        base_commit=base_commit_ref,
        paths=[workdir],
        ignore_rules_file=semgrep_ignore,
    )

    config_args = ["--config", config_specifier]
    rewrite_args = ["--no-rewrite-rule-ids"] if uses_managed_policy else []

    debug_echo("=== seeing if there are any findings")
    findings = FindingSets()

    with targets.current_paths() as paths:
        click.echo(
            "=== looking for current issues in " + unit_len(paths, "file"), err=True
        )
        for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE):
            args = [
                "--skip-unknown-extensions",
                "--disable-nosem",
                "--json",
                *rewrite_args,
                *config_args,
            ]
            for path in chunk:
                args.append(path)
            semgrep_results = json.loads(str(semgrep(*args)))["results"]
            findings.current.update_findings(
                Finding.from_semgrep_result(result, committed_datetime)
                for result in semgrep_results
                if not result["extra"].get("is_ignored")
            )
            findings.ignored.update_findings(
                Finding.from_semgrep_result(result, committed_datetime)
                for result in semgrep_results
                if result["extra"].get("is_ignored")
            )
            click.echo(
                f"| {unit_len(findings.current, 'current issue')} found", err=True
            )
            click.echo(
                f"| {unit_len(findings.ignored, 'ignored issue')} found",
                err=True,
            )

    if not findings.current:
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            paths_with_findings = {finding.path for finding in findings.current}
            paths_to_check = set(str(path) for path in paths) & paths_with_findings
            if not paths_to_check:
                click.echo(
                    "=== not looking at pre-existing issues since all files with current issues are newly created",
                    err=True,
                )
            else:
                click.echo(
                    "=== looking for pre-existing issues in "
                    + unit_len(paths_to_check, "file"),
                    err=True,
                )
                for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE):
                    args = [
                        "--skip-unknown-extensions",
                        "--json",
                        *rewrite_args,
                        *config_args,
                    ]
                    for path in chunk:
                        args.append(path)
                    semgrep_results = json.loads(str(semgrep(*args)))["results"]
                    findings.baseline.update_findings(
                        Finding.from_semgrep_result(result, committed_datetime)
                        for result in semgrep_results
                    )
                click.echo(
                    f"| {unit_len(findings.baseline, 'pre-existing issue')} found",
                    err=True,
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open("w") as sarif_file:
            args = ["--sarif", *rewrite_args, *config_args]
            for path in paths:
                args.extend(["--include", path])
            semgrep(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return findings
Exemplo n.º 30
0
 def batches_iter(self, batch_size):
     return chunked_iter(iter(self), batch_size)