示例#1
0
 def chunks(n, join=None, *, seq):
     if join is None:
         yield from fy.chunks(n, seq)
     else:
         if isinstance(join, str):
             join = join.join
         for item in fy.chunks(n, seq):
             yield join(item)
示例#2
0
    def test_input_batch_generator(self):
        # Throw in a few empty sentences
        tokenized_sentences = [["abab", "aaaa", "b", "ab", "."], [], []]
        batch_size = 2

        # We're expecting batches of size batch_size, in the order in which sentences are provided
        sentence_index_batches = funcy.chunks(batch_size,
                                              range(len(tokenized_sentences)))
        expected_generated_inputs = []

        # Compute what the generated input should be
        for sentence_index_batch in sentence_index_batches:
            tokenized_sentence_batch = [
                tokenized_sentences[index] for index in sentence_index_batch
            ]
            inputs_batch = self.input_encoder.prepare_inputs_from_pretokenized(
                tokenized_sentence_batch)
            expected_generated_inputs.append(inputs_batch)

        predict_generator = self.input_encoder.input_batch_generator(
            tokenized_sentences, batch_size)
        # verify the output of the generator
        for expected_inputs, inputs in zip(expected_generated_inputs,
                                           predict_generator):
            for key in expected_inputs:
                np.testing.assert_equal(expected_inputs[key], inputs[key])

        # now all empty sentences
        # test with emtpy sentences
        tokenized_sentences = [[]]
        batch_size = 4

        # We're expecting batches of size batch_size, in the order in which sentences are provided
        sentence_index_batches = funcy.chunks(batch_size,
                                              range(len(tokenized_sentences)))
        expected_generated_inputs = []

        # Compute what the generated input should be
        for sentence_index_batch in sentence_index_batches:
            tokenized_sentence_batch = [
                tokenized_sentences[index] for index in sentence_index_batch
            ]
            inputs_batch = self.input_encoder.prepare_inputs_from_pretokenized(
                tokenized_sentence_batch)
            expected_generated_inputs.append(inputs_batch)

        predict_generator = self.input_encoder.input_batch_generator(
            tokenized_sentences, batch_size)
        # verify the output of the generator
        for expected_inputs, inputs in zip(expected_generated_inputs,
                                           predict_generator):
            for key in expected_inputs:
                np.testing.assert_equal(expected_inputs[key], inputs[key])
示例#3
0
文件: media.py 项目: Viewly/alpha-2
def generate_random_images(
        tmp_directory,
        video_file,
        size=None,
        aspect_ratio=(16, 9), **kwargs):
    """ Generate uniformly distributed random snapshots from a video."""
    from moviepy import editor
    clip = editor.VideoFileClip(video_file)

    if int(clip.duration) < 1:
        return 0

    # hard-coded steps of snapshot smoothness
    if clip.duration < 30:
        num_of_chunks = 3
    elif clip.duration < 60:
        num_of_chunks = 5
    elif clip.duration < 60 * 5:
        num_of_chunks = 20
    elif clip.duration < 60 * 10:
        num_of_chunks = 30
    else:
        num_of_chunks = 50

    chunk_size = int(clip.duration // num_of_chunks)
    for i, video_chunk in enumerate(chunks(chunk_size, range(0, int(clip.duration)))):
        random_frame_time = random.uniform(first(video_chunk), last(video_chunk))
        img = Image.fromarray(clip.get_frame(random_frame_time))
        if size:
            img = img_resize(img, size=size, aspect_ratio=aspect_ratio)
        file_name = f'{i}.{kwargs.get("output_ext", "png")}'
        img_save(img, str(tmp_directory / 'random' / file_name))

    return num_of_chunks
示例#4
0
def list_pipelines(selection: List[str]) -> List[DataPipeline]:
    """
    Return list of pipelines related to this project (which must have the tag for our project set).

    The :selection should be a list of glob patterns to select specific pipelines by their ID.
    If the selection is an empty list, then all pipelines are used.
    """
    client = boto3.client("datapipeline")
    paginator = client.get_paginator("list_pipelines")
    response_iterator = paginator.paginate()
    all_pipeline_ids = response_iterator.search("pipelineIdList[].id")
    if selection:
        selected_pipeline_ids = [
            pipeline_id for pipeline_id in all_pipeline_ids
            for glob in selection if fnmatch.fnmatch(pipeline_id, glob)
        ]
    else:
        selected_pipeline_ids = list(all_pipeline_ids)

    dw_pipelines = []
    chunk_size = 25  # Per AWS documentation, need to go in pages of 25 pipelines
    for ids_chunk in funcy.chunks(chunk_size, selected_pipeline_ids):
        resp = client.describe_pipelines(pipelineIds=ids_chunk)
        for description in resp["pipelineDescriptionList"]:
            for tag in description["tags"]:
                if tag["key"] == "user:project" and tag[
                        "value"] == "data-warehouse":
                    dw_pipelines.append(DataPipeline(description))
    return sorted(dw_pipelines, key=attrgetter("name"))
示例#5
0
    def decryptFile(self, fileIn, chunksize):
        try:
            fileOut = fileIn + ".tmp"
            print(bcolors.WARNING + "[!] " + bcolors.ENDC +
                  "Decrypting file: " + fileIn)
            with open(fileIn, "rb") as encryptedFile:
                encrypted = base64.b64decode(encryptedFile.read(64))
                setup = encrypted[:
                                  48]  # READ KEY[32] and IV[16] = 32 + 16 = 48 | Hent key og IV
                if self.key == setup[:32]:
                    print(bcolors.OKGREEN + "[+] Password correct!" +
                          bcolors.ENDC)
                else:
                    print(bcolors.FAIL + "[!] WRONG PASSWORD" + bcolors.ENDC)
                    sys.exit(0)

                iv = setup[32:]
                cipher = AES.new(self.key, AES.MODE_CBC, iv)
                with open(fileOut, "wb") as decryptedFile:
                    encrypted = base64.b64decode(encryptedFile.read())
                    chunks = list(funcy.chunks(chunksize, encrypted))
                    for chunk in chunks:
                        decrypted_chunk = self._unpad(cipher.decrypt(chunk))
                        decryptedFile.write(decrypted_chunk)
            os.rename(fileOut, fileIn)
            print(bcolors.OKGREEN + "[+] " + bcolors.ENDC + "Decrypting done!")
        except Exception as e:
            raise e
示例#6
0
def get_batch_dataframe(df, batch_size=100):
    """
    split DataFrame to sub-DataDrame and each sub-DataDrame row size is batch_size

    Parameters
    ----------
    df : Pandas DataFrame
    batch_size : number of records in each sub-dataframe(default: 100)

    Returns
    -------
    DataFrame generator

    Examples
    --------
    >>> import pandas as pd
    >>> from tidyframe import get_batch_dataframe
    >>> df = pd.DataFrame()
    >>> df['col_1'] = list("abcde")
    >>> df['col_2'] = [1, 2, 3, 4, 5]
    >>> dfs = [ x for x in get_batch_dataframe(df,2)]
    >>> dfs[-1]
        col_1  col_2
    4       e      5
    >>> [ x.shape[0] for x in dfs]
    [2, 2, 1]
    """
    for min_batch in chunks(batch_size, range(df.shape[0])):
        yield df.iloc[min_batch, :]
def _series_chunks(s, n_jobs):
    if n_jobs < 0:
        # have n chunks if we are using all n cores/cpus = cpu_count() + 1 + n_jobs
        n_jobs = cpu_count() + 1 + n_jobs
    n = len(s)
    n_chunks = int(n / n_jobs)
    return (s.iloc[ilocs] for ilocs in fp.chunks(n_chunks, range(n)))
    def decrypt_file(self):

        with open(self.input_file, "rb") as encryptedFile:
            chunk_size = 24 * 1024
            encrypted = base64.b64decode(encryptedFile.read(64))
            setup = encrypted[:48]
            # key_confirm = input("Please enter the key used to encrypt the file:- ")
            salt = b'\x9aX\x10\xa6^\x1fUVu\xc0\xa2\xc8\xff\xceOV'
            key_check = Crypto.Protocol.KDF.PBKDF2(password=self.secret_key,
                                                   salt=salt,
                                                   dkLen=32,
                                                   count=10000)

            def unpad(s):
                return s[:-ord(s[len(s) - 1:])]

            if key_check == setup[:32]:
                print("Password Correct!")
            else:
                print("Wrong Password!")
                sys.exit(0)

            iv = setup[32:]
            cipher = AES.new(key_check, AES.MODE_CBC, iv)
            with open(self.output_file, "wb") as decryptedFile:
                encrypted = base64.b64decode(encryptedFile.read())
                chunks = list(funcy.chunks(chunk_size, encrypted))
                for chunk in chunks:
                    decrypted_chunk = unpad(cipher.decrypt(chunk))
                    decryptedFile.write(decrypted_chunk)
示例#9
0
def tqdm_chunks(collection, chunk_size, enum=False):
    """Call funcy.chunks and return the resulting generator wrapped in a progress bar."""
    tqdm_nice = tqdm_notebook if in_notebook() else tqdm
    chunks = funcy.chunks(chunk_size, collection)
    if enum:
        chunks = enumerate(chunks)
    return tqdm_nice(chunks, total=int(np.ceil(len(collection) / chunk_size)))
def split_into_groups(randomized_primary_team, randomized_guest_team):
    # TODO: this is f****d, make two lists and concat them together instead
    total_attendees = len(randomized_primary_team) + len(randomized_guest_team)

    if total_attendees % 5 == 0:
        group_size = 5
    elif total_attendees % 6 == 0:
        group_size = 6
    elif total_attendees % 7 == 0:
        group_size = 7
    elif total_attendees % 8 == 0:
        group_size = 8
    elif total_attendees % 9 == 0:
        group_size = 9
    else:
        group_size = 6

    chunked_list = fn.chunks(group_size, randomized_primary_team)

    for i in range(len(randomized_guest_team)):
        chunked_list[i % len(chunked_list)].append(randomized_guest_team.pop())

    chunked_list_last_child = chunked_list[-1]
    if len(chunked_list_last_child) < 5:
        for i in range(len(chunked_list_last_child)):
            chunked_list[i % len(chunked_list)].append(chunked_list_last_child.pop())
        del chunked_list[-1]

    return chunked_list
示例#11
0
def sendSNIPayload(cmd, argument):
    """
    we know which server command will be executed at this point
    so we call the 'execute cmd' and encode the command output
    """
    randy = randomString()
    print("(*) Executing: %s command" % cmd)

    if not ("CD" or "EX") in cmd:
        payload = (executeCmd(cmd, 0))
    else:
        payload = argument.encode('utf-8')

    encoded_payload = str(base64.b32encode(payload), "utf-8")

    if log_enabled:
        print(encoded_payload)

    encoded_payload = encoded_payload.replace("=", '')
    chunks = list(funcy.chunks(240, encoded_payload))
    finito = ("finito-%s" % randy)
    chunks.append(finito)

    if log_enabled:
        print(encoded_payload)
        print(chunks)

    sendSNIChunks(chunks)
示例#12
0
    def input_batches_from_raw_text(self, raw_text, batch_size=32):
        """Construct ContextualizedEmbedding inputs from a raw text string.

        Args:
            raw_text: str, the entire raw document
            batch_size: number of sentences per batch; limited by GPU memory

        Returns:
            generator: yields dicts of inputs to the ContextualizedEmbedding layer
                'forward_input': padded array of character codes corresponding to each sentence
                'backward_input': padded array of character codes in reverse order
                'forward_index_input': padded array of locations of token outputs in forward_input
                'backward_index_input': padded array of locations of token outputs in backward_input
                'forward_mask_input': mask of same shape as forward_index_input, with 0's where
                    padded and 1's where real tokens
                'backward_mask_input':mask of same shape as back_index_input, with 0's where
                    padded and 1's where real tokens
            num_batches: int, necessary because after enforcing sentence limits we may end up with
                more sentences than we expect
            document_index_batches: list of lists of (start, end) tuples indicating spans where each
                token came from in raw_text. Outer list over batches, inner list over sentences
        """
        parsed_text = split_sentences_and_tokenize_raw_text(
            raw_text, self.max_token_sequence_len, self.max_char_sequence_len -
            2)  # -2 for special start and end characters

        generator = self.input_batch_generator(parsed_text["tokens"],
                                               batch_size)
        num_batches = int(np.ceil(len(parsed_text["tokens"]) / batch_size))
        document_indices = create_document_indices_from_sentence_indices(
            parsed_text["spans"], parsed_text["tokens"], raw_text)
        # now group document_indices into batches
        document_index_batches = list(chunks(batch_size, document_indices))
        return generator, num_batches, document_index_batches
示例#13
0
def splitscrape(text):
    sent_corpus = [s for s in nltk.sent_tokenize(text)]
    sent_corpus = [
        "".join(chunk)
        for chunk in chunks(int(len(sent_corpus) / 100), sent_corpus)
    ]
    plotlist = [wordshifter(x) for x in sent_corpus]
    return (plotlist)
示例#14
0
 def simulate(self, seed: int = None, copolicy=False):
     """
     Generates tuples of (state, action, next_state) and the
     probability transitioning from state to next_state.
     """
     bits = self._simulate(seed, copolicy)
     chunks = fn.chunks(self.spec.order.total_bits, bits)
     return [self.spec.unflatten(c)[0] for c in chunks]
示例#15
0
 def decode_plain(self, text):
     if len(text) % 2:
         raise ValueError("Text is not playfair-encoded: Length is odd")
     for a, b in chunks(2, text):
         if a == b:
             raise ValueError("Text is not playfair-encoded:" +
                              " Invalid bigram {}".format(a + b))
         yield self._code_pair(a, b, -1)
示例#16
0
def _mygene_fetch(queries, scopes, specie):
    # To retry or ignore only one chunk on error
    @ignore(requests.HTTPError, default=[])
    @log_errors(lambda msg: cprint(msg, 'red'), stack=False)
    @retry(10, errors=requests.HTTPError, timeout=lambda n: 5 * 1.4**n)
    @log_errors(lambda msg: cprint(msg, 'yellow'), stack=False)
    def querymany(qs):
        try:
            return mg.querymany(qs,
                                scopes=scopes,
                                fields=['entrezgene', 'symbol'],
                                species=specie,
                                email='*****@*****.**',
                                verbose=False)
        except requests.HTTPError as e:
            # Do not retry on Bad Request
            if e.response.status_code == 400:
                return []
            raise

    cprint('> Going to query %d genes in %s...' % (len(queries), scopes),
           'cyan')
    cprint('>     sample queries: %s' % ', '.join(take(8, queries)), 'cyan')
    # Read cache
    prefix = '%s-%s:' % (SPECIE_PREFIXES[specie], PREFIXES[scopes])
    keys = [prefix + q for q in queries]
    res = {
        k: pickle.loads(v) if v else ''
        for k, v in zip(queries, mget(keys)) if v is not None
    }
    if res:
        queries = set(queries) - set(res)
        print(
            ('Got %d from cache, %d queries left' % (len(res), len(queries))))

    if queries:
        mg = mygene.MyGeneInfo()
        # Looks like sorting groups bad queries
        data = cat(
            querymany(qs)
            for qs in chunks(500, tqdm(sorted(queries), leave=False)))
        new = {
            str(item['query']): (item['entrezgene'], item['symbol'])
            for item in data if not item.get('notfound')
            and 'entrezgene' in item and 'symbol' in item
        }
        res.update(new)
        # Cache results and fails
        pipe = redis_client.pipeline(transaction=False)
        for k, v in new.items():
            pipe.setex(prefix + k, CACHE_TIMEOUT, pickle.dumps(v, -1))
        for k in queries - set(new):
            pipe.setex(prefix + k, CACHE_TIMEOUT, '')
        pipe.execute()

    res = {k: v for k, v in res.items() if v != ''}
    cprint('-> Got %d matches' % len(res), 'yellow')
    return res
示例#17
0
 def iter_by_ids(self, path, ids, page_size, **params):
     for ids_chunk in chunks(page_size, ids):
         data = self.get(
             path,
             ids=','.join(map(str, ids_chunk)),
             page_size=page_size,
             **params,
         )
         yield from data['results']
示例#18
0
 def iter_by_ids(self, path, ids, page_size, **params):
     for ids_chunk in chunks(page_size, ids):
         data = self.get(
             path,
             ids=','.join(map(str, ids_chunk)),
             page_size=page_size,
             **params,
         )
         yield from data['results']
示例#19
0
 def mget(self, keys: List[K]) -> List[Optional[V]]:
     """Get values for keys"""
     results = []
     for key_chunk in funcy.chunks(self._key_chunk_size, keys):
         encoded_keys = [self._key_encoder(key) for key in key_chunk]
         results.extend(
             self._value_decoder(encoded_data
                                 ) if encoded_data is not None else None
             for encoded_data in self._redis.mget(encoded_keys))
     return results
示例#20
0
    def sync_data(self):
        table = self.TABLE

        url = self.get_url()

        for handle in self.config.get('handles'):
            params = {
                'user_id': handle,
                'count': 200,
                'include_rts': 1,
            }
            has_more = True
            max_id = None

            while has_more:
                last_max_id = max_id

                result = self.client.make_request(url,
                                                  self.API_METHOD,
                                                  params=params)

                data = self.get_stream_data(result)

                with singer.metrics.record_counter(endpoint=table) as counter:
                    for index, obj in enumerate(data):
                        LOGGER.debug("On {} of {}".format(index, len(data)))

                        processed = self.filter_keys(obj)

                        singer.write_records(table, [processed])

                        counter.increment()

                        if max_id is None:
                            max_id = obj.get('id')
                        else:
                            max_id = min(max_id, obj.get('id'))

                        params['max_id'] = max_id

                        self.state = incorporate(
                            self.state,
                            "tweet_engagements.{}".format(obj.get('id')),
                            'date', processed.get('created_at'))

                for substream in self.substreams:
                    substream.state = self.state
                    for tweets in funcy.chunks(25, data):
                        substream.sync_data(parent_ids=[
                            tweet.get('id_str') for tweet in tweets
                        ])

                if last_max_id == max_id:
                    has_more = False
示例#21
0
def map_backtranslate():
    """"""
    raise NotImplementedError()
    from multiprocessing import Pool

    import funcy
    path = untar_data(URLs.IMDB)
    txt_files = glob.glob(f'{path}/train/*/*.txt')
    pool = Pool(8)
    chunks = funcy.chunks(1000, txt_files)
    pool.map(save_backtranslations, list(chunks))
示例#22
0
    def _unflatten(self, bits: Bits):
        size = self.order.total_bits
        for i, chunk in enumerate(fn.chunks(self.order.total_bits, bits)):
            mapping = {}
            for j, bit in enumerate(chunk):
                lvl = i*size + j
                var = self.bexpr.bdd.var_at_level(lvl)
                name, _, idx = TIMED_INPUT_MATCHER.match(var).groups()
                mapping[f'{name}[{idx}]'] = bit

            yield self.dyn.imap.unblast(mapping)
示例#23
0
 def objects(self, sphere, evaluate_expressions=False):
     chunk_size = 25  # Per AWS documentation, need to go in pages of 25 objects
     object_ids = self.instance_ids(sphere)
     paginator = self.client.get_paginator("describe_objects")
     # Evaluation fails for components so block the flag here.
     evaluate_expressions = evaluate_expressions and sphere in ("ATTEMPT", "INSTANCE")
     for ids_chunk in funcy.chunks(chunk_size, object_ids):
         response_iterator = paginator.paginate(
             pipelineId=self.pipeline_id, objectIds=ids_chunk, evaluateExpressions=evaluate_expressions
         )
         for pipeline_object in response_iterator.search("pipelineObjects[]"):
             yield DataPipelineObject(pipeline_object)
def splitscrape(text):
    sent_corpus = []
    for s in nltk.sent_tokenize(text):
        sent_corpus.append(s)
    sent_corpus = [
        "".join(chunk)
        for chunk in chunks(int(len(sent_corpus) / 100), sent_corpus)
    ]
    plotlist = []
    for x in sent_corpus:
        plotlist.append(wordshifter(x))
    return (plotlist)
示例#25
0
 def _get_features(self, texts, max_min_features=False, chunk_size=10, verbose=False):
     text_features = []
     _iter = funcy.chunks(chunk_size, texts)
     if verbose:
         _iter = tqdm.tqdm(_iter, total=int(np.ceil(len(texts) / chunk_size)))
     for t in _iter:
         features = np.array(self.feature_pipeline.transform(t))
         feature_types = [features.mean(axis=1)]
         if max_min_features:
             feature_types = feature_types + [features.max(axis=1), features.min(axis=1)]
         concat_features = np.hstack(feature_types)
         text_features.append(concat_features)
     return np.vstack(text_features)
示例#26
0
    def word_tokenize(self, text):
        """Get list of string tokens from input string.

        Args:
            text: input string for tokenization
        Yields:
            token: str, non-whitespace tokens
        """
        for token in split_possessive_markers(split_contractions(_html_tokenize(text))):
            if self._max_characters_per_token is not None:
                for token_chunk in funcy.chunks(self._max_characters_per_token, token):
                    yield token_chunk
            else:
                yield token
def get_artist_info(df_master, username, token, sp):
    """
    Get artist info for every song in df_master

    Args:
        df_master (DataFrame): DF returned from get_album_info()
        username (str): string of spotify username
        token (str): Spotify api session token
        sp: (Spotify object): Spotify API session object

    Returns:
        artists_df (Pandas DataFrame): df with all artist metadata/features
        df_master (Pandas DataFrame): df_master merged with artists_df/features
    """
    if token:

        all_artists = dict()

        # iterate in batches

        for artist_id_batch in chunks(20, df_master["artist_id"].unique()):
            try:
                batch_artists = sp.artists(artist_id_batch)
                batch_artists = batch_artists["artists"]
                for i, artist_id in enumerate(artist_id_batch):
                    # get only attributes that are needed
                    all_artists[artist_id] = {
                        "artist_followers": batch_artists[i]["followers"]["total"],
                        "artist_genres": batch_artists[i]["genres"],
                        "artist_popularity": batch_artists[i]["popularity"],
                    }
            except AttributeError:
                logger.info("ERROR AT {}".format(artist_id_batch))

        # create df of artists data
        artists_df = pd.DataFrame(all_artists).T
        artists_df["artist_genres"] = artists_df.artist_genres.apply(
            lambda x: [i.replace(" ", "_") for i in x]
        )

        #######################################################################################################

        # merge master df with artists df
        df_master = df_master.join(artists_df, on="artist_id")

        return artists_df, df_master

    else:
        logging.info(f"Can't get token for {username}")
示例#28
0
def upgrade():
    ### commands auto generated by Alembic - please adjust! ###
    table = op.create_table('free_email_providers',
    sa.Column('id', sa.Integer(), nullable=False),
    sa.Column('domain', sa.Unicode(length=255), nullable=False),
    sa.PrimaryKeyConstraint('id')
    )
    op.create_index(op.f('ix_free_email_providers_domain'), 'free_email_providers', ['domain'], unique=False)
    ### end Alembic commands ###

    with open(os.path.join(PROJECT_ROOT, 'free.emails'), 'r') as f:
        FREE_EMAILS_SET = set(r.strip() for r in f.readlines())

    # Fill the data
    for domains in chunks(1000, FREE_EMAILS_SET):
        op.bulk_insert(table, [{'domain': d} for d in domains])
示例#29
0
def score_pat(sub, patient):
    sub_row = sub.loc[patient]
    truth_rows = st1_test_df[st1_test_df.patientId == patient].dropna()
    truth_arr = truth_rows[['x', 'y', 'width', 'height']].values
    if isinstance(sub_row, np.float):
        if truth_rows.empty:
            return np.nan
        else:
            return 0
    elif truth_rows.empty:
        return 0
    else:
        bboxes = list(funcy.chunks(5, lmap(float, sub_row.strip().split(' '))))
        scores = np.array(bboxes)[:, 0]
        arr = np.array(bboxes)[:, 1:]
        return map_iou(truth_arr, arr, scores)
示例#30
0
def get_1_or_2(x, cut1, cut2):
    """Unused"""
    if len(x) == 0:
        return np.nan
    new_dets = []
    x = lmap(float, x)
    chunks = list(funcy.chunks(5, x))
    for i, c in enumerate(chunks):
        if c[0] > cut1:
            new_dets.append(c)
        elif (i == 1) and c[0] > cut2:
            assert len(new_dets) == 1
            new_dets.append(c)
        else:
            break
    return ' '.join(list(funcy.flatten(new_dets)))
示例#31
0
def binary_write(differences, features_number, output_file):
    chunking_time_logger = TimeLogger(task_name='Chunking')
    differences = differences.flatten('F')
    differences = np.append(differences, features_number)
    differences = struct.pack('=%df' % differences.size, *differences)

    chunk_size = 10000000
    difference_chunks = funcy.chunks(chunk_size, differences)
    chunking_time_logger.finish()

    chunk_counter = 1
    for difference_chunk in difference_chunks:
        with open(output_file, 'ab') as f:
            difference_chunk_time_logger = TimeLogger(task_name='Write difference %d-th chunk' % chunk_counter)
            f.write(difference_chunk)
            difference_chunk_time_logger.finish()
            chunk_counter += 1
示例#32
0
def parse_page(text):
    tree = html.fromstring(text)
    for item in tree.xpath('//*[@itemtype="https://schema.org/Product"]'):
        cls = classer(item)
        link = item.cssselect('.eventlink a')[0].get('href')
        kwargs = {
            'vendor': Vendor.STRANAVETROV,
            'level': LEVELS_MAP[cls('stangeevent').lower()],
            'url': 'https://stranavetrov.ru' + link,
            'title': cls('eventname'),
            'length': int_or_none(cls('kmevent')),
            'price': cls('eventnewprice') or cls('eventprice', None),
        }

        dates = chunks(2, map(parse_date, find_dates(cls('dataevent'))))
        for start, end in dates:
            yield Item(start=start, end=end, **kwargs)
示例#33
0
    def url_info(self, domains, *categories):
        """ Wrapper over `AwisApi.url_info` for parallel processing.
        """
        if self.closed:
            raise AttributeError('Session is closed')

        CHUNK_SIZE = 5  # AWIS`s limit

        def request(domains, categories):
            logger.info('AWIS request for %s', domains)
            return self.api.url_info(domains, *categories)

        with ThreadPoolExecutor(self.workers_count) as executor:
            tasks = [
                executor.submit(request, chunk, categories)
                for chunk in chunks(CHUNK_SIZE, self.handle_cache(domains))
            ]

            for future in as_completed(tasks):
                self.session_list_of_raw_result.append(future.result())
示例#34
0
    def update_users(self, users_data, prefix=None):
        """ Uses Intercom's bulk update.
        """
        CHUNK_SIZE = 50  # Intercom's limitation

        @requests_retry
        def request(chunk_of_users_data):
            url = '{0}/bulk/users'.format(self.base_url)
            response = session.post(
                url,
                json={'items': [
                    {
                        'method': 'post',
                        'data_type': 'user',
                        'data': apply_prefix_for_user_data(ch, prefix)
                    } for ch in chunk_of_users_data]
                },
                auth=self.auth,
                headers=self.get_headers(),
                timeout=TIMEOUT)
            # TODO: re-raise custom exception for 429 HTTP error
            # for further handling (e.g. retry celery task)
            response.raise_for_status()
            result = response.json()

            try:
                status_url = result['links']['self']
                logger.debug('Bulk update status: %s', status_url)
            except KeyError:
                logger.error('Weird response from Intercom: %r', result)

            return result

        with self.get_executor() as executor:
            for _ in executor.map(request, chunks(CHUNK_SIZE, users_data)):
                pass
示例#35
0
文件: build.py 项目: bozzmob/dxr
 def path_chunks(tree):
     """Return an iterable of worker-sized iterables of paths."""
     return chunks(500, unignored(tree.source_folder,
                                  tree.ignore_paths,
                                  tree.ignore_filenames))