Пример #1
0
    def _merge(self, matches):
        # get matches up to and including first important_match
        #   but if no important_match, then all matches are important_matches
        relevant_matches = self._first_important_matches(matches)

        # get individual lines from important_matches that were marked important
        # these will be prepended to the final result
        def get_marked_lines(match, marker):
            return tuple(line
                         for line, flag in zip(match.value(self.__class__),
                                               match.valueflags(self.__class__))
                         if flag is marker)
        top_lines = concat(get_marked_lines(m, ParameterFlag.top) for m in relevant_matches)

        # also get lines that were marked as bottom, but reverse the match order so that lines
        # coming earlier will ultimately be last
        bottom_lines = concat(get_marked_lines(m, ParameterFlag.bottom) for m in
                              reversed(relevant_matches))

        # now, concat all lines, while reversing the matches
        #   reverse because elements closer to the end of search path take precedence
        all_lines = concat(m.value(self.__class__) for m in reversed(relevant_matches))

        # stack top_lines + all_lines, then de-dupe
        top_deduped = tuple(unique(concatv(top_lines, all_lines)))

        # take the top-deduped lines, reverse them, and concat with reversed bottom_lines
        # this gives us the reverse of the order we want, but almost there
        # NOTE: for a line value marked both top and bottom, the bottom marker will win out
        #       for the top marker to win out, we'd need one additional de-dupe step
        bottom_deduped = unique(concatv(reversed(tuple(bottom_lines)), reversed(top_deduped)))

        # just reverse, and we're good to go
        return tuple(reversed(tuple(bottom_deduped)))
Пример #2
0
def convert_cat_codes(s, fmt):
    unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))
    c = (pd.to_numeric(s, downcast='integer').astype('category').pipe(
        lambda xf: xf.cat.rename_categories([
            fmt[k] for k in sorted(xf.unique().dropna())
        ])).cat.set_categories(unq_lvls))
    return c
Пример #3
0
    async def _process_headers(self, peer: ETHPeer, headers: List[BlockHeader]) -> int:
        await self._download_block_parts(
            [header for header in headers if not _is_body_empty(header)],
            self.request_bodies,
            self._downloaded_bodies,
            _body_key,
            'body')
        self.logger.info("Got block bodies for chain segment")

        missing_receipts = [header for header in headers if not _is_receipts_empty(header)]
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        missing_receipts = list(unique(missing_receipts, key=_receipts_key))
        await self._download_block_parts(
            missing_receipts,
            self.request_receipts,
            self._downloaded_receipts,
            _receipts_key,
            'receipt')
        self.logger.info("Got block receipts for chain segment")

        # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block
        # here.
        for header in headers:
            await self.chaindb.coro_persist_header(header)

        head = await self.chaindb.coro_get_canonical_head()
        self.logger.info("Imported chain segment, new head: #%d", head.block_number)
        if head.hash == peer.head_hash:
            self.logger.info("Fast sync with %s completed", peer)
            self._sync_complete.set()
        return head.block_number
Пример #4
0
def terms(
    doclike: types.DocLike,
    *,
    ngs: Optional[int | Collection[int] | types.DocLikeToSpans] = None,
    ents: Optional[bool | types.DocLikeToSpans] = None,
    ncs: Optional[bool | types.DocLikeToSpans] = None,
    dedupe: bool = True,
) -> Iterable[Span]:
    """
    Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks --
    from ``doclike`` as a single, concatenated collection, with optional deduplication
    of spans extracted by more than one type.

    .. code-block:: pycon

        >>> extract.terms(doc, ngs=2, ents=True, ncs=True)
        >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2))
        >>> extract.terms(doc, ents=extract.entities)
        >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON"))

    Args:
        doclike
        ngs: N-gram terms to be extracted.
            If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is
            used to extract terms; if a callable, ``ngs(doclike)`` is used to extract
            terms; if None, no n-gram terms are extracted.
        ents: Entity terms to be extracted.
            If True, :func:`textacy.extract.entities(doclike)` is used to extract terms;
            if a callable, ``ents(doclike)`` is used to extract terms;
            if None, no entity terms are extracted.
        ncs: Noun chunk terms to be extracted.
            If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract
            terms; if a callable, ``ncs(doclike)`` is used to extract terms;
            if None, no noun chunk terms are extracted.
        dedupe: If True, deduplicate terms whose spans are extracted by multiple types
            (e.g. a span that is both an n-gram and an entity), as identified by
            identical (start, stop) indexes in ``doclike``; otherwise, don't.

    Returns:
        Next term from ``doclike``, in order of n-grams then entities then noun chunks,
        with each collection's terms given in order of appearance.

    Note:
        This function is *not* to be confused with keyterm extraction, which leverages
        statistics and algorithms to quantify the "key"-ness of terms before returning
        the top-ranking terms. There is no such scoring or ranking here.

    See Also:
        - :func:`textacy.extact.ngrams()`
        - :func:`textacy.extact.entities()`
        - :func:`textacy.extact.noun_chunks()`
        - :mod:`textacy.extact.keyterms`
    """
    extractors = _get_extractors(ngs, ents, ncs)
    terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors)
    if dedupe is True:
        terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end))
    for term in terms_:
        yield term
Пример #5
0
 def conda_build_local_paths(self):
     # does file system reads to make sure paths actually exist
     return tuple(
         unique(full_path for full_path in (expand(d) for d in (
             self._croot,
             self.bld_path,
             self.conda_build.get('root-dir'),
             join(self.root_prefix, 'conda-bld'),
             '~/conda-bld',
         ) if d) if isdir(full_path)))
Пример #6
0
 def conda_build_local_paths(self):
     # does file system reads to make sure paths actually exist
     return tuple(unique(full_path for full_path in (
         expand(d) for d in (
             self._croot,
             self.bld_path,
             self.conda_build.get('root-dir'),
             join(self.root_prefix, 'conda-bld'),
             '~/conda-bld',
         ) if d
     ) if isdir(full_path)))
Пример #7
0
def validate_cats_for_fmt(x, fmtid, convfn):
    fmt = FORMATS[fmtid]
    fmt_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))
    xs = pd.Series(x, name=fmtid)
    xc = convfn(xs, fmt)
    assert type(xc) == type(xs)
    assert xc.dtype.name == 'category'
    assert list(xc.cat.categories) == fmt_lvls
    vc = xc.value_counts().to_dict()
    assert set(vc.keys()) == set(fmt.values())
    return xc
Пример #8
0
 def _skip_empty_and_duplicated_receipts(
         self,
         headers: List[BlockHeader]) -> Generator[BlockHeader, None, None]:
     # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
     # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), so
     # we have an extra check here to avoid requesting those receipts multiple times.
     headers = list(unique(headers,
                           key=operator.attrgetter('receipt_root')))
     for header in headers:
         if (header.receipt_root != self.chaindb.empty_root_hash
                 and header.receipt_root not in self._pending_receipts):
             yield header
Пример #9
0
def n_unique_words(doc_or_words: Union[Doc, Iterable[Token]]) -> int:
    """
    Compute the number of *unique* words in a document.

    Args:
        doc_or_words: If a spaCy ``Doc``, non-punctuation tokens (words) are extracted;
            if an iterable of spaCy ``Token`` s, all are included as-is.
    """
    words = _get_words(doc_or_words)
    # NOTE: this stdlib solution is slower than itertoolz for docs with ~250+ words
    # so let's take a small hit on short docs for the sake of big wins on long docs
    # return len({word.lower for word in words})
    return itertoolz.count(itertoolz.unique(word.lower for word in words))
Пример #10
0
    async def _process_headers(self, peer: ETHPeer, headers: List[BlockHeader]) -> int:
        await self._download_block_parts(
            [header for header in headers if not _is_body_empty(header)],
            self.request_bodies,
            self._downloaded_bodies,
            _body_key,
            'body')
        self.logger.info("Got block bodies for chain segment")

        missing_receipts = [header for header in headers if not _is_receipts_empty(header)]
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        missing_receipts = list(unique(missing_receipts, key=_receipts_key))
        await self._download_block_parts(
            missing_receipts,
            self.request_receipts,
            self._downloaded_receipts,
            _receipts_key,
            'receipt')
        self.logger.info("Got block receipts for chain segment")

        # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block
        # here.
        for header in headers:
            await self.chaindb.coro_persist_header(header)

        head = await self.chaindb.coro_get_canonical_head()
        self.logger.info("Imported chain segment, new head: #%d", head.block_number)
        # Quite often the header batch we receive here includes headers past the peer's reported
        # head (via the NewBlock msg), so we can't compare our head's hash to the peer's in
        # order to see if the sync is completed. Instead we just check that we have the peer's
        # head_hash in our chain.
        try:
            await self.chaindb.coro_get_block_header_by_hash(peer.head_hash)
        except HeaderNotFound:
            pass
        else:
            self.logger.info("Fast sync with %s completed", peer)
            self._sync_complete.set()

        return head.block_number
Пример #11
0
    def transform(
            self,
            doclikes: Iterable[types.DocLike]) -> Iterable[Tuple[str, ...]]:
        """
        Convert a sequence of spaCy Docs or Spans into an ordered, nested sequence
        of terms as strings.

        Args:
            doclikes

        Yields:
            Ordered sequence of terms as strings for next Doc or Span.
        """
        normalize_ = self.normalize
        for doclike in doclikes:
            terms = itertoolz.concat(
                tokenizer(doclike) for tokenizer in self.tokenizers)
            if self.dedupe is True:
                terms = itertoolz.unique(terms, lambda span:
                                         (span.start, span.end))
            yield tuple(normalize_(term) for term in terms)
Пример #12
0
def test_unique():
    assert tuple(unique((1, 2, 3))) == (1, 2, 3)
    assert tuple(unique((1, 2, 1, 3))) == (1, 2, 3)
    assert tuple(unique((1, 2, 3), key=iseven)) == (1, 2)
Пример #13
0
def test_unique():
    assert tuple(unique((1, 2, 3))) == (1, 2, 3)
    assert tuple(unique((1, 2, 1, 3))) == (1, 2, 3)
    assert tuple(unique((1, 2, 3), key=iseven)) == (1, 2)
Пример #14
0
    async def _sync(self, peer: ETHPeer) -> None:
        head = await self.chaindb.coro_get_canonical_head()
        head_td = await self.chaindb.coro_get_score(head.hash)
        if peer.head_td <= head_td:
            self.logger.info(
                "Head TD (%d) announced by %s not higher than ours (%d), not syncing",
                peer.head_td, peer, head_td)
            return

        self.logger.info("Starting sync with %s", peer)
        # FIXME: Fetch a batch of headers, in reverse order, starting from our current head, and
        # find the common ancestor between our chain and the peer's.
        start_at = max(0, head.block_number - eth.MAX_HEADERS_FETCH)
        while True:
            self.logger.info("Fetching chain segment starting at #%d",
                             start_at)
            peer.sub_proto.send_get_block_headers(start_at,
                                                  eth.MAX_HEADERS_FETCH,
                                                  reverse=False)
            try:
                headers = await wait_with_token(self._new_headers.get(),
                                                peer.wait_until_finished(),
                                                token=self.cancel_token,
                                                timeout=self._reply_timeout)
            except TimeoutError:
                self.logger.warn(
                    "Timeout waiting for header batch from %s, aborting sync",
                    peer)
                await peer.stop()
                break

            if peer.is_finished():
                self.logger.info("%s disconnected, aborting sync", peer)
                break

            self.logger.info("Got headers segment starting at #%d", start_at)

            # TODO: Process headers for consistency.

            await self._download_block_parts(
                [header for header in headers if not _is_body_empty(header)],
                self.request_bodies, self._downloaded_bodies, _body_key,
                'body')

            self.logger.info(
                "Got block bodies for chain segment starting at #%d", start_at)

            missing_receipts = [
                header for header in headers if not _is_receipts_empty(header)
            ]
            # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
            # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
            # so we do this to avoid requesting the same receipts multiple times.
            missing_receipts = list(unique(missing_receipts,
                                           key=_receipts_key))
            await self._download_block_parts(missing_receipts,
                                             self.request_receipts,
                                             self._downloaded_receipts,
                                             _receipts_key, 'receipt')

            self.logger.info(
                "Got block receipts for chain segment starting at #%d",
                start_at)

            for header in headers:
                await self.chaindb.coro_persist_header(header)
                start_at = header.block_number + 1

            self.logger.info("Imported chain segment, new head: #%d",
                             start_at - 1)
            head = await self.chaindb.coro_get_canonical_head()
            if head.hash == peer.head_hash:
                self.logger.info("Chain sync with %s completed", peer)
                self._sync_complete.set()
                break
Пример #15
0
def convert_cat_force(s, fmt):
    unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))
    c = (pd.to_numeric(s, downcast='integer').replace(
        to_replace=fmt).astype('category').cat.set_categories(unq_lvls))
    return c