def build(obj, *applicators): """ Run the provided object through the series of applicator functions. If ``obj`` is an instances of :class:`~eth.chains.base.BaseChain` the applicators will be run on a copy of the chain and thus will not mutate the provided chain instance. """ if isinstance(obj, BaseChain): return pipe(obj, copy(), *applicators) else: return pipe(obj, *applicators)
def test_chain_builder_initialize_chain_default(chain_class): chain = pipe( chain_class, genesis(), ) header = chain.get_canonical_head() assert header == chain.get_canonical_block_by_number(0).header assert header.parent_hash == constants.GENESIS_PARENT_HASH assert header.uncles_hash == constants.EMPTY_UNCLE_HASH assert header.coinbase == constants.GENESIS_COINBASE assert header.state_root == constants.BLANK_ROOT_HASH assert header.transaction_root == constants.BLANK_ROOT_HASH assert header.receipt_root == constants.BLANK_ROOT_HASH assert header.bloom == 0 assert header.difficulty == 1 assert header.block_number == constants.GENESIS_BLOCK_NUMBER assert header.gas_limit == constants.GENESIS_GAS_LIMIT assert header.gas_used == 0 # account for runtime. should run in less than few seconds and should be # effectively "now" assert abs(header.timestamp - time.time()) < 2 assert header.extra_data == constants.GENESIS_EXTRA_DATA assert header.mix_hash == constants.GENESIS_MIX_HASH assert header.nonce == constants.GENESIS_NONCE
def guess_language(doc, output="best"): """Guess the language of a document. This function applies a statistical method to determine the language of a document. Depending on the ``output`` argument, it may either return a single language code, or a ranking of languages that a document may be written in, sorted by probability. Uses the langid library. Parameters ---------- doc : document output : string Either "best" to get a pair (code, prob) giving the two-letter code of the most probable language and its probability, or "rank" for a list of such pairs for all languages in the model. """ from langid import classify, rank try: func = {"best": classify, "rank": rank}[output] except KeyError: raise ValueError("invalid parameter value output=%r" % output) return pipe(doc, fetch, func)
def tokenize(doc): """Tokenize text. Uses the NLTK function word_tokenize. """ nltk_download('punkt') return pipe(doc, fetch, nltk.word_tokenize)
def movie_review_emotions(doc, **kwargs): """Emotion (fine-grained sentiment) tagger for movie reviews. The training data for this function is that of Buitinck et al., with the training and test data concatenated. The algorithm is SVMs in a binary relevance (one-vs-rest) combination. You may use the training data (and this function) for academic/research purposes only. Add a parameter for_academic_research=True if you accept the license. Returns ------- tagged : list of (string, list of string) A list of (sentence, labels) pairs. Each sentence may have zero or more labels. References ---------- L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015). Multi-emotion detection in user-generated reviews. Proc. ECIR. https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf """ if not (kwargs.get('for_academic_research', False) or kwargs.get('unittest', False)): raise RuntimeError( "This functionality is only available for" " academic research. Please use movie_review_emotions(doc," " for_academic_research=True) to use this function for" " that purpose.") from ._emotion import classify nltk_download('punkt') sentences = pipe(doc, fetch, nltk.sent_tokenize) return list(zip(sentences, classify(sentences)))
def alpino(doc, output="raw"): """Wrapper around the Alpino (dependency) parser for Dutch. Expects an environment variable ALPINO_HOME to point at the Alpino installation dir. The script uses the 'dependencies' end_hook to generate lemmata and the dependency structure. Parameters ---------- output : string If 'raw', returns the raw output from Alpino itself. If 'saf', returns a SAF dictionary. References ---------- `Alpino homepage <http://www.let.rug.nl/vannoord/alp/Alpino/>`_. """ from ._alpino import tokenize, parse_raw, interpret_parse try: transf = {"raw": identity, "saf": interpret_parse}[output] except KeyError: raise ValueError("Unknown output format %r" % output) return pipe(doc, fetch, tokenize, parse_raw, transf)
def test_chain_builder_initialize_chain_with_params(chain_class): chain = pipe(chain_class, genesis(params={'difficulty': 12345}, )) header = chain.get_canonical_head() assert header == chain.get_canonical_block_by_number(0).header assert header.difficulty == 12345
def x_hashing_pre(self, line): #Remove links, hashtags, at-mentions, mark-up, and "RT" line = re.sub(r"http\S+", "", line) line = re.sub(r"@\S+", "", line) line = re.sub(r"#\S+", "", line) line = re.sub("<[^>]*>", "", line) line = line.replace(" RT", "").replace("RT ", "") #Remove emojis line = re.sub(self.myre, "", line) #Remove punctuation and extra spaces line = ct.pipe(line, preprocessing.strip_tags, preprocessing.strip_punctuation, preprocessing.strip_numeric, preprocessing.strip_non_alphanum, preprocessing.strip_multiple_whitespaces) #Strip and reduce to max training length line = line.lower().strip().lstrip() #Truncate sampels for LID if self.type == "LID": line = line[0:self.sample_size] return line
def find_matching_fn_abi(abi, fn_identifier=None, args=None, kwargs=None): filters = [] if fn_identifier: if fn_identifier is FallbackFn: return get_fallback_func_abi(abi) elif is_text(fn_identifier): filters.append(functools.partial(filter_by_name, fn_identifier)) else: raise TypeError("Unsupported function identifier") if args is not None or kwargs is not None: if args is None: args = tuple() if kwargs is None: kwargs = {} num_arguments = len(args) + len(kwargs) filters.extend([ functools.partial(filter_by_argument_count, num_arguments), functools.partial(filter_by_encodability, args, kwargs), ]) function_candidates = pipe(abi, *filters) if len(function_candidates) == 1: return function_candidates[0] if not function_candidates: raise ValueError("No matching functions found") else: raise ValueError("Multiple functions found")
def find_matching_fn_abi(abi, fn_name=None, args=None, kwargs=None): filters = [] if fn_name: filters.append(functools.partial(filter_by_name, fn_name)) if args is not None or kwargs is not None: if args is None: args = tuple() if kwargs is None: kwargs = {} num_arguments = len(args) + len(kwargs) filters.extend([ functools.partial(filter_by_argument_count, num_arguments), functools.partial(filter_by_encodability, args, kwargs), ]) function_candidates = filter_by_type('function', abi) function_candidates = pipe(abi, *filters) if len(function_candidates) == 1: return function_candidates[0] if not function_candidates: raise ValueError("No matching functions found") else: raise ValueError("Multiple functions found")
def fetch_candidate_head(self): # Try to return a log that has the score that we are checking for, # checking in order of oldest to most recent. unchecked_logs = pipe( self.unchecked_logs, enumerate, tuple, reversed, tuple, ) current_score = self.current_score for idx, log_entry in unchecked_logs: if log_entry['score'] == current_score: return self.unchecked_logs.pop(idx) # If no further recorded but unchecked logs exist, go to the next # is_new_head = true log while True: try: log_entry = self.get_next_log() # TODO: currently just raise when there is no log anymore except NextLogUnavailable: # TODO: should returns the genesis collation instead or just leave it? raise NoCandidateHead("No candidate head available") if log_entry['is_new_head']: break self.unchecked_logs.append(log_entry) self.current_score = log_entry['score'] return log_entry
def map_abi_data(normalizers, types, data): ''' This function will apply normalizers to your data, in the context of the relevant types. Each normalizer is in the format: def normalizer(datatype, data): # Conditionally modify data return (datatype, data) Where datatype is a valid ABI type string, like "uint". In case of an array, like "bool[2]", normalizer will receive `data` as an iterable of typed data, like `[("bool", True), ("bool", False)]`. Internals --- This is accomplished by: 1. Decorating the data tree with types 2. Recursively mapping each of the normalizers to the data 3. Stripping the types back out of the tree ''' pipeline = itertools.chain( [abi_data_tree(types)], map(data_tree_map, normalizers), [partial(recursive_map, strip_abi_type)], ) return pipe(data, *pipeline)
def montage_stream(ims, montage_order=None, channel_order=[0, 1, 2], clear_none=True): """From a sequence of single-channel field images, montage multichannels. Suppose the input is a list: ``` ims = [green1a, blue1a, red1a, green1b, blue1b, red1b, green2a, blue2a, red2a, green2b, blue2b, red2b] ``` with channel order ``[2, 0, 1]`` and montage order ``[1, 0]``, then the output will be: ``` [rgb1_ba, rgb2_ba] ``` Parameters ---------- ims : iterator of array, shape (M, N) A list of images in which consecutive images represent single channels of the same image. (See example.) montage_order : array-like of int, optional The order of the montage images (in 1D or 2D). channel_order : list of int, optional The order in which the channels appear. Returns ------- montaged_stream : iterator of arrays An iterator of the images composed into multi-channel montages. Examples -------- >>> images = (i * np.ones((4, 5), dtype=np.uint8) for i in range(24)) >>> montaged = list(montage_stream(images, [[0, 1], [2, 3]], [2, 0, 1])) >>> len(montaged) 2 >>> montaged[0].shape (8, 10, 3) >>> montaged[0][0, 0, :] array([2, 0, 1], dtype=uint8) >>> montaged[0][4, 5, :] array([11, 9, 10], dtype=uint8) >>> montaged[1][4, 5, :] array([23, 21, 22], dtype=uint8) """ if montage_order is None: montage_order = cellomics.SPIRAL_CLOCKWISE_RIGHT_25 montage_order = np.array(montage_order) ntiles = montage_order.size if clear_none: nchannels = len([i for i in channel_order if i is not None]) else: nchannels = len(channel_order) return tz.pipe(ims, c.partition(nchannels), c.map(stack_channels(order=channel_order)), c.partition(ntiles), c.map(montage(order=montage_order)))
def movie_review_emotions(doc, **kwargs): """Emotion (fine-grained sentiment) tagger for movie reviews. The training data for this function is that of Buitinck et al., with the training and test data concatenated. The algorithm is SVMs in a binary relevance (one-vs-rest) combination. You may use the training data (and this function) for academic/research purposes only. Add a parameter for_academic_research=True if you accept the license. Returns ------- tagged : list of (string, list of string) A list of (sentence, labels) pairs. Each sentence may have zero or more labels. References ---------- L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015). Multi-emotion detection in user-generated reviews. Proc. ECIR. https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf """ if not (kwargs.get("for_academic_research", False) or kwargs.get("unittest", False)): raise RuntimeError( "This functionality is only available for" " academic research. Please use movie_review_emotions(doc," " for_academic_research=True) to use this function for" " that purpose." ) from ._emotion import classify nltk_download("punkt") sentences = pipe(doc, fetch, nltk.sent_tokenize) return list(zip(sentences, classify(sentences)))
def get_text_from_xml_file(filename): """ This is setup for extracting text from the Stackoverflow posts data dump that is stored in a xml file. Returns a stream of Post bodies (just the text). """ @tlz.curry def _get_xml_attr(key, xml_element): return xml_element.attributes[key].value @tlz.curry def _try_to_get_xml_attr(key, xml_element, default=''): try: return _get_xml_attr(key, xml_element) except(KeyError): return default return tlz.pipe(filename, minidom.parse, # Not pure lambda layer0: layer0.getElementsByTagName("posts")[0], lambda layer1: layer1.getElementsByTagName("row"), c_map(tlz.juxt(_try_to_get_xml_attr("Title"), _get_xml_attr("Body"))), c_map(lambda titleAndBody: '\n\n\n'.join(titleAndBody)))
def tokenize(doc): """Tokenize text. Uses the NLTK function word_tokenize. """ nltk_download("punkt") return pipe(doc, fetch, nltk.word_tokenize)
def token_vectors_pipeline(input_col: str, output_col: str, df: DataFrame, stemmer_func=None): """Convert a string into an array of integer token ids""" filled_col = input_col + "_filled" tokenised_col = input_col + "_tokenised" tf_vectors = input_col + "_tf_vectors" transforms = [ # note that the tokenizer completely breaks given null input values partial(fill_nulls_with_empty_string, input_col, filled_col), partial(tokenize_words, filled_col, tokenised_col), ] # optionally stem the tokens if stemmer_func: transforms += [partial(stemmer_func, tokenised_col, tokenised_col)] transforms += [ partial(rm_empty_strings_from_tokens, tokenised_col, tokenised_col), partial(term_frequency_vectors, tokenised_col, tf_vectors), partial(sparse_vector_indices, tf_vectors, output_col), partial(drop_cols, [filled_col, tokenised_col, tf_vectors]), ] return pipe(df, *transforms)
def get_aggregation_bitfield(attestation_participants, target_committee_size): bitfield = get_empty_bitfield(target_committee_size) bitfield = pipe( bitfield, *(set_voted(index=committee_index) for committee_index in attestation_participants)) return bitfield
def fetch_candidate_head(self): # Try to return a log that has the score that we are checking for, # checking in order of oldest to most recent. unchecked_logs = pipe( self.unchecked_logs, enumerate, tuple, reversed, tuple, ) current_score = self.current_score for idx, logs_entry in unchecked_logs: if logs_entry['score'] == current_score: return self.unchecked_logs.pop(idx) # If no further recorded but unchecked logs exist, go to the next # is_new_head = true log while True: # TODO: currently just raise when there is no log anymore log_entry = self.get_next_log() if log_entry['is_new_head']: break self.unchecked_logs.append(log_entry) self.current_score = log_entry['score'] return log_entry
def int_to_bytes32(value): if not isinstance(value, int) or isinstance(value, bool): raise ValueError( "Value must be an integer: Got: {0}".format( type(value), ) ) if value < 0: raise ValueError( "Value cannot be negative: Got: {0}".format( value, ) ) if value > UINT_256_MAX: raise ValueError( "Value exeeds maximum UINT256 size. Got: {0}".format( value, ) ) value_bytes = pipe( value, int_to_big_endian, pad32, ) return value_bytes
def main(): mnist = fetch_mldata('MNIST original') X = mnist.data.astype(np.float32).reshape( (len(mnist.data), 28, 28, 1)) / 255. label_binarizer = LabelBinarizer() Y = label_binarizer.fit_transform(mnist.target) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=60000) layers = [ Conv2D(20, (3, 3), padding='same', activation='relu'), MaxPooling2D(), Conv2D(50, (3, 3), padding='same', activation='relu'), MaxPooling2D(), Flatten(), Dense(500, activation='relu'), Dense(10, activation='softmax') ] input = Input((28, 28, 1)) output = toolz.pipe(input, *layers) model = Model(input, output) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=[X_test, Y_test])
def nlner_conll(doc, **kwargs): """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset. See http://www.clips.uantwerpen.be/conll2002/ner/ for the dataset and its license. Add a parameter conll2002_project=True if you accept the license. See also -------- frog: NER tagger and dependency parser for Dutch. stanford_ner_tag: NER tagger for English. """ if not (kwargs.get("conll2002_project", False) or kwargs.get("unittest", False)): raise RuntimeError( "This functionality is only available to the" " CoNLL'02 project. Please use nlner_conll(doc," " conll2002_project=True) if you are doing research" " in the context of the shared CoNLL-2002 shared task." ) from ._nl_conll_ner import ner return pipe(doc, fetch, _tokenize_if_needed, ner)
def load_seg_models(models, loaderdict=_LOADERDICT): """ models - [(kind1, filename1), (kind2, filename2), ...] """ return tlz.pipe( models, tlzc.map(lambda model: load_seg_model( model[0], model[1], loaderdict=loaderdict)), tuple)
def decode_and_fix(text, encoding='utf-8'): """ First applies a liberal decode method to the text in which it """ return tlz.pipe(text, adv_decode(encoding=encoding), clean_unicode, normize_text)
def _ecpairing(data): exponent = bn128.FQ12.one() processing_pipeline = (_process_point(data[start_idx:start_idx + 192]) for start_idx in range(0, len(data), 192)) exponent = pipe(bn128.FQ12.one(), *processing_pipeline) result = bn128.final_exponentiate(exponent) == bn128.FQ12.one() return result
def std_decode(text, encoding='utf-8', errors='strict'): """ Standardized interface to standard python string decode method. Only accepts byte string. """ return tlz.pipe(text, verify_bytestring, lambda txt: txt.decode(encoding=encoding, errors=errors))
def tags_at(run: int, *other_runs: int, beamline: int = None) -> Tuple[int, Sequence[int]]: """ Example: hightag, tags = tags_at(509700, beamline=3) # from single run hightag, tags = tags_at(509700, 509701, 509702, beamline=3) # from multiple runs """ if beamline is None: raise ValueError("Keyword argument 'beamline' must be given!") runs = run, *other_runs hightag_at_the_beamline = partial(hightag, beamline) taglist_at_the_beamline = partial(taglist, beamline) hightags: ndarray = pipe(runs, partial(map, hightag_at_the_beamline), partial(fromiter, dtype='int')) if not (hightags == hightags[0]).all(): raise ValueError('Not all the runs have a single hightag!') tags = pipe(runs, partial(map, taglist_at_the_beamline), concat, tuple) return hightags[0], tags
def segment_text(segmentfunc, txt, flatten=False): """ Splits the text into tokens and then segments the tokens. using segmentfunc Curried. """ return tlz.pipe(txt, tpu.split_and_clean, tlzc.map(segmentfunc), mseg.should_flatten(flatten), list)
def segment_text(model, txt, flatten=True): """ Splits the text into tokens and then segments the tokens. Uses a Flatcat model. Curried. """ return tlz.pipe(txt, tpu.split_and_clean, tlzc.map(mk_segmenter(model)), mseg.should_flatten(flatten), list)
def test_chain_builder_initialize_chain_with_state_simple(chain_class): chain = pipe(chain_class, genesis(state=((ADDRESS_A, 'balance', 1), ), )) header = chain.get_canonical_head() assert header == chain.get_canonical_block_by_number(0).header assert header.state_root != constants.BLANK_ROOT_HASH account_db = chain.get_vm().state.account_db assert account_db.get_balance(ADDRESS_A) == 1
def find_background_illumination(fns, radius=None, input_bitdepth=None, quantile=0.5, stretch_quantile=0.): """Use a set of related images to find uneven background illumination. Parameters ---------- fns : list of string A list of image file names radius : int, optional The radius of the structuring element used to find background. default: The width or height of the input images divided by 4, whichever is smaller. input_bitdepth : int, optional The bit-depth of the input images. Should be specified if non-standard bitdepth images are used in a 16-bit image file, e.g. 12-bit images. Default is the dtype of the input image. quantile : float in [0, 1], optional The desired quantile to find background. default: 0.5 (median) stretch_quantile : float in [0, 1], optional Stretch image to full dtype limit, saturating above this quantile. Returns ------- illum : np.ndarray, float, shape (M, N) The estimated illumination over the image field. See Also -------- `correct_image_illumination`, `correct_multiimage_illumination`. """ # this function follows the "PyToolz" streaming data model to # obtain the illumination estimate. # first, define the functions for each individual step: in_range = ('image' if input_bitdepth is None else (0, 2**input_bitdepth - 1)) rescale = tz.curry(exposure.rescale_intensity) normalize = (tz.partial(stretchlim, bottom=stretch_quantile) if stretch_quantile > 0 else skimage.img_as_float) # produce a stream of properly-scaled images ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize) for fn in fns) # take the mean of that stream mean_image = mean(ims) # return the median filter of that mean radius = radius or min(mean_image.shape) // 4 illum = ndi.percentile_filter(mean_image, percentile=(quantile * 100), footprint=morphology.disk(radius)) return illum
def apply_all_link_refs(bytecode: bytes, link_refs: List[Dict[str, Any]], attr_dict: Dict[str, str]) -> bytes: """ Applies all link references corresponding to a valid attr_dict to the bytecode. """ if link_refs is None: return bytecode link_fns = (apply_link_ref(offset, ref["length"], attr_dict[ref["name"]]) for ref in link_refs for offset in ref["offsets"]) linked_bytecode = cytoolz.pipe(bytecode, *link_fns) return linked_bytecode
def middleware(method, params): # TODO send call to eth-tester without gas, and remove guess_gas entirely if method == 'eth_call': filled_transaction = pipe( params[0], fill_default_from, fill_default_gas, ) return make_request(method, [filled_transaction] + params[1:]) elif method in ( 'eth_estimateGas', 'eth_sendTransaction', ): filled_transaction = pipe( params[0], fill_default_from, ) return make_request(method, [filled_transaction] + params[1:]) else: return make_request(method, params)
def hash(self): ''' :returns: the hash of the encoded bytestring :rtype: ~hexbytes.main.HexBytes ''' return pipe( self, rlp.encode, keccak, HexBytes, )
def remove_punctuation(line): """ Removes punctuation from corpus :param line: :return: """ return ct.pipe(line, preprocessing.strip_tags, preprocessing.strip_punctuation, preprocessing.strip_numeric, preprocessing.strip_non_alphanum, preprocessing.strip_multiple_whitespaces)
def get_interactions(): dates = sorted(set(map(_g('date'), data['interactions']))) d = t.pipe(data['interactions'], tc.groupby(lambda i: i.student), tc.valmap(lambda x: t.pipe(t.groupby(lambda i: i.date,x), tc.valmap(lambda v: [v[0].time_in, v[0].time_out])))) mat = [['student'] + dates] for student, attendance in d.items(): record = [student] for dt in dates: if dt in attendance: record.append(attendance[dt]) elif dt in data['students'][student].absences: record.append(('','')) else: record.append((None,None)) mat.append(record) return {'interactions': mat}
def normize_text(text): """ Normalizes characters and converts all to best matching ASCII representation. Expects text to be Unicode. (returned text is still Unicode) """ return tlz.pipe(text, verify_unicode, unidecode, unicode)
def _ecpairing(data): exponent = bn128.FQ12.one() processing_pipeline = ( _process_point(data[start_idx:start_idx + 192]) for start_idx in range(0, len(data), 192) ) exponent = pipe(bn128.FQ12.one(), *processing_pipeline) result = bn128.final_exponentiate(exponent) == bn128.FQ12.one() return result
def find_background_illumination(fns, radius=None, input_bitdepth=None, quantile=0.5, stretch_quantile=0.): """Use a set of related images to find uneven background illumination. Parameters ---------- fns : list of string A list of image file names radius : int, optional The radius of the structuring element used to find background. default: The width or height of the input images divided by 4, whichever is smaller. input_bitdepth : int, optional The bit-depth of the input images. Should be specified if non-standard bitdepth images are used in a 16-bit image file, e.g. 12-bit images. Default is the dtype of the input image. quantile : float in [0, 1], optional The desired quantile to find background. default: 0.5 (median) stretch_quantile : float in [0, 1], optional Stretch image to full dtype limit, saturating above this quantile. Returns ------- illum : np.ndarray, float, shape (M, N) The estimated illumination over the image field. See Also -------- `correct_image_illumination`, `correct_multiimage_illumination`. """ # this function follows the "PyToolz" streaming data model to # obtain the illumination estimate. # first, define the functions for each individual step: in_range = ('image' if input_bitdepth is None else (0, 2**input_bitdepth - 1)) rescale = tz.curry(exposure.rescale_intensity) normalize = (tz.partial(stretchlim, bottom=stretch_quantile) if stretch_quantile > 0 else skimage.img_as_float) # produce a stream of properly-scaled images ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize) for fn in fns) # take the mean of that stream mean_image = mean(ims) # return the median filter of that mean radius = radius or min(mean_image.shape) // 4 mean_image = img_as_ubyte(stretchlim(mean_image)) illum = imfilter.rank.median(mean_image, selem=morphology.disk(radius)) return illum
def nlner_conll(doc): """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset. See http://www.clips.uantwerpen.be/conll2002/ner/ for the dataset and its license. See also -------- frog: NER tagger and dependency parser for Dutch. stanford_ner_tag: NER tagger for English. """ from ._nl_conll_ner import ner return pipe(doc, fetch, _tokenize_if_needed, ner)
def corenlp_lemmatize(doc, output="raw"): """Wrapper around the Stanford CoreNLP lemmatizer. CoreNLP is downloaded automatically. Parameters ---------- output : string If 'raw', returns the raw output lines from CoreNLP. If 'saf', returns a SAF dictionary. """ from ._corenlp import parse, stanford_to_saf return pipe(doc, fetch, parse, _output_func(output, stanford_to_saf))
def stem_snowball(doc, language): """Stem words in doc using the Snowball stemmer. Set the parameter ``lang`` to a language code such as "de", "en", "nl", or the special string "porter" to get Porter's classic stemming algorithm for English. See also -------- morphy: smarter approach to stemming (lemmatization), but only for English. """ from Stemmer import Stemmer # Build the Stemmer before fetching to force an exception for invalid # languages. stem = Stemmer(language).stemWords return pipe(doc, fetch, _tokenize_if_needed, stem)
def serializable_unsigned_transaction_from_dict(web3, transaction_dict): ''' if web3 is None, fill out transaction as much as possible without calling client ''' filled_transaction = pipe( transaction_dict, dict, fill_transaction_defaults(web3), chain_id_to_v, apply_formatters_to_dict(TRANSACTION_FORMATTERS), ) if 'v' in filled_transaction: serializer = Transaction else: serializer = UnsignedTransaction return serializer.from_dict(filled_transaction)
def corenlp(doc, output="raw"): """Wrapper around the Stanford CoreNLP parser. CoreNLP is downloaded automatically. If run with all annotators, it requires around 3G of memory, and it will keep the process in memory indefinitely. Parameters ---------- output : string If 'raw', returns the raw output lines from CoreNLP. If 'saf', returns a SAF dictionary. """ from ._corenlp import parse, stanford_to_saf return pipe(doc, fetch, parse, _output_func(output, stanford_to_saf))
def sentiwords_tag(doc, output="bag"): """Tag doc with SentiWords polarity priors. Performs left-to-right, longest-match annotation of token spans with polarities from SentiWords. Uses no part-of-speech information; when a span has multiple possible taggings in SentiWords, the mean is returned. Parameters ---------- doc : document or list of strings output : string, optional Output format. Either "bag" for a histogram (dict) of annotated token span frequencies, or "tokens" a mixed list of strings and (list of strings, polarity) pairs. See also -------- movie_review_emotions: per-sentence fine-grained sentiment tagger movie_review_polarity: figure out if a movie review is positive or negative """ from ._sentiwords import tag tagged = pipe(doc, fetch, _tokenize_if_needed, tag) if output == "bag": counts = {} for ngram, polarity in tagged: if polarity == 0: continue if ngram in counts: counts[ngram][1] += 1 else: counts[ngram] = [polarity, 1] return counts elif output == "tokens": return [ngram if polarity == 0 else (ngram, polarity) for ngram, polarity in tagged] else: raise ValueError("unknown output format %r" % output)
def frog(doc, output='raw'): """Wrapper around the Frog lemmatizer/POS tagger/NER/dependency parser. Expects Frog to be running in server mode, listening on ``localhost:${XTAS_FROG_PORT}`` or port 9987 if the environment variable ``XTAS_FROG_PORT`` is not set. It is *not* started for you. Currently, the module is only tested with all frog modules active except for the NER and parser. The following line starts Frog in the correct way: ``frog -S ${XTAS_FROG_PORT:-9887}`` Parameters ---------- output : string If 'raw', returns the raw output lines from Frog itself. If 'tokens', returns dictionaries for the tokens. If 'saf', returns a SAF dictionary. References ---------- `Frog homepage <http://ilk.uvt.nl/frog/>`_ See also -------- nlner_conll: simple NER tagger for Dutch. """ from ._frog import call_frog, parse_frog, frog_to_saf if output not in ('raw', 'tokens', 'saf'): raise ValueError("Unknown output: {output}, " "please choose either raw, tokens, or saf" .format(**locals())) result = pipe(doc, fetch, call_frog) if output == 'raw': return list(result) if output in ('tokens', 'saf'): result = parse_frog(result) if output == 'tokens': return list(result) return frog_to_saf(result)
def movie_review_polarity(doc): """Movie review polarity classifier. Determines whether the film review ``doc`` is positive or negative. Might be applicable to other types of document as well, but uses a statistical model trained on a corpus of user reviews of movies, all in English. Returns ------- p : float The probability that the movie review ``doc`` is positive. See also -------- movie_review_emotions: per-sentence fine-grained sentiment tagger sentiwords_tag: more generic sentiment expression tagger """ from ._polarity import classify return pipe(doc, fetch, classify)
def morphy(doc): """Lemmatize tokens using morphy, WordNet's lemmatizer. Finds the morphological root of all words in ``doc``, which is assumed to be written in English. Returns ------- lemmas : list List of lemmas. See also -------- stem_snowball: simpler approach to lemmatization (stemming). """ # XXX Results will be better if we do POS tagging first, but then we # need to map Penn Treebank tags to WordNet tags. nltk_download("wordnet") tokens = pipe(doc, fetch, _tokenize_if_needed) return map(nltk.WordNetLemmatizer().lemmatize, tokens)
def corenlp_lemmatize(doc, output='raw'): """Wrapper around the CoreNLP lemmatizer. Expects ``$CORENLP_HOME`` to point to the CoreNLP installation dir. Tested with `CoreNLP 2014-01-04 <http://nlp.stanford.edu/software/stanford-corenlp-full-2014-01-04.zip>`_. Parameters ---------- output : string If 'raw', returns the raw output lines from CoreNLP. If 'saf', returns a SAF dictionary. """ from ._corenlp import parse, stanford_to_saf try: transf = {"raw": identity, "saf": stanford_to_saf}[output] except KeyError: raise ValueError("Unknown output format %r" % output) return pipe(doc, fetch, parse, transf)
def load(self, line, word_classes = False): #Tokenize zho if self.language == "zho" and self.zho_split == True: line = [x for x in self.tk.cut(line, cut_all = True, HMM = True) if x != ""] line = " ".join(line) #Remove links, hashtags, at-mentions, mark-up, and "RT" line = re.sub(r"http\S+", "", line) line = re.sub(r"@\S+", "", line) line = re.sub(r"#\S+", "", line) line = re.sub("<[^>]*>", "", line) line = line.replace(" RT", "").replace("RT ", "") #Remove emojis line = re.sub(self.myre, "", line) #Remove punctuation and extra spaces line = ct.pipe(line, preprocessing.strip_tags, preprocessing.strip_punctuation, preprocessing.split_alphanum, preprocessing.strip_non_alphanum, preprocessing.strip_multiple_whitespaces ) #Strip and reduce to max training length line = line.lower().strip().lstrip() if word_classes == False: line = self.r.tagRawSentenceHash(rawLine = line, DICT = self.DICT, word_dict = self.domain_dict) #Array of tuples (LEX, POS, CAT) #For training word embeddings, just return the list else: line = self.r.tagRawSentenceGenSim(rawLine = line, DICT = self.DICT) return np.array(line)
def find_matching_event_abi(abi, event_name=None, argument_names=None): filters = [ functools.partial(filter_by_type, 'event'), ] if event_name is not None: filters.append(functools.partial(filter_by_name, event_name)) if argument_names is not None: filters.append( functools.partial(filter_by_argument_name, argument_names) ) event_abi_candidates = pipe(abi, *filters) if len(event_abi_candidates) == 1: return event_abi_candidates[0] elif not event_abi_candidates: raise ValueError("No matching events found") else: raise ValueError("Multiple events found")
def movie_review_emotions(doc): """Emotion (fine-grained sentiment) tagger for movie reviews. The training data for this function is that of Buitinck et al., with the training and test data concatenated. The algorithm is SVMs in a binary relevance (one-vs-rest) combination. Returns ------- tagged : list of (string, list of string) A list of (sentence, labels) pairs. Each sentence may have zero or more labels. References ---------- L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015). Multi-emotion detection in user-generated reviews. Proc. ECIR. https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf """ from ._emotion import classify nltk_download('punkt') sentences = pipe(doc, fetch, nltk.sent_tokenize) return list(zip(sentences, classify(sentences)))