def estimate(self): file_hash = self.request.get('hash', None) paths = JobPaths(DST_PATH, file_hash) # extract metadata metadata = _delayed_read_metadata(paths.metadata) if not metadata: return json.dumps(dict(target=0, cur=0, proc=0)) # build estimate _joiner = partial(os.path.join, SRC_PATH) src_paths = filter(os.path.isfile, map(_joiner, metadata.filepaths)) target = sum(map(os.path.getsize, src_paths)) # Check if the file is outdated, report 0 in that case. # The worker will redo this check and rebuild the zip. # This avoids us serving an outdated zip, before the job has # a chance to rebuild it. size = 0 if paths.has_zip(): outdated = _is_zip_outdated(paths.zip, src_paths) size = 0 if outdated else os.path.getsize(paths.zip) result = dict( target=target, cur=size, proc=100 if paths.has_done() else ( size * 100 / target if target else 0) ) return json.dumps(result)
def tph_gf(): t = map(lambda n: n * (n + 1) // 2, count(1)) p = map(lambda n: n * (3 * n - 1) // 2, count(1)) h = map(lambda n: n * (2 * n - 1), count(1)) filter_tph = filter_equal(filter_equal(t, p), h) while True: yield next(filter_tph)
def fetch_generator(tabix, contig): fetch = tabix.fetch(contig) rows = map(lambda x: x.split('\t'), fetch) annos = (row for row in rows if "CodingTranscript" in row[9]) json_rows = map(_map_line_to_json, annos) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
def __iter__(self): lists = PodcastList.by_rating(endkey=self.min_list_rating) lists = islice(lists, 0, self.num_lists) lists = map(self._prepare_list, lists) categories = Category.top_categories(self.num_categories) categories = map(self._prepare_category, categories) return chain(lists, categories)
def bencode(value): if type(value) is tuple: value = list(value) switch = { # Flatten the list of pairs before bencoding each one. BT spec says sort them. dict: (b'd%se', lambda x: b''.join(map(bencode, chain.from_iterable(sorted(x.items()))))), list: (b'l%se', lambda x: b''.join(map(bencode, x))), int: (b'i%de', lambda x: x), }.get(type(value), (b'%d:%s', lambda x: (lambda y: (len(y), y))(str(x)))) return switch[0] % switch[1](value)
def load_data(input_file): open_file = open('%s.tsv' % input_file) open_file = csv.reader(open_file, delimiter="\t") open_file.next() grasp = map(row_generator, open_file) grasp = ifilter(lambda row: row[58] != "", grasp) json_rows = map(_map_line_to_json, grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "grasp") for rg in row_groups)
def unique_justseen(iterable, key=None): """ List unique elements, preserving order. Remember only the element just seen." >>> ''.join(unique_justseen('AAAABBBCCDAABBB')) 'ABCDAB' >>> ''.join(unique_justseen('ABBCcAD', str.lower)) 'ABCAD' """ return map(next, map(itemgetter(1), groupby(iterable, key)))
def remove_specs(self, *specs, **kwargs): assert all((s.concrete for s in specs)) with_dependents = kwargs.get("with_dependents", True) with_dependencies = kwargs.get("with_dependencies", False) specs = set(specs) if with_dependencies: specs = get_dependencies(specs) if kwargs.get("exclude", None): specs = set(filter_exclude(specs, kwargs["exclude"])) all_specs = set(self.get_all_specs()) to_deactivate = specs to_keep = all_specs - to_deactivate dependents = find_dependents(to_keep, to_deactivate) if with_dependents: # remove all packages depending on the ones to remove if len(dependents) > 0: tty.warn(self._croot + "The following dependents will be removed: %s" % ", ".join((s.name for s in dependents))) to_deactivate.update(dependents) elif len(dependents) > 0: tty.warn(self._croot + "The following packages will be unusable: %s" % ", ".join((s.name for s in dependents))) extensions = set(filter(lambda s: s.package.is_extension, to_deactivate)) standalones = to_deactivate - extensions # Please note that a traversal of the DAG in post-order and then # forcibly removing each package should remove the need to specify # with_dependents for deactivating extensions/allow removal without # additional checks (force=True). If removal performance becomes # unbearable for whatever reason, this should be the first point of # attack. # # see: https://github.com/spack/spack/pull/3227#discussion_r117147475 remove_extension = ft.partial(self.remove_extension, with_dependents=with_dependents) set(map(remove_extension, extensions)) set(map(self.remove_standalone, standalones)) self.purge_empty_directories()
def pgf_klinearize(args): grammar = pgf.readPGF(args.pgfgrammar); #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = [(sentid, parsesBlock) \ for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]; outputPrinter = printMosesNbestFormat; sentIdsList = map(itemgetter(0), inputSet); parsesBlocks = map(itemgetter(1), inputSet); for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): strTrans = str(outputPrinter(transBlock, sentIdsList)); if strTrans: print(strTrans, file=args.outputstream); return;
def _gen_arch_segment(self, xmlgen, segment): """Generate a <segment> tag for the given ``segment``.""" with xmlgen.element("segment", { "name": segment.name, "length": str(segment.length), "type": "unidir", "freq": str(segment.freq), "Rmetal": str(segment.Rmetal), "Cmetal": str(segment.Cmetal), }): xmlgen.element_leaf("sb", {"type": "pattern"}, " ".join(map(lambda x: "1" if x else "0", segment.sb or ((True, ) * (segment.length + 1))))) xmlgen.element_leaf("cb", {"type": "pattern"}, " ".join(map(lambda x: "1" if x else "0", segment.cb or ((True, ) * segment.length)))) xmlgen.element_leaf("mux", {"name": segment.mux})
def update_contacts(contacts): contacts = map(_transform_contact_data, contacts) # Filter contact data using whitelist if settings.EMARSYS_RECIPIENT_WHITELIST is not None: contacts = filter(lambda contact: contact[3] # 3=email in settings.EMARSYS_RECIPIENT_WHITELIST, contacts) contacts = list(contacts) assert len(contacts) <= BATCH_SIZE if not contacts: return 0, [], [] num_successful, errors = _update_contacts(contacts) missing_contacts = [email for email, error_dict in errors.items() if '2008' in error_dict] failed_contacts = [(email, error_dict) for email, error_dict in errors.items() if '2008' not in error_dict] return num_successful, missing_contacts, failed_contacts
def build_order_from_list(table, order_list): def get_column(key, direction): if direction is not None and direction not in ('desc', 'asc'): raise ValueError("Order direction must be 'desc' or 'asc'") if direction == 'desc': return getattr(table.columns, key).desc() else: return getattr(table.columns, key) def interpret_column(column): if isinstance(order_list, tuple): return get_column(order_list[1], order_list[0]) if isinstance(order_list, str) or isinstance(order_list, unicode): return get_column(order_list, 'asc') else: raise ValueError('Can not interpret order statement. Use list of strings or tuples.') if isinstance(order_list, list): return list(map(interpret_column, order_list)) else: return [interpret_column(order_list)]
def _load_stream_without_unbatching(self, stream): """ Return an iterator of deserialized batches (iterable) of objects from the input stream. if the serializer does not operate on batches the default implementation returns an iterator of single element lists. """ return map(lambda x: [x], self.load_stream(stream))
def print_status(self, *specs, **kwargs): if kwargs.get("with_dependencies", False): specs = set(get_dependencies(specs)) specs = sorted(specs, key=lambda s: s.name) in_view = list(map(self.get_spec, specs)) for s, v in zip(specs, in_view): if not v: tty.error(self._croot + 'Package not linked: %s' % s.name) elif s != v: self.print_conflict(v, s, level="warn") in_view = list(filter(None, in_view)) if len(specs) > 0: tty.msg("Packages linked in %s:" % self._croot[:-1]) # avoid circular dependency import spack.cmd spack.cmd.display_specs(in_view, flags=True, variants=True, long=self.verbose) else: tty.warn(self._croot + "No packages found.")
def main(): global search_for if len(sys.argv) > 1: search_for = sys.argv[1] search_for = re.compile(r'\b%s\b' % re.escape(search_for), re.I) auth = tweepy.OAuthHandler(api_key, api_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # friends = list(tweepy.Cursor(api.friends_ids).items()) with open(follow_input_file) as f: friends = list(map(str.strip, f)) random.shuffle(friends) total = 0 for user in friends: # crs = tweepy.Cursor(api.user_timeline, user_id=user, count=200, trim_user="******", include_rts="false") crs = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, trim_user="******", include_rts="false") try: tweets = [tweet for tweet in crs.items(limit_last) if search_for.search(tweet.text) and not tweet.retweeted] if tweets: tweet = random.choice(tweets) tweet.retweet() # print('User id %d: retweeted' % user) print('%s: %s' % (user, tweet.text.encode('ascii', 'replace'))) total += 1 if total >= limit_total: break else: # print('User id %d: nothing found' % user) print('%s: nothing found' % user) except tweepy.error.TweepError as e: # print('Error, user id %d: %s' % (user, e)) print('Error for user %s: %s' % (user, e))
def get_queryset(self, query_params=None, *args, **kwargs): if query_params is None: query_params = self.request.QUERY_PARAMS location = parse_location(self.kwargs.get('location_slug', None)) time_range = parse_time_range( query_params.get('start', None), query_params.get('end', None) ) try: min_tide_level = float(self.request.QUERY_PARAMS['tide_level']) except KeyError: raise MissingParameterException( 'Missing required query parameter `tide_level`') extended_time_range = TimeRange( start=time_range.start - ONE_DAY, end=time_range.end + ONE_DAY) predictions = get_queryset(location, extended_time_range).filter( tide_level__gte=min_tide_level) return filter(None, map( partial(transform_time_window, time_range, extended_time_range), make_tide_time_windows(predictions)))
def parmap_dict(f,problems,leavefree=1,debug=False,verbose=False): global mypool problems = list(problems) njobs = len(problems) if njobs==0: if verbose: print('NOTHING TO DO?') return [] if not debug and (not 'mypool' in globals() or mypool is None): if verbose: print('NO POOL FOUND. RESTARTING.') mypool = Pool(cpu_count()-leavefree) enumerator = map(f,problems) if debug else mypool.imap(f,problems) results = {} sys.stdout.write('\n') for key,result in enumerator: if isinstance(result,tuple) and len(result)==1: result=result[0] results[key]=result if verbose and type(result) is RuntimeError: print('ERROR PROCESSING',problems[i]) sys.stdout.write('\r \r') results = {key:results[key] for key in problems if key in results and not results[key] is None} return results
def parmap(f,problems,leavefree=1,debug=False,verbose=False): global mypool problems = list(problems) njobs = len(problems) if njobs==0: if verbose: print('NOTHING TO DO?') return [] if not debug and (not 'mypool' in globals() or mypool is None): if verbose: print('NO POOL FOUND. RESTARTING.') mypool = Pool(cpu_count()-leavefree) enumerator = map(f,problems) if debug else mypool.imap(f,problems) results = {} sys.stdout.write('\n') for i,result in enumerator: sys.stdout.write('\rdone %0.1f%% '%((i+1)*100./njobs)) sys.stdout.flush() if isinstance(result,tuple) and len(result)==1: result=result[0] results[i]=result if verbose and type(result) is RuntimeError: print('ERROR PROCESSING',problems[i]) sys.stdout.write('\r \r') return [results[i] if i in results else None \ for i,k in enumerate(problems)]
def get_api_docs(routes): """ Generates GitHub Markdown formatted API documentation using provided schemas in RequestHandler methods and their docstrings. :type routes: [(url, RequestHandler), ...] :param routes: List of routes (this is ideally all possible routes of the app) :rtype: str :returns: generated GFM-formatted documentation """ routes = map(_get_tuple_from_route, routes) documentation = [] for url, rh in sorted(routes, key=lambda a: a[0]): if issubclass(rh, APIHandler): documentation.append(_get_route_doc(url, rh)) documentation = ( "**This documentation is automatically generated.**\n\n" + "**Output schemas only represent `data` and not the full output; " + "see output examples and the JSend specification.**\n" + "\n<br>\n<br>\n".join(documentation) ) return documentation
def __call__(self, *args, **kwargs): argvalues = [arg.value if isinstance(arg, DiffObject) else arg for arg in args] kwargvalues = kwargs #TODO: for now can not diff wrt kwargs #? should I check is all derivatives are provided? #? provide option for numerically computed derivative if not defined? f = self.fun(*argvalues, **kwargvalues) if not any([isinstance(arg, DiffObject) for arg in args]): return f if self.dfun: #compute df_args df = [self.dfun[i](*argvalues, **kwargvalues) \ if isinstance(arg, DiffObject) else None \ for i, arg in enumerate(args)] else: #if self.dfun is empty assume fun returns a tuple of nominal #value and derivative list f, df = f #try to make DiffObject if type(f) in DiffObject._types: dlist = [arg.chain(dfi) for arg, dfi in zip(args, df) if isinstance(arg, DiffObject)] d = sum_dicts(*dlist) return DiffObject(f, d) elif isinstance(f, Iterable): dlist = [[arg.chain(dfij) for dfij in dfi] for arg, dfi in zip(args, df) if isinstance(arg, DiffObject)] d = [sum_dicts(*d) for d in zip(*dlist)] return type(f)(map(DiffObject, f, d)) raise TypeError('DiffFunction output not implemented as a DiffObject')
def get_all_specs(self): dotspack = os.path.join(self.root, spack.store.layout.metadata_dir) if os.path.exists(dotspack): return list(filter(None, map(self.get_spec, os.listdir(dotspack)))) else: return []
def _sift_structure(func, structure, transform): prev_section, relative_path = None, None for section, hosts in structure: if not (section and hosts): continue if prev_section: common_path = tuple(_common_path(section, prev_section)) relative_path = section[len(common_path):] if len(common_path) == 0: for s in prev_section[:-1]: print(_close(s)) else: relative_path = section for s in relative_path: print(_open(s)) if func(section): hosts = map(transform, hosts) for h in hosts: print(h) print(_close(section[-1])) prev_section = section if prev_section: for s in reversed(prev_section[1:]): print(_close(s))
def internal_reader(self, input_stream): """ Reader which uses python eval on each part of a tab separated string. Yields a tuple of python objects. """ for input_line in input_stream: yield list(map(self.deserialize, input_line.split("\t")))
def __init__(self, lens_list): """Initialise the C array from lens_list. The parameter might be another Lenses instance, a NumPy array or a Python sequence of sequences. If lens_list is a NumPy array, it must be C contiguuous. Keep a reference to the data to make sure it is not garbage collected. If lens_list is a Python sequence, each element shall be a sequence of three floats, containing the coordinates and mass of the respective lens. For example Lenses([(0., 0., 1.), (1.2, 0., .0004)]) will create an array of the two given lenses. """ if isinstance(lens_list, Lenses): _c.Structure.__init__(self, lens_list.num_lenses, lens_list.lens) elif isinstance(lens_list, _np.ndarray): _c.Structure.__init__(self, len(lens_list), lens_list.ctypes.data_as(_c.POINTER(Lens))) else: lens_list = list(map(tuple, lens_list)) n = len(lens_list) _c.Structure.__init__(self, n, (Lens*n)(*lens_list))
def _createFromLocal(self, data, schema): """ Create an RDD for DataFrame from a list or pandas.DataFrame, returns the RDD and schema. """ # make sure data could consumed multiple times if not isinstance(data, list): data = list(data) if schema is None or isinstance(schema, (list, tuple)): struct = self._inferSchemaFromList(data, names=schema) converter = _create_converter(struct) data = map(converter, data) if isinstance(schema, (list, tuple)): for i, name in enumerate(schema): struct.fields[i].name = name struct.names[i] = name schema = struct elif not isinstance(schema, StructType): raise TypeError("schema should be StructType or list or None, but got: %s" % schema) # convert python objects to sql data data = [schema.toInternal(row) for row in data] return self._sc.parallelize(data), schema
def podcast_lists(request, page_size=20): # Make sure page request is an int. If not, deliver first page. try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 lists = podcastlists_by_rating(skip=(page-1) * page_size, limit=page_size) def _prepare_list(l): user = get_user_by_id(l.user) l = proxy_object(l) l.username = user.username if user else '' return l lists = map(_prepare_list, lists) num_pages = int(ceil(podcastlist_count() / float(page_size))) page_list = get_page_list(1, num_pages, page, 15) return render(request, 'podcast_lists.html', { 'lists': lists, 'page_list': page_list, })
def _generate_plaintext_signature(client_shared_secret, token_shared_secret=None, _percent_encode=True): """ Calculates the PLAINTEXT signature. :param client_shared_secret: Client (consumer) shared secret. :param token_shared_secret: Token/temporary credentials shared secret if available. :param _percent_encode: (DEBUG) Must be ``True`` to be compatible with OAuth 1.0 RFC5849 & OAuth 1.0a; We have added this parameter to enable better debugging by the signature verification routines. If this is set to ``False``, the signature elements will not be percent-encoded before the plaintext signature is generated. :returns: PLAINTEXT signature. """ client_shared_secret = client_shared_secret or SYMBOL_EMPTY_BYTES token_shared_secret = token_shared_secret or SYMBOL_EMPTY_BYTES if _percent_encode: return SYMBOL_AMPERSAND.join(map(percent_encode, (client_shared_secret, token_shared_secret))) else: # User clients can forget to do this and this has been fixed # by OAuth 1.0a, so we use this piece of code to detect whether # the user's OAuth client library complies with the specification # when in debugging mode. return SYMBOL_AMPERSAND.join((client_shared_secret, token_shared_secret))
def episode_toplist(request, num=100): lang = process_lang_params(request) toplist = EpisodeToplist(language=lang) entries = list(map(proxy_object, toplist[:num])) # load podcast objects podcast_ids = [e.podcast for e in entries] podcasts = podcasts_to_dict(podcast_ids, True) for entry in entries: entry.podcast = podcasts.get(entry.podcast, None) current_site = RequestSite(request) # Determine maximum listener amount (or 0 if no entries exist) max_listeners = max([0]+[e.listeners for e in entries]) languages = get_podcast_languages() all_langs = get_language_names(languages) return render(request, 'episode_toplist.html', { 'entries': entries, 'max_listeners': max_listeners, 'url': current_site, 'language': lang, 'all_languages': all_langs, })
def roundrobin(*iterables): """roundrobin('ABC', 'D', 'EF') --> A D E B F C Recipe originally credited to George Sakkis. Reimplemented to work both in Python 2+ and 3+. http://docs.python.org/3.4/library/itertools.html#itertools-recipes """ pending = len(iterables) next_attr = "next" if version_info[0] == 2 else "__next__" nexts = cycle(map(attrgetter(next_attr), map(iter, iterables))) while pending: try: for n in nexts: yield n() except StopIteration: pending -= 1 nexts = cycle(islice(nexts, pending))
def disable(filename, sections=None): '''Disable the given paths in the specified hosts file.''' structure = _read_structure_lazy(filename, include_hosts=True) sections = list(map(_split, sections)) sift = lambda h: _path_in_list(h, sections) _sift_structure(sift, structure, _add_comment)
def node_list(): """Display a list of server nodes.""" click.echo( tabulate(map(lambda s: s.as_dict(), list_nodes()), headers="keys"))
def _dashboard_from_dict(cls, data): params = {x: data.get(x, None) for x in cls._fields} # Parse the blocks if they're provided and generate block instances. params['blocks'] = tuple(map(Block.from_dict, data.get('blocks', []))) return cls(**params)
from itertools import imap as map, ifilter as filter from os import path, listdir from functools import partial from ast import parse from distutils.sysconfig import get_python_lib if __name__ == "__main__": package_name = "offregister_python_venv" with open(path.join(package_name, "__init__.py")) as f: __author__, __version__ = map( lambda buf: next(map(lambda e: e.value.s, parse(buf).body)), filter( lambda line: line.startswith("__version__") or line.startswith( "__author__"), f, ), ) to_funcs = lambda *paths: ( partial(path.join, path.dirname(__file__), package_name, *paths), partial(path.join, get_python_lib(prefix=""), package_name, *paths), ) _data_join, _data_install_dir = to_funcs("_data") setup( name=package_name, author=__author__, version=__version__,
def filtered_jsonfinder(string, json_only=False): predicate = lambda start, end, json: (all( map(string[start:end].__contains__, filters) ) and check_min_elements(json, options.min_size)) return jsonfinder(string, json_only=json_only, predicate=predicate)
def get_dependencies(specs): "Get set of dependencies (includes specs)" retval = set() set(map(retval.update, (set(s.traverse()) for s in specs))) return retval
def func(iterator): return map(f, iterator)
def _updateIndex(self): """Update the item list""" text_getter = itemgetter(0) path_getter = itemgetter(1) def _replace_htmltags(s): def opentag(m): return ''.join(('<span class="', m.group(1), '">')) s = MATCH_CLOSE_TAG.sub('</span>', s) s = MATCH_OPEN_TAG.sub(opentag, s) return ''.join(('<body>', s, '</body>')) lw = self._ui.listWidgetIndex incr_res = self._incr_results full_res = self._fts_results query = self._ui.lineEditSearch.text().strip() if incr_res is not None and full_res is not None\ and len(incr_res) == 0 and len(full_res) == 0\ and len(query.split()) == 1: self._timerSpellCorrection.start(200) # Escape the previous selection row_prev = lw.currentRow() selected_prev = None if row_prev != -1: selected_prev = self._found_items[row_prev] # Update Index if incr_res and full_res: closed = set(map(path_getter, incr_res)) self._found_items = incr_res + tuple( item for item in full_res if path_getter(item) not in closed) elif incr_res: self._found_items = tuple(incr_res) elif full_res: self._found_items = tuple(full_res) else: self._found_items = tuple() del incr_res del full_res # Create a new list items = tuple( _replace_htmltags(text_getter(item)) for item in self._found_items) lw.clear() lw.addItems(items) # Restore the previous selection if selected_prev: comparer = itemgetter(2, 3, 1) # (sortkey, prio, path) current = comparer(selected_prev) for row in range(len(self._found_items)): if comparer(self._found_items[row]) == current: lw.setCurrentRow(row) break url = self._ui.webView.url().toString() sel_row = -1 for (row, path) in enumerate(map(path_getter, self._found_items)): if 'dict:' + path == url: sel_row = row break if sel_row >= 0: lw.setCurrentRow(sel_row) lw.scrollToItem(lw.item(sel_row), QAbstractItemView.EnsureVisible) else: lw.scrollToTop() if self._selection_pending: self._selection_pending = False self.selectItemRelative() if self._loading_pending: self._loading_pending = False self._loadItem()
def process_special_casing(special_casing, table, index): # Unconditional special casing. unconditional_tolower = {} unconditional_toupper = {} # Conditional special casing, language independent. conditional_tolower = {} conditional_toupper = {} # Conditional special casing, language dependent. lang_conditional_tolower = {} lang_conditional_toupper = {} def caseInfo(code): (upper, lower, flags) = table[index[code]] return ((code + lower) & 0xffff, (code + upper) & 0xffff) for (code, lower, upper, languages, contexts) in read_special_casing(special_casing): assert code <= MAX_BMP, 'Unexpected character outside of BMP: %s' % code assert len(languages ) <= 1, 'Expected zero or one language ids: %s' % languages assert len( contexts ) <= 1, 'Expected zero or one casing contexts: %s' % languages (default_lower, default_upper) = caseInfo(code) special_lower = len(lower) != 1 or lower[0] != default_lower special_upper = len(upper) != 1 or upper[0] != default_upper # Invariant: If |code| has casing per UnicodeData.txt, then it also has # casing rules in SpecialCasing.txt. assert code == default_lower or len(lower) != 1 or code != lower[0] assert code == default_upper or len(upper) != 1 or code != upper[0] language = languages[0] if languages else None context = contexts[0] if contexts else None if not language and not context: if special_lower: unconditional_tolower[code] = lower if special_upper: unconditional_toupper[code] = upper elif not language and context: if special_lower: conditional_tolower[code] = (lower, context) if special_upper: conditional_toupper[code] = (upper, context) else: if language not in lang_conditional_tolower: lang_conditional_tolower[language] = {} lang_conditional_toupper[language] = {} if special_lower: lang_conditional_tolower[language][code] = (lower, context) if special_upper: lang_conditional_toupper[language][code] = (upper, context) # Certain special casing rules are inlined in jsstr.cpp, ensure these cases # still match the current SpecialCasing.txt file. def lowerCase(code): (lower, _) = caseInfo(code) return lower def upperCase(code): (_, upper) = caseInfo(code) return upper def ascii(char_dict): return (ch for ch in char_dict.keys() if ch <= 0x7f) def latin1(char_dict): return (ch for ch in char_dict.keys() if ch <= 0xff) def is_empty(iterable): return not any(True for _ in iterable) def is_equals(iter1, iter2): return all(x == y for (x, y) in zip_longest(iter1, iter2)) # Ensure no ASCII characters have special case mappings. assert is_empty(ascii(unconditional_tolower)) assert is_empty(ascii(unconditional_toupper)) assert is_empty(ascii(conditional_tolower)) assert is_empty(ascii(conditional_toupper)) # Ensure no Latin1 characters have special lower case mappings. assert is_empty(latin1(unconditional_tolower)) assert is_empty(latin1(conditional_tolower)) # Ensure no Latin1 characters have conditional special upper case mappings. assert is_empty(latin1(conditional_toupper)) # Ensure U+00DF is the only Latin1 character with a special upper case mapping. assert is_equals([0x00DF], latin1(unconditional_toupper)) # Ensure U+0130 is the only character with a special lower case mapping. assert is_equals([0x0130], unconditional_tolower) # Ensure no characters have language independent conditional upper case mappings. assert is_empty(conditional_toupper) # Ensure U+03A3 is the only character with language independent conditional lower case mapping. assert is_equals([0x03A3], conditional_tolower) # Verify U+0130 and U+03A3 have simple lower case mappings. assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3]) # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings. assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys())) assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys())) # Maximum case mapping length is three characters. assert max( map( len, chain( unconditional_tolower.values(), unconditional_toupper.values(), map(itemgetter(0), conditional_tolower.values()), map(itemgetter(0), conditional_toupper.values()), map( itemgetter(0), chain.from_iterable( d.values() for d in lang_conditional_tolower.values())), map( itemgetter(0), chain.from_iterable( d.values() for d in lang_conditional_toupper.values())), ))) <= 3 # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms). assert set([ 'After_I', 'After_Soft_Dotted', 'Final_Sigma', 'More_Above', 'Not_Before_Dot', ]).issuperset( set( filter( partial(is_not, None), chain( map(itemgetter(1), conditional_tolower.values()), map(itemgetter(1), conditional_toupper.values()), map( itemgetter(1), chain.from_iterable( d.values() for d in lang_conditional_tolower.values())), map( itemgetter(1), chain.from_iterable( d.values() for d in lang_conditional_toupper.values())), )))) # Special casing for U+00DF (LATIN SMALL LETTER SHARP S). assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [ 0x0053, 0x0053 ] # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE). assert unconditional_tolower[0x0130] == [0x0069, 0x0307] # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA). assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ([ 0x03C2 ], 'Final_Sigma') return (unconditional_tolower, unconditional_toupper)
def applySchema(it): cls = _create_cls(schema) return map(cls, it)
def _decode_map(self, smap): sources = smap['sources'] sourceRoot = smap.get('sourceRoot') names = list(map(text_type, smap['names'])) mappings = smap['mappings'] lines = mappings.split(';') if sourceRoot is not None: sources = list(map(partial(os.path.join, sourceRoot), sources)) # List of all tokens tokens = [] # line_index is used to identify the closest column when looking up a token line_index = [] # Main index of all tokens # The index is keyed on (line, column) index = {} dst_col, src_id, src_line, src_col, name_id = 0, 0, 0, 0, 0 for dst_line, line in enumerate(lines): # Create list for columns in index line_index.append([]) segments = line.split(',') dst_col = 0 for segment in segments: if not segment: continue parse = self.parse_vlq(segment) dst_col += parse[0] src = None name = None if len(parse) > 1: try: src_id += parse[1] if not 0 <= src_id < len(sources): raise SourceMapDecodeError( "Segment %s references source %d; there are " "%d sources" % (segment, src_id, len(sources)) ) src = sources[src_id] src_line += parse[2] src_col += parse[3] if len(parse) > 4: name_id += parse[4] if not 0 <= name_id < len(names): raise SourceMapDecodeError( "Segment %s references name %d; there are " "%d names" % (segment, name_id, len(names)) ) name = names[name_id] except IndexError: raise SourceMapDecodeError( "Invalid segment %s, parsed as %r" % (segment, parse) ) try: assert dst_line >= 0, ('dst_line', dst_line) assert dst_col >= 0, ('dst_col', dst_col) assert src_line >= 0, ('src_line', src_line) assert src_col >= 0, ('src_col', src_col) except AssertionError as e: raise SourceMapDecodeError( "Segment %s has negative %s (%d), in file %s" % (segment, e.message[0], e.message[1], src) ) token = Token(dst_line, dst_col, src, src_line, src_col, name) tokens.append(token) # Insert into main index index[(dst_line, dst_col)] = token # Insert into specific line index line_index[dst_line].append(dst_col) return SourceMapIndex(smap, tokens, line_index, index, sources)
def sections(filename): '''Read and print paths from the given hosts file.''' structure = _read_structure_lazy(filename, include_hosts=False) for path in map(_join, structure): print(path)
class AutoSummDirective(AutodocDirective, Autosummary): """automodule directive that makes a summary at the beginning of the module This directive combines the :class:`sphinx.ext.autodoc.directives.AutodocDirective` and :class:`sphinx.ext.autosummary.Autosummary` directives to put a summary of the specified module at the beginning of the module documentation.""" if sphinx_version < [1, 7]: _default_flags = AutodocDirective._default_flags.union( {'autosummary'} | set(map('autosummary-{}'.format, member_options)) ) else: AUTODOC_DEFAULT_OPTIONS.append('autosummary') AUTODOC_DEFAULT_OPTIONS.extend( map('autosummary-{}'.format, member_options)) @property def autosummary_documenter(self): """Returns the AutosummaryDocumenter subclass that can be used""" try: return self._autosummary_documenter except AttributeError: pass objtype = self.name[4:] env = self.state.document.settings.env if sphinx_version < [1, 7]: doc_class = self._registry[objtype] params = self else: reporter = self.state.document.reporter try: lineno = reporter.get_source_and_line(self.lineno)[1] except AttributeError: lineno = None doc_class = get_documenters(self.env.app)[objtype] args = (self.state, ) if sphinx_version >= [2, 1] else () params = DocumenterBridge( env, reporter, process_documenter_options(doc_class, env.config, self.options), lineno, *args) documenter = doc_class(params, self.arguments[0]) if hasattr(documenter, 'get_grouped_documenters'): self._autosummary_documenter = documenter return documenter # in case the has been changed in the registry, we decide manually if objtype == 'module': documenter = AutoSummModuleDocumenter(params, self.arguments[0]) elif objtype == 'class': documenter = AutoSummClassDocumenter(params, self.arguments[0]) else: raise ValueError( "Could not find a valid documenter for the object type %s" % ( objtype)) self._autosummary_documenter = documenter return documenter def run(self): """Run method for the directive""" options_save = self.options.copy() doc_nodes = AutodocDirective.run(self) self.options.update(options_save) if 'autosummary' not in self.options: return doc_nodes try: self.env = self.state.document.settings.env except AttributeError: pass # is set automatically with sphinx >= 1.8.0 if sphinx_version < [2, 0]: self.warnings = [] self.result = ViewList() documenter = self.autosummary_documenter grouped_documenters = documenter.get_grouped_documenters() summ_nodes = self.autosumm_nodes(documenter, grouped_documenters) dn = summ_nodes.pop(documenter.fullname) if self.name == 'automodule': doc_nodes = self.inject_summ_nodes(doc_nodes, summ_nodes) # insert the nodes directly after the paragraphs if self.name == 'autoclass': for node in dn[::-1]: self._insert_after_paragraphs(doc_nodes[1], node) dn = [] elif self.name == 'automodule': # insert table before the documentation of the members istart = 2 if 'noindex' not in self.options else 0 # if we have a title in the module, we look for the section if (len(doc_nodes) >= istart + 1 and isinstance(doc_nodes[istart], nodes.section)): others = doc_nodes[istart] istart = 2 # skip the title else: others = doc_nodes found = False if len(others[istart:]) >= 2: for i in range(istart, len(others)): if isinstance(others[i], sphinx.addnodes.index): found = True break if found: for node in dn[::-1]: others.insert(i, node) dn = [] return self.warnings + dn + doc_nodes def _insert_after_paragraphs(self, node, insertion): """Inserts the given `insertion` node after the paragraphs in `node` This method inserts the `insertion` node after the instances of nodes.paragraph in the given `node`. Usually the node of one documented class is set up like Name of the documented item (allways) (nodes.Element) Summary (sometimes) (nodes.paragraph) description (sometimes) (nodes.paragraph) Parameters section (sometimes) (nodes.rubric) We want to be below the description, so we loop until we are below all the paragraphs. IF that does not work, we simply put it at the end""" found = False if len(node) >= 2: for i in range(len(node[1])): if not isinstance(node[1][i], nodes.paragraph): node[1].insert(i + 1, insertion) found = True break if not found: node.insert(1, insertion) def inject_summ_nodes(self, doc_nodes, summ_nodes): """Method to inject the autosummary nodes into the autodoc nodes Parameters ---------- doc_nodes: list The list of nodes as they are generated by the :meth:`sphinx.ext.autodoc.AutodocDirective.run` method summ_nodes: dict The generated autosummary nodes as they are generated by the :meth:`autosumm_nodes` method. Note that `summ_nodes` must only contain the members autosummary tables! Returns ------- doc_nodes: list The modified `doc_nodes` Notes ----- `doc_nodes` are modified in place and not copied!""" def inject_summary(node): if isinstance(node, nodes.section): for sub in node: inject_summary(sub) return if (len(node) and (isinstance(node, nodes.section) or ( isinstance(node[0], nodes.Element) and node[0].get('module') and node[0].get('fullname')))): node_summ_nodes = summ_nodes.get("%s.%s" % ( node[0]['module'], node[0]['fullname'])) if not node_summ_nodes: return for summ_node in node_summ_nodes[::-1]: self._insert_after_paragraphs(node, summ_node) for node in doc_nodes: inject_summary(node) return doc_nodes def autosumm_nodes(self, documenter, grouped_documenters): """Create the autosummary nodes based on the documenter content Parameters ---------- documenter: sphinx.ext.autodoc.Documenter The base (module or class) documenter for which to generate the autosummary tables of its members grouped_documenters: dict The dictionary as it is returned from the :meth:`AutosummaryDocumenter.get_grouped_documenters` method Returns ------- dict a mapping from the objects fullname to the corresponding autosummary tables of its members. The objects include the main object of the given `documenter` and the classes that are defined in it See Also -------- AutosummaryDocumenter.get_grouped_documenters, inject_summ_nodes""" summ_nodes = {} this_nodes = [] for section, documenters in six.iteritems(grouped_documenters): items = self.get_items_from_documenters(documenters) if not items: continue node = nodes.rubric() # create note for the section title (we could also use .. rubric # but that causes problems for latex documentations) self.state.nested_parse( ViewList(['**%s**' % section]), 0, node) this_nodes += node this_nodes += self.get_table(items) for mdocumenter, check_module in documenters: if (mdocumenter.objtype == 'class' and not (check_module and not mdocumenter.check_module())): if hasattr(mdocumenter, 'get_grouped_documenters'): summ_nodes.update(self.autosumm_nodes( mdocumenter, mdocumenter.get_grouped_documenters()) ) summ_nodes[documenter.fullname] = this_nodes return summ_nodes def get_items_from_documenters(self, documenters): """Return the items needed for creating the tables This method creates the items that are used by the :meth:`sphinx.ext.autosummary.Autosummary.get_table` method by what is taken from the values of the :meth:`AutoSummModuleDocumenter.get_grouped_documenters` method. Returns ------- list A list containing tuples like ``(name, signature, summary_string, real_name)`` that can be used for the :meth:`sphinx.ext.autosummary.Autosummary.get_table` method.""" items = [] max_item_chars = 50 base_documenter = self.autosummary_documenter try: base_documenter.analyzer = ModuleAnalyzer.for_module( base_documenter.real_modname) attr_docs = base_documenter.analyzer.find_attr_docs() except PycodeError as err: logger.debug('[autodocsumm] module analyzer failed: %s', err) # no source file -- e.g. for builtin and C modules base_documenter.analyzer = None attr_docs = {} # at least add the module.__file__ as a dependency if (hasattr(base_documenter.module, '__file__') and base_documenter.module.__file__): base_documenter.directive.filename_set.add( base_documenter.module.__file__) else: base_documenter.directive.filename_set.add( base_documenter.analyzer.srcname) for documenter, check_module in documenters: documenter.parse_name() documenter.import_object() documenter.real_modname = documenter.get_real_modname() real_name = documenter.fullname display_name = documenter.object_name if display_name is None: # for instance attributes display_name = documenter.objpath[-1] if check_module and not documenter.check_module(): continue # -- Grab the signature sig = documenter.format_signature() if not sig: sig = '' else: max_chars = max(10, max_item_chars - len(display_name)) sig = mangle_signature(sig, max_chars=max_chars) # sig = sig.replace('*', r'\*') # -- Grab the documentation no_docstring = False if documenter.objpath: key = ('.'.join(documenter.objpath[:-1]), documenter.objpath[-1]) try: doc = attr_docs[key] no_docstring = True except KeyError: pass if not no_docstring: documenter.add_content(None) doc = documenter.get_doc() if doc: doc = doc[0] else: continue while doc and not doc[0].strip(): doc.pop(0) # If there's a blank line, then we can assume the first sentence / # paragraph has ended, so anything after shouldn't be part of the # summary for i, piece in enumerate(doc): if not piece.strip(): doc = doc[:i] break # Try to find the "first sentence", which may span multiple lines m = re.search(r"^([A-Z].*?\.)(?:\s|$)", " ".join(doc).strip()) if m: summary = m.group(1).strip() elif doc: summary = doc[0].strip() else: summary = '' items.append((display_name, sig, summary, real_name)) return items
import sphinx.ext.autodoc as ad from sphinx.ext.autosummary import Autosummary, mangle_signature from docutils import nodes from docutils.statemachine import ViewList if sphinx.__version__ >= '1.7': from sphinx.ext.autodoc import Signature, get_documenters from sphinx.ext.autodoc.directive import ( AutodocDirective, AUTODOC_DEFAULT_OPTIONS, DocumenterBridge, process_documenter_options) else: from sphinx.ext.autodoc import ( getargspec, formatargspec, AutoDirective as AutodocDirective, AutoDirective as AutodocRegistry) sphinx_version = list(map(float, re.findall(r'\d+', sphinx.__version__)[:3])) if sphinx_version >= [2, 0]: from sphinx.util import force_decode else: from sphinx.ext.autodoc import force_decode try: from cyordereddict import OrderedDict except ImportError: try: from collections import OrderedDict except ImportError: from ordereddict import OrderedDict
meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score)) else: continue Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html( elem.get('Body')) values = (Id, ParentId, IsAccepted, TimeToAnswer, Score, Text.encode("utf-8"), NumTextTokens, NumCodeLines, LinkCount, NumImages) yield values root.clear() # preserve memory if counter >= 1000000: break with open(filename_filtered, "w") as f: for values in parsexml(filename): line = "\t".join(map(str, values)) f.write(line + "\n") with open(filename_filtered_meta, "w") as f: json.dump(meta, f) print("years:", years) print("#qestions: %i" % num_questions) print("#answers: %i" % num_answers)
def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) safecheck = runner_conf.get( "spark.sql.execution.pandas.arrowSafeTypeConversion", "false").lower() == 'true' # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType assign_cols_by_name = runner_conf.get( "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\ .lower() == "true" # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of # pandas Series. See SPARK-27240. df_for_struct = ( eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF) ser = ArrowStreamPandasUDFSerializer(timezone, safecheck, assign_cols_by_name, df_for_struct) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF if is_scalar_iter or is_map_iter: if is_scalar_iter: assert num_udfs == 1, "One SCALAR_ITER UDF expected here." if is_map_iter: assert num_udfs == 1, "One MAP_ITER UDF expected here." arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) def func(_, iterator): num_input_rows = [0] def map_batch(batch): udf_args = [batch[offset] for offset in arg_offsets] num_input_rows[0] += len(udf_args[0]) if len(udf_args) == 1: return udf_args[0] else: return tuple(udf_args) iterator = map(map_batch, iterator) result_iter = udf(iterator) num_output_rows = 0 for result_batch, result_type in result_iter: num_output_rows += len(result_batch) assert is_map_iter or num_output_rows <= num_input_rows[0], \ "Pandas MAP_ITER UDF outputted more rows than input rows." yield (result_batch, result_type) if is_scalar_iter: try: next(iterator) except StopIteration: pass else: raise RuntimeError( "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input " "iterator.") if is_scalar_iter and num_output_rows != num_input_rows[0]: raise RuntimeError( "The number of output rows of pandas iterator UDF should be " "the same with input rows. The input rows number is %d but the " "output rows number is %d." % (num_input_rows[0], num_output_rows)) # profiling is not supported for UDF return func, None, ser, ser udfs = {} call_udf = [] mapper_str = "" if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # Create function like this: # lambda a: f([a[0]], [a[0], a[1]]) # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) udfs['f'] = udf split_offset = arg_offsets[0] + 1 arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]] arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]] mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1)) else: # Create function like this: # lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3])) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. for i in range(num_udfs): arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) safecheck = runner_conf.get( "spark.sql.execution.pandas.convertToArrowArraySafely", "false").lower() == 'true' # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType assign_cols_by_name = runner_conf.get( "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\ .lower() == "true" if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: ser = CogroupUDFSerializer(timezone, safecheck, assign_cols_by_name) else: # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of # pandas Series. See SPARK-27240. df_for_struct = ( eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF) ser = ArrowStreamPandasUDFSerializer(timezone, safecheck, assign_cols_by_name, df_for_struct) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF if is_scalar_iter or is_map_iter: if is_scalar_iter: assert num_udfs == 1, "One SCALAR_ITER UDF expected here." if is_map_iter: assert num_udfs == 1, "One MAP_ITER UDF expected here." arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) def func(_, iterator): num_input_rows = [ 0 ] # TODO(SPARK-29909): Use nonlocal after we drop Python 2. def map_batch(batch): udf_args = [batch[offset] for offset in arg_offsets] num_input_rows[0] += len(udf_args[0]) if len(udf_args) == 1: return udf_args[0] else: return tuple(udf_args) iterator = map(map_batch, iterator) result_iter = udf(iterator) num_output_rows = 0 for result_batch, result_type in result_iter: num_output_rows += len(result_batch) # This assert is for Scalar Iterator UDF to fail fast. # The length of the entire input can only be explicitly known # by consuming the input iterator in user side. Therefore, # it's very unlikely the output length is higher than # input length. assert is_map_iter or num_output_rows <= num_input_rows[0], \ "Pandas SCALAR_ITER UDF outputted more rows than input rows." yield (result_batch, result_type) if is_scalar_iter: try: next(iterator) except StopIteration: pass else: raise RuntimeError( "pandas iterator UDF should exhaust the input " "iterator.") if num_output_rows != num_input_rows[0]: raise RuntimeError( "The length of output in Scalar iterator pandas UDF should be " "the same with the input's; however, the length of output was %d and the " "length of input was %d." % (num_output_rows, num_input_rows[0])) # profiling is not supported for UDF return func, None, ser, ser def extract_key_value_indexes(grouped_arg_offsets): """ Helper function to extract the key and value indexes from arg_offsets for the grouped and cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code. :param grouped_arg_offsets: List containing the key and value indexes of columns of the DataFrames to be passed to the udf. It consists of n repeating groups where n is the number of DataFrames. Each group has the following format: group[0]: length of group group[1]: length of key indexes group[2.. group[1] +2]: key attributes group[group[1] +3 group[0]]: value attributes """ parsed = [] idx = 0 while idx < len(grouped_arg_offsets): offsets_len = grouped_arg_offsets[idx] idx += 1 offsets = grouped_arg_offsets[idx:idx + offsets_len] split_index = offsets[0] + 1 offset_keys = offsets[1:split_index] offset_values = offsets[split_index:] parsed.append([offset_keys, offset_values]) idx += offsets_len return parsed if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) parsed_offsets = extract_key_value_indexes(arg_offsets) # Create function like this: # mapper a: f([a[0]], [a[0], a[1]]) def mapper(a): keys = [a[o] for o in parsed_offsets[0][0]] vals = [a[o] for o in parsed_offsets[0][1]] return f(keys, vals) elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: # We assume there is only one UDF here because cogrouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) parsed_offsets = extract_key_value_indexes(arg_offsets) def mapper(a): df1_keys = [a[0][o] for o in parsed_offsets[0][0]] df1_vals = [a[0][o] for o in parsed_offsets[0][1]] df2_keys = [a[1][o] for o in parsed_offsets[1][0]] df2_vals = [a[1][o] for o in parsed_offsets[1][1]] return f(df1_keys, df1_vals, df2_keys, df2_vals) else: udfs = [] for i in range(num_udfs): udfs.append( read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i)) def mapper(a): result = tuple( f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. if len(result) == 1: return result[0] else: return result func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
def func(s, iterator): return chain.from_iterable(map(f, iterator))
def _some1(predicate, iterable): """Alternative implementation of :func:`some`.""" return any(map(predicate, iterable))
def colorize_root(root): colorize = ft.partial(tty.color.colorize, color=sys.stdout.isatty()) pre, post = map(colorize, "@M[@. @M]@.".split()) return "".join([pre, root, post])
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \ == "true": timezone = self.conf.get("spark.sql.session.timeZone") else: timezone = None # If no schema supplied by user then get the names of columns only if schema is None: schema = [str(x) if not isinstance(x, basestring) else (x.encode('utf-8') if not isinstance(x, str) else x) for x in data.columns] if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \ and len(data) > 0: try: return self._create_from_pandas_with_arrow(data, schema, timezone) except Exception as e: warnings.warn("Arrow will not be used in createDataFrame: %s" % str(e)) # Fallback to create DataFrame without arrow if raise some exception data = self._convert_from_pandas(data, schema, timezone) if isinstance(schema, StructType): verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj elif isinstance(schema, DataType): dataType = schema schema = StructType().add("value", schema) verify_func = _make_type_verifier( dataType, name="field value") if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj, else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def search_winners(self, states): # A belief state is a list of tuples (prob, candidate, # previous candidate) to describe the probablitity of each # candidate in the state at a certain time. The previous # candidates are used to reconstruct the most likely path prev_belief_state = [] scanned_candidates = {} for state in states: if not state: continue belief_state = [(0, c, None) for c in state] state_size = len(belief_state) for prev_prob, prev_candidate, _ in prev_belief_state: if prev_prob <= 0: continue transition_probs = self.calculate_transition_costs( prev_candidate.body, [c.body for c in state]) assert len(transition_probs) == state_size emission_probs = list( map(self.calculate_emission_cost, [c.body for c in state])) # Update current belief state for idx in range(state_size): transition_prob = transition_probs[idx] emission_prob = emission_probs[idx] if emission_prob <= 0 or transition_prob <= 0: continue new_prob = prev_prob * transition_prob * emission_prob prob, _, _ = belief_state[idx] if prob < new_prob: belief_state[idx] = (new_prob, state[idx], prev_candidate) most_prob, winner, _ = max(belief_state, key=lambda c: c[0]) # If no one at previous state can reach current state, # then it is a new start new_start = most_prob <= 0 if new_start: # Update current belief state belief_state = [(self.calculate_emission_cost(c.body), c, None) for c in state] scanned_candidates = {} most_prob, winner, _ = max(belief_state, key=lambda c: c[0]) if most_prob <= 0: continue # Update scanned table for reconstructing path scanned_candidates.update({c.id: pc for _, c, pc in belief_state}) yield winner, scanned_candidates, new_start # Avoid underflow: multiplying all probability values by # an estimated scalar least_prob, _, _ = min(filter(lambda c: c[0] > 0, belief_state), key=lambda c: c[0]) scalar = 1 prob = least_prob while prob < 1: scalar *= 10 prob = least_prob * scalar if scalar > 1: belief_state = [(p * scalar, c, pc) for p, c, pc in belief_state] prev_belief_state = belief_state
def is_minimum_version(version, min_version): """Return True if version is equal or greater to min_version""" return list(map(int, version.split('.'))) >= list(map(int, min_version.split('.')))
def internal_writer(self, outputs, stdout): """ Writer which outputs the python repr for each item. """ for output in outputs: print("\t".join(map(self.internal_serialize, output)), file=stdout)
def is_less_version(version, max_version): """Return True if version is less to max_version""" return list(map(int, version.split('.'))) < list(map(int, max_version.split('.')))
def _nginx_cerbot_setup( domains, https_cert_email, conf_dirs=("/etc/nginx/sites-enabled", ), use_sudo=True, warn_only=True, quiet=True, ): if not cmd_avail("certbot"): install() if domains != "all": raise NotImplementedError("{} for domains".format(domains)) run_cmd = partial(_run_command, sudo=use_sudo) if not run("ls -A '{conf_dir}'".format(conf_dir=conf_dirs[0]), shell_escape=False): return "hosts_d is empty empty; skipping" server_names_t = tuple( chain(*(run_cmd("grep -RF server_name '{conf_dir}'".format( conf_dir=conf_dir)).split("\n") for conf_dir in conf_dirs))) hosts = tuple( l.partition("127.0.0.1")[2].strip() for l in run_cmd("grep -F 127.0.0.1 /etc/hosts").split("\n") if "localhost" not in l) server_names_d = dict((lambda spl: (spl[1].lstrip().rstrip("; \t\r"), spl[ 0][:spl[0].rfind(":")]))(l.split("server_name")) for l in server_names_t) if len(server_names_d) < len(server_names_t): raise NotImplementedError( "Same server_name in multiple files. We don't know what to stop!") hosts_d = { host: server_names_d[host] for host in hosts if host.count(".") > 1 and host in server_names_d and len(host.translate(None, "~^|()?*")) == len(host) } if not hosts_d: return "hosts_d is empty empty; skipping" run_cmd("mkdir -p /etc/nginx/sites-disabled") sites_avail_local_filepath = resource_filename( "offregister_app_push", path.join("conf", "nginx.sites-available.conf")) def certbot_prep(dns_name, conf_loc): run_cmd("mv '{}' '/etc/nginx/sites-disabled/{}'".format( conf_loc, path.split(conf_loc)[1])) wwwroot = "/var/www/static/{dns_name}".format(dns_name=dns_name) if exists(wwwroot): run_cmd("rm -r '{wwwroot}'".format(wwwroot=wwwroot)) run_cmd("mkdir -p '{wwwroot}'".format(wwwroot=wwwroot)) _send_nginx_conf( conf_remote_filename="/etc/nginx/sites-enabled/{dns_name}-certbot". format(dns_name=dns_name), sites_avail_local_filepath=sites_avail_local_filepath, proxy_block_local_filepath=None, conf_vars={ "NGINX_PORT": 80, "DNS_NAMES": (dns_name, ), "DESCRIPTION": "Temporary conf doing certbot for {}".format(dns_name), "WWWPATH": "/", "WWWROOT": wwwroot, }, ) print( 'one("{}", "{}") ='.format(dns_name, conf_loc), "-w '{wwwroot}' -d '{dns_name}' ".format(dns_name=dns_name, wwwroot=wwwroot), ) return "-w '{wwwroot}' -d '{dns_name}' ".format(dns_name=dns_name, wwwroot=wwwroot) secured_already = (frozenset( run_cmd("ls /etc/letsencrypt/live", warn_only=True).splitlines()) if exists("/etc/letsencrypt/live") else tuple()) cerbot_cmds = tuple( "certbot certonly --agree-tos -m {https_cert_email} --webroot {root}". format(https_cert_email=https_cert_email, root=certbot_prep(dns_name, conf_loc)) for dns_name, conf_loc in iteritems(hosts_d) if dns_name not in secured_already) if not cerbot_cmds: return "You must've already secured all your domains. Otherwise clean: /etc/letsencrypt/live" service_name = "nginx" if sudo( "systemctl status -q {service_name} --no-pager --full".format( service_name=service_name), warn_only=True, ).failed: sudo("systemctl start -q {service_name} --no-pager --full".format( service_name=service_name)) else: sudo("systemctl reload -q {service_name} --no-pager --full".format( service_name=service_name)) print("cerbot_cmds =", cerbot_cmds) certbot_res = tuple(map(run_cmd, cerbot_cmds)) sudo("cp /etc/nginx/sites-disabled/* /etc/nginx/sites-enabled") # sudo('rm -r /etc/nginx/sites-disabled') def secure_conf(dns_name, conf_loc, https_header): # print 'secure_conf({!r}, {!r})'.format(dns_name, conf_loc) if run_cmd("grep -Fq 443 {conf_loc}".format(conf_loc=conf_loc), warn_only=True).failed: logger.warning( "Skipping {conf_loc}; 443 already found within".format( conf_loc=conf_loc)) sio = StringIO() get(remote_path=conf_loc, use_sudo=use_sudo, local_path=sio) sio.seek(0) sio_s = sio.read() substr = sio_s[sio_s.find("{", sio_s.find("server")):sio_s.rfind("}") + 2].replace("listen 80", "listen 443", 1) https_header %= { "CA_CERT_PATH": "/etc/letsencrypt/live/{dns_name}/fullchain.pem".format( dns_name=dns_name), "PRIV_KEY_PATH": "/etc/letsencrypt/live/{dns_name}/privkey.pem".format( dns_name=dns_name), } """ # TODO: Address parsing, if not in `listen` keyword sni = substr.find('server_name') sni = substr[sni:substr.find(';', sni)] col = sni.rfind(':') col = col.format(':') if col > -1 else col""" return put( remote_path=conf_loc, use_sudo=use_sudo, local_path=StringIO("{orig}\n\nserver {substr}".format( orig=sio_s, substr=substr.replace( "{dns_name};\n".format(dns_name=dns_name), "{dns_name};\n{https_header}\n".format( dns_name=dns_name, https_header=_indent(https_header, 4)), 1, ), )), ) with open( resource_filename("offregister_app_push", path.join("conf", "nginx.https_header.conf")), "rt", ) as f: https_header = f.read() replaced_confs = tuple( secure_conf(dns_name, conf_loc, https_header) for dns_name, conf_loc in iteritems(hosts_d)) sudo("systemctl reload -q {service_name} --no-pager --full".format( service_name=service_name)) return {"certbot_res": certbot_res, "replaced_confs": replaced_confs}
def _group_from_dict(cls, data): params = {x: data.get(x, None) for x in cls._fields} # Parse the feeds if they're provided and generate feed instances. params['feeds'] = tuple(map(Feed.from_dict, data.get('feeds', []))) return cls(**params)
def decode_predictions_beam_search(preds, index2word, glossary=None, alphas=None, heuristic=0, x_text=None, unk_symbol='<unk>', pad_sequences=False, mapping=None, verbose=0): """ Decodes predictions from the BeamSearch method. :param preds: Predictions codified as word indices. :param index2word: Mapping from word indices into word characters. :param alphas: Attention model weights: Float matrix with shape (I, J) (I: number of target items; J: number of source items). :param heuristic: Replace unknown words heuristic (0, 1 or 2) :param x_text: Source text (for unk replacement) :param unk_symbol: Unknown words symbol :param pad_sequences: Whether we should make a zero-pad on the input sequence. :param mapping: Source-target dictionary (for unk_replace heuristics 1 and 2) :param verbose: Verbosity level, by default 0. :return: List of decoded predictions """ if verbose > 0: logger.info('Decoding beam search prediction ...') if alphas is not None: if x_text is None: raise AssertionError( 'When using POS_UNK, you must provide the input ' 'text to decode_predictions_beam_search!') if verbose > 0: logger.info('Using heuristic %d' % heuristic) if pad_sequences: preds = [ pred[:sum([int(elem > 0) for elem in pred]) + 1] for pred in preds ] flattened_predictions = [ list(map(lambda x: index2word[x], pred)) for pred in preds ] final_predictions = [] if alphas is not None: x_text = list(map(lambda x: x.split(), x_text)) hard_alignments = list( map( lambda alignment, x_sentence: np.argmax( alignment[:, :max(1, len(x_sentence))], axis=1), alphas, x_text)) for i, a_no in list(enumerate(flattened_predictions)): if unk_symbol in a_no or glossary is not None: a_no = replace_unknown_words(x_text[i], a_no, hard_alignments[i], unk_symbol, glossary=glossary, heuristic=heuristic, mapping=mapping, verbose=verbose) a_no = [ a.decode('utf-8') if isinstance(a, str) and sys.version_info.major == 2 else a for a in a_no ] tmp = u' '.join(a_no[:-1]) final_predictions.append(tmp) else: for a_no in flattened_predictions: a_no = [ a.decode('utf-8') if isinstance(a, str) and sys.version_info.major == 2 else a for a in a_no ] tmp = u' '.join(a_no[:-1]) final_predictions.append(tmp) return final_predictions
def __compat_repr__(self): # pragma: nocover def make_param(name): value = getattr(self, name) return '{name}={value!r}'.format(**locals()) params = ', '.join(map(make_param, self._fields)) return 'EntryPoint({params})'.format(**locals())
def sample_ensemble(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) if args.n_best: predictions, n_best = beam_searcher.predictBeamSearchNet()[s] else: predictions = beam_searcher.predictBeamSearchNet()[s] n_best = None if params_prediction['pos_unk']: samples = predictions[0] alphas = predictions[1] sources = [ x.strip() for x in open(args.text, 'r').read().split('\n') ] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: samples = predictions alphas = None heuristic = None sources = None predictions = decode_predictions_beam_search(samples, index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): predictions = map(detokenize_function, predictions) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(n_best): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip( n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search( [n_best_pred], index2word_y, alphas=[n_best_alpha] if params_prediction['pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = map(detokenize_function, pred) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception( 'Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(predictions) if args.n_best: logging.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logging.info('Sampling finished')
def parsexml(filename): global num_questions, num_answers counter = 0 it = map(itemgetter(1), iter(etree.iterparse(filename, events=('start', )))) root = next(it) # get posts element for elem in it: if counter % 100000 == 0: print("Processed %i <row/> elements" % counter) counter += 1 if elem.tag == 'row': creation_date = dateparser.parse(elem.get('CreationDate')) Id = int(elem.get('Id')) PostTypeId = int(elem.get('PostTypeId')) Score = int(elem.get('Score')) if PostTypeId == 1: num_questions += 1 years[creation_date.year] += 1 ParentId = -1 TimeToAnswer = 0 q_creation[Id] = creation_date accepted = elem.get('AcceptedAnswerId') if accepted: q_accepted[Id] = int(accepted) IsAccepted = 0 elif PostTypeId == 2: num_answers += 1 ParentId = int(elem.get('ParentId')) if not ParentId in q_creation: # question was too far in the past continue TimeToAnswer = (creation_date - q_creation[ParentId]).seconds if ParentId in q_accepted: IsAccepted = int(q_accepted[ParentId] == Id) else: IsAccepted = 0 meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score)) else: continue Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html( elem.get('Body')) values = (Id, ParentId, IsAccepted, TimeToAnswer, Score, Text.encode("utf-8"), NumTextTokens, NumCodeLines, LinkCount, NumImages) yield values root.clear() # preserve memory if counter >= 1000000: break