示例#1
0
    def estimate(self):
        file_hash = self.request.get('hash', None)
        paths = JobPaths(DST_PATH, file_hash)

        # extract metadata
        metadata = _delayed_read_metadata(paths.metadata)
        if not metadata:
            return json.dumps(dict(target=0, cur=0, proc=0))

        # build estimate
        _joiner = partial(os.path.join, SRC_PATH)
        src_paths = filter(os.path.isfile, map(_joiner, metadata.filepaths))
        target = sum(map(os.path.getsize, src_paths))

        # Check if the file is outdated, report 0 in that case.
        # The worker will redo this check and rebuild the zip.
        # This avoids us serving an outdated zip, before the job has
        # a chance to rebuild it.
        size = 0

        if paths.has_zip():
            outdated = _is_zip_outdated(paths.zip, src_paths)
            size = 0 if outdated else os.path.getsize(paths.zip)

        result = dict(
            target=target,
            cur=size,
            proc=100 if paths.has_done() else (
                size * 100 / target if target else 0)
        )

        return json.dumps(result)
示例#2
0
def tph_gf():
    t = map(lambda n: n * (n + 1) // 2, count(1))
    p = map(lambda n: n * (3 * n - 1) // 2, count(1))
    h = map(lambda n: n * (2 * n - 1), count(1))
    filter_tph = filter_equal(filter_equal(t, p), h)
    while True:
        yield next(filter_tph)
示例#3
0
def fetch_generator(tabix, contig):
    fetch = tabix.fetch(contig)
    rows = map(lambda x: x.split('\t'), fetch)
    annos = (row for row in rows if "CodingTranscript" in row[9])
    json_rows = map(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
示例#4
0
文件: topics.py 项目: Mic92/mygpo
    def __iter__(self):
        lists = PodcastList.by_rating(endkey=self.min_list_rating)
        lists = islice(lists, 0, self.num_lists)
        lists = map(self._prepare_list, lists)

        categories = Category.top_categories(self.num_categories)
        categories = map(self._prepare_category, categories)

        return chain(lists, categories)
示例#5
0
def bencode(value):
    if type(value) is tuple: value = list(value)
    switch = {
        # Flatten the list of pairs before bencoding each one.  BT spec says sort them.
        dict: (b'd%se', lambda x: b''.join(map(bencode, chain.from_iterable(sorted(x.items()))))),
        list: (b'l%se', lambda x: b''.join(map(bencode, x))),
        int:  (b'i%de', lambda x: x),
    }.get(type(value), (b'%d:%s', lambda x: (lambda y: (len(y), y))(str(x))))
    return switch[0] % switch[1](value)
示例#6
0
def load_data(input_file):
    open_file = open('%s.tsv' % input_file)
    open_file = csv.reader(open_file, delimiter="\t")
    open_file.next()
    grasp = map(row_generator, open_file)
    grasp = ifilter(lambda row: row[58] != "", grasp)
    json_rows = map(_map_line_to_json, grasp)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "grasp") for rg in row_groups)
示例#7
0
def unique_justseen(iterable, key=None):
    """
    List unique elements, preserving order. Remember only the element just seen."

    >>> ''.join(unique_justseen('AAAABBBCCDAABBB'))
    'ABCDAB'
    >>> ''.join(unique_justseen('ABBCcAD', str.lower))
    'ABCAD'
    """
    return map(next, map(itemgetter(1), groupby(iterable, key)))
示例#8
0
    def remove_specs(self, *specs, **kwargs):
        assert all((s.concrete for s in specs))
        with_dependents = kwargs.get("with_dependents", True)
        with_dependencies = kwargs.get("with_dependencies", False)

        specs = set(specs)

        if with_dependencies:
            specs = get_dependencies(specs)

        if kwargs.get("exclude", None):
            specs = set(filter_exclude(specs, kwargs["exclude"]))

        all_specs = set(self.get_all_specs())

        to_deactivate = specs
        to_keep = all_specs - to_deactivate

        dependents = find_dependents(to_keep, to_deactivate)

        if with_dependents:
            # remove all packages depending on the ones to remove
            if len(dependents) > 0:
                tty.warn(self._croot +
                         "The following dependents will be removed: %s"
                         % ", ".join((s.name for s in dependents)))
                to_deactivate.update(dependents)
        elif len(dependents) > 0:
            tty.warn(self._croot +
                     "The following packages will be unusable: %s"
                     % ", ".join((s.name for s in dependents)))

        extensions = set(filter(lambda s: s.package.is_extension,
                         to_deactivate))
        standalones = to_deactivate - extensions

        # Please note that a traversal of the DAG in post-order and then
        # forcibly removing each package should remove the need to specify
        # with_dependents for deactivating extensions/allow removal without
        # additional checks (force=True). If removal performance becomes
        # unbearable for whatever reason, this should be the first point of
        # attack.
        #
        # see: https://github.com/spack/spack/pull/3227#discussion_r117147475
        remove_extension = ft.partial(self.remove_extension,
                                      with_dependents=with_dependents)

        set(map(remove_extension, extensions))
        set(map(self.remove_standalone, standalones))

        self.purge_empty_directories()
示例#9
0
def pgf_klinearize(args):
  grammar = pgf.readPGF(args.pgfgrammar);
  #if sys.version_info < (3, 0):
  #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
  inputSet = [(sentid, parsesBlock) \
      for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
  outputPrinter = printMosesNbestFormat;
  sentIdsList  = map(itemgetter(0), inputSet);
  parsesBlocks = map(itemgetter(1), inputSet);
  
  for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
    strTrans = str(outputPrinter(transBlock, sentIdsList));
    if strTrans:
      print(strTrans, file=args.outputstream);
  return;
示例#10
0
 def _gen_arch_segment(self, xmlgen, segment):
     """Generate a <segment> tag for the given ``segment``."""
     with xmlgen.element("segment", {
         "name": segment.name,
         "length": str(segment.length),
         "type": "unidir",
         "freq": str(segment.freq),
         "Rmetal": str(segment.Rmetal),
         "Cmetal": str(segment.Cmetal),
         }):
         xmlgen.element_leaf("sb", {"type": "pattern"}, " ".join(map(lambda x: "1" if x else "0",
             segment.sb or ((True, ) * (segment.length + 1)))))
         xmlgen.element_leaf("cb", {"type": "pattern"}, " ".join(map(lambda x: "1" if x else "0",
             segment.cb or ((True, ) * segment.length))))
         xmlgen.element_leaf("mux", {"name": segment.mux})
示例#11
0
def update_contacts(contacts):
    contacts = map(_transform_contact_data, contacts)

    # Filter contact data using whitelist
    if settings.EMARSYS_RECIPIENT_WHITELIST is not None:
        contacts = filter(lambda contact: contact[3]  # 3=email
                          in settings.EMARSYS_RECIPIENT_WHITELIST, contacts)

    contacts = list(contacts)

    assert len(contacts) <= BATCH_SIZE

    if not contacts:
        return 0, [], []

    num_successful, errors = _update_contacts(contacts)

    missing_contacts = [email
                        for email, error_dict in errors.items()
                        if '2008' in error_dict]
    failed_contacts = [(email, error_dict)
                       for email, error_dict in errors.items()
                       if '2008' not in error_dict]

    return num_successful, missing_contacts, failed_contacts
示例#12
0
def build_order_from_list(table, order_list):

    def get_column(key, direction):

        if direction is not None and direction not in ('desc', 'asc'):
            raise ValueError("Order direction must be 'desc' or 'asc'")

        if direction == 'desc':
            return getattr(table.columns, key).desc()

        else:
            return getattr(table.columns, key)

    def interpret_column(column):

        if isinstance(order_list, tuple):
            return get_column(order_list[1], order_list[0])

        if isinstance(order_list, str) or isinstance(order_list, unicode):
            return get_column(order_list, 'asc')

        else:
            raise ValueError('Can not interpret order statement. Use list of strings or tuples.')

    if isinstance(order_list, list):
        return list(map(interpret_column, order_list))

    else:
        return [interpret_column(order_list)]
示例#13
0
 def _load_stream_without_unbatching(self, stream):
     """
     Return an iterator of deserialized batches (iterable) of objects from the input stream.
     if the serializer does not operate on batches the default implementation returns an
     iterator of single element lists.
     """
     return map(lambda x: [x], self.load_stream(stream))
示例#14
0
    def print_status(self, *specs, **kwargs):
        if kwargs.get("with_dependencies", False):
            specs = set(get_dependencies(specs))

        specs = sorted(specs, key=lambda s: s.name)
        in_view = list(map(self.get_spec, specs))

        for s, v in zip(specs, in_view):
            if not v:
                tty.error(self._croot +
                          'Package not linked: %s' % s.name)
            elif s != v:
                self.print_conflict(v, s, level="warn")

        in_view = list(filter(None, in_view))

        if len(specs) > 0:
            tty.msg("Packages linked in %s:" % self._croot[:-1])

            # avoid circular dependency
            import spack.cmd
            spack.cmd.display_specs(in_view, flags=True, variants=True,
                                    long=self.verbose)
        else:
            tty.warn(self._croot + "No packages found.")
示例#15
0
文件: rt_2.py 项目: Ardi3613/tools
def main():
	global search_for
	if len(sys.argv) > 1:
		search_for = sys.argv[1]
	search_for = re.compile(r'\b%s\b' % re.escape(search_for), re.I)
	auth = tweepy.OAuthHandler(api_key, api_secret)
	auth.set_access_token(access_token, access_token_secret)
	api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#	friends = list(tweepy.Cursor(api.friends_ids).items())
	with open(follow_input_file) as f:
		friends = list(map(str.strip, f))
	random.shuffle(friends)
	total = 0
	for user in friends:
#		crs = tweepy.Cursor(api.user_timeline, user_id=user, count=200, trim_user="******", include_rts="false")
		crs = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, trim_user="******", include_rts="false")
		try:
			tweets = [tweet for tweet in crs.items(limit_last) if search_for.search(tweet.text) and not tweet.retweeted]
			if tweets:
				tweet = random.choice(tweets)
				tweet.retweet()
#				print('User id %d: retweeted' % user)
				print('%s: %s' % (user, tweet.text.encode('ascii', 'replace')))
				total += 1
				if total >= limit_total:
					break
			else:
#				print('User id %d: nothing found' % user)
				print('%s: nothing found' % user)
		except tweepy.error.TweepError as e:
#			print('Error, user id %d: %s' % (user, e))
			print('Error for user %s: %s' % (user, e))
示例#16
0
    def get_queryset(self, query_params=None, *args, **kwargs):
        if query_params is None:
            query_params = self.request.QUERY_PARAMS

        location = parse_location(self.kwargs.get('location_slug', None))
        time_range = parse_time_range(
            query_params.get('start', None),
            query_params.get('end', None)
        )

        try:
            min_tide_level = float(self.request.QUERY_PARAMS['tide_level'])
        except KeyError:
            raise MissingParameterException(
                'Missing required query parameter `tide_level`')

        extended_time_range = TimeRange(
            start=time_range.start - ONE_DAY,
            end=time_range.end + ONE_DAY)

        predictions = get_queryset(location, extended_time_range).filter(
            tide_level__gte=min_tide_level)

        return filter(None, map(
            partial(transform_time_window, time_range, extended_time_range),
            make_tide_time_windows(predictions)))
示例#17
0
def parmap_dict(f,problems,leavefree=1,debug=False,verbose=False):
    global mypool
    problems = list(problems)
    njobs    = len(problems)

    if njobs==0:
        if verbose: print('NOTHING TO DO?')
        return []

    if not debug and (not 'mypool' in globals() or mypool is None):
        if verbose: print('NO POOL FOUND. RESTARTING.')
        mypool = Pool(cpu_count()-leavefree)

    enumerator = map(f,problems) if debug else mypool.imap(f,problems)
    results = {}
    sys.stdout.write('\n')
    for key,result in enumerator:
        if isinstance(result,tuple) and len(result)==1:
            result=result[0]
        results[key]=result
        if verbose and type(result) is RuntimeError:
            print('ERROR PROCESSING',problems[i])

    sys.stdout.write('\r            \r')
    
    results = {key:results[key] for key in problems if key in results and not results[key] is None}
    return results
示例#18
0
def parmap(f,problems,leavefree=1,debug=False,verbose=False):
    global mypool
    problems = list(problems)
    njobs    = len(problems)

    if njobs==0:
        if verbose: print('NOTHING TO DO?')
        return []

    if not debug and (not 'mypool' in globals() or mypool is None):
        if verbose: print('NO POOL FOUND. RESTARTING.')
        mypool = Pool(cpu_count()-leavefree)

    enumerator = map(f,problems) if debug else mypool.imap(f,problems)
    results = {}
    sys.stdout.write('\n')
    for i,result in enumerator:
        sys.stdout.write('\rdone %0.1f%% '%((i+1)*100./njobs))
        sys.stdout.flush()
        if isinstance(result,tuple) and len(result)==1:
            result=result[0]
        results[i]=result
        if verbose and type(result) is RuntimeError:
            print('ERROR PROCESSING',problems[i])

    sys.stdout.write('\r            \r')
    return [results[i] if i in results else None \
        for i,k in enumerate(problems)]
示例#19
0
def get_api_docs(routes):
    """
    Generates GitHub Markdown formatted API documentation using
    provided schemas in RequestHandler methods and their docstrings.

    :type  routes: [(url, RequestHandler), ...]
    :param routes: List of routes (this is ideally all possible routes of the
        app)
    :rtype: str
    :returns: generated GFM-formatted documentation
    """
    routes = map(_get_tuple_from_route, routes)

    documentation = []
    for url, rh in sorted(routes, key=lambda a: a[0]):
        if issubclass(rh, APIHandler):
            documentation.append(_get_route_doc(url, rh))

    documentation = (
        "**This documentation is automatically generated.**\n\n" +
        "**Output schemas only represent `data` and not the full output; " +
        "see output examples and the JSend specification.**\n" +
        "\n<br>\n<br>\n".join(documentation)
    )
    return documentation
示例#20
0
 def __call__(self, *args, **kwargs):
     argvalues = [arg.value if isinstance(arg, DiffObject) else arg for arg in args]
     kwargvalues = kwargs #TODO: for now can not diff wrt kwargs       
     
     #? should I check is all derivatives are provided?
     #? provide option for numerically computed derivative if not defined?
     f = self.fun(*argvalues, **kwargvalues)
     
     if not any([isinstance(arg, DiffObject) for arg in args]):
         return f
     if self.dfun:
         #compute df_args
         df = [self.dfun[i](*argvalues, **kwargvalues) \
              if isinstance(arg, DiffObject) else None \
              for i, arg in enumerate(args)]
     else:
         #if self.dfun is empty assume fun returns a tuple of nominal 
         #value and derivative list
         f, df = f
         
     #try to make DiffObject
     if type(f) in DiffObject._types:
         dlist = [arg.chain(dfi) for arg, dfi in zip(args, df) if isinstance(arg, DiffObject)]
         d = sum_dicts(*dlist)
         return DiffObject(f, d)
     elif isinstance(f, Iterable):
         dlist = [[arg.chain(dfij) for dfij in dfi] for arg, dfi in zip(args, df) if isinstance(arg, DiffObject)]
         d = [sum_dicts(*d) for d in zip(*dlist)]
         return type(f)(map(DiffObject, f, d))
         
     raise TypeError('DiffFunction output not implemented as a DiffObject')
示例#21
0
 def get_all_specs(self):
     dotspack = os.path.join(self.root,
                             spack.store.layout.metadata_dir)
     if os.path.exists(dotspack):
         return list(filter(None, map(self.get_spec, os.listdir(dotspack))))
     else:
         return []
示例#22
0
文件: hosts.py 项目: dhaffner/hosts
def _sift_structure(func, structure, transform):
    prev_section, relative_path = None, None
    for section, hosts in structure:
        if not (section and hosts):
            continue

        if prev_section:
            common_path = tuple(_common_path(section, prev_section))
            relative_path = section[len(common_path):]
            if len(common_path) == 0:
                for s in prev_section[:-1]:
                    print(_close(s))
        else:
            relative_path = section

        for s in relative_path:
            print(_open(s))

        if func(section):
            hosts = map(transform, hosts)

        for h in hosts:
            print(h)

        print(_close(section[-1]))
        prev_section = section

    if prev_section:
        for s in reversed(prev_section[1:]):
            print(_close(s))
示例#23
0
 def internal_reader(self, input_stream):
     """
     Reader which uses python eval on each part of a tab separated string.
     Yields a tuple of python objects.
     """
     for input_line in input_stream:
         yield list(map(self.deserialize, input_line.split("\t")))
示例#24
0
    def __init__(self, lens_list):
        """Initialise the C array from lens_list.

        The parameter might be another Lenses instance, a NumPy array
        or a Python sequence of sequences.

        If lens_list is a NumPy array, it must be C contiguuous.  Keep
        a reference to the data to make sure it is not garbage
        collected.

        If lens_list is a Python sequence, each element shall be a
        sequence of three floats, containing the coordinates and mass
        of the respective lens.  For example

        Lenses([(0., 0., 1.), (1.2, 0., .0004)])

        will create an array of the two given lenses.
        """
        if isinstance(lens_list, Lenses):
            _c.Structure.__init__(self, lens_list.num_lenses, lens_list.lens)
        elif isinstance(lens_list, _np.ndarray):
            _c.Structure.__init__(self, len(lens_list),
                                  lens_list.ctypes.data_as(_c.POINTER(Lens)))
        else:
            lens_list = list(map(tuple, lens_list))
            n = len(lens_list)
            _c.Structure.__init__(self, n, (Lens*n)(*lens_list))
示例#25
0
    def _createFromLocal(self, data, schema):
        """
        Create an RDD for DataFrame from a list or pandas.DataFrame, returns
        the RDD and schema.
        """
        # make sure data could consumed multiple times
        if not isinstance(data, list):
            data = list(data)

        if schema is None or isinstance(schema, (list, tuple)):
            struct = self._inferSchemaFromList(data, names=schema)
            converter = _create_converter(struct)
            data = map(converter, data)
            if isinstance(schema, (list, tuple)):
                for i, name in enumerate(schema):
                    struct.fields[i].name = name
                    struct.names[i] = name
            schema = struct

        elif not isinstance(schema, StructType):
            raise TypeError("schema should be StructType or list or None, but got: %s" % schema)

        # convert python objects to sql data
        data = [schema.toInternal(row) for row in data]
        return self._sc.parallelize(data), schema
示例#26
0
文件: views.py 项目: fk-lx/mygpo
def podcast_lists(request, page_size=20):

    # Make sure page request is an int. If not, deliver first page.
    try:
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1

    lists = podcastlists_by_rating(skip=(page-1) * page_size, limit=page_size)


    def _prepare_list(l):
        user = get_user_by_id(l.user)
        l = proxy_object(l)
        l.username = user.username if user else ''
        return l

    lists = map(_prepare_list, lists)

    num_pages = int(ceil(podcastlist_count() / float(page_size)))

    page_list = get_page_list(1, num_pages, page, 15)

    return render(request, 'podcast_lists.html', {
        'lists': lists,
        'page_list': page_list,
        })
示例#27
0
def _generate_plaintext_signature(client_shared_secret,
                                  token_shared_secret=None,
                                  _percent_encode=True):
    """
    Calculates the PLAINTEXT signature.

    :param client_shared_secret:
        Client (consumer) shared secret.
    :param token_shared_secret:
        Token/temporary credentials shared secret if available.
    :param _percent_encode:
        (DEBUG)

        Must be ``True`` to be compatible with OAuth 1.0 RFC5849 & OAuth 1.0a;
        We have added this parameter to enable better debugging by the
        signature verification routines. If this is set to ``False``, the
        signature elements will not be percent-encoded before the plaintext
        signature is generated.
    :returns:
        PLAINTEXT signature.
    """
    client_shared_secret = client_shared_secret or SYMBOL_EMPTY_BYTES
    token_shared_secret = token_shared_secret or SYMBOL_EMPTY_BYTES
    if _percent_encode:
        return SYMBOL_AMPERSAND.join(map(percent_encode,
                            (client_shared_secret, token_shared_secret)))
    else:
        # User clients can forget to do this and this has been fixed
        # by OAuth 1.0a, so we use this piece of code to detect whether
        # the user's OAuth client library complies with the specification
        # when in debugging mode.
        return SYMBOL_AMPERSAND.join((client_shared_secret,
                                      token_shared_secret))
示例#28
0
文件: views.py 项目: fk-lx/mygpo
def episode_toplist(request, num=100):
    lang = process_lang_params(request)

    toplist = EpisodeToplist(language=lang)
    entries = list(map(proxy_object, toplist[:num]))

    # load podcast objects
    podcast_ids = [e.podcast for e in entries]
    podcasts = podcasts_to_dict(podcast_ids, True)
    for entry in entries:
        entry.podcast = podcasts.get(entry.podcast, None)

    current_site = RequestSite(request)

    # Determine maximum listener amount (or 0 if no entries exist)
    max_listeners = max([0]+[e.listeners for e in entries])

    languages = get_podcast_languages()
    all_langs = get_language_names(languages)

    return render(request, 'episode_toplist.html', {
        'entries': entries,
        'max_listeners': max_listeners,
        'url': current_site,
        'language': lang,
        'all_languages': all_langs,
    })
示例#29
0
文件: iters.py 项目: mgill25/fn.py
def roundrobin(*iterables):
    """roundrobin('ABC', 'D', 'EF') --> A D E B F C
    Recipe originally credited to George Sakkis.
    Reimplemented to work both in Python 2+ and 3+. 

    http://docs.python.org/3.4/library/itertools.html#itertools-recipes
    """
    pending = len(iterables)
    next_attr = "next" if version_info[0] == 2 else "__next__"
    nexts = cycle(map(attrgetter(next_attr), map(iter, iterables)))
    while pending:
        try:
            for n in nexts:
                yield n()
        except StopIteration:
            pending -= 1
            nexts = cycle(islice(nexts, pending))
示例#30
0
文件: hosts.py 项目: dhaffner/hosts
def disable(filename, sections=None):
    '''Disable the given paths in the specified hosts file.'''

    structure = _read_structure_lazy(filename, include_hosts=True)

    sections = list(map(_split, sections))
    sift = lambda h: _path_in_list(h, sections)
    _sift_structure(sift, structure, _add_comment)
示例#31
0
def node_list():
    """Display a list of server nodes."""
    click.echo(
        tabulate(map(lambda s: s.as_dict(), list_nodes()), headers="keys"))
示例#32
0
def _dashboard_from_dict(cls, data):
    params = {x: data.get(x, None) for x in cls._fields}
    # Parse the blocks if they're provided and generate block instances.
    params['blocks'] = tuple(map(Block.from_dict, data.get('blocks', [])))
    return cls(**params)
示例#33
0
    from itertools import imap as map, ifilter as filter

from os import path, listdir
from functools import partial
from ast import parse
from distutils.sysconfig import get_python_lib

if __name__ == "__main__":
    package_name = "offregister_python_venv"

    with open(path.join(package_name, "__init__.py")) as f:
        __author__, __version__ = map(
            lambda buf: next(map(lambda e: e.value.s,
                                 parse(buf).body)),
            filter(
                lambda line: line.startswith("__version__") or line.startswith(
                    "__author__"),
                f,
            ),
        )

    to_funcs = lambda *paths: (
        partial(path.join, path.dirname(__file__), package_name, *paths),
        partial(path.join, get_python_lib(prefix=""), package_name, *paths),
    )
    _data_join, _data_install_dir = to_funcs("_data")

    setup(
        name=package_name,
        author=__author__,
        version=__version__,
示例#34
0
 def filtered_jsonfinder(string, json_only=False):
     predicate = lambda start, end, json: (all(
         map(string[start:end].__contains__, filters)
     ) and check_min_elements(json, options.min_size))
     return jsonfinder(string, json_only=json_only, predicate=predicate)
示例#35
0
def get_dependencies(specs):
    "Get set of dependencies (includes specs)"
    retval = set()
    set(map(retval.update, (set(s.traverse()) for s in specs)))
    return retval
示例#36
0
 def func(iterator):
     return map(f, iterator)
示例#37
0
    def _updateIndex(self):
        """Update the item list"""

        text_getter = itemgetter(0)
        path_getter = itemgetter(1)

        def _replace_htmltags(s):
            def opentag(m):
                return ''.join(('<span class="', m.group(1), '">'))

            s = MATCH_CLOSE_TAG.sub('</span>', s)
            s = MATCH_OPEN_TAG.sub(opentag, s)
            return ''.join(('<body>', s, '</body>'))

        lw = self._ui.listWidgetIndex

        incr_res = self._incr_results
        full_res = self._fts_results

        query = self._ui.lineEditSearch.text().strip()
        if incr_res is not None and full_res is not None\
                and len(incr_res) == 0 and len(full_res) == 0\
                and len(query.split()) == 1:
            self._timerSpellCorrection.start(200)

        # Escape the previous selection
        row_prev = lw.currentRow()
        selected_prev = None
        if row_prev != -1:
            selected_prev = self._found_items[row_prev]

        # Update Index
        if incr_res and full_res:
            closed = set(map(path_getter, incr_res))
            self._found_items = incr_res + tuple(
                item for item in full_res if path_getter(item) not in closed)
        elif incr_res:
            self._found_items = tuple(incr_res)
        elif full_res:
            self._found_items = tuple(full_res)
        else:
            self._found_items = tuple()

        del incr_res
        del full_res

        # Create a new list
        items = tuple(
            _replace_htmltags(text_getter(item)) for item in self._found_items)
        lw.clear()
        lw.addItems(items)

        # Restore the previous selection
        if selected_prev:
            comparer = itemgetter(2, 3, 1)  # (sortkey, prio, path)
            current = comparer(selected_prev)
            for row in range(len(self._found_items)):
                if comparer(self._found_items[row]) == current:
                    lw.setCurrentRow(row)
                    break

        url = self._ui.webView.url().toString()
        sel_row = -1
        for (row, path) in enumerate(map(path_getter, self._found_items)):
            if 'dict:' + path == url:
                sel_row = row
                break

        if sel_row >= 0:
            lw.setCurrentRow(sel_row)
            lw.scrollToItem(lw.item(sel_row), QAbstractItemView.EnsureVisible)
        else:
            lw.scrollToTop()

        if self._selection_pending:
            self._selection_pending = False
            self.selectItemRelative()

        if self._loading_pending:
            self._loading_pending = False
            self._loadItem()
示例#38
0
def process_special_casing(special_casing, table, index):
    # Unconditional special casing.
    unconditional_tolower = {}
    unconditional_toupper = {}

    # Conditional special casing, language independent.
    conditional_tolower = {}
    conditional_toupper = {}

    # Conditional special casing, language dependent.
    lang_conditional_tolower = {}
    lang_conditional_toupper = {}

    def caseInfo(code):
        (upper, lower, flags) = table[index[code]]
        return ((code + lower) & 0xffff, (code + upper) & 0xffff)

    for (code, lower, upper, languages,
         contexts) in read_special_casing(special_casing):
        assert code <= MAX_BMP, 'Unexpected character outside of BMP: %s' % code
        assert len(languages
                   ) <= 1, 'Expected zero or one language ids: %s' % languages
        assert len(
            contexts
        ) <= 1, 'Expected zero or one casing contexts: %s' % languages

        (default_lower, default_upper) = caseInfo(code)
        special_lower = len(lower) != 1 or lower[0] != default_lower
        special_upper = len(upper) != 1 or upper[0] != default_upper

        # Invariant: If |code| has casing per UnicodeData.txt, then it also has
        # casing rules in SpecialCasing.txt.
        assert code == default_lower or len(lower) != 1 or code != lower[0]
        assert code == default_upper or len(upper) != 1 or code != upper[0]

        language = languages[0] if languages else None
        context = contexts[0] if contexts else None

        if not language and not context:
            if special_lower:
                unconditional_tolower[code] = lower
            if special_upper:
                unconditional_toupper[code] = upper
        elif not language and context:
            if special_lower:
                conditional_tolower[code] = (lower, context)
            if special_upper:
                conditional_toupper[code] = (upper, context)
        else:
            if language not in lang_conditional_tolower:
                lang_conditional_tolower[language] = {}
                lang_conditional_toupper[language] = {}
            if special_lower:
                lang_conditional_tolower[language][code] = (lower, context)
            if special_upper:
                lang_conditional_toupper[language][code] = (upper, context)

    # Certain special casing rules are inlined in jsstr.cpp, ensure these cases
    # still match the current SpecialCasing.txt file.
    def lowerCase(code):
        (lower, _) = caseInfo(code)
        return lower

    def upperCase(code):
        (_, upper) = caseInfo(code)
        return upper

    def ascii(char_dict):
        return (ch for ch in char_dict.keys() if ch <= 0x7f)

    def latin1(char_dict):
        return (ch for ch in char_dict.keys() if ch <= 0xff)

    def is_empty(iterable):
        return not any(True for _ in iterable)

    def is_equals(iter1, iter2):
        return all(x == y for (x, y) in zip_longest(iter1, iter2))

    # Ensure no ASCII characters have special case mappings.
    assert is_empty(ascii(unconditional_tolower))
    assert is_empty(ascii(unconditional_toupper))
    assert is_empty(ascii(conditional_tolower))
    assert is_empty(ascii(conditional_toupper))

    # Ensure no Latin1 characters have special lower case mappings.
    assert is_empty(latin1(unconditional_tolower))
    assert is_empty(latin1(conditional_tolower))

    # Ensure no Latin1 characters have conditional special upper case mappings.
    assert is_empty(latin1(conditional_toupper))

    # Ensure U+00DF is the only Latin1 character with a special upper case mapping.
    assert is_equals([0x00DF], latin1(unconditional_toupper))

    # Ensure U+0130 is the only character with a special lower case mapping.
    assert is_equals([0x0130], unconditional_tolower)

    # Ensure no characters have language independent conditional upper case mappings.
    assert is_empty(conditional_toupper)

    # Ensure U+03A3 is the only character with language independent conditional lower case mapping.
    assert is_equals([0x03A3], conditional_tolower)

    # Verify U+0130 and U+03A3 have simple lower case mappings.
    assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])

    # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
    assert is_equals(["az", "lt", "tr"],
                     sorted(lang_conditional_tolower.keys()))
    assert is_equals(["az", "lt", "tr"],
                     sorted(lang_conditional_toupper.keys()))

    # Maximum case mapping length is three characters.
    assert max(
        map(
            len,
            chain(
                unconditional_tolower.values(),
                unconditional_toupper.values(),
                map(itemgetter(0), conditional_tolower.values()),
                map(itemgetter(0), conditional_toupper.values()),
                map(
                    itemgetter(0),
                    chain.from_iterable(
                        d.values()
                        for d in lang_conditional_tolower.values())),
                map(
                    itemgetter(0),
                    chain.from_iterable(
                        d.values()
                        for d in lang_conditional_toupper.values())),
            ))) <= 3

    # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
    assert set([
        'After_I',
        'After_Soft_Dotted',
        'Final_Sigma',
        'More_Above',
        'Not_Before_Dot',
    ]).issuperset(
        set(
            filter(
                partial(is_not, None),
                chain(
                    map(itemgetter(1), conditional_tolower.values()),
                    map(itemgetter(1), conditional_toupper.values()),
                    map(
                        itemgetter(1),
                        chain.from_iterable(
                            d.values()
                            for d in lang_conditional_tolower.values())),
                    map(
                        itemgetter(1),
                        chain.from_iterable(
                            d.values()
                            for d in lang_conditional_toupper.values())),
                ))))

    # Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
    assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [
        0x0053, 0x0053
    ]

    # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
    assert unconditional_tolower[0x0130] == [0x0069, 0x0307]

    # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
    assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ([
        0x03C2
    ], 'Final_Sigma')

    return (unconditional_tolower, unconditional_toupper)
示例#39
0
 def applySchema(it):
     cls = _create_cls(schema)
     return map(cls, it)
示例#40
0
    def _decode_map(self, smap):
        sources = smap['sources']
        sourceRoot = smap.get('sourceRoot')
        names = list(map(text_type, smap['names']))
        mappings = smap['mappings']
        lines = mappings.split(';')

        if sourceRoot is not None:
            sources = list(map(partial(os.path.join, sourceRoot), sources))

        # List of all tokens
        tokens = []

        # line_index is used to identify the closest column when looking up a token
        line_index = []

        # Main index of all tokens
        # The index is keyed on (line, column)
        index = {}

        dst_col, src_id, src_line, src_col, name_id = 0, 0, 0, 0, 0
        for dst_line, line in enumerate(lines):
            # Create list for columns in index
            line_index.append([])

            segments = line.split(',')
            dst_col = 0
            for segment in segments:
                if not segment:
                    continue
                parse = self.parse_vlq(segment)
                dst_col += parse[0]

                src = None
                name = None
                if len(parse) > 1:
                    try:
                        src_id += parse[1]
                        if not 0 <= src_id < len(sources):
                            raise SourceMapDecodeError(
                                "Segment %s references source %d; there are "
                                "%d sources" % (segment, src_id, len(sources))
                            )

                        src = sources[src_id]
                        src_line += parse[2]
                        src_col += parse[3]

                        if len(parse) > 4:
                            name_id += parse[4]
                            if not 0 <= name_id < len(names):
                                raise SourceMapDecodeError(
                                    "Segment %s references name %d; there are "
                                    "%d names" % (segment, name_id, len(names))
                                )

                            name = names[name_id]
                    except IndexError:
                        raise SourceMapDecodeError(
                            "Invalid segment %s, parsed as %r"
                            % (segment, parse)
                        )

                try:
                    assert dst_line >= 0, ('dst_line', dst_line)
                    assert dst_col >= 0, ('dst_col', dst_col)
                    assert src_line >= 0, ('src_line', src_line)
                    assert src_col >= 0, ('src_col', src_col)
                except AssertionError as e:
                    raise SourceMapDecodeError(
                        "Segment %s has negative %s (%d), in file %s"
                        % (segment, e.message[0], e.message[1], src)
                    )

                token = Token(dst_line, dst_col, src, src_line, src_col, name)
                tokens.append(token)

                # Insert into main index
                index[(dst_line, dst_col)] = token

                # Insert into specific line index
                line_index[dst_line].append(dst_col)

        return SourceMapIndex(smap, tokens, line_index, index, sources)
示例#41
0
def sections(filename):
    '''Read and print paths from the given hosts file.'''

    structure = _read_structure_lazy(filename, include_hosts=False)
    for path in map(_join, structure):
        print(path)
示例#42
0
class AutoSummDirective(AutodocDirective, Autosummary):
    """automodule directive that makes a summary at the beginning of the module

    This directive combines the
    :class:`sphinx.ext.autodoc.directives.AutodocDirective` and
    :class:`sphinx.ext.autosummary.Autosummary` directives to put a summary of
    the specified module at the beginning of the module documentation."""

    if sphinx_version < [1, 7]:
        _default_flags = AutodocDirective._default_flags.union(
            {'autosummary'} | set(map('autosummary-{}'.format, member_options))
            )
    else:
        AUTODOC_DEFAULT_OPTIONS.append('autosummary')
        AUTODOC_DEFAULT_OPTIONS.extend(
            map('autosummary-{}'.format, member_options))

    @property
    def autosummary_documenter(self):
        """Returns the AutosummaryDocumenter subclass that can be used"""
        try:
            return self._autosummary_documenter
        except AttributeError:
            pass
        objtype = self.name[4:]
        env = self.state.document.settings.env
        if sphinx_version < [1, 7]:
            doc_class = self._registry[objtype]
            params = self
        else:
            reporter = self.state.document.reporter
            try:
                lineno = reporter.get_source_and_line(self.lineno)[1]
            except AttributeError:
                lineno = None
            doc_class = get_documenters(self.env.app)[objtype]
            args = (self.state, ) if sphinx_version >= [2, 1] else ()
            params = DocumenterBridge(
                env, reporter,
                process_documenter_options(doc_class, env.config,
                                           self.options),
                lineno, *args)
        documenter = doc_class(params, self.arguments[0])
        if hasattr(documenter, 'get_grouped_documenters'):
            self._autosummary_documenter = documenter
            return documenter
        # in case the has been changed in the registry, we decide manually
        if objtype == 'module':
            documenter = AutoSummModuleDocumenter(params, self.arguments[0])
        elif objtype == 'class':
            documenter = AutoSummClassDocumenter(params, self.arguments[0])
        else:
            raise ValueError(
                "Could not find a valid documenter for the object type %s" % (
                    objtype))
        self._autosummary_documenter = documenter
        return documenter

    def run(self):
        """Run method for the directive"""
        options_save = self.options.copy()
        doc_nodes = AutodocDirective.run(self)
        self.options.update(options_save)
        if 'autosummary' not in self.options:
            return doc_nodes
        try:
            self.env = self.state.document.settings.env
        except AttributeError:
            pass  # is set automatically with sphinx >= 1.8.0
        if sphinx_version < [2, 0]:
            self.warnings = []
            self.result = ViewList()
        documenter = self.autosummary_documenter
        grouped_documenters = documenter.get_grouped_documenters()
        summ_nodes = self.autosumm_nodes(documenter, grouped_documenters)

        dn = summ_nodes.pop(documenter.fullname)
        if self.name == 'automodule':
            doc_nodes = self.inject_summ_nodes(doc_nodes, summ_nodes)
        # insert the nodes directly after the paragraphs
        if self.name == 'autoclass':
            for node in dn[::-1]:
                self._insert_after_paragraphs(doc_nodes[1], node)
            dn = []
        elif self.name == 'automodule':
            # insert table before the documentation of the members
            istart = 2 if 'noindex' not in self.options else 0
            # if we have a title in the module, we look for the section
            if (len(doc_nodes) >= istart + 1 and
                    isinstance(doc_nodes[istart], nodes.section)):
                others = doc_nodes[istart]
                istart = 2  # skip the title
            else:
                others = doc_nodes
            found = False
            if len(others[istart:]) >= 2:
                for i in range(istart, len(others)):
                    if isinstance(others[i], sphinx.addnodes.index):
                        found = True
                        break
            if found:
                for node in dn[::-1]:
                    others.insert(i, node)
                dn = []
        return self.warnings + dn + doc_nodes

    def _insert_after_paragraphs(self, node, insertion):
        """Inserts the given `insertion` node after the paragraphs in `node`

        This method inserts the `insertion` node after the instances of
        nodes.paragraph in the given `node`.
        Usually the node of one documented class is set up like

        Name of the documented item (allways) (nodes.Element)
        Summary (sometimes) (nodes.paragraph)
        description (sometimes) (nodes.paragraph)
        Parameters section (sometimes) (nodes.rubric)

        We want to be below the description, so we loop until we
        are below all the paragraphs. IF that does not work,
        we simply put it at the end"""
        found = False
        if len(node) >= 2:
            for i in range(len(node[1])):
                if not isinstance(node[1][i], nodes.paragraph):
                    node[1].insert(i + 1, insertion)
                    found = True
                    break
        if not found:
            node.insert(1, insertion)

    def inject_summ_nodes(self, doc_nodes, summ_nodes):
        """Method to inject the autosummary nodes into the autodoc nodes

        Parameters
        ----------
        doc_nodes: list
            The list of nodes as they are generated by the
            :meth:`sphinx.ext.autodoc.AutodocDirective.run` method
        summ_nodes: dict
            The generated autosummary nodes as they are generated by the
            :meth:`autosumm_nodes` method. Note that `summ_nodes` must only
            contain the members autosummary tables!

        Returns
        -------
        doc_nodes: list
            The modified `doc_nodes`

        Notes
        -----
        `doc_nodes` are modified in place and not copied!"""
        def inject_summary(node):
            if isinstance(node, nodes.section):
                for sub in node:
                    inject_summary(sub)
                return
            if (len(node) and (isinstance(node, nodes.section) or (
                    isinstance(node[0], nodes.Element) and
                    node[0].get('module') and node[0].get('fullname')))):
                node_summ_nodes = summ_nodes.get("%s.%s" % (
                    node[0]['module'], node[0]['fullname']))
                if not node_summ_nodes:
                    return
                for summ_node in node_summ_nodes[::-1]:
                    self._insert_after_paragraphs(node, summ_node)
        for node in doc_nodes:
            inject_summary(node)
        return doc_nodes

    def autosumm_nodes(self, documenter, grouped_documenters):
        """Create the autosummary nodes based on the documenter content

        Parameters
        ----------
        documenter: sphinx.ext.autodoc.Documenter
            The base (module or class) documenter for which to generate the
            autosummary tables of its members
        grouped_documenters: dict
            The dictionary as it is returned from the
            :meth:`AutosummaryDocumenter.get_grouped_documenters` method

        Returns
        -------
        dict
            a mapping from the objects fullname to the corresponding
            autosummary tables of its members. The objects include the main
            object of the given `documenter` and the classes that are defined
            in it

        See Also
        --------
        AutosummaryDocumenter.get_grouped_documenters, inject_summ_nodes"""

        summ_nodes = {}
        this_nodes = []
        for section, documenters in six.iteritems(grouped_documenters):
            items = self.get_items_from_documenters(documenters)
            if not items:
                continue
            node = nodes.rubric()
            # create note for the section title (we could also use .. rubric
            # but that causes problems for latex documentations)
            self.state.nested_parse(
                ViewList(['**%s**' % section]), 0, node)
            this_nodes += node
            this_nodes += self.get_table(items)
            for mdocumenter, check_module in documenters:
                if (mdocumenter.objtype == 'class' and
                        not (check_module and not mdocumenter.check_module())):
                    if hasattr(mdocumenter, 'get_grouped_documenters'):
                        summ_nodes.update(self.autosumm_nodes(
                            mdocumenter, mdocumenter.get_grouped_documenters())
                            )
        summ_nodes[documenter.fullname] = this_nodes
        return summ_nodes

    def get_items_from_documenters(self, documenters):
        """Return the items needed for creating the tables

        This method creates the items that are used by the
        :meth:`sphinx.ext.autosummary.Autosummary.get_table` method by what is
        taken from the values of the
        :meth:`AutoSummModuleDocumenter.get_grouped_documenters` method.

        Returns
        -------
        list
            A list containing tuples like
            ``(name, signature, summary_string, real_name)`` that can be used
            for the :meth:`sphinx.ext.autosummary.Autosummary.get_table`
            method."""

        items = []

        max_item_chars = 50
        base_documenter = self.autosummary_documenter
        try:
            base_documenter.analyzer = ModuleAnalyzer.for_module(
                    base_documenter.real_modname)
            attr_docs = base_documenter.analyzer.find_attr_docs()
        except PycodeError as err:
            logger.debug('[autodocsumm] module analyzer failed: %s', err)
            # no source file -- e.g. for builtin and C modules
            base_documenter.analyzer = None
            attr_docs = {}
            # at least add the module.__file__ as a dependency
            if (hasattr(base_documenter.module, '__file__') and
                    base_documenter.module.__file__):
                base_documenter.directive.filename_set.add(
                    base_documenter.module.__file__)
        else:
            base_documenter.directive.filename_set.add(
                base_documenter.analyzer.srcname)

        for documenter, check_module in documenters:
            documenter.parse_name()
            documenter.import_object()
            documenter.real_modname = documenter.get_real_modname()
            real_name = documenter.fullname
            display_name = documenter.object_name
            if display_name is None:  # for instance attributes
                display_name = documenter.objpath[-1]
            if check_module and not documenter.check_module():
                continue

            # -- Grab the signature

            sig = documenter.format_signature()
            if not sig:
                sig = ''
            else:
                max_chars = max(10, max_item_chars - len(display_name))
                sig = mangle_signature(sig, max_chars=max_chars)
#                sig = sig.replace('*', r'\*')

            # -- Grab the documentation

            no_docstring = False
            if documenter.objpath:
                key = ('.'.join(documenter.objpath[:-1]),
                       documenter.objpath[-1])
                try:
                    doc = attr_docs[key]
                    no_docstring = True
                except KeyError:
                    pass
            if not no_docstring:
                documenter.add_content(None)
                doc = documenter.get_doc()
                if doc:
                    doc = doc[0]
                else:
                    continue

            while doc and not doc[0].strip():
                doc.pop(0)

            # If there's a blank line, then we can assume the first sentence /
            # paragraph has ended, so anything after shouldn't be part of the
            # summary
            for i, piece in enumerate(doc):
                if not piece.strip():
                    doc = doc[:i]
                    break

            # Try to find the "first sentence", which may span multiple lines
            m = re.search(r"^([A-Z].*?\.)(?:\s|$)", " ".join(doc).strip())
            if m:
                summary = m.group(1).strip()
            elif doc:
                summary = doc[0].strip()
            else:
                summary = ''

            items.append((display_name, sig, summary, real_name))
        return items
示例#43
0
import sphinx.ext.autodoc as ad
from sphinx.ext.autosummary import Autosummary, mangle_signature
from docutils import nodes
from docutils.statemachine import ViewList

if sphinx.__version__ >= '1.7':
    from sphinx.ext.autodoc import Signature, get_documenters
    from sphinx.ext.autodoc.directive import (
        AutodocDirective, AUTODOC_DEFAULT_OPTIONS, DocumenterBridge,
        process_documenter_options)
else:
    from sphinx.ext.autodoc import (
        getargspec, formatargspec, AutoDirective as AutodocDirective,
        AutoDirective as AutodocRegistry)

sphinx_version = list(map(float, re.findall(r'\d+', sphinx.__version__)[:3]))

if sphinx_version >= [2, 0]:
    from sphinx.util import force_decode
else:
    from sphinx.ext.autodoc import force_decode


try:
    from cyordereddict import OrderedDict
except ImportError:
    try:
        from collections import OrderedDict
    except ImportError:
        from ordereddict import OrderedDict
示例#44
0
                meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))

            else:
                continue

            Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(
                elem.get('Body'))

            values = (Id, ParentId, IsAccepted, TimeToAnswer, Score,
                      Text.encode("utf-8"), NumTextTokens, NumCodeLines,
                      LinkCount, NumImages)

            yield values

            root.clear()  # preserve memory
        if counter >= 1000000:
            break


with open(filename_filtered, "w") as f:
    for values in parsexml(filename):
        line = "\t".join(map(str, values))
        f.write(line + "\n")

with open(filename_filtered_meta, "w") as f:
    json.dump(meta, f)

print("years:", years)
print("#qestions: %i" % num_questions)
print("#answers: %i" % num_answers)
示例#45
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.arrowSafeTypeConversion",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
        # pandas Series. See SPARK-27240.
        df_for_struct = (
            eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
            or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
            or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
        ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                             assign_cols_by_name,
                                             df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = [0]

            def map_batch(batch):
                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows[0] += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                assert is_map_iter or num_output_rows <= num_input_rows[0], \
                    "Pandas MAP_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError(
                        "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input "
                        "iterator.")

            if is_scalar_iter and num_output_rows != num_input_rows[0]:
                raise RuntimeError(
                    "The number of output rows of pandas iterator UDF should be "
                    "the same with input rows. The input rows number is %d but the "
                    "output rows number is %d." %
                    (num_input_rows[0], num_output_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0),
                                                  ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer,
                                               infile,
                                               eval_type,
                                               runner_conf,
                                               udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
示例#46
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.convertToArrowArraySafely",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            ser = CogroupUDFSerializer(timezone, safecheck,
                                       assign_cols_by_name)
        else:
            # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
            # pandas Series. See SPARK-27240.
            df_for_struct = (
                eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
                or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
                or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
            ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                                 assign_cols_by_name,
                                                 df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = [
                0
            ]  # TODO(SPARK-29909): Use nonlocal after we drop Python 2.

            def map_batch(batch):
                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows[0] += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                # This assert is for Scalar Iterator UDF to fail fast.
                # The length of the entire input can only be explicitly known
                # by consuming the input iterator in user side. Therefore,
                # it's very unlikely the output length is higher than
                # input length.
                assert is_map_iter or num_output_rows <= num_input_rows[0], \
                    "Pandas SCALAR_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError(
                        "pandas iterator UDF should exhaust the input "
                        "iterator.")

                if num_output_rows != num_input_rows[0]:
                    raise RuntimeError(
                        "The length of output in Scalar iterator pandas UDF should be "
                        "the same with the input's; however, the length of output was %d and the "
                        "length of input was %d." %
                        (num_output_rows, num_input_rows[0]))

        # profiling is not supported for UDF
        return func, None, ser, ser

    def extract_key_value_indexes(grouped_arg_offsets):
        """
        Helper function to extract the key and value indexes from arg_offsets for the grouped and
        cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code.

        :param grouped_arg_offsets:  List containing the key and value indexes of columns of the
            DataFrames to be passed to the udf. It consists of n repeating groups where n is the
            number of DataFrames.  Each group has the following format:
                group[0]: length of group
                group[1]: length of key indexes
                group[2.. group[1] +2]: key attributes
                group[group[1] +3 group[0]]: value attributes
        """
        parsed = []
        idx = 0
        while idx < len(grouped_arg_offsets):
            offsets_len = grouped_arg_offsets[idx]
            idx += 1
            offsets = grouped_arg_offsets[idx:idx + offsets_len]
            split_index = offsets[0] + 1
            offset_keys = offsets[1:split_index]
            offset_values = offsets[split_index:]
            parsed.append([offset_keys, offset_values])
            idx += offsets_len
        return parsed

    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, f = read_single_udf(pickleSer,
                                         infile,
                                         eval_type,
                                         runner_conf,
                                         udf_index=0)
        parsed_offsets = extract_key_value_indexes(arg_offsets)

        # Create function like this:
        #   mapper a: f([a[0]], [a[0], a[1]])
        def mapper(a):
            keys = [a[o] for o in parsed_offsets[0][0]]
            vals = [a[o] for o in parsed_offsets[0][1]]
            return f(keys, vals)
    elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because cogrouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1
        arg_offsets, f = read_single_udf(pickleSer,
                                         infile,
                                         eval_type,
                                         runner_conf,
                                         udf_index=0)

        parsed_offsets = extract_key_value_indexes(arg_offsets)

        def mapper(a):
            df1_keys = [a[0][o] for o in parsed_offsets[0][0]]
            df1_vals = [a[0][o] for o in parsed_offsets[0][1]]
            df2_keys = [a[1][o] for o in parsed_offsets[1][0]]
            df2_vals = [a[1][o] for o in parsed_offsets[1][1]]
            return f(df1_keys, df1_vals, df2_keys, df2_vals)
    else:
        udfs = []
        for i in range(num_udfs):
            udfs.append(
                read_single_udf(pickleSer,
                                infile,
                                eval_type,
                                runner_conf,
                                udf_index=i))

        def mapper(a):
            result = tuple(
                f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
            # In the special case of a single UDF this will return a single result rather
            # than a tuple of results; this is the format that the JVM side expects.
            if len(result) == 1:
                return result[0]
            else:
                return result

    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
示例#47
0
 def func(s, iterator):
     return chain.from_iterable(map(f, iterator))
示例#48
0
def _some1(predicate, iterable):
    """Alternative implementation of :func:`some`."""
    return any(map(predicate, iterable))
示例#49
0
def colorize_root(root):
    colorize = ft.partial(tty.color.colorize, color=sys.stdout.isatty())
    pre, post = map(colorize, "@M[@. @M]@.".split())
    return "".join([pre, root, post])
示例#50
0
    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            from pyspark.sql.utils import require_minimum_pandas_version
            require_minimum_pandas_version()

            if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
               == "true":
                timezone = self.conf.get("spark.sql.session.timeZone")
            else:
                timezone = None

            # If no schema supplied by user then get the names of columns only
            if schema is None:
                schema = [str(x) if not isinstance(x, basestring) else
                          (x.encode('utf-8') if not isinstance(x, str) else x)
                          for x in data.columns]

            if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \
                    and len(data) > 0:
                try:
                    return self._create_from_pandas_with_arrow(data, schema, timezone)
                except Exception as e:
                    warnings.warn("Arrow will not be used in createDataFrame: %s" % str(e))
                    # Fallback to create DataFrame without arrow if raise some exception
            data = self._convert_from_pandas(data, schema, timezone)

        if isinstance(schema, StructType):
            verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            verify_func = _make_type_verifier(
                dataType, name="field value") if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj,
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
示例#51
0
    def search_winners(self, states):
        # A belief state is a list of tuples (prob, candidate,
        # previous candidate) to describe the probablitity of each
        # candidate in the state at a certain time. The previous
        # candidates are used to reconstruct the most likely path
        prev_belief_state = []
        scanned_candidates = {}

        for state in states:
            if not state:
                continue

            belief_state = [(0, c, None) for c in state]
            state_size = len(belief_state)
            for prev_prob, prev_candidate, _ in prev_belief_state:
                if prev_prob <= 0:
                    continue
                transition_probs = self.calculate_transition_costs(
                    prev_candidate.body, [c.body for c in state])
                assert len(transition_probs) == state_size
                emission_probs = list(
                    map(self.calculate_emission_cost, [c.body for c in state]))

                # Update current belief state
                for idx in range(state_size):
                    transition_prob = transition_probs[idx]
                    emission_prob = emission_probs[idx]
                    if emission_prob <= 0 or transition_prob <= 0:
                        continue
                    new_prob = prev_prob * transition_prob * emission_prob
                    prob, _, _ = belief_state[idx]
                    if prob < new_prob:
                        belief_state[idx] = (new_prob, state[idx],
                                             prev_candidate)

            most_prob, winner, _ = max(belief_state, key=lambda c: c[0])

            # If no one at previous state can reach current state,
            # then it is a new start
            new_start = most_prob <= 0
            if new_start:
                # Update current belief state
                belief_state = [(self.calculate_emission_cost(c.body), c, None)
                                for c in state]
                scanned_candidates = {}
                most_prob, winner, _ = max(belief_state, key=lambda c: c[0])
            if most_prob <= 0:
                continue

            # Update scanned table for reconstructing path
            scanned_candidates.update({c.id: pc for _, c, pc in belief_state})

            yield winner, scanned_candidates, new_start

            # Avoid underflow: multiplying all probability values by
            # an estimated scalar
            least_prob, _, _ = min(filter(lambda c: c[0] > 0, belief_state),
                                   key=lambda c: c[0])
            scalar = 1
            prob = least_prob
            while prob < 1:
                scalar *= 10
                prob = least_prob * scalar
            if scalar > 1:
                belief_state = [(p * scalar, c, pc)
                                for p, c, pc in belief_state]

            prev_belief_state = belief_state
示例#52
0
def is_minimum_version(version, min_version):
    """Return True if version is equal or greater to min_version"""
    return list(map(int, version.split('.'))) >= list(map(int, min_version.split('.')))
示例#53
0
 def internal_writer(self, outputs, stdout):
     """
     Writer which outputs the python repr for each item.
     """
     for output in outputs:
         print("\t".join(map(self.internal_serialize, output)), file=stdout)
示例#54
0
def is_less_version(version, max_version):
    """Return True if version is less to max_version"""
    return list(map(int, version.split('.'))) < list(map(int, max_version.split('.')))
示例#55
0
def _nginx_cerbot_setup(
    domains,
    https_cert_email,
    conf_dirs=("/etc/nginx/sites-enabled", ),
    use_sudo=True,
    warn_only=True,
    quiet=True,
):
    if not cmd_avail("certbot"):
        install()

    if domains != "all":
        raise NotImplementedError("{} for domains".format(domains))

    run_cmd = partial(_run_command, sudo=use_sudo)

    if not run("ls -A '{conf_dir}'".format(conf_dir=conf_dirs[0]),
               shell_escape=False):
        return "hosts_d is empty empty; skipping"

    server_names_t = tuple(
        chain(*(run_cmd("grep -RF server_name '{conf_dir}'".format(
            conf_dir=conf_dir)).split("\n") for conf_dir in conf_dirs)))

    hosts = tuple(
        l.partition("127.0.0.1")[2].strip()
        for l in run_cmd("grep -F 127.0.0.1 /etc/hosts").split("\n")
        if "localhost" not in l)

    server_names_d = dict((lambda spl: (spl[1].lstrip().rstrip("; \t\r"), spl[
        0][:spl[0].rfind(":")]))(l.split("server_name"))
                          for l in server_names_t)
    if len(server_names_d) < len(server_names_t):
        raise NotImplementedError(
            "Same server_name in multiple files. We don't know what to stop!")

    hosts_d = {
        host: server_names_d[host]
        for host in hosts if host.count(".") > 1 and host in server_names_d
        and len(host.translate(None, "~^|()?*")) == len(host)
    }

    if not hosts_d:
        return "hosts_d is empty empty; skipping"

    run_cmd("mkdir -p /etc/nginx/sites-disabled")
    sites_avail_local_filepath = resource_filename(
        "offregister_app_push", path.join("conf",
                                          "nginx.sites-available.conf"))

    def certbot_prep(dns_name, conf_loc):
        run_cmd("mv '{}' '/etc/nginx/sites-disabled/{}'".format(
            conf_loc,
            path.split(conf_loc)[1]))
        wwwroot = "/var/www/static/{dns_name}".format(dns_name=dns_name)
        if exists(wwwroot):
            run_cmd("rm -r '{wwwroot}'".format(wwwroot=wwwroot))
        run_cmd("mkdir -p '{wwwroot}'".format(wwwroot=wwwroot))
        _send_nginx_conf(
            conf_remote_filename="/etc/nginx/sites-enabled/{dns_name}-certbot".
            format(dns_name=dns_name),
            sites_avail_local_filepath=sites_avail_local_filepath,
            proxy_block_local_filepath=None,
            conf_vars={
                "NGINX_PORT":
                80,
                "DNS_NAMES": (dns_name, ),
                "DESCRIPTION":
                "Temporary conf doing certbot for {}".format(dns_name),
                "WWWPATH":
                "/",
                "WWWROOT":
                wwwroot,
            },
        )
        print(
            'one("{}", "{}") ='.format(dns_name, conf_loc),
            "-w '{wwwroot}' -d '{dns_name}' ".format(dns_name=dns_name,
                                                     wwwroot=wwwroot),
        )
        return "-w '{wwwroot}' -d '{dns_name}' ".format(dns_name=dns_name,
                                                        wwwroot=wwwroot)

    secured_already = (frozenset(
        run_cmd("ls /etc/letsencrypt/live", warn_only=True).splitlines())
                       if exists("/etc/letsencrypt/live") else tuple())
    cerbot_cmds = tuple(
        "certbot certonly --agree-tos -m {https_cert_email} --webroot {root}".
        format(https_cert_email=https_cert_email,
               root=certbot_prep(dns_name, conf_loc))
        for dns_name, conf_loc in iteritems(hosts_d)
        if dns_name not in secured_already)

    if not cerbot_cmds:
        return "You must've already secured all your domains. Otherwise clean: /etc/letsencrypt/live"

    service_name = "nginx"
    if sudo(
            "systemctl status -q {service_name} --no-pager --full".format(
                service_name=service_name),
            warn_only=True,
    ).failed:
        sudo("systemctl start -q {service_name} --no-pager --full".format(
            service_name=service_name))
    else:
        sudo("systemctl reload -q {service_name} --no-pager --full".format(
            service_name=service_name))
    print("cerbot_cmds =", cerbot_cmds)
    certbot_res = tuple(map(run_cmd, cerbot_cmds))
    sudo("cp /etc/nginx/sites-disabled/* /etc/nginx/sites-enabled")

    # sudo('rm -r /etc/nginx/sites-disabled')

    def secure_conf(dns_name, conf_loc, https_header):
        # print 'secure_conf({!r}, {!r})'.format(dns_name, conf_loc)
        if run_cmd("grep -Fq 443 {conf_loc}".format(conf_loc=conf_loc),
                   warn_only=True).failed:
            logger.warning(
                "Skipping {conf_loc}; 443 already found within".format(
                    conf_loc=conf_loc))
        sio = StringIO()
        get(remote_path=conf_loc, use_sudo=use_sudo, local_path=sio)
        sio.seek(0)
        sio_s = sio.read()
        substr = sio_s[sio_s.find("{", sio_s.find("server")):sio_s.rfind("}") +
                       2].replace("listen 80", "listen 443", 1)
        https_header %= {
            "CA_CERT_PATH":
            "/etc/letsencrypt/live/{dns_name}/fullchain.pem".format(
                dns_name=dns_name),
            "PRIV_KEY_PATH":
            "/etc/letsencrypt/live/{dns_name}/privkey.pem".format(
                dns_name=dns_name),
        }
        """ # TODO: Address parsing, if not in `listen` keyword
        sni = substr.find('server_name')
        sni = substr[sni:substr.find(';', sni)]
        col = sni.rfind(':')
        col = col.format(':') if col > -1 else col"""

        return put(
            remote_path=conf_loc,
            use_sudo=use_sudo,
            local_path=StringIO("{orig}\n\nserver {substr}".format(
                orig=sio_s,
                substr=substr.replace(
                    "{dns_name};\n".format(dns_name=dns_name),
                    "{dns_name};\n{https_header}\n".format(
                        dns_name=dns_name,
                        https_header=_indent(https_header, 4)),
                    1,
                ),
            )),
        )

    with open(
            resource_filename("offregister_app_push",
                              path.join("conf", "nginx.https_header.conf")),
            "rt",
    ) as f:
        https_header = f.read()
    replaced_confs = tuple(
        secure_conf(dns_name, conf_loc, https_header)
        for dns_name, conf_loc in iteritems(hosts_d))

    sudo("systemctl reload -q {service_name} --no-pager --full".format(
        service_name=service_name))
    return {"certbot_res": certbot_res, "replaced_confs": replaced_confs}
示例#56
0
def _group_from_dict(cls, data):
    params = {x: data.get(x, None) for x in cls._fields}
    # Parse the feeds if they're provided and generate feed instances.
    params['feeds'] = tuple(map(Feed.from_dict, data.get('feeds', [])))
    return cls(**params)
def decode_predictions_beam_search(preds,
                                   index2word,
                                   glossary=None,
                                   alphas=None,
                                   heuristic=0,
                                   x_text=None,
                                   unk_symbol='<unk>',
                                   pad_sequences=False,
                                   mapping=None,
                                   verbose=0):
    """
    Decodes predictions from the BeamSearch method.

    :param preds: Predictions codified as word indices.
    :param index2word: Mapping from word indices into word characters.
    :param alphas: Attention model weights: Float matrix with shape (I, J) (I: number of target items; J: number of source items).
    :param heuristic: Replace unknown words heuristic (0, 1 or 2)
    :param x_text: Source text (for unk replacement)
    :param unk_symbol: Unknown words symbol
    :param pad_sequences: Whether we should make a zero-pad on the input sequence.
    :param mapping: Source-target dictionary (for unk_replace heuristics 1 and 2)
    :param verbose: Verbosity level, by default 0.
    :return: List of decoded predictions
    """
    if verbose > 0:
        logger.info('Decoding beam search prediction ...')

    if alphas is not None:
        if x_text is None:
            raise AssertionError(
                'When using POS_UNK, you must provide the input '
                'text to decode_predictions_beam_search!')
        if verbose > 0:
            logger.info('Using heuristic %d' % heuristic)
    if pad_sequences:
        preds = [
            pred[:sum([int(elem > 0) for elem in pred]) + 1] for pred in preds
        ]
    flattened_predictions = [
        list(map(lambda x: index2word[x], pred)) for pred in preds
    ]
    final_predictions = []

    if alphas is not None:
        x_text = list(map(lambda x: x.split(), x_text))
        hard_alignments = list(
            map(
                lambda alignment, x_sentence: np.argmax(
                    alignment[:, :max(1, len(x_sentence))], axis=1), alphas,
                x_text))

        for i, a_no in list(enumerate(flattened_predictions)):
            if unk_symbol in a_no or glossary is not None:
                a_no = replace_unknown_words(x_text[i],
                                             a_no,
                                             hard_alignments[i],
                                             unk_symbol,
                                             glossary=glossary,
                                             heuristic=heuristic,
                                             mapping=mapping,
                                             verbose=verbose)
            a_no = [
                a.decode('utf-8')
                if isinstance(a, str) and sys.version_info.major == 2 else a
                for a in a_no
            ]
            tmp = u' '.join(a_no[:-1])
            final_predictions.append(tmp)
    else:
        for a_no in flattened_predictions:
            a_no = [
                a.decode('utf-8')
                if isinstance(a, str) and sys.version_info.major == 2 else a
                for a in a_no
            ]
            tmp = u' '.join(a_no[:-1])
            final_predictions.append(tmp)
    return final_predictions
示例#58
0
 def __compat_repr__(self):  # pragma: nocover
     def make_param(name):
         value = getattr(self, name)
         return '{name}={value!r}'.format(**locals())
     params = ', '.join(map(make_param, self._fields))
     return 'EntryPoint({params})'.format(**locals())
示例#59
0
def sample_ensemble(args, params):

    from data_engine.prepare_data import update_dataset_from_file
    from keras_wrapper.model_ensemble import BeamSearchEnsemble
    from keras_wrapper.cnn_model import loadModel
    from keras_wrapper.dataset import loadDataset
    from keras_wrapper.utils import decode_predictions_beam_search

    logging.info("Using an ensemble of %d models" % len(args.models))
    models = [loadModel(m, -1, full_path=True) for m in args.models]
    dataset = loadDataset(args.dataset)
    dataset = update_dataset_from_file(dataset,
                                       args.text,
                                       params,
                                       splits=args.splits,
                                       remove_outputs=True)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    # For converting predictions into sentences
    index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET']
                                      [0]]['idx2words']

    if params.get('APPLY_DETOKENIZATION', False):
        detokenize_function = eval('dataset.' +
                                   params['DETOKENIZATION_METHOD'])

    params_prediction = dict()
    params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20)
    params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1)
    params_prediction['beam_size'] = params.get('BEAM_SIZE', 6)
    params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100)
    params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH']
    params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
    params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
    params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
    params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
    params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False)
    params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING',
                                                      False)
    params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
    params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY',
                                                       False)
    params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False)
    params_prediction['length_norm_factor'] = params.get(
        'LENGTH_NORM_FACTOR', 0.0)
    params_prediction['coverage_norm_factor'] = params.get(
        'COVERAGE_NORM_FACTOR', 0.0)
    params_prediction['pos_unk'] = params.get('POS_UNK', False)
    params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
        else params.get('MAX_OUTPUT_TEXT_LEN', 50)
    params_prediction['output_max_length_depending_on_x'] = params.get(
        'MAXLEN_GIVEN_X', True)
    params_prediction['output_max_length_depending_on_x_factor'] = params.get(
        'MAXLEN_GIVEN_X_FACTOR', 3)
    params_prediction['output_min_length_depending_on_x'] = params.get(
        'MINLEN_GIVEN_X', True)
    params_prediction['output_min_length_depending_on_x_factor'] = params.get(
        'MINLEN_GIVEN_X_FACTOR', 2)
    params_prediction['attend_on_output'] = params.get(
        'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower())

    heuristic = params.get('HEURISTIC', 0)
    mapping = None if dataset.mapping == dict() else dataset.mapping
    model_weights = args.weights

    if model_weights is not None and model_weights != []:
        assert len(model_weights) == len(
            models
        ), 'You should give a weight to each model. You gave %d models and %d weights.' % (
            len(models), len(model_weights))
        model_weights = map(lambda x: float(x), model_weights)
        if len(model_weights) > 1:
            logger.info('Giving the following weights to each model: %s' %
                        str(model_weights))
    for s in args.splits:
        # Apply model predictions
        params_prediction['predict_on_sets'] = [s]
        beam_searcher = BeamSearchEnsemble(models,
                                           dataset,
                                           params_prediction,
                                           model_weights=model_weights,
                                           n_best=args.n_best,
                                           verbose=args.verbose)
        if args.n_best:
            predictions, n_best = beam_searcher.predictBeamSearchNet()[s]
        else:
            predictions = beam_searcher.predictBeamSearchNet()[s]
            n_best = None
        if params_prediction['pos_unk']:
            samples = predictions[0]
            alphas = predictions[1]
            sources = [
                x.strip() for x in open(args.text, 'r').read().split('\n')
            ]
            sources = sources[:-1] if len(sources[-1]) == 0 else sources
        else:
            samples = predictions
            alphas = None
            heuristic = None
            sources = None

        predictions = decode_predictions_beam_search(samples,
                                                     index2word_y,
                                                     alphas=alphas,
                                                     x_text=sources,
                                                     heuristic=heuristic,
                                                     mapping=mapping,
                                                     verbose=args.verbose)
        # Apply detokenization function if needed
        if params.get('APPLY_DETOKENIZATION', False):
            predictions = map(detokenize_function, predictions)

        if args.n_best:
            n_best_predictions = []
            for i, (n_best_preds, n_best_scores,
                    n_best_alphas) in enumerate(n_best):
                n_best_sample_score = []
                for n_best_pred, n_best_score, n_best_alpha in zip(
                        n_best_preds, n_best_scores, n_best_alphas):
                    pred = decode_predictions_beam_search(
                        [n_best_pred],
                        index2word_y,
                        alphas=[n_best_alpha]
                        if params_prediction['pos_unk'] else None,
                        x_text=[sources[i]]
                        if params_prediction['pos_unk'] else None,
                        heuristic=heuristic,
                        mapping=mapping,
                        verbose=args.verbose)
                    # Apply detokenization function if needed
                    if params.get('APPLY_DETOKENIZATION', False):
                        pred = map(detokenize_function, pred)

                    n_best_sample_score.append([i, pred, n_best_score])
                n_best_predictions.append(n_best_sample_score)
        # Store result
        if args.dest is not None:
            filepath = args.dest  # results file
            if params.get('SAMPLING_SAVE_MODE', 'list'):
                list2file(filepath, predictions)
                if args.n_best:
                    nbest2file(filepath + '.nbest', n_best_predictions)
            else:
                raise Exception(
                    'Only "list" is allowed in "SAMPLING_SAVE_MODE"')
        else:
            list2stdout(predictions)
            if args.n_best:
                logging.info('Storing n-best sentences in ./' + s + '.nbest')
                nbest2file('./' + s + '.nbest', n_best_predictions)
        logging.info('Sampling finished')
示例#60
0
def parsexml(filename):
    global num_questions, num_answers

    counter = 0

    it = map(itemgetter(1), iter(etree.iterparse(filename,
                                                 events=('start', ))))

    root = next(it)  # get posts element

    for elem in it:
        if counter % 100000 == 0:
            print("Processed %i <row/> elements" % counter)

        counter += 1

        if elem.tag == 'row':
            creation_date = dateparser.parse(elem.get('CreationDate'))

            Id = int(elem.get('Id'))
            PostTypeId = int(elem.get('PostTypeId'))
            Score = int(elem.get('Score'))

            if PostTypeId == 1:
                num_questions += 1
                years[creation_date.year] += 1

                ParentId = -1
                TimeToAnswer = 0
                q_creation[Id] = creation_date
                accepted = elem.get('AcceptedAnswerId')
                if accepted:
                    q_accepted[Id] = int(accepted)
                IsAccepted = 0

            elif PostTypeId == 2:
                num_answers += 1

                ParentId = int(elem.get('ParentId'))
                if not ParentId in q_creation:
                    # question was too far in the past
                    continue

                TimeToAnswer = (creation_date - q_creation[ParentId]).seconds

                if ParentId in q_accepted:
                    IsAccepted = int(q_accepted[ParentId] == Id)
                else:
                    IsAccepted = 0

                meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))

            else:
                continue

            Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(
                elem.get('Body'))

            values = (Id, ParentId, IsAccepted, TimeToAnswer, Score,
                      Text.encode("utf-8"), NumTextTokens, NumCodeLines,
                      LinkCount, NumImages)

            yield values

            root.clear()  # preserve memory
        if counter >= 1000000:
            break