Пример #1
0
def convert_model(n_clicks, close, elements, layout, user_id, is_open):

    if user_id.startswith("python_generated_ssid"):
        # Trim id
        user_id = user_id.split("-")[-1]

    if n_clicks is None:
        return [False, [html.H5("No specs defined yet")]]

    else:
        # Keep elements that are either edges (have a source)
        # or elements that have a parent (nodes, not groups)
        elements = [
            elem for elem in elements
            if (("source" in elem["data"]) or ("parent" in elem["data"]))
        ]

        pipelines, classifiers = pipeline_creator.create_pipelines(
            elements, node_options)

        # Save pipelines to Redis (to be used in other modules)
        for pipe, clf in zip(pipelines, classifiers):
            r.set(f"{user_id}_pipeline_{clf}", dill.dumps(pipe))

        # TODO: Make this a modal
        #       https://dash-bootstrap-components.opensource.faculty.ai/l/components/modal
        return [
            not is_open,
            [
                html.P(f"{i+1}) {str(pipeline)}")
                for (i, pipeline) in enumerate(pipelines)
            ]
        ]
Пример #2
0
def show_schema(api_choice, user_id):


    if api_choice is None:
        return [html.H4("Nothing selected.")]

    else:
        df = get_data(api_choice, user_id)

    if df is None:
        return [html.H4("Nothing to display")]

    schema = r.get(f"{user_id}_{api_choice}_schema")
    if schema is None:
        sample = df.sample(n=50, replace=True).dropna()
        types, subtypes = infer_types(sample, is_sample=True)
        r.set(f"{user_id}_{api_choice}_schema", dill.dumps({
            "types": types,
            "subtypes": subtypes
        }))

    else:
        schema = dill.loads(schema)
        types, subtypes = schema["types"], schema["subtypes"]

    return [
        html.Br(),
        dcc.ConfirmDialog(id="schema_confirmation"),
        html.Button("Update schema", id="update_schema"),

        schema_table(df[:500], types, subtypes)
    ]
Пример #3
0
def serve_layout():
    """
    The layout of our app needs to be inside a function \
    so that every time some new session starts a new \
    session_id is generated.
    """

    session_id = f"python_generated_ssid_{uuid.uuid4()}"

    # TODO: This should probably be moved to `utils.startup`
    # load some data for all users
    for file in os.listdir("../data"):
        if file.endswith("csv"):
            df = pd.read_csv("../data/" + file)
            r.set(f"{session_id}_user_data_example_{file[:-4]}",
                  pickle.dumps(df))

    return html.Div(
        children=[
            html.H2(session_id, id="user_id", style={"display": "none"}),
            html.Div(
                [
                    # Sidebar / menu
                    html.Div(children=SideBar,
                             className="col-sm-4 col-md-3 col-xl-2",
                             id="sidebar",
                             style={"display": "inline-block"}),

                    # main Div
                    html.Div(children=MainMenu,
                             className="col-sm-8  col-md-9 col-xl-10",
                             id="mainmenu",
                             style={"display": "inline-block"}),

                    # Sidebar / menu
                    html.Div(children=[
                        sd_material_ui.Drawer(SideBar2,
                                              id="drawer",
                                              open=True,
                                              docked=True,
                                              openSecondary=True),
                    ],
                             className="",
                             id="sidebar2",
                             style={"display": "inline-block"}),
                ],
                className="row",
                id="main_content")
        ],
        className="container",
        style={"display": "inline"},
        id="main_page")
Пример #4
0
def update_schema(n_clicks, table_colnames, row_types, row_subtypes,
                  dataset_choice, user_id):
    """
    Update the dataset schema. This function takes the html elements \
    from the table head (containing column names) and its first two \
    rows (containing dropdowns with the data types/subtypes), parses \
    them and stores them in redis.

    Args:
        n_clicks (int): Number of button clicks.
        table_colnames (dict): The head (`html.Thead`) of the table, \
                               as a Dash dict.
        row_types (dict): The first table row (`html.Tr`) containing \
                          the Dash dropdown dict with the data types.
        row_subtypes (dict): The first table row (`html.Tr`) containing \
                             the Dash dropdown dict with the data subtypes.
        dataset_choice (str): Name of dataset.
        user_id (str): Session/user id.

    Returns:
        list(str, bool): A message and a boolean for a browser alert.
    """


    types = {}
    for col_name, col in zip(table_colnames, row_types):
        dropdown = col["props"]["children"]
        dropdown_value = dropdown["props"]["value"]
        col_name = col_name["props"]["children"]

        types[col_name ] = dropdown_value

    subtypes = {}
    for col_name, col in zip(table_colnames, row_subtypes):
        dropdown = col["props"]["children"]
        dropdown_value = dropdown["props"]["value"]
        col_name = col_name["props"]["children"]

        subtypes[col_name] = dropdown_value

    r.set(f"{user_id}_{dataset_choice}_schema", dill.dumps({
        "types": types,
        "subtypes": subtypes
    }))

    return "Updated", True
Пример #5
0
def compare_torrents(name, files, sites):
    torrents = r.get(name)
    if not torrents:
        torrents = db.select_torrent(name)
        r.set(name, json.dumps(torrents))
    else:
        torrents = json.loads(str(torrents, encoding='utf-8'))

    cmp_success = []
    cmp_warning = []
    for t in torrents:
        success_count = failure_count = 0
        torrent_files = eval(t['files'])
        result_site = format_sites(t['sites_existed'], sites)
        if not result_site:
            continue
        if len(torrent_files):
            if type(files) is int:
                continue

            keys = list(files.keys())
            for key in keys:
                files[key.replace('\\', '/')] = files.pop(key)  # 对于Windows,将\\更改为/,以适配数据库

            for k, v in torrent_files.items():
                if v * 0.95 < files.get(k, -1) < v * 1.05:
                    success_count += 1
                else:
                    failure_count += 1
            if failure_count:
                if success_count > failure_count:
                    db.hit(t['id'])
                    cmp_warning.append({'id': t['id'], 'sites': result_site})
            else:
                db.hit(t['id'])
                cmp_success.append({'id': t['id'], 'sites': result_site})
        else:
            if type(files) is not int:
                continue
            if t['length'] * 0.95 < files < t['length'] * 1.05:
                db.hit(t['id'])
                cmp_success.append({'id': t['id'], 'sites': result_site})
    return {'name': name, 'cmp_success': cmp_success, 'cmp_warning': cmp_warning}
Пример #6
0
def api_connect(api_choice, user_id, *args, **kwargs):
    """
    Connect to the selected API. A function that serves as the front \
    end to all others, abstracting them away. ALso stores the API \
    handle in Redis for later usage.

    Args:
        api_choice (str): A key in `connectors_mapping`.
        user_id (str): Session/user id.
        *args: Arguments to be passed to the appropriate API connector.
        **kwargs: Keyword arguments to be passed to the appropriate \
                  API connector.

    Returns:
        bool: Whether everything succeeded or not (an exception was raised).
    """


    if any(x is None for x in args):
        return False

    func = connectors_mapping[api_choice]

    if api_choice == "ganalytics":
        # Google analytics needs the user_id too
        kwargs.update({"user_id": user_id})

    try:
        api_handle = func(*args, **kwargs)

        # TODO: Maybe add a timeout here as well?
        # Store in Redis that the API connected, and its handle(s)
        r.set(f"{user_id}_{api_choice}_api", "true")
        r.set(f"{user_id}_{api_choice}_api_handle", dill.dumps(api_handle))

        return True

    except Exception as e:
        print(e)
        return False
Пример #7
0
def get_users_ganalytics(n_clicks, metrics, user_id):

    if metrics is None:
        raise PreventUpdate()

    if n_clicks > 0:
        # TODO: Why have this requester here if below you do your own request?!
        # Get the API handle
        requester = dill.loads(r.get(f"{user_id}_ganalytics_api_handle"))

        if not isinstance(metrics, list):
            metrics = [metrics]

        # requests response object
        response = requests.get(
            f"http://127.0.0.1:5000/{user_id}/{','.join(metrics)}")

        # parse the results and reform them
        results = json.loads(response.text)

        for metric in metrics:

            data = results["data"][metric[3:]]

            # TODO: This signifies duplication of storage. The other
            #       server already stores the results in a redis cache
            #       but we cannot remove this because other parts of the
            #       code depend on this storage. Consider reworking the
            #       REST API, but using the same database for 2 servers
            #       is an anti-pattern for micro-services architectures.
            r.set(f"{user_id}_ganalytics_data_{metric}",
                  pickle.dumps(data),
                  ex=3600)

        return [html.Br(), html.P(str(results))]

    else:
        raise PreventUpdate()
Пример #8
0
def get_users_tweets(n_clicks, acc_name, user_id):

    if n_clicks > 0:
        # Get the API handle
        api = pickle.loads(r.get(f"{user_id}_twitter_api_handle"))

        # TODO: This is a cache so consider a better implementation.
        query = r.get(f"{user_id}_twitter_data_{acc_name}")
        if query is None:
            # TODO: Consider saving for future use / as a dataset.
            query = api.GetUserTimeline(screen_name=acc_name)

            # Expire the retrieved tweets cache in one hour
            r.set(f"{user_id}_twitter_data_{acc_name}",
                  pickle.dumps(query),
                  ex=3600)
        else:
            query = pickle.loads(query)

        return [html.P(str(status.text)) for status in query]

    else:
        raise PreventUpdate()
Пример #9
0
def modify_graph(remove_clicked_time, connect_selected_time, modify_node_time,
                 load_prebuilt_time, *add_nodes):

    # This is necessary since Python cannot accept *args in the middle
    # of the function parameter list. The tapped node is used only for
    # altering parameters on the last-clicked node, while the selected
    # is used for connecting nodes. The modify_node_attribute refers to
    # the dropdown (sklearn kwarg) and modify_node_params is the value
    (elems, to_be_deleted, selected, modify_node_attribute, modify_node_params,
     tapped, user_text, mapping_store, pipeline_options,
     user_id) = add_nodes[-10:]

    add_nodes = add_nodes[:-10]

    if all(x is None for x in [
            remove_clicked_time, connect_selected_time, modify_node_time,
            *add_nodes
    ]):
        if elems is not None:
            return elems
        else:
            return []

    G = Graph(elems)

    # Create list of tuples, e.g.: (time_clicked, add_xgb)
    add_node_list = [(add_node, f"add_{model}")
                     for (add_node, model) in zip(add_nodes, node_options)]

    # Sort buttons based on clicked time (most recent first)
    buttons_and_clicks = sorted([(remove_clicked_time, "remove"),
                                 (connect_selected_time, "connect"),
                                 (modify_node_time, "modify"),
                                 (load_prebuilt_time, "prebuilt")] +
                                add_node_list,
                                reverse=True)

    # Graph operations
    if buttons_and_clicks[0][1] == "remove":
        G.node_collection.remove_node(to_be_deleted)

    elif buttons_and_clicks[0][1] == "connect":
        G.edge_collection.add_edges(selected)

    elif buttons_and_clicks[0][1].startswith("add_"):
        # e.g.: (time_clicked, add_xgb) --> xgb
        G.node_collection.add_node(buttons_and_clicks[0][1][4:])

    elif buttons_and_clicks[0][1] == "prebuilt":
        pipeline_steps = prebuilt_pipelines[pipeline_options]

        return GraphUtils(pipeline_steps).render_graph()

    elif buttons_and_clicks[0][1] == "modify":
        if tapped is not None:
            for node in G.node_collection.nodes:
                # iterate over all the nodes to find the appropriate one
                # TODO: The fact that is is necessary means that `Graph`
                #       should implement a __get__ method (or w/e it is)
                if node.id == tapped["id"]:

                    if node.node_type == "feat_maker":

                        try:
                            dataset_choice = pipeline_creator.find_input_node(
                                elems).dataset
                        except AttributeError:
                            raise PreventUpdate()

                        # Get the mapping symbols
                        # These are the same now but will be changed later
                        user_columns = list(mapping_store["selected_columns"])
                        user_symbols = list(
                            mapping_store["selected_columns"].values())

                        # left- and right-hand side
                        lhs = ','.join(user_symbols)
                        rhs = ' '.join(user_symbols)

                        # TODO: Make sure that these symbols are defined in the
                        #       correct order and that this order is preserved
                        #       when passed to the func inside the pipeline.
                        #       Line 183 probably fixes this but we need to
                        #       double check.
                        exec_commands = [
                            f"{lhs} = sympy.symbols('{rhs}')",
                            f"f = {user_text}",
                            f"lambdify( ({lhs}), f)",
                        ]

                        func_name = f"{user_id}_feat_{'-'.join(user_columns)}"
                        # Store the func to Redis, and save only the
                        # key. This is due to python functions not
                        # being JSON serializable.
                        r.set(func_name, dill.dumps(exec_commands))

                        # TODO: This needs improvement, e.g. with adding
                        #       variables in the edges and passing data
                        #       through there. The current implementation
                        #       is forced to load the dataset twice.
                        params = {
                            "func_name": func_name,
                            "cols": user_columns,
                            "dataset_choice": dataset_choice,
                            "user_id": user_id
                        }
                        node.options["data"]["func_params"].update(params)

                    else:
                        node.options["data"]["func_params"].update(
                            {modify_node_attribute: modify_node_params})

    return G.render_graph()