def preprocess_data(self, path, is_training_data=False):
        reader = IndexedFileReader(path)

        num_fwd_edge_types = 0
        annotation_size = 0
        num_choices = -1
        for g in tqdm.tqdm(reader,
                           desc='Preliminary Data Pass',
                           dynamic_ncols=True):
            num_fwd_edge_types = max(
                num_fwd_edge_types,
                max([e[1] for e in g['edges']] + [-1]) + 1)
            annotation_size = max(annotation_size, max(g['node_features']) + 1)
            if num_choices == -1:
                num_choices = g['num_choices']
            else:
                if num_choices != g['num_choices']:
                    raise AutoPandasException(
                        "Number of choices differ across training points")

        self.params['num_edge_types'] = num_fwd_edge_types * (
            1 if self.params['tie_fwd_bkwd'] else 2)
        self.params['annotation_size'] = annotation_size
        self.params['num_choices'] = num_choices
        reader.close()
    def add_set(self,
                vals: List[Any],
                selected_indices: List[int] = None,
                query: bool = False):
        if (not query) and (selected_indices is None):
            raise AutoPandasException(
                "One of query and selected_indices needs to be supplied to OrderedSubsets"
            )

        element_node_collections: List[GraphNodeCollection] = [
            self.get_element_node_collection(v, idx)
            for idx, v in enumerate(vals)
        ]

        for c in element_node_collections:
            self.elements.append(c.setup_representor(self.edge_collection))

        #  Add any internal edges
        for c in element_node_collections:
            c.add_internal_edges(self.edge_collection)

        #  Add edges from these to the existing nodes
        for c1, c2 in itertools.product(element_node_collections,
                                        self.node_collections):
            c1.add_external_edges(c2, collector=self.edge_collection)

        for c in element_node_collections:
            self.nodes.update(c.nodes)

        if selected_indices is not None:
            self.selected_nodes = [
                self.elements[idx] for idx in selected_indices
            ]
    def add_options(self,
                    num_options: int,
                    picked: int = None,
                    query: bool = False):
        if (not query) and (picked is None):
            raise AutoPandasException(
                "One of query and picked needs to be supplied to Chain")

        self.num_options = num_options
        self.picked = picked
    def add_choices(self,
                    num_choices: int,
                    chosen: int = None,
                    query: bool = False):
        if (not query) and (chosen is None):
            raise AutoPandasException(
                "One of query and chosen needs to be supplied to Choice")

        self.num_choices = num_choices
        self.chosen = chosen
def RExt(dtype: DType,
         rgen=None,
         spec: SearchSpec = None,
         depth: int = 1,
         mode: str = None,
         tracker: OpTracker = None,
         arg_name: str = None,
         identifier: str = None,
         constraint: Callable[[Any], Any] = None,
         **kwargs):

    if constraint is None:

        def constraint(x):
            return True

    if mode != 'training-data':
        raise AutoPandasException("Unrecognized mode {} in RExt".format(mode))

    pool: List[Optional[Value]] = []
    for idx, val in enumerate(spec.inputs):
        if not (dtype.hasinstance(val) and constraint(val)):
            continue
        pool.append(Fetcher(val=val, source='inps', idx=idx))

    for idx, val in enumerate(spec.intermediates[:depth - 1]):
        if not (dtype.hasinstance(val) and constraint(val)):
            continue
        pool.append(Fetcher(val=val, source='intermediates', idx=idx))

    if rgen is not None:
        pool.append(None)

    random.shuffle(pool)
    label = 'ext_' + arg_name + '_' + identifier
    rlabel = 'rext_' + arg_name + '_' + identifier
    for selection in pool:
        tracker.record.pop(label, None)
        tracker.record.pop(rlabel, None)
        if selection is None:
            #  We've decided to create a new input altogether
            val = next(rgen)
            tracker.record[rlabel] = {'val': val, 'arg_name': arg_name}
            yield NewInp(val)

        else:
            selection: Fetcher
            tracker.record[label] = {
                'source': selection.source,
                'idx': selection.idx
            }
            yield selection
    def add_domain(self,
                   domain: List[Any],
                   selected_idx: int = None,
                   query: bool = False):
        if (not query) and (selected_idx is None):
            raise AutoPandasException(
                "One of query and selected_idx needs to be supplied to Select")

        domain_node_collections: List[GraphNodeCollection] = [
            self.get_domain_node_collection(v, idx)
            for idx, v in enumerate(domain)
        ]

        for c in domain_node_collections:
            self.domain_nodes.append(c.setup_representor(self.edge_collection))

        #  Add the adjacency edges amongst the elements
        #  UPDATE : It's probably more appropriate to NOT have adjacency edges
        #  There is no implicit order in the set of values passed to Select at any point,
        #  it's just a pool of candidates to choose from
        # for c1, c2 in zip(domain_node_collections, domain_node_collections[1:]):
        #     c1.add_custom_edge(c2, GraphEdgeType.ADJACENCY, self.edge_collection)

        #  Add any internal edges
        for c in domain_node_collections:
            c.add_internal_edges(self.edge_collection)

        #  Add edges from these to the existing nodes
        for c1, c2 in itertools.product(domain_node_collections,
                                        self.node_collections):
            c1.add_external_edges(c2, collector=self.edge_collection)

        for c in domain_node_collections:
            self.nodes.update(c.nodes)

        if selected_idx is not None:
            self.selected_node = self.domain_nodes[selected_idx]
    def add_iterables(self,
                      iterables: List[List[Any]],
                      selected_indices: List[int] = None,
                      query: bool = False):
        if (not query) and (selected_indices is None):
            raise AutoPandasException(
                "One of query and selected_indices needs to be supplied to Product"
            )

        iterable_node_collections: List[List[GraphNodeCollection]] = []
        for iter_num, iterable in enumerate(iterables):
            iterable_node_collections.append([
                self.get_iterable_node_collection(
                    v, idx + iter_num * len(iterables))
                for idx, v in enumerate(iterable)
            ])
            self.iterables.append([])
            for c in iterable_node_collections[-1]:
                self.iterables[-1].append(
                    c.setup_representor(self.edge_collection))
                c.add_internal_edges(self.edge_collection)

        for iterable_collections in iterable_node_collections:
            for c1, c2 in itertools.product(iterable_collections,
                                            self.node_collections):
                c1.add_external_edges(c2, collector=self.edge_collection)

        for iterable_collections in iterable_node_collections:
            for c in iterable_collections:
                self.nodes.update(c.nodes)

        if selected_indices is not None:
            self.selected_nodes = [
                self.iterables[iter_num][idx]
                for iter_num, idx in enumerate(selected_indices)
            ]