예제 #1
0
def search(url: str,
           query: dict,
           data: pd.DataFrame or str or d3m_ds.Dataset=None,
           send_data=True,
           max_return_docs: int=20,
           return_named_entity: bool=False) -> typing.List[Dataset]:
    """
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        url: str - the datamart server(for ISI's datamart it is meaningless, just a flag)
        query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema)
        data: the data you are trying to augment. It can be provided as one of:
            - a pandas.DataFrame object
            - a D3M Dataset object
            - the path to a D3M datasetDoc.json file
            - the path to a CSV file
        send_data: (for ISI's datamart it is meaningless)

    Returns: a list of datamart.Dataset objects

    """
    if not url.startswith(SEARCH_URL):
        return []

    loaded_data = DataLoader.load_data(data)
    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    es_results = []
    if (query and ('required_variables' in query)) or (loaded_data is None):
        # if ("required_variables" exists or no data):
        es_results = augmenter.query_by_json(query, loaded_data,
                                             size=max_return_docs,
                                             return_named_entity=return_named_entity) or []
    else:
        # if there is no "required_variables" in the query JSON, but the dataset exists,
        # try each named entity column as "required_variables" and concat the results:
        query = query or {}
        exist = set()
        for col in loaded_data:
            if Utils.is_column_able_to_query(loaded_data[col]):
                query['required_variables'] = [{
                    "type": "dataframe_columns",
                    "names": [col]
                }]
                cur_results = augmenter.query_by_json(query, loaded_data,
                                                   size=max_return_docs,
                                                   return_named_entity=return_named_entity)
                if not cur_results:
                    continue
                for res in cur_results:
                    if res['_id'] not in exist:
                        # TODO: how about the score ??
                        exist.add(res['_id'])
                        es_results.append(res)
    return [Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results]
예제 #2
0
def search(
    query: dict,
    data: pd.DataFrame or str
    or d3m_ds.Dataset = None) -> typing.List[Dataset]:
    """
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema)
        data: the data you are trying to augment. It can be provided as one of:
            - a pandas.DataFrame object
            - a D3M Dataset object
            - the path to a D3M datasetDoc.json file
            - the path to a CSV file

    Returns: a list of datamart.Dataset objects.

    """
    loaded_data = DataLoader.load_data(data)
    augmenter = Augment(es_index=DEFAULT_ES)
    if not (query and
            ('required_variables' in query)) and (loaded_data is not None):
        query = query or {}
        query['required_variables'] = []
        for col in loaded_data:
            if Utils.is_column_able_to_query(loaded_data[col]):
                query['required_variables'].append({
                    "type": "dataframe_columns",
                    "names": [col]
                })
    es_results = augmenter.query_by_json(query, loaded_data)
    if es_results:
        return [
            Dataset(es_result, original_data=loaded_data, query_json=query)
            for es_result in es_results
        ]
    return []