示例#1
0
def get_most_recent_transcript_manifest(db: Database) -> pd.DataFrame:
    """
    Get a pandas dataframe that can act as a manifest of the most recent transcript available for each event stored in a
    CDP instance's database.

    Parameters
    ----------
    db: Database
        An already initialized database object connected to a CDP instance's database.

    Returns
    -------
    manifest: pandas.DataFrame
        A dataframe with transcript, event, body, and file details where each row is the most recent transcript for the
        event of that row.
    """
    # Get transcript dataset
    transcripts = pd.DataFrame(db.select_rows_as_list("transcript"))
    events = pd.DataFrame(db.select_rows_as_list("event"))
    bodies = pd.DataFrame(db.select_rows_as_list("body"))
    files = pd.DataFrame(db.select_rows_as_list("file"))
    events = events.merge(bodies,
                          left_on="body_id",
                          right_on="body_id",
                          suffixes=("_event", "_body"))
    transcripts = transcripts.merge(files,
                                    left_on="file_id",
                                    right_on="file_id",
                                    suffixes=("_transcript", "_file"))
    transcripts = transcripts.merge(events,
                                    left_on="event_id",
                                    right_on="event_id",
                                    suffixes=("_transcript", "_event"))

    # Group
    most_recent_transcripts = []
    grouped = transcripts.groupby("event_id")
    for name, group in grouped:
        most_recent = group.loc[group["created_transcript"].idxmax()]
        most_recent_transcripts.append(most_recent)

    most_recent = pd.DataFrame(most_recent_transcripts)

    return most_recent
示例#2
0
def _get_transcript_for_event(event_id: str, db: Database,
                              order_by_field: str) -> Dict[str, Any]:
    # Get the most recent or highest confidence transcript for the event
    results = db.select_rows_as_list(table="transcript",
                                     filters=[("event_id", event_id)],
                                     order_by=(order_by_field,
                                               OrderOperators.desc),
                                     limit=1)

    # Return result if found
    if len(results) == 1:
        return TrancriptJoin(event_id=event_id, transcript_details=results[0])

    # Return none to be filtered out
    return TrancriptJoin(event_id=event_id, transcript_details=None)
示例#3
0
def get_transcript_manifest(db: Database,
                            order_by_field: str = "confidence"
                            ) -> pd.DataFrame:
    """
    Get a pandas dataframe that can act as a manifest of a transcript available for each event stored in a CDP
    instance's database.

    Parameters
    ----------
    db: Database
        An already initialized database object connected to a CDP instance's database.
    order_by_field: str
        Which field to order the transcripts by to select the first (highest value) of.
        Default: "confidence"
        Choices: ["created", "confidence"]

    Returns
    -------
    manifest: pandas.DataFrame
        A dataframe where each row has transcript, event, body, and file details for the event at that row.
    """

    # Get transcript dataset
    events = pd.DataFrame(db.select_rows_as_list("event", limit=1e6))

    # Enforce that the provided order by field is valid
    order_by_field = order_by_field.lower()
    if order_by_field not in ALLOWED_ORDER_BY_FIELDS:
        raise ValueError(
            f"Provided `order_by_field` value is not a valid selection for transcript ordering / selection. "
            f"Received: {order_by_field}. Possible choices: {ALLOWED_ORDER_BY_FIELDS}"
        )

    # Create transcript get partial
    transcript_get = partial(_get_transcript_for_event,
                             db=db,
                             order_by_field=order_by_field)

    # Threaded request all transcripts
    with ThreadPoolExecutor() as exe:
        transcript_joins = list(exe.map(transcript_get, list(events.event_id)))

    # Filter down to only valid events
    # (Events that have transcripts)
    events_with_transcripts = [
        join.event_id for join in transcript_joins
        if join.transcript_details is not None
    ]
    events = events[events.event_id.isin(events_with_transcripts)]

    # Create a dataframe of the valid transcripts
    transcripts = pd.DataFrame([
        join.transcript_details for join in transcript_joins
        if join.transcript_details is not None
    ])

    # Merge transcript data with event data
    events = events.merge(transcripts,
                          on="event_id",
                          suffixes=("_event", "_transcript"))

    # Create file get partial
    file_get = partial(_get_file, db=db)

    # Get all the transcript files
    with ThreadPoolExecutor() as exe:
        transcript_file_details = pd.DataFrame(
            list(exe.map(file_get, events.file_id)))

    # Merge transcript file data with event transcript data
    events = events.merge(transcript_file_details,
                          on="file_id",
                          suffixes=("_transcript", "_file"))

    # Get body details and merge
    events = events.merge(pd.DataFrame(
        db.select_rows_as_list(table="body", limit=1e4)),
                          on="body_id",
                          suffixes=("_event", "_body"))

    return events
示例#4
0
def _get_file(file_id: str, db: Database) -> Dict[str, Any]:
    return db.select_row_by_id(table="file", id=file_id)