def get_most_recent_transcript_manifest(db: Database) -> pd.DataFrame: """ Get a pandas dataframe that can act as a manifest of the most recent transcript available for each event stored in a CDP instance's database. Parameters ---------- db: Database An already initialized database object connected to a CDP instance's database. Returns ------- manifest: pandas.DataFrame A dataframe with transcript, event, body, and file details where each row is the most recent transcript for the event of that row. """ # Get transcript dataset transcripts = pd.DataFrame(db.select_rows_as_list("transcript")) events = pd.DataFrame(db.select_rows_as_list("event")) bodies = pd.DataFrame(db.select_rows_as_list("body")) files = pd.DataFrame(db.select_rows_as_list("file")) events = events.merge(bodies, left_on="body_id", right_on="body_id", suffixes=("_event", "_body")) transcripts = transcripts.merge(files, left_on="file_id", right_on="file_id", suffixes=("_transcript", "_file")) transcripts = transcripts.merge(events, left_on="event_id", right_on="event_id", suffixes=("_transcript", "_event")) # Group most_recent_transcripts = [] grouped = transcripts.groupby("event_id") for name, group in grouped: most_recent = group.loc[group["created_transcript"].idxmax()] most_recent_transcripts.append(most_recent) most_recent = pd.DataFrame(most_recent_transcripts) return most_recent
def _get_transcript_for_event(event_id: str, db: Database, order_by_field: str) -> Dict[str, Any]: # Get the most recent or highest confidence transcript for the event results = db.select_rows_as_list(table="transcript", filters=[("event_id", event_id)], order_by=(order_by_field, OrderOperators.desc), limit=1) # Return result if found if len(results) == 1: return TrancriptJoin(event_id=event_id, transcript_details=results[0]) # Return none to be filtered out return TrancriptJoin(event_id=event_id, transcript_details=None)
def get_transcript_manifest(db: Database, order_by_field: str = "confidence" ) -> pd.DataFrame: """ Get a pandas dataframe that can act as a manifest of a transcript available for each event stored in a CDP instance's database. Parameters ---------- db: Database An already initialized database object connected to a CDP instance's database. order_by_field: str Which field to order the transcripts by to select the first (highest value) of. Default: "confidence" Choices: ["created", "confidence"] Returns ------- manifest: pandas.DataFrame A dataframe where each row has transcript, event, body, and file details for the event at that row. """ # Get transcript dataset events = pd.DataFrame(db.select_rows_as_list("event", limit=1e6)) # Enforce that the provided order by field is valid order_by_field = order_by_field.lower() if order_by_field not in ALLOWED_ORDER_BY_FIELDS: raise ValueError( f"Provided `order_by_field` value is not a valid selection for transcript ordering / selection. " f"Received: {order_by_field}. Possible choices: {ALLOWED_ORDER_BY_FIELDS}" ) # Create transcript get partial transcript_get = partial(_get_transcript_for_event, db=db, order_by_field=order_by_field) # Threaded request all transcripts with ThreadPoolExecutor() as exe: transcript_joins = list(exe.map(transcript_get, list(events.event_id))) # Filter down to only valid events # (Events that have transcripts) events_with_transcripts = [ join.event_id for join in transcript_joins if join.transcript_details is not None ] events = events[events.event_id.isin(events_with_transcripts)] # Create a dataframe of the valid transcripts transcripts = pd.DataFrame([ join.transcript_details for join in transcript_joins if join.transcript_details is not None ]) # Merge transcript data with event data events = events.merge(transcripts, on="event_id", suffixes=("_event", "_transcript")) # Create file get partial file_get = partial(_get_file, db=db) # Get all the transcript files with ThreadPoolExecutor() as exe: transcript_file_details = pd.DataFrame( list(exe.map(file_get, events.file_id))) # Merge transcript file data with event transcript data events = events.merge(transcript_file_details, on="file_id", suffixes=("_transcript", "_file")) # Get body details and merge events = events.merge(pd.DataFrame( db.select_rows_as_list(table="body", limit=1e4)), on="body_id", suffixes=("_event", "_body")) return events
def _get_file(file_id: str, db: Database) -> Dict[str, Any]: return db.select_row_by_id(table="file", id=file_id)