def _preprocessing_test(self, anns, new_anns, qid, vids): """ Preprocessing annotations Args: anns: annotations qid: start query id Returns: new_anns: preprocessed annotations qid: last query id """ translator = str.maketrans("", "", string.punctuation) for vid in anns.keys(): ann = anns[vid] duration = ann["duration"] for ts,q in zip(ann["timestamps"], ann["sentences"]): id_range = [qid] new_anns[str(qid)] = { "timestamps": ts, "query": q, "tokens": utils.tokenize(q.lower(), translator), "duration": duration, "video_id": vid } qid += 1 vids.extend(list(anns.keys())) return new_anns, qid, list(set(vids))
def _preprocessing(self, anns, aux_ann_path): """ Preprocessing annotations Args: anns: annotations aux_ann_path: path for annotations for auxiliary information (e.g., duration) Returns: new_anns: preprocessed annotations """ aux_anns = io_utils.load_csv(aux_ann_path) vid2len = {ann["id"]: ann["length"] for ann in aux_anns} vids = [] new_anns = dict() translator = str.maketrans("", "", string.punctuation) for qid, ann in enumerate(anns): info, query = ann.split("##") vid, spos, epos = info.split(" ") duration = vid2len[vid] new_anns[str(qid)] = { "timestamps": [float(spos), float(epos)], "query": query, "tokens": utils.tokenize(query.lower(), translator), "duration": float(duration), "video_id": vid } vids.append(vid) return new_anns, list(set(vids))
def split_in_paragraphs(document: WikipediaDocument, max_n_tokens=300) -> List[str]: splitted_documents = [] tokens = utils.tokenize(document.content) total_tokens = len(tokens) i = 0 current_split_tokens = [] for tok in tokens: i += 1 if i >= max_n_tokens or i >= total_tokens: i = 0 paragraph = ' '.join(current_split_tokens) splitted_documents.append( WikipediaDocument(id=document.id, title=document.title, content=paragraph, label=document.label)) current_split_tokens = [] else: current_split_tokens.append(tok) return splitted_documents