Exemplo n.º 1
0
def build_question_and_duration(split="tiny"):
    data = data_readers.read_corpus(split)
    questions = []
    question_durations_sec = []
    response_times_sec = []
    session_ids = []

    sessions = data_util.get_sessions(data)
    for session in progressbar.progressbar(sessions):
        for question, response in session.iter_question_and_response():
            questions.append(question.row.text)
            question_durations_sec.append(question.duration)
            response_times_sec.append((response.row.created_at - question.row.created_at).seconds)
            session_ids.append(session.id)

    dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "question_duration_sec": question_durations_sec, "response_time_sec": response_times_sec})
    return dataset
Exemplo n.º 2
0
def build_question_and_sentiment(split="tiny"):
    data = data_readers.read_corpus(split)

    pool = Pool(12)
    sessions = data_util.get_sessions(data)

    combined_results = defaultdict(list)
    for session_result in progressbar.progressbar(pool.imap(process_session, sessions), max_value=len(sessions)):
        for k, v in session_result.items():
            combined_results[k].extend(v)
    session_ids = combined_results["session_ids"]
    questions = combined_results["questions"]
    sentiments = combined_results["sentiments"]
    response_times_sec = combined_results["response_times_sec"]

    dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "question_sentiment": sentiments, "response_time_sec": response_times_sec})
    return dataset
Exemplo n.º 3
0
def build_question_only(split="tiny", concatenator=None):
    data = data_readers.read_corpus(split)
    questions = []
    response_times_sec = []
    session_ids = []

    sessions = data_util.get_sessions(data)
    progress = progressbar.ProgressBar(max_value=len(sessions)).start()
    for i, session in enumerate(sessions):
        for question, response in session.iter_question_and_response(concatenator=concatenator):
            questions.append(question.row.text)
            response_times_sec.append((response.row.created_at - question.row.created_at).seconds)
            session_ids.append(session.id)
        progress.update(i)

    dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "response_time_sec": response_times_sec})
    progress.finish()
    return dataset
Exemplo n.º 4
0
def build_label_counts(split="tiny"):
    data = data_readers.read_corpus(split)
    label_counts = []
    response_times_sec = []
    session_ids = []

    sessions = data_util.get_sessions(data)
    for session in progressbar.progressbar(sessions):
        counts = defaultdict(int)
        for question, response in session.iter_question_and_response():
            response_time_sec = (response.row.created_at - question.row.created_at).seconds
            response_times_sec.append(response_time_sec)
            label_counts.append(tuple(counts[label] for label in Config.LABELS))
            session_ids.append(session.id)

            counts[get_response_time_label(response_time_sec)] += 1

    dataset = pd.DataFrame.from_dict({"session_id": session_ids,  "response_time_sec": response_times_sec, "label_counts": label_counts})
    return dataset
Exemplo n.º 5
0
def build_question_with_context_window(split="tiny", window_size=0):
    data = data_readers.read_corpus(split)
    sessions = data_util.get_sessions(data)

    questions = []
    response_times_sec = []
    session_ids = []
    turn_texts = defaultdict(list)
    turn_speakers = defaultdict(list)
    turn_times = defaultdict(list)

    for session in progressbar.progressbar(sessions):
        for question, response in session.iter_question_and_response():
            questions.append(question.row.text)
            response_times_sec.append((response.row.created_at - question.row.created_at).seconds)
            session_ids.append(session.id)

            times = defaultdict(lambda: 0)
            texts = defaultdict(lambda: [])
            speakers = defaultdict(lambda: Config.EMPTY_TAG)
            prev = question.row.created_at
            for i, turn in enumerate(session.iter_turns(start_row=question.index, num_turns=window_size+1, direction=-1)):
                texts[i] = turn.text
                speakers[i] = turn.sent_from
                times[i] = int((prev - turn.created_at).seconds)
                prev = turn.created_at

            for i in range(1, window_size+1):
                turn_texts["turn_text-%d" % i].append(texts[i])
                turn_speakers["turn_speaker-%d" % i].append(speakers[i])
                turn_times["turn_time-%d" % i].append(times[i])

    columns = {"session_id": session_ids, "question": questions, "response_time_sec": response_times_sec}
    columns.update(turn_texts)
    columns.update(turn_speakers)
    columns.update(turn_times)
    dataset = pd.DataFrame.from_dict(columns)
    return dataset
Exemplo n.º 6
0
def split_data(data, tiny_f=0.01, train_f=0.7, dev_f=0.15, test_f=0.15):
    session_ids = data.session_id.unique()
    assert abs(train_f + dev_f + test_f - 1.0) * len(session_ids) < 1
    assert tiny_f < train_f
    log_info("Splitting %d session_ids" % len(session_ids))

    log_info("Extracting Sessions")
    sessions = data_util.get_sessions(data)
    print("\tExtracted %d sessions" % len(sessions))

    session_id_to_num_questions = defaultdict(int)
    num_questions_to_session_ids = defaultdict(list)
    for session in sessions:
        num_questions = len(tuple(session.iter_question_and_response()))
        session_id_to_num_questions[session.id] = num_questions
        num_questions_to_session_ids[num_questions].append(session.id)

    groups = get_stratified_session_ids(num_questions_to_session_ids,  min([train_f, dev_f, test_f]))

    session_id_splits = defaultdict(list)
    for group in groups:
        np.random.seed(seed=Config.SEED)
        np.random.shuffle(group)

        train_split, dev_split, test_split = np.split(group, [int(np.round(train_f * len(group))), int(np.round((train_f + dev_f) * len(group)))])
        session_id_splits["train"].extend(train_split)
        session_id_splits["dev"].extend(dev_split)
        session_id_splits["test"].extend(test_split)
    # tiny is a subset of train
    session_id_splits["tiny"] = session_id_splits["train"][:int(np.round(tiny_f * len(session_ids)))]
    for split, ids in session_id_splits.items():
        num_q = sum([session_id_to_num_questions[id] for id in ids])
        print("\t%s: %d sessions, %d questions" % (split, len(ids), num_q))

    return  data[data.session_id.isin(session_id_splits["tiny"])],\
            data[data.session_id.isin(session_id_splits["train"])],\
            data[data.session_id.isin(session_id_splits["dev"])],\
            data[data.session_id.isin(session_id_splits["test"])]