Пример #1
0
def read_label_counts_data(split="tiny"):
    dtypes = {"response_time_sec": np.int32, "session_id": np.int32}
    converters = {"label_counts": ast.literal_eval}
    path = Config.LABEL_COUNTS_DATASET_FILE(split)
    data = pd.read_csv(path,
                       sep=",",
                       header=0,
                       dtype=dtypes,
                       converters=converters)
    log_info("Read %s data with %d rows" % (path.stem, data.shape[0]))
    return data
Пример #2
0
def read_question_and_response_data(split="tiny"):
    dtypes = {"response_time_sec": np.int32, "session_id": np.int32}
    converters = {"question": ast.literal_eval, "response": ast.literal_eval}
    path = Config.QUESTION_TEXT_AND_RESPONSE_TEXT_DATASET_FILE(split)
    data = pd.read_csv(path,
                       sep=",",
                       header=0,
                       dtype=dtypes,
                       converters=converters)
    log_info("Read %s data with %d rows" % (path.stem, data.shape[0]))
    return data
Пример #3
0
def read_question_and_context_data(split="tiny",
                                   window_size=1,
                                   include_question_text=True,
                                   include_context_text=True,
                                   include_context_speaker=True,
                                   include_context_times=False):
    assert window_size <= Config.MAX_CONTEXT_WINDOW_SIZE
    dtypes = {"response_time_sec": np.int32, "session_id": np.int32}
    converters = {}

    if include_context_speaker:
        for i in range(1, window_size + 1):
            dtypes["turn_speaker-%d" % i] = str

    def to_float(t):
        try:
            return np.float32(t)
        except:
            return -1

    if include_context_times:
        for i in range(1, window_size + 1):
            converters["turn_time-%d" % i] = to_float

    if include_question_text:
        converters["question"] = ast.literal_eval

    if include_context_text:
        for i in range(1, window_size + 1):
            converters["turn_text-%d" % i] = ast.literal_eval

    path = Config.QUESTION_AND_CONTEXT_WINDOW_DATASET_FILE(split)
    data = pd.read_csv(path,
                       sep=",",
                       header=0,
                       dtype=dtypes,
                       converters=converters)

    drop_columns = set(
        data.columns.values) - (set(dtypes.keys()) | set(converters.keys()))
    data.drop(labels=drop_columns, axis="columns", inplace=True)

    log_info("Read %s data with %d rows" % (path.stem, data.shape[0]))
    return data
Пример #4
0
def split_data(data, tiny_f=0.01, train_f=0.7, dev_f=0.15, test_f=0.15):
    session_ids = data.session_id.unique()
    assert abs(train_f + dev_f + test_f - 1.0) * len(session_ids) < 1
    assert tiny_f < train_f
    log_info("Splitting %d session_ids" % len(session_ids))

    log_info("Extracting Sessions")
    sessions = data_util.get_sessions(data)
    print("\tExtracted %d sessions" % len(sessions))

    session_id_to_num_questions = defaultdict(int)
    num_questions_to_session_ids = defaultdict(list)
    for session in sessions:
        num_questions = len(tuple(session.iter_question_and_response()))
        session_id_to_num_questions[session.id] = num_questions
        num_questions_to_session_ids[num_questions].append(session.id)

    groups = get_stratified_session_ids(num_questions_to_session_ids,  min([train_f, dev_f, test_f]))

    session_id_splits = defaultdict(list)
    for group in groups:
        np.random.seed(seed=Config.SEED)
        np.random.shuffle(group)

        train_split, dev_split, test_split = np.split(group, [int(np.round(train_f * len(group))), int(np.round((train_f + dev_f) * len(group)))])
        session_id_splits["train"].extend(train_split)
        session_id_splits["dev"].extend(dev_split)
        session_id_splits["test"].extend(test_split)
    # tiny is a subset of train
    session_id_splits["tiny"] = session_id_splits["train"][:int(np.round(tiny_f * len(session_ids)))]
    for split, ids in session_id_splits.items():
        num_q = sum([session_id_to_num_questions[id] for id in ids])
        print("\t%s: %d sessions, %d questions" % (split, len(ids), num_q))

    return  data[data.session_id.isin(session_id_splits["tiny"])],\
            data[data.session_id.isin(session_id_splits["train"])],\
            data[data.session_id.isin(session_id_splits["dev"])],\
            data[data.session_id.isin(session_id_splits["test"])]
Пример #5
0
def read_corpus(split=None):
    dtypes = {
        'session_id': np.int32,
        'created_at': object,
        'sent_from': str,
        'sent_to': str,
        'content_type': str
    }
    converters = {"text": ast.literal_eval}
    if split is None:
        path = Config.CORPUS_FILE
        split = "entire"
    else:
        path = Config.CORPUS_SPLIT_FILE(split)

    data = pd.read_csv(path,
                       sep=",",
                       header=0,
                       dtype=dtypes,
                       parse_dates=["created_at"],
                       converters=converters)

    log_info("Read %s corpus with %d rows" % (split, data.shape[0]))
    return data
Пример #6
0
    dest = os.path.join(Config.DATA_DIR, destname)
    return dest

if __name__ == "__main__":
    assert Path(Config.CORPUS_FILE).exists(), "%s does not exist" % Config.CORPUS_FILE
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--dataset", dest="dataset", type=str, default=Dataset.QUESTION_ONLY.name,
            help="Which dataset to build. Defaults to QUESTION_ONLY")
    args = parser.parse_args()
    args.dataset = Dataset[args.dataset]

    builders = {Dataset.QUESTION_ONLY: build_question_only,
                Dataset.QUESTION_AND_INDEX: build_question_and_index,
                Dataset.QUESTION_AND_DURATION: build_question_and_duration,
                Dataset.QUESTION_AND_SENTIMENT: build_question_and_sentiment,
                Dataset.QUESTION_AND_NEWLINES: lambda split: build_question_only(split, concatenator="\n"),
                Dataset.QUESTION_AND_CONTEXT_WINDOW: lambda split: build_question_with_context_window(split, window_size=Config.MAX_CONTEXT_WINDOW_SIZE),
                Dataset.QUESTION_TEXT_AND_RESPONSE_TEXT: build_question_text_and_response_text,
                Dataset.LABEL_COUNTS: build_label_counts}

    log_info("Building the %s dataset" % args.dataset.name.lower())

    for split in Config.SPLITS:
        log_info("Building %s" % split)
        dataset = builders[args.dataset](split)
        print("\tExtracted %s samples" % dataset.shape[0])

        dest = get_dest_name(split)
        print("\tWriting dataset to %s" % dest)
        dataset.to_csv(dest, index=False)
Пример #7
0
    return  data[data.session_id.isin(session_id_splits["tiny"])],\
            data[data.session_id.isin(session_id_splits["train"])],\
            data[data.session_id.isin(session_id_splits["dev"])],\
            data[data.session_id.isin(session_id_splits["test"])]

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--train" , dest="train", type=float,
            default = 0.7, help="Fraction of train data")
    parser.add_argument("-d", "--dev" , dest="dev", type=float,
            default = 0.15, help="Fraction of dev data")
    parser.add_argument("-e", "--test" , dest="test", type=float,
            default = 0.15, help="Fraction of test data")
    parser.add_argument("-y", "--tiny" , dest="tiny", type=float,
            default = 0.01, help="Fraction of tiny data")
    args = parser.parse_args()
    assert args.tiny < args.train

    log_info("Reading Corpus")
    data = data_readers.read_corpus()

    tiny, train, dev, test = split_data(data, tiny_f=args.tiny, train_f=args.train, dev_f=args.dev, test_f=args.test)
    splits = {"tiny": tiny, "train": train, "dev": dev, "test": test}

    for name, split in splits.items():
        dest = get_dest(split=name)
        log_info("Writing %d %s rows to %s" % (split.shape[0], name, dest))
        split.to_csv(dest, index=False)

Пример #8
0
        "--dest",
        dest="dest",
        type=str,
        default=None,
        help="Path to destination file. Defaults to {datafile}_processed.csv")
    args = parser.parse_args()

    path = Path(args.datafile).resolve()
    assert path.exists() and path.is_file() and path.suffix == '.csv'

    if args.dest is None:
        args.dest = os.path.join(str(path.parent),
                                 path.stem + "_preprocessed" + path.suffix)

    if Path(Config.REMOVED_ROWS_FILE).exists():
        log_info("Deleting %s" % Config.REMOVED_ROWS_FILE)
        os.remove(Config.REMOVED_ROWS_FILE)

    log_info("Reading CSV file")
    data = read_csv(args.datafile)

    log_info("Parsing timestamps")
    data = parse_timestamps(data)

    log_info("Sorting data")
    data = data.sort_values(by=["session_id", "created_at"],
                            ascending=[True, True],
                            axis="index")

    log_info("Deduping utterances")
    data = dedupe_utterances(data)