예제 #1
0
def load_data_context(data_path='data/train.txt', is_train=True):

    data_list = []
    target_list = []
    f_data = open(data_path, 'r')
    data_lines = f_data.readlines()
    f_data.close()

    for i, text in enumerate(data_lines):
        # skip the first line
        if i == 0:
            continue

        tokens = text.split('\t')

        convers = tokens[1:CONV_PAD_LEN + 1]

        # normal preprocessing
        raw_a = convers[0]
        raw_b = convers[1]
        raw_c = convers[2]

        a = processing_pipeline(raw_a)
        b = processing_pipeline(raw_b)
        c = processing_pipeline(raw_c)

        data_list.append((a, b, c, raw_a, raw_b, raw_c))
        if is_train:
            emo = tokens[CONV_PAD_LEN + 1].strip()
            target_list.append(EMOS_DIC[emo])

    if is_train:
        return data_list, target_list
    else:
        return data_list
예제 #2
0
def clean_sentences(sent_text):
    """
    This function detects if a line should be removed (all empty or no words)
    And returns a cleaned sentences without duplicated tokens
    """
    to_keep = False
    if re.match(".*[a-zA-Z]+.*", sent_text):
        to_keep = True
    remove_dup = re.sub(r'(.+?)\1+', r'\1', processing_pipeline(sent_text))
    return to_keep, remove_dup
예제 #3
0
def load_data_context(data_path='data/train.txt', is_train=True):
    # data_path = 'data/train.txt'

    data_list = []
    target_list = []
    f_data = open(data_path, 'r')
    data_lines = f_data.readlines()
    f_data.close()

    for i, text in enumerate(data_lines):
        # skip the first line
        if i == 0:
            continue

        tokens = text.split('\t')

        convers = tokens[1:CONV_PAD_LEN + 1]

        a = convers[0]
        b = convers[1]
        c = convers[2]

        a = processing_pipeline(a)
        b = processing_pipeline(b)
        c = processing_pipeline(c)

        a_len = len(a.split())
        b_len = len(b.split())
        c_len = len(c.split())

        data_list.append((a, a_len, b, b_len, c, c_len))
        if is_train:
            emo = tokens[CONV_PAD_LEN + 1].strip()
            target_list.append(EMOS_DIC[emo])

    if is_train:
        return data_list, target_list
    else:
        return data_list
예제 #4
0
def load_data_context(data_path='/data/SuperMod/test_data.txt', is_train=True):

    data_list = []
    target_list = []

    df = pd.read_csv(data_path, encoding="utf8")

    if len(df.columns) > 4:
        data_list = df.comment_text.tolist()
        target_list = df.toxic.tolist()
    else:
        data_list = df.comment_text.tolist()
        target_list = df.toxicity.tolist()

    clean_sent_list = [
        sent_tokenize(processing_pipeline(email)) for email in data_list
    ]

    if is_train:
        return clean_sent_list, target_list
    else:
        return clean_sent_list