예제 #1
0
def process_literature_files():
    print("---------------------------------------------------------------------------------------")
    print("BRAZILIAN LITERATURE")
    # walk and merge texts to one file
    count_prev = 0
    count_pos = 0

    lines = list()
    count_files = 0
    count_authors = 0
    for dir_path, dirs, files in os.walk(DATASET_PT_CORPUS_PATH + "corpus-of-brazilian-portuguese-literature/"):
        print(dir_path)
        count_authors += 1
        for filename in tqdm.tqdm([f for f in files if f.endswith(".txt")]):
            file_path = os.path.join(dir_path, filename)
            count_files += 1
            # print(file_path)
            try:
                with open(file_path, "r", encoding="windows-1252") as text_file:
                    line = text_file.read()
                    count_prev += len(line.split())
                    line = clear_text(line)
                    line = clean_text_ret(line)
                    lines.append(line)
                    count_pos += len(line.split())
            except:
                try:
                    with open(file_path, "r", encoding="utf-8") as text_file:
                        line = text_file.read()
                        count_prev += len(line.split())
                        line = clear_text(line)
                        line = clean_text_ret(line)
                        lines.append(line)
                        count_pos += len(line.split())
                except:
                    with open(file_path, "r", errors="ignore") as text_file:
                        line = text_file.read()
                        count_prev += len(line.split())
                        line = clear_text(line)
                        line = clean_text_ret(line)
                        lines.append(line)
                        count_pos += len(line.split())
    print("Writing to output")
    with open(DATASET_PT_CORPUS_PATH + "corpus-of-brazilian-portuguese-literature/literature_proc.txt", "w+") as out_file:
        for line in tqdm.tqdm(lines):
            out_file.write(line + "\n")

    print("Prev:    ", count_prev)
    print("Pos:     ", count_pos)
    print("Authors: ", count_authors)
    print("Files:   ", count_files)
예제 #2
0
def process_hc():
    print("---------------------------------------------------------------------------------------")
    print("HC")

    hc_df = pd.read_csv(DATASET_PT_CORPUS_PATH + "hc_corpus/hc_corpus.tsv", sep="\t")
    texts = hc_df["content"].values

    count_prev = 0
    count_pos = 0
    count_posts = 0

    with open(DATASET_PT_CORPUS_PATH + "hc_corpus/hc_corpus_proc.txt", "w+") as output_file:
        for line in tqdm.tqdm(texts):
            line = str(line)

            count_prev += len(line.split())
            line = clear_text(line)
            line = clean_text_ret(line)
            splits = line.split()

            line_len = len(splits)
            if line_len >= 25:
                count_posts += 1
                output_file.write(line + "\n")
                count_pos += line_len
    print("Prev: ", str(format(count_prev, ',d')))
    print("Pos:  ", str(format(count_pos, ',d')))
    print("News: ", str(format(count_posts, ',d')))
예제 #3
0
def process_old_news():
    print("---------------------------------------------------------------------------------------")
    print("OLD NEWS")
    # with open(DATASET_PT_CORPUS_PATH + "old-newspaper/old-newspaper.tsv") as f, \
    #         open(DATASET_PT_CORPUS_PATH + "old-newspaper/pt_old_news.tsv", "w+") as out:
    #     out.write("language\tnewspaper\tdate\ttext")
    #     for line in tqdm.tqdm(f):
    #         if line[0:25].find("Portuguese") != -1:
    #             out.write(line)

    dataset_df = pd.read_csv(DATASET_PT_CORPUS_PATH + "old-newspapers/pt_old_news.tsv", sep="\t")

    text_values = dataset_df["text"].values
    count_prev = 0
    count_pos = 0
    with open(DATASET_PT_CORPUS_PATH + "old-newspapers/pt_old_news_proc.txt", "w+") as f:
        for line in tqdm.tqdm(text_values):
            count_prev += len(line.split())
            line = clear_text(line)
            line = clean_text_ret(line)

            f.write(line + "\n")

            count_pos += len(line.split())

    print("Prev: ", count_prev)
    print("Pos:  ", count_pos)
예제 #4
0
def process_folha():
    print("---------------------------------------------------------------------------------------")
    print("FOLHA UOL")

    count_prev = 0
    count_pos = 0
    count_posts = 0

    folha_df = pd.read_csv(DATASET_PT_CORPUS_PATH + "news-of-the-site-folhauol/articles.csv")

    texts = folha_df["text"].values

    with open(DATASET_PT_CORPUS_PATH + "news-of-the-site-folhauol/folha_proc.txt", "w+") as output_file:
        for line in tqdm.tqdm(texts):
            line = str(line)

            count_prev += len(line.split())
            line = clear_text(line)
            line = clean_text_ret(line)
            splits = line.split()

            line_len = len(splits)
            if line_len >= 10:
                count_posts += 1
                output_file.write(line + "\n")
                count_pos += line_len

    print("Prev: ", str(format(count_prev, ',d')))
    print("Pos:  ", str(format(count_pos, ',d')))
    print("News: ", str(format(count_posts, ',d')))
예제 #5
0
def process_instructions():
    print("---------------------------------------------------------------------------------------")
    print("INSTRUCTIONS")

    count_prev = 0
    count_pos = 0
    count_posts = 0

    files = glob.glob(DATASET_PT_CORPUS_PATH + "human-instructions-portuguese-wikihow/*.ttl")

    for i in range(len(files)):
        f = files[i]

        with open(f) as input_file, \
                open(DATASET_PT_CORPUS_PATH + "human-instructions-portuguese-wikihow/[email protected]".replace("@", str(i + 1).zfill(3)), "w+") as output_file:
            print(f)
            print("Reading file")
            text = input_file.read()

            print("Find patterns")
            lines = re.findall(r'"""(.*?)"""', text)

            print("Processing")
            for line in tqdm.tqdm(lines):
                try:
                    if line.find("Main Steps") != -1 or line.find("Requirements") != -1:
                        continue

                    count_prev += len(line.split())
                    line = clear_text(line)
                    line = clean_text_ret(line)
                    splits = line.split()

                    line_len = len(splits)
                    if line_len >= 10 and line.find("div class") == -1:
                        count_posts += 1
                        output_file.write(line + "\n")
                        count_pos += line_len
                except:
                    pass

        print("Prev: ", str(format(count_prev, ',d')))
        print("Pos:  ", str(format(count_pos, ',d')))
        print("News: ", str(format(count_posts, ',d')))
예제 #6
0
def process_wiki_files():
    print("---------------------------------------------------------------------------------------")
    print("WIKI PROCESS")
    with open(DATASET_PT_CORPUS_PATH + "wikipedia_ptbr/ptwiki.txt", "r") as wiki_file, \
            open(DATASET_PT_CORPUS_PATH + "wikipedia_ptbr/wiki_proc.txt", "w+") as wiki_proc_file:
        count_prev = 0
        count_pos = 0

        for line in tqdm.tqdm(wiki_file):
            if (not line.startswith('<') and not line.startswith('</')) and (not line.endswith(">")) and \
                    len(line.replace("\n", "").split()) >= 1:
                count_prev += len(line.split())
                line = line.replace("Section::::", "").replace("BULLET::::", "").replace("\n\n", "\n")
                line = clear_text(line)
                line = clean_text_ret(line)
                count_pos += len(line.split())

                wiki_proc_file.write(line + "\n")

        print("Prev: ", count_prev)
        print("Pos:  ", count_pos)

    return None
예제 #7
0
def process_blog():
    print("---------------------------------------------------------------------------------------")
    print("BLOG POSTS")

    count_prev = 0
    count_pos = 0
    count_posts = 0

    i = 0
    for chunk in pd.read_csv(DATASET_PT_CORPUS_PATH + "blogspot-posts/blogposts.csv", chunksize=10 ** 6):
        i += 1
        with open(DATASET_PT_CORPUS_PATH + "blogspot-posts/[email protected]".replace("@", str(i).zfill(6)), "w+") as output_file:
            contents = chunk["content"].values
            print("Processing chunk ", i)

            for line in tqdm.tqdm(contents):

                line = str(line)
                splits = line.split()

                if len(splits) < 25:
                    continue

                count_prev += len(line.split())
                line = clear_text(line)
                line = clean_text_ret(line)

                splits = line.split()
                line_len = len(splits)
                if line_len >= 25 and line.find("if gte mso") == -1:
                    count_posts += 1
                    output_file.write(line + "\n")
                    count_pos += line_len

            print("Prev: ", str(format(count_prev, ',d')))
            print("Pos:  ", str(format(count_pos, ',d')))
            print("Posts:", str(format(count_posts, ',d')))
def setup_data():
    # Load "procedentes"
    print("Loading text files...")
    data = []

    dict_classes = {
        PATH_LAB_PROC: PROCEDENTE,
        PATH_LAB_INPROC: IMPROCEDENTE,
        PATH_LAB_EXT: EXTINCAO,
        PATH_LAB_PARC_PROC: PARCIALMENTE_PROCEDENTE
    }

    regression_df = pd.read_csv("data/attributes.csv")
    print(regression_df.describe())

    for path_class in dict_classes.keys():

        folder = JEC_DATASET_PATH + path_class
        file_paths = glob.glob(folder + "*.txt")

        for file_path in file_paths:
            with open(file_path) as f:
                raw_content = f.read()
                file_name = file_path.replace(folder, "").replace(".txt", "")

                found_df = np.array(regression_df.loc[regression_df['sentenca'] == int(file_name)])
                # print(np.array(found_df))

                if len(found_df) == 0:
                    arr = [file_name, raw_content, dict_classes[path_class], 0.0]
                    data.append(arr)
                else:
                    for fdf in found_df:
                        arr = [file_name, raw_content, dict_classes[path_class], float(str(fdf[1]).replace("R$ ", "").replace(".", "").replace(",", "."))]
                        data.append(arr)

    print("Pre-processing...")
    processed_data = []

    for file_name, content, label, valor in tqdm.tqdm(data):
        clean_text = text_preprocessing.clear_text(content)
        processed_text = text_preprocessing.pre_process(clean_text)
        processed_data.append([file_name, processed_text, label, valor])

    df = pd.DataFrame(data=processed_data, columns=["file_name", "content", "label", "indenizacao"])
    df.to_csv(PROJECT_PATH + JEC_DEST_PATH + JEC_FINAL_PATH + "jec_ufsc_dataset.csv")

    print(df.describe())

    x = df[["file_name", "content"]]
    y = df["label"]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=(int(time.time()) % 2 ** 32), stratify=y)

    x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=(int(time.time()) % 2 ** 32), stratify=y_test)

    print(x_train.shape, x_test.shape, x_val.shape, y_train.shape, y_test.shape, y_val.shape)

    _path = PROJECT_PATH + JEC_DEST_PATH + JEC_FINAL_PATH + "jec_ufsc_train.csv"
    # save_data(x_train, y_train, _path)

    _path = PROJECT_PATH + JEC_DEST_PATH + JEC_FINAL_PATH + "jec_ufsc_test.csv"
    # save_data(x_test, y_test, _path)

    _path = PROJECT_PATH + JEC_DEST_PATH + JEC_FINAL_PATH + "jec_ufsc_val.csv"