예제 #1
0
def fix_file_encodings(inmainfolder,
                       outmainfolder,
                       in_ext="html",
                       in_encoding="cp1256",
                       out_ext="txt",
                       out_encoding="utf8"):

    subfolders = io_utils.getfoldernames_of_dir(inmainfolder)

    for subf in subfolders:

        p1 = os.path.join(inmainfolder, subf)
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=True)

        o1 = io_utils.ensure_dir(os.path.join(outmainfolder, subf))
        print("In ", subf)
        for fname in fnames:

            p2 = os.path.join(p1, fname + "." + in_ext)
            o2 = os.path.join(o1, fname + "." + out_ext)
            fix_file(p2, o2, in_encoding, out_encoding)
            '''
            infile = codecs.open(p2, "r", encoding=in_encoding)
            text = infile.read()
            o2 = os.path.join(o1, fname+"."+out_ext)
            outfile = codecs.open(o2, "w", encoding=out_encoding)
            outfile.write(text)
            infile.close()
            outfile.close()
            '''

        print("Finished..\n")

    print("Done.")
예제 #2
0
def run_csv():
    '''
    mainfolder = "/home/dicle/Documents/arabic_nlp/datasets/sentiment/Twitter"
    outpath = "/home/dicle/Documents/arabic_nlp/datasets/sentiment/Twitter/ar_500polartweets.csv"
    '''
    mainfolder = "/home/dicle/Documents/arabic_nlp/datasets/OCA-corpus"
    outpath = os.path.join(mainfolder, "ar_polar-moviereviewsOCAcorpus.csv")
    fix_folder = io_utils.ensure_dir(os.path.join(mainfolder, "fix_files"))
    files_to_csv(mainfolder, outpath, fix_folder)
예제 #3
0
    def _dump_classification_system(self, model, task_obj, picklefolder,
                                    modelname):

        recordfolder = io_utils.ensure_dir(
            os.path.join(picklefolder, modelname))

        modelpath = os.path.join(recordfolder, CLSF_CONSTANTS.MODEL_FILE_NAME)
        classifierpath = os.path.join(recordfolder,
                                      CLSF_CONSTANTS.CLASSIFIER_FILE_NAME)

        joblib.dump(model, modelpath)
        joblib.dump(task_obj, classifierpath)

        return recordfolder
예제 #4
0
def files_to_csv(mainfolder, outpath, fixfolder, in_encoding="utf-8"):

    textcol = "text"
    catcol = "polarity"
    other = "domain"

    labels = io_utils.getfoldernames_of_dir(mainfolder)

    rows = []

    for label in labels:
        p1 = os.path.join(mainfolder, label)
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)

        print("Reading in ", label)

        for fname in fnames:

            print(" ", fname)
            p2 = os.path.join(p1, fname)

            try:
                f = codecs.open(p2, "r", encoding=in_encoding)
                text = f.read()

            except UnicodeDecodeError:
                f = codecs.open(p2, "r", encoding="cp1256")
                text = f.read()
                f2 = codecs.open(os.path.join(
                    io_utils.ensure_dir(os.path.join(fixfolder, label)),
                    fname),
                                 "w",
                                 encoding="utf")
                f2.write(text)

            text = text.strip()

            row = {textcol: text, catcol: label, other: fname}
            rows.append(row)

            f.close()

    df = pd.DataFrame(rows)
    df = df.sample(frac=1).reset_index(drop=True)

    if outpath:
        df.to_csv(outpath, sep="\t", index=False)
    return df
예제 #5
0
def fix_texts_nested(infolder, outfolder):

    folders = io_utils.getfoldernames_of_dir(infolder)

    for folder in folders:
        inp1 = os.path.join(infolder, folder)
        outp1 = io_utils.ensure_dir(os.path.join(outfolder, folder))

        files = io_utils.getfilenames_of_dir(inp1, False)
        for file in files:

            inp2 = os.path.join(inp1, file)
            text = read_encoded_file(inp2)
            text = fix_text(text)
            outp2 = os.path.join(outp1, file)
            with open(outp2, "w") as f:
                f.write(text)
예제 #6
0
        
        fpath = os.path.join(outfolder, cat + "_disintersecting_count.csv")
        content = "\n".join([str(i) + "\t" + str(j) for (i, j) in dis_countwords[cat]])
        open(fpath, "w").write(content)
        
        fpath = os.path.join(outfolder, cat + "_intersecting_count.csv")
        content = "\n".join([str(i) + "\t" + str(j) for (i, j) in int_countwords[cat]])
        open(fpath, "w").write(content)
    

    '''

    # en sentiment
    infolder = "/home/dicle/Documents/data/en_sentiment"
    fname = "en_polar_10Kreviews.csv"
    text_col = "text"
    cat_col = "category"
    sep = "\t"
    outrootfolder = "/home/dicle/Documents/data/en_sentiment/dataset_analysis"
    outfolder = io_utils.ensure_dir(os.path.join(outrootfolder, fname))
    lang = "en"
    analyse_datasets(infolder, fname, text_col, cat_col, sep, lang, outfolder)
    '''
    df = pd.read_csv(os.path.join(folderpath, fname), sep=";")
    disintersects, intersection = category_disintersection_words(df, textcol="MAIL", catcol="TIP")
    outfolder = "/home/dicle/Documents/data/emailset2/terms_features"
    open(os.path.join(outfolder, "intersection.csv"), "w").write("\n".join(intersection))
    for i,words in enumerate(disintersects):
        open(os.path.join(outfolder, "disintersects_cat"+str(i)+".csv"), "w").write("\n".join(words))
    '''