def fix_file_encodings(inmainfolder, outmainfolder, in_ext="html", in_encoding="cp1256", out_ext="txt", out_encoding="utf8"): subfolders = io_utils.getfoldernames_of_dir(inmainfolder) for subf in subfolders: p1 = os.path.join(inmainfolder, subf) fnames = io_utils.getfilenames_of_dir(p1, removeextension=True) o1 = io_utils.ensure_dir(os.path.join(outmainfolder, subf)) print("In ", subf) for fname in fnames: p2 = os.path.join(p1, fname + "." + in_ext) o2 = os.path.join(o1, fname + "." + out_ext) fix_file(p2, o2, in_encoding, out_encoding) ''' infile = codecs.open(p2, "r", encoding=in_encoding) text = infile.read() o2 = os.path.join(o1, fname+"."+out_ext) outfile = codecs.open(o2, "w", encoding=out_encoding) outfile.write(text) infile.close() outfile.close() ''' print("Finished..\n") print("Done.")
def run_csv(): ''' mainfolder = "/home/dicle/Documents/arabic_nlp/datasets/sentiment/Twitter" outpath = "/home/dicle/Documents/arabic_nlp/datasets/sentiment/Twitter/ar_500polartweets.csv" ''' mainfolder = "/home/dicle/Documents/arabic_nlp/datasets/OCA-corpus" outpath = os.path.join(mainfolder, "ar_polar-moviereviewsOCAcorpus.csv") fix_folder = io_utils.ensure_dir(os.path.join(mainfolder, "fix_files")) files_to_csv(mainfolder, outpath, fix_folder)
def _dump_classification_system(self, model, task_obj, picklefolder, modelname): recordfolder = io_utils.ensure_dir( os.path.join(picklefolder, modelname)) modelpath = os.path.join(recordfolder, CLSF_CONSTANTS.MODEL_FILE_NAME) classifierpath = os.path.join(recordfolder, CLSF_CONSTANTS.CLASSIFIER_FILE_NAME) joblib.dump(model, modelpath) joblib.dump(task_obj, classifierpath) return recordfolder
def files_to_csv(mainfolder, outpath, fixfolder, in_encoding="utf-8"): textcol = "text" catcol = "polarity" other = "domain" labels = io_utils.getfoldernames_of_dir(mainfolder) rows = [] for label in labels: p1 = os.path.join(mainfolder, label) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) print("Reading in ", label) for fname in fnames: print(" ", fname) p2 = os.path.join(p1, fname) try: f = codecs.open(p2, "r", encoding=in_encoding) text = f.read() except UnicodeDecodeError: f = codecs.open(p2, "r", encoding="cp1256") text = f.read() f2 = codecs.open(os.path.join( io_utils.ensure_dir(os.path.join(fixfolder, label)), fname), "w", encoding="utf") f2.write(text) text = text.strip() row = {textcol: text, catcol: label, other: fname} rows.append(row) f.close() df = pd.DataFrame(rows) df = df.sample(frac=1).reset_index(drop=True) if outpath: df.to_csv(outpath, sep="\t", index=False) return df
def fix_texts_nested(infolder, outfolder): folders = io_utils.getfoldernames_of_dir(infolder) for folder in folders: inp1 = os.path.join(infolder, folder) outp1 = io_utils.ensure_dir(os.path.join(outfolder, folder)) files = io_utils.getfilenames_of_dir(inp1, False) for file in files: inp2 = os.path.join(inp1, file) text = read_encoded_file(inp2) text = fix_text(text) outp2 = os.path.join(outp1, file) with open(outp2, "w") as f: f.write(text)
fpath = os.path.join(outfolder, cat + "_disintersecting_count.csv") content = "\n".join([str(i) + "\t" + str(j) for (i, j) in dis_countwords[cat]]) open(fpath, "w").write(content) fpath = os.path.join(outfolder, cat + "_intersecting_count.csv") content = "\n".join([str(i) + "\t" + str(j) for (i, j) in int_countwords[cat]]) open(fpath, "w").write(content) ''' # en sentiment infolder = "/home/dicle/Documents/data/en_sentiment" fname = "en_polar_10Kreviews.csv" text_col = "text" cat_col = "category" sep = "\t" outrootfolder = "/home/dicle/Documents/data/en_sentiment/dataset_analysis" outfolder = io_utils.ensure_dir(os.path.join(outrootfolder, fname)) lang = "en" analyse_datasets(infolder, fname, text_col, cat_col, sep, lang, outfolder) ''' df = pd.read_csv(os.path.join(folderpath, fname), sep=";") disintersects, intersection = category_disintersection_words(df, textcol="MAIL", catcol="TIP") outfolder = "/home/dicle/Documents/data/emailset2/terms_features" open(os.path.join(outfolder, "intersection.csv"), "w").write("\n".join(intersection)) for i,words in enumerate(disintersects): open(os.path.join(outfolder, "disintersects_cat"+str(i)+".csv"), "w").write("\n".join(words)) '''