def fix_file_encodings(inmainfolder, outmainfolder, in_ext="html", in_encoding="cp1256", out_ext="txt", out_encoding="utf8"): subfolders = io_utils.getfoldernames_of_dir(inmainfolder) for subf in subfolders: p1 = os.path.join(inmainfolder, subf) fnames = io_utils.getfilenames_of_dir(p1, removeextension=True) o1 = io_utils.ensure_dir(os.path.join(outmainfolder, subf)) print("In ", subf) for fname in fnames: p2 = os.path.join(p1, fname + "." + in_ext) o2 = os.path.join(o1, fname + "." + out_ext) fix_file(p2, o2, in_encoding, out_encoding) ''' infile = codecs.open(p2, "r", encoding=in_encoding) text = infile.read() o2 = os.path.join(o1, fname+"."+out_ext) outfile = codecs.open(o2, "w", encoding=out_encoding) outfile.write(text) infile.close() outfile.close() ''' print("Finished..\n") print("Done.")
def read_and_merge_n_files(folderpath, N): filenames = io_utils.getfilenames_of_dir(folderpath, False)[:N] texts = [] for fname in filenames: fpath = os.path.join(folderpath, fname) content = io_utils.readtxtfile2(fpath) texts.append(content) return texts
def fix_lexicon(): infolder = "/home/dicle/Documents/lexicons/tr_sentiment_boun" outfolder = "/home/dicle/Documents/lexicons/" for fname in io_utils.getfilenames_of_dir(infolder, False): p1 = os.path.join(infolder, fname) text = read_encoded_file(p1, encoding="utf-8") text = fix_text(text) with open(os.path.join(outfolder, fname), "w") as f: f.write(text)
def count_tweets(folderpath, outfolder): N = 0 Nr = 0 Ntr = 0 days = io_utils.getfoldernames_of_dir(folderpath) print(folderpath) for day in days: p1 = os.path.join(folderpath, day) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) for fname in fnames: p2 = os.path.join(p1, fname) ''' lines = open(p2, "r").readlines() nlines = len(lines) ''' tweets = lines2tweets(p2) ntweets = len(tweets) tr_tweets = count_lang_tweets(tweets, lang="tr") ntrtweets = len(tr_tweets) plain_tweets = count_nonreply_tweets(tr_tweets) nptweets = len(plain_tweets) print(" ", day, " / ", fname, " # lines: ", ntweets, " # tr_tweets: ", ntrtweets, " # non-reply tweeets: ", nptweets) N += ntweets Nr += nptweets Ntr += ntrtweets if ntrtweets > 0: outpath_tr = os.path.join(outfolder, day + "_" + fname) json.dump(tr_tweets, open(outpath_tr, "w")) if nptweets > 0: outpath_nr = os.path.join(outfolder, day + "_" + fname + "-nonreply") json.dump(plain_tweets, open(outpath_nr, "w")) return N, Ntr, Nr
def files_to_csv(mainfolder, outpath, fixfolder, in_encoding="utf-8"): textcol = "text" catcol = "polarity" other = "domain" labels = io_utils.getfoldernames_of_dir(mainfolder) rows = [] for label in labels: p1 = os.path.join(mainfolder, label) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) print("Reading in ", label) for fname in fnames: print(" ", fname) p2 = os.path.join(p1, fname) try: f = codecs.open(p2, "r", encoding=in_encoding) text = f.read() except UnicodeDecodeError: f = codecs.open(p2, "r", encoding="cp1256") text = f.read() f2 = codecs.open(os.path.join( io_utils.ensure_dir(os.path.join(fixfolder, label)), fname), "w", encoding="utf") f2.write(text) text = text.strip() row = {textcol: text, catcol: label, other: fname} rows.append(row) f.close() df = pd.DataFrame(rows) df = df.sample(frac=1).reset_index(drop=True) if outpath: df.to_csv(outpath, sep="\t", index=False) return df
def fix_texts_nested(infolder, outfolder): folders = io_utils.getfoldernames_of_dir(infolder) for folder in folders: inp1 = os.path.join(infolder, folder) outp1 = io_utils.ensure_dir(os.path.join(outfolder, folder)) files = io_utils.getfilenames_of_dir(inp1, False) for file in files: inp2 = os.path.join(inp1, file) text = read_encoded_file(inp2) text = fix_text(text) outp2 = os.path.join(outp1, file) with open(outp2, "w") as f: f.write(text)
def _sample_N_tweets(folderpath, N, filtrate=None, keywords=None): print(folderpath) fnames = io_utils.getfilenames_of_dir(folderpath, removeextension=False) fnames = [i for i in fnames if i.endswith("-nonreply")] all_tweets = [] for fname in fnames: p = os.path.join(folderpath, fname) tweets = json.load(open(p, "r")) all_tweets.extend(tweets) #print(fname, len(tweets), len(all_tweets)) if filtrate and keywords: all_tweets = filtrate(keywords, all_tweets) random.shuffle(all_tweets) print(len(all_tweets), N) selected_tweets = random.sample(all_tweets, min(len(all_tweets), N)) return selected_tweets
def spam_mails_to_csv(mainfolder, outpath): csv_rows = [] cats = io_utils.getfoldernames_of_dir(mainfolder) for cat in cats: p1 = os.path.join(mainfolder, cat) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) for fname in fnames: p2 = os.path.join(p1, fname) lines = open(p2, "r").readlines() items = extract_structure(lines) items["category"] = cat csv_rows.append(items) random.shuffle(csv_rows) df = pd.DataFrame(csv_rows) if outpath: df.to_csv(outpath, index=False, sep="\t") return df
def detect_illstructured(mainfolder, outcsvpath, bodyfolder): fnames = [] folders1 = io_utils.getfoldernames_of_dir(mainfolder) ngoodfiles = 0 io_utils.initialize_csv_file(out_header, outcsvpath) for folder1 in folders1: # assuming the corpus has one more subfolder hierarchy p1 = os.path.join(mainfolder, folder1) txtfiles = io_utils.getfilenames_of_dir(p1, removeextension=False) # check fnames fnames.extend(txtfiles) for txtfile in txtfiles: fpath = os.path.join(p1, txtfile) # if line 0 or 1 has Sent: with open(fpath) as f: lines = f.readlines() date = lines[1] datep = re.match(r"\s*"+DATE, date) if datep: to = lines[2] cc = lines[3] subject = lines[4] date2 = extract_metadata(date, DATE) to2 = extract_metadata(to, TO) cc2 = extract_metadata(cc, CC) subject2 = extract_metadata(subject, SUBJECT) bodylines = [i for i in lines[5:] if not i.isspace()] body = "\n".join(bodylines) body = body.strip() # record body aside io_utils.todisc_txt(body.decode("utf-8"), os.path.join(bodyfolder, txtfile)) items = [txtfile, "", to2, cc2, date2, subject2, str(len(body))] io_utils.append_csv_cell_items(items, outcsvpath) ''' datep = re.match(r"\s*"+DATE, txt) if datep: if "@" in lines[0]: ngoodfiles += 1 else: print "- ", fpath datestr = txt[datep.end():] else: print fpath ''' print("nfiles: ", str(len(fnames))) print("ngoodfiles: ", str(ngoodfiles))
if __name__ == '__main__': ''' p = '/home/dicle/Documents/data/email_datasets/enron/classified/enron_with_categories/2/1825.txt' print(get_email(p)) ''' emails = [] folder = "/home/dicle/Documents/data/email_datasets/enron/classified/enron_with_categories" subfolders = io_utils.getfoldernames_of_dir(folder) id_ = 0 for subfolder in subfolders: p1 = os.path.join(folder, subfolder) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) txtfiles = [i for i in fnames if i.endswith(".txt")] print(subfolder) for txtfile in txtfiles: print(" Reading ", txtfile) p2 = os.path.join(p1, txtfile) text, from_ = get_email(p2) emails.append({ "fname": txtfile, "folder": subfolder, "sender": from_, "body": text })