def fix_lexicon(): infolder = "/home/dicle/Documents/lexicons/tr_sentiment_boun" outfolder = "/home/dicle/Documents/lexicons/" for fname in io_utils.getfilenames_of_dir(infolder, False): p1 = os.path.join(infolder, fname) text = read_encoded_file(p1, encoding="utf-8") text = fix_text(text) with open(os.path.join(outfolder, fname), "w") as f: f.write(text)
def run_for_datasets(): folder = "/home/dicle/Documents/arabic_nlp/datasets/sentiment/MASC Corpus/MASC Corpus/Excel Format" #fnames = ["MSAC corpus- Political.xlsx"] fnames = io_utils.getfilenames_of_dir(folder, removeextension=False) textcol = "Text" catcol = "Polarity" config_dict = conf.ar_sentiment_params picklefolder = "/home/dicle/Documents/experiments/ar_sentiment/models" for fname in fnames: p = os.path.join(folder, fname) df = pd.read_excel(p) instances = df[textcol].tolist() labels = df[catcol].tolist() instances, labels = corpus_io.shuffle_dataset(instances, labels) modelname = ".".join(fname.split(".")[:-1]) print("\n\n") print("Classify ", modelname) g = df.groupby(by=[catcol]) print("Category counts:\n ", g.count()[textcol]) run_ar_sentiment_analyser2(instances, labels, config_dict, picklefolder, modelname) all_instances = [] all_labels = [] for fname in fnames: p = os.path.join(folder, fname) df = pd.read_excel(p) instances = df[textcol].tolist() labels = df[catcol].tolist() all_instances.extend(instances) all_labels.extend(labels) all_instances, all_labels = corpus_io.shuffle_dataset( all_instances, all_labels) print("Classify ALL") modelname = "3sets" run_ar_sentiment_analyser2(all_instances, all_labels, config_dict, picklefolder, modelname)
def get_thy_data(folderpath): fnames = io_utils.getfilenames_of_dir(folderpath, removeextension=False) fnames.sort() instances = [] labels = [] for fname in fnames: content = open(os.path.join(folderpath, fname), "r").read().strip() instances.append(content) labels.append(fname) return instances, labels
def count_tweets(folderpath, outfolder): N = 0 Nr = 0 Ntr = 0 days = io_utils.getfoldernames_of_dir(folderpath) print(folderpath) for day in days: p1 = os.path.join(folderpath, day) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) for fname in fnames: p2 = os.path.join(p1, fname) ''' lines = open(p2, "r").readlines() nlines = len(lines) ''' tweets = lines2tweets(p2) ntweets = len(tweets) tr_tweets = count_lang_tweets(tweets, lang="tr") ntrtweets = len(tr_tweets) plain_tweets = count_nonreply_tweets(tr_tweets) nptweets = len(plain_tweets) print(" ", day," / ", fname, " # lines: ", ntweets, " # tr_tweets: ", ntrtweets, " # non-reply tweeets: ", nptweets) N += ntweets Nr += nptweets Ntr += ntrtweets if ntrtweets > 0: outpath_tr = os.path.join(outfolder, day+"_"+fname) json.dump(tr_tweets, open(outpath_tr, "w")) if nptweets > 0: outpath_nr = os.path.join(outfolder, day+"_"+fname+"-nonreply") json.dump(plain_tweets, open(outpath_nr, "w")) return N, Ntr, Nr
def json_to_txt(jsonfolder, txtfolder): fnames = io_utils.getfilenames_of_dir(jsonfolder, removeextension=False) for fname in fnames: p = os.path.join(jsonfolder, fname) pairs = read_qa_pairs(p) for i, pair in enumerate(pairs): #print(pair) txtfname = str(i) + "-" + fname.replace(".json", ".txt") content = pair["question"] + "\n\n" + pair["answer"] txtpath = os.path.join(txtfolder, txtfname) outf = open(txtpath, "w") outf.write(content) outf.close()
def get_database(jsonfolder): fnames = io_utils.getfilenames_of_dir(jsonfolder, removeextension=False) all_pairs = [] for fname in fnames: p = os.path.join(jsonfolder, fname) pairs = read_qa_pairs(p) #print(pairs) #print(type(pairs)) #print(type(pairs)) all_pairs.extend(pairs) #print(all_pairs[:5]) instances = [ pair["answer"] + "\n" + pair["question"] for pair in all_pairs ] return instances
def get_news_dataset(folderpath): dataset = [] subfolders = io_utils.getfoldernames_of_dir(folderpath) for label in subfolders: p1 = os.path.join(folderpath, label) fnames = io_utils.getfilenames_of_dir(p1, removeextension=False) for fname in fnames: p2 = os.path.join(p1, fname) text = open(p2, "r").read() text = text.strip() dataset.append((text, label)) random.shuffle(dataset) return dataset
def fix_yildiz_tweets(): infolder = "/home/dicle/Documents/data/tr_sentiment/sentiment-3000tweet/enhanced_data/orig" outfolder = "/home/dicle/Documents/data/tr_sentiment/sentiment-3000tweet/enhanced_data/fixed" folders = io_utils.getfoldernames_of_dir(infolder) for folder in folders: inp1 = os.path.join(infolder, folder) outp1 = io_utils.ensure_dir(os.path.join(outfolder, folder)) files = io_utils.getfilenames_of_dir(inp1, False) for file in files: inp2 = os.path.join(inp1, file) text = read_encoded_file(inp2) text = fix_text(text) outp2 = os.path.join(outp1, file) with open(outp2, "w") as f: f.write(text)
def _sample_N_tweets(folderpath, N, filtrate=None, keywords=None): print(folderpath) fnames = io_utils.getfilenames_of_dir(folderpath, removeextension=False) fnames = [i for i in fnames if i.endswith("-nonreply")] all_tweets = [] for fname in fnames: p = os.path.join(folderpath, fname) tweets = json.load(open(p, "r")) all_tweets.extend(tweets) #print(fname, len(tweets), len(all_tweets)) if filtrate and keywords: all_tweets = filtrate(keywords, all_tweets) random.shuffle(all_tweets) print(len(all_tweets), N) selected_tweets = random.sample(all_tweets, min(len(all_tweets), N)) return selected_tweets
print(" related docs: ", ", ".join(cluster_members[cl])) print(" related words: ", ", ".join(neighbour_terms)) results.append((word, cl, occ)) ''' order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] print(order_centroids) ''' if __name__ == "__main__": # build clustering model lang = "tr" # tcell folder = "/home/dicle/Documents/experiments/tcell_topics/docs_less" fnames = io_utils.getfilenames_of_dir(folder, removeextension=False) instances = [] labels = [] for fname in fnames: path = os.path.join(folder, fname) text = "" with open(path, "r") as f: text = f.read().strip() instances.append(text) labels.append(fname) sentence = "hattımdaki paket değişikliklerini ve paketleri nasıl öğrenebilirim?" #sentence = "bugün hava ne güzel" t0 = time() output = get_topics_kmeans(instances, labels, sentence) '''