예제 #1
0
def fix_lexicon():

    infolder = "/home/dicle/Documents/lexicons/tr_sentiment_boun"
    outfolder = "/home/dicle/Documents/lexicons/"

    for fname in io_utils.getfilenames_of_dir(infolder, False):
        p1 = os.path.join(infolder, fname)
        text = read_encoded_file(p1, encoding="utf-8")
        text = fix_text(text)
        with open(os.path.join(outfolder, fname), "w") as f:
            f.write(text)
예제 #2
0
def run_for_datasets():

    folder = "/home/dicle/Documents/arabic_nlp/datasets/sentiment/MASC Corpus/MASC Corpus/Excel Format"
    #fnames = ["MSAC corpus- Political.xlsx"]
    fnames = io_utils.getfilenames_of_dir(folder, removeextension=False)
    textcol = "Text"
    catcol = "Polarity"

    config_dict = conf.ar_sentiment_params

    picklefolder = "/home/dicle/Documents/experiments/ar_sentiment/models"

    for fname in fnames:

        p = os.path.join(folder, fname)

        df = pd.read_excel(p)

        instances = df[textcol].tolist()
        labels = df[catcol].tolist()
        instances, labels = corpus_io.shuffle_dataset(instances, labels)

        modelname = ".".join(fname.split(".")[:-1])

        print("\n\n")
        print("Classify ", modelname)

        g = df.groupby(by=[catcol])
        print("Category counts:\n ", g.count()[textcol])

        run_ar_sentiment_analyser2(instances, labels, config_dict,
                                   picklefolder, modelname)

    all_instances = []
    all_labels = []
    for fname in fnames:

        p = os.path.join(folder, fname)

        df = pd.read_excel(p)

        instances = df[textcol].tolist()
        labels = df[catcol].tolist()
        all_instances.extend(instances)
        all_labels.extend(labels)

    all_instances, all_labels = corpus_io.shuffle_dataset(
        all_instances, all_labels)

    print("Classify ALL")
    modelname = "3sets"
    run_ar_sentiment_analyser2(all_instances, all_labels, config_dict,
                               picklefolder, modelname)
예제 #3
0
def get_thy_data(folderpath):

    fnames = io_utils.getfilenames_of_dir(folderpath, removeextension=False)
    fnames.sort()

    instances = []
    labels = []
    for fname in fnames:

        content = open(os.path.join(folderpath, fname), "r").read().strip()
        instances.append(content)
        labels.append(fname)

    return instances, labels
예제 #4
0
def count_tweets(folderpath, outfolder):
    
    N = 0
    Nr = 0
    Ntr = 0
    
    days = io_utils.getfoldernames_of_dir(folderpath)
    
    print(folderpath)
    for day in days:
        
        p1 = os.path.join(folderpath, day)
        
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)
        
        for fname in fnames:
            
            p2 = os.path.join(p1, fname)
            '''
            lines = open(p2, "r").readlines()
            nlines = len(lines)
            '''
            
            tweets = lines2tweets(p2)
            ntweets = len(tweets)
            
            tr_tweets = count_lang_tweets(tweets, lang="tr")
            ntrtweets = len(tr_tweets)
            
            plain_tweets = count_nonreply_tweets(tr_tweets)
            nptweets = len(plain_tweets)
            
            print(" ", day," / ", fname, "  # lines: ", ntweets, " # tr_tweets: ", ntrtweets, " # non-reply tweeets: ", nptweets)
            
            N += ntweets
            Nr += nptweets
            Ntr += ntrtweets
            
            
            if ntrtweets > 0:
                outpath_tr = os.path.join(outfolder, day+"_"+fname)
                json.dump(tr_tweets, open(outpath_tr, "w"))
            
            if nptweets > 0:
                outpath_nr = os.path.join(outfolder, day+"_"+fname+"-nonreply")
                json.dump(plain_tweets, open(outpath_nr, "w"))
    
    return N, Ntr, Nr
예제 #5
0
def json_to_txt(jsonfolder, txtfolder):

    fnames = io_utils.getfilenames_of_dir(jsonfolder, removeextension=False)

    for fname in fnames:

        p = os.path.join(jsonfolder, fname)
        pairs = read_qa_pairs(p)

        for i, pair in enumerate(pairs):
            #print(pair)
            txtfname = str(i) + "-" + fname.replace(".json", ".txt")
            content = pair["question"] + "\n\n" + pair["answer"]

            txtpath = os.path.join(txtfolder, txtfname)
            outf = open(txtpath, "w")
            outf.write(content)
            outf.close()
예제 #6
0
def get_database(jsonfolder):

    fnames = io_utils.getfilenames_of_dir(jsonfolder, removeextension=False)
    all_pairs = []

    for fname in fnames:

        p = os.path.join(jsonfolder, fname)
        pairs = read_qa_pairs(p)
        #print(pairs)
        #print(type(pairs))
        #print(type(pairs))
        all_pairs.extend(pairs)

    #print(all_pairs[:5])
    instances = [
        pair["answer"] + "\n" + pair["question"] for pair in all_pairs
    ]
    return instances
예제 #7
0
def get_news_dataset(folderpath):

    dataset = []

    subfolders = io_utils.getfoldernames_of_dir(folderpath)
    for label in subfolders:
        p1 = os.path.join(folderpath, label)

        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)

        for fname in fnames:
            p2 = os.path.join(p1, fname)
            text = open(p2, "r").read()
            text = text.strip()

            dataset.append((text, label))

    random.shuffle(dataset)

    return dataset
예제 #8
0
def fix_yildiz_tweets():

    infolder = "/home/dicle/Documents/data/tr_sentiment/sentiment-3000tweet/enhanced_data/orig"
    outfolder = "/home/dicle/Documents/data/tr_sentiment/sentiment-3000tweet/enhanced_data/fixed"

    folders = io_utils.getfoldernames_of_dir(infolder)

    for folder in folders:
        inp1 = os.path.join(infolder, folder)
        outp1 = io_utils.ensure_dir(os.path.join(outfolder, folder))

        files = io_utils.getfilenames_of_dir(inp1, False)
        for file in files:

            inp2 = os.path.join(inp1, file)
            text = read_encoded_file(inp2)
            text = fix_text(text)
            outp2 = os.path.join(outp1, file)
            with open(outp2, "w") as f:
                f.write(text)
예제 #9
0
def _sample_N_tweets(folderpath, N, filtrate=None, keywords=None):

    print(folderpath)
    fnames = io_utils.getfilenames_of_dir(folderpath, removeextension=False)
    
    fnames = [i for i in fnames if i.endswith("-nonreply")]

    all_tweets = []
    for fname in fnames:
        
        p = os.path.join(folderpath, fname)
        tweets = json.load(open(p, "r"))
        all_tweets.extend(tweets)
        #print(fname, len(tweets), len(all_tweets))
    
    if filtrate and keywords:
        all_tweets = filtrate(keywords, all_tweets)
    
    random.shuffle(all_tweets)
    print(len(all_tweets), N)
    selected_tweets = random.sample(all_tweets, min(len(all_tweets), N))
    return selected_tweets
예제 #10
0
        print(" related docs: ", ", ".join(cluster_members[cl]))
        print(" related words: ", ", ".join(neighbour_terms))
        results.append((word, cl, occ))
    '''
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    print(order_centroids)
    '''


if __name__ == "__main__":

    # build clustering model
    lang = "tr"
    # tcell
    folder = "/home/dicle/Documents/experiments/tcell_topics/docs_less"
    fnames = io_utils.getfilenames_of_dir(folder, removeextension=False)
    instances = []
    labels = []
    for fname in fnames:
        path = os.path.join(folder, fname)
        text = ""
        with open(path, "r") as f:
            text = f.read().strip()
        instances.append(text)
        labels.append(fname)

    sentence = "hattımdaki paket değişikliklerini ve paketleri nasıl öğrenebilirim?"
    #sentence = "bugün hava ne güzel"
    t0 = time()
    output = get_topics_kmeans(instances, labels, sentence)
    '''