예제 #1
0
def predict_word_dataset(word, dataset):
    """
    Predicts presence of word in each instance in dataset, and returns results array.
    """
    # Get weight vector for word

    if os.path.isfile(c.rbf_data + word + '.txt'):
        results = u.get_lines(c.rbf_data + word + '.txt')
        return results

    else:
        start_time = time.clock()
        print 'starting training for', word, 'at:', datetime.datetime.now().time()
        clf = u.get_clf(word, 'rbf')
    
        # Predict presence of word in each sentence, saving results
        results = clf.predict(dataset)
        print len(results)
        print
    
        with open(c.rbf_data + word + '.txt', 'w') as file:
            file.write(word + ' ')
            for i in results:
                file.write(str(i))
                file.write('\n')
        
        time_taken = (time.clock()-start_time)/60
        print 'Time taken to train and write predictions to file for', word,
        print 'is:', time_taken, 'minutes'           
        return results
def get_word_dataset(word):
    """
    Predicts presence of word in each instance in dataset, and returns results array.
    """
    file_name = c.rbf_data + word + '.txt'

    # Get predicted classes for word
    results = u.get_lines(file_name)
    results = results[1:]

    return results
예제 #3
0
# Simply combines Large Movie Review datasets (test and train) into one larger file.

import lib.utility as u

if __name__ == "__main__":

    sents1 = set(u.get_lines("imdb-sentences-neg.txt"))
    sents2 = set(u.get_lines("imdb-sentences-test-neg.txt"))
    l1 = len(sents1)
    l2 = len(sents2)

    all = sents1.union(sents2)
    l3 = len(all)

    #print l1, l2, l3

    #with open("imdb-all-sentences-neg.txt", "w") as file:
    #    for s in all:
    #        if len(s.split()) > 1:
    #            file.write(s + "\n")
예제 #4
0
import lib.utility as u
import config as c
import random
import itertools
import os
from time import time

if __name__ == "__main__":
    # Load target words into memory
    target_words = u.get_target_words()

    # Get highly related pairs of words to get agreement data for
    #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt")

    # Store all sentences in memory
    sentences_with = set(u.get_lines(c.output_dir + "sentences-with.txt"))
    sentences_without = set(u.get_lines(c.output_dir +
                                        "sentences-without.txt"))

    # Get all sentences used already - using a dict (so we only care about sentences used for specific word)
    used_sentences = {}
    for word in target_words:
        with open(c.train_data_dir + word + ".txt") as file:
            sents = set()
            for line in file:
                sents.add(line[3:].strip())
            used_sentences[word] = sents

    # Get sentences with that we haven't used, and sentences without that we haven't used, using set difference
    #sentences_with_available = list(sentences_with.difference(used_sentences))
    sentences_without_available = list(
# Take in pairs of targets and their cosine scores and select the highest N pairs
# to then compute agreement in normal way


import lib.utility as u
import config as c


if __name__ == "__main__":
    # Read in all pairs with cosine scores
    pairs = u.get_lines(c.output_dir + "edges-cosine.txt")
    
    # Iterate each pair and add to dictionary, where key is tuple and
    # value is score - then we can sort by value
    pairs_dict = {}
    for p in pairs:
        p = p.split()
        t1 = p[0]
        t2 = p[1]
        score = float(p[2])
        
        # check reverse isn't in dict already before inserting
        if (t2, t1) not in pairs_dict:
            pairs_dict[(t1, t2)] = score
    
    #total = len(pairs_dict)
    #print "total number of pairs =", total
    
    # Set N (number of pairs to select)
    N = 5000
    print "chosen number of pairs =", N
        # Get individual words
        #pair = pair.split()[0:2]
        ti = pair[0]  # i
        tj = pair[1]  # j
        print ctr, "- i:", ti, "j:", tj

        # Get row and col for matrix
        row = target_words.index(ti)
        col = target_words.index(tj)
        #print row, col

        # Read pair test dataset int sents array - file could be with words in reversed order
        dir = c.output_dir + "agreement-data/"
        sents = []
        try:
            sents = u.get_lines(dir + ti + "-" + tj + ".txt")
        except:
            try:
                sents = u.get_lines(dir + tj + "-" + ti + ".txt")
            except:
                # exit program if no file found for these words
                print "No dataset for:", ti, tj
                #sys.exit("Can't find dataset file. Program terminating.")
                continue

        # Convert sents to binary feature vectors
        data = []
        for sentence in sents:
            vect = numpy.zeros(feat_size)
            for w in sentence.strip().split()[1:]:
                # ignore target words in feature vectors
예제 #7
0
# Gets highest related pairs and prints to new file.


import lib.utility as u
import numpy


if __name__ == "__main__":
    sim_a = u.get_lines("output-700/list-sim.txt")
    agr_a = u.get_lines("output-700/list-agr.txt")
    
    # create dict of tuples from two arrays (so we can sort together)
    d = {}
    for i in xrange(len(sim_a)):
        d[i] = (float(sim_a[i]), float(agr_a[i]))
    
    #print d
    
    # sort dict and take top half highest and put into new arrays
    sim_a_top = []
    agr_a_top = []
    half = len(d)/float(2)
    i=0
    for key in sorted(d, key=d.get, reverse=True):
        if i > half:
            break
        #print d[key]
        sim_a_top.append(d[key][0])
        agr_a_top.append(d[key][1])
        
        i+=1
    and returns dictionary
    """
    counts = {}
    with open(f) as file:
        for line in file:
            for w in words:
                if w in line:
                    try:
                        counts[w] += 1
                    except:
                        counts[w] = 1
    return counts


if __name__ == "__main__":
    words = u.get_lines(c.output_dir + "frequent-words.txt")

    # Get count in positive sentences, and negative sentences to use to compute LLR
    pos_counts = count_occurrences("imdb-all-sentences-pos.txt", words)
    neg_counts = count_occurrences("imdb-all-sentences-neg.txt", words)

    # Compute log likelihood ratio for each word and store in dictionary
    llrs = {}
    for w in words:
        # Get each count from dicts
        pos = pos_counts[w]
        neg = neg_counts[w]
        # Compute each conditional prob, then take log as score
        # P(t = +1 | w)
        p1 = pos / float(pos + neg)
        # P(t = -1 | w)
예제 #9
0
# Computes likelihoods of targets based on occurrences in corpus.


import lib.utility as u


if __name__ == "__main__":
    corpus_pos = u.get_lines("../data/imdb-sentences-pos.txt")
    corpus_neg = u.get_lines("../data/imdb-sentences-neg.txt")
    corpus = corpus_pos + corpus_neg
    
    # get total count of all words
    total_count = 0
    counts = {}
    for line in corpus:
        for word in set(line.strip().split()):
            if word.isalpha():
                if word in counts:
                    counts[word] += 1
                else:
                    counts[word] = 1
                total_count += 1
    
    targets = u.get_target_words()
    likelihoods = {}
    for t in targets:
        likelihoods[t] = counts[t] / float(total_count)
    
    #for key in sorted(likelihoods, key=likelihoods.get, reverse=False):
    #    print key, likelihoods[key]
    
import lib.utility as u
import config as c
import random
import itertools


if __name__ == "__main__":
    # Load target words into memory
    target_words = u.get_target_words()
    
    # Get highly related pairs of words to get agreement data for
    #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt")
    
    # Store all sentences in memory
    sentences_with = set(u.get_lines(c.output_dir + "sentences-with.txt"))
    sentences_without = set(u.get_lines(c.output_dir + "sentences-without.txt"))
    
    # Get all sentences used already - using a dict (so we only care about sentences used for specific word)
    used_sentences = {}
    for word in target_words:
        with open(c.train_data_dir + word + ".txt") as file:
            sents = set()
            for line in file:
                sents.add(line[3:].strip())
            used_sentences[word] = sents
    
    # Get sentences with that we haven't used, and sentences without that we haven't used, using set difference
    #sentences_with_available = list(sentences_with.difference(used_sentences))
    sentences_without_available = list(sentences_without.difference(used_sentences))
    
 # Get individual words
 #pair = pair.split()[0:2]
 ti = pair[0] # i
 tj = pair[1] # j
 print ctr, "- i:", ti, "j:", tj
 
 # Get row and col for matrix
 row = target_words.index(ti)
 col = target_words.index(tj)
 #print row, col
 
 # Read pair test dataset int sents array - file could be with words in reversed order
 dir = c.output_dir + "agreement-data/"
 sents = []
 try:
     sents = u.get_lines(dir + ti + "-" + tj + ".txt")
 except:
     try:
         sents = u.get_lines(dir + tj + "-" + ti + ".txt")
     except:
         # exit program if no file found for these words
         print "No dataset for:", ti, tj
         #sys.exit("Can't find dataset file. Program terminating.")
         continue
 
 # Convert sents to binary feature vectors
 data = []
 for sentence in sents:
     vect = numpy.zeros(feat_size)
     for w in sentence.strip().split()[1:]:
         # ignore target words in feature vectors
# Take in pairs of targets and their cosine scores and select the highest N pairs
# to then compute agreement in normal way

import lib.utility as u
import config as c

if __name__ == "__main__":
    # Read in all pairs with cosine scores
    pairs = u.get_lines(c.output_dir + "edges-cosine.txt")

    # Iterate each pair and add to dictionary, where key is tuple and
    # value is score - then we can sort by value
    pairs_dict = {}
    for p in pairs:
        p = p.split()
        t1 = p[0]
        t2 = p[1]
        score = float(p[2])

        # check reverse isn't in dict already before inserting
        if (t2, t1) not in pairs_dict:
            pairs_dict[(t1, t2)] = score

    #total = len(pairs_dict)
    #print "total number of pairs =", total

    # Set N (number of pairs to select)
    N = 5000
    print "chosen number of pairs =", N

    # Iterate through sorted pairs, and take top N
예제 #13
0
# Get test data for sample of target words to evaluate accuracy.
# Similar to get-data-train.py, but more to do here as we have to ensure not to select those already in train
# set, meaning there are fewer potential sentences.


import lib.utility as u
import config as c
import random


if __name__ == "__main__":
    # Hard coded list of words to test with
    target_words = ['wonderful', 'love', 'excellent', 'great', 'classic', 'terrible', 'boring', 'worst', 'stupid', 'crap']
    
    # Store all sentences in memory
    sentences_with = u.get_lines(c.output_dir + "sentences-with.txt")
    sentences_without = u.get_lines(c.output_dir + "sentences-without.txt")
    all_sentences = sentences_with + sentences_without
    
    # For each target word, find sample sentences (pos and neg)
    # N is number of pos/neg instances to select
    N = 500
    i = 0
    for word in target_words:
        #print "for:", word
        
        # Shuffle data first
        random.shuffle(sentences_with)
        random.shuffle(all_sentences)
        
        # Get train set in an array to check we don't get a sentence we already have
예제 #14
0
# Computes frequency of features (from feature space) in Large Movie Review dataset.

import lib.utility as u
import math

if __name__ == "__main__":
    corpus_neg = u.get_lines("imdb-all-sentences-neg.txt")
    corpus_pos = u.get_lines("imdb-all-sentences-pos.txt")
    corpus = corpus_neg + corpus_pos

    # get total count of all docs
    total_docs = len(corpus)
    print "total docs:", total_docs

    # get features from feat-space file
    features = {}
    with open("output-700/feat-space.txt") as file:
        line = file.readline()
        for term in line.strip().split():
            features[term] = 0

    # go through corpus and count in how many docs each term appears
    for line in corpus:
        # we need to remove all the labels and ":1" after each term on each line
        line = set(line.split()[1:])

        # go through each line as a set - so no duplicates
        for term in line:
            term = term.split(":")[0]
            try:
                features[term] += 1
예제 #15
0
# Computes idf of features to use for feature values for sentiment classifier dataset.

import lib.utility as u
import math

if __name__ == "__main__":
    corpus = u.get_lines("train-sample.txt")

    # get total count of all docs
    total_docs = len(corpus)

    # get features from feat-space file
    features = {}
    with open("feat-space-sent-prefix.txt") as file:
        line = file.readline()
        for term in line.strip().split():
            features[term] = 0

    # go through corpus and count in how many docs each term appears
    for line in corpus:
        # we need to remove all the labels and ":1" after each term on each line
        line = set(line.split()[1:])

        # go through each line as a set - so no duplicates
        for term in line:
            term = term.split(":")[0]
            try:
                features[term] += 1
            except:
                #print term, "not in feature space"
                continue