Exemplo n.º 1
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    num_spam_emails = len(file_lists_by_category[0])
    num_ham_emails = len(file_lists_by_category[1])
    
    spam_word_to_count = util.get_counts(file_lists_by_category[0])
    ham_word_to_count = util.get_counts(file_lists_by_category[1])
    
    p_d = {k:((v+1.0)/(num_spam_emails+2)) for (k, v) in spam_word_to_count.items()}
    q_d = {k:((v+1.0)/(num_ham_emails+2)) for (k, v) in ham_word_to_count.items()}
        
    probabilities_by_category = (p_d, q_d)
    return probabilities_by_category
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.
    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    ### TODO: Write your code here

    spam = file_lists[0]
    ham = file_lists[1]

    vocab = util.get_counts(ham + spam).keys()
    qdvalues = {}
    pdvalues = {}
    spamcounts = util.get_counts(spam)
    hamcounts = util.get_counts(ham)

    Ns = len(spam)
    Nh = len(ham)

    for word in vocab:
        pdvalues[word] = (spamcounts[word] + 1) / (Ns + 2)
        qdvalues[word] = (hamcounts[word] + 1) / (Nh + 2)
    probabilities_by_category = (pdvalues, qdvalues)

    return probabilities_by_category
Exemplo n.º 3
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    # TODO: Write your code here
    # File lists
    spam_files = file_lists_by_category[0]
    ham_files = file_lists_by_category[1]
    # Target distributions
    pd = util.Counter()
    qd = util.Counter()
    # The number of times each word occurs in specific bag
    counts_in_spam = util.get_counts(spam_files)
    counts_in_ham = util.get_counts(ham_files)
    # SPAM bag size and HAM bag size
    spam_bag_size = sum(list(counts_in_spam.values()))
    ham_bag_size = sum(list(counts_in_ham.values()))
    # Dictionary
    dictionary = set(list(counts_in_spam.keys()) + list(counts_in_ham.keys()))
    # Assign distributions
    for word in dictionary:
        # A word can be either picked or not picked, hence 2
        pd[word] = (counts_in_spam[word] + 1) / (spam_bag_size +
                                                 len(dictionary))
        qd[word] = (counts_in_ham[word] + 1) / (ham_bag_size + len(dictionary))
    """
    # Sanity Check
    
    s = 0
    for word in pd:
        s += pd[word]
    print("total pd: {}".format(s))

    s = 0
    for word in qd:
        s += qd[word]
    print("total qd: {}".format(s))
    """
    return pd, qd
Exemplo n.º 4
0
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False):
    hashers = MultiLSHasher(num_hashes, num_bits)
    if verbose: print 'Hashers initialized'

    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    doc_features = {}
    word_counts = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            word = int(row[1])
            count = float(row[2])
            word_counts[word] += 1
            if doc not in doc_features:
                doc_features[doc] = []
            doc_features[doc].append((word, count))
    if verbose: print 'Loaded doc features'

    for doc, features in doc_features.items():
        if type(features[0]) is float:
            break
        feature_tfidf = []
        for w, c in features:
            tfidf = math.log(c + 1) * math.log(
                num_docs / float(word_counts[w]))
            feature_tfidf.append((w, tfidf))
        doc_features[doc] = feature_tfidf

    hashers.compute_stream(doc_features)
    signatures = hashers.compute_signatures()
    if verbose: print 'Computed signatures'

    doc_features = {}
    words_doc_count = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            count = float(row[2]) if '.' in row[2] else int(row[2])
            for hl, s in signatures.items():
                word = str(row[1]) + hl + s[doc]
                words_doc_count[word] += 1
                if doc not in doc_features:
                    doc_features[doc] = []
                doc_features[doc].append((word, count))
    if verbose: print 'Generated hashed doc features'

    filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for doc, feature_counts in doc_features.items():
            for feature, count in feature_counts:
                tfidf = math.log(count + 1) * math.log(
                    num_docs / float(words_doc_count[feature]))
                datawriter.writerow([doc, feature, tfidf])
    if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 5
0
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False):
    hashers = MultiLSHasher(num_hashes, num_bits)
    if verbose: print 'Hashers initialized'

    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    doc_features = {}
    word_counts = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            word = int(row[1])
            count = float(row[2])
            word_counts[word] += 1
            if doc not in doc_features:
                doc_features[doc] = []
            doc_features[doc].append((word, count))
    if verbose: print 'Loaded doc features'

    for doc, features in doc_features.items():
        if type(features[0]) is float:
            break
        feature_tfidf = []
        for w, c in features:
            tfidf = math.log(c+1) * math.log(num_docs/float(word_counts[w]))
            feature_tfidf.append((w,tfidf))
        doc_features[doc] = feature_tfidf

    hashers.compute_stream(doc_features)
    signatures = hashers.compute_signatures()
    if verbose: print 'Computed signatures'

    doc_features = {}
    words_doc_count = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            count = float(row[2]) if '.' in row[2] else int(row[2])
            for hl, s in signatures.items():
                word = str(row[1]) + hl + s[doc]
                words_doc_count[word] += 1
                if doc not in doc_features:
                    doc_features[doc] = []
                doc_features[doc].append((word, count))
    if verbose: print 'Generated hashed doc features'

    filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for doc, feature_counts in doc_features.items():
            for feature, count in feature_counts:
                tfidf = math.log(count+1) * math.log(num_docs/float(
                    words_doc_count[feature]))
                datawriter.writerow([doc, feature, tfidf])
    if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 6
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    ### TODO: Write your code here

    spam_list = file_lists_by_category[0]
    ham_list = file_lists_by_category[1]

    spam_counts = util.get_counts(spam_list)
    num_spam_words = len(spam_counts)

    ham_counts = util.get_counts(ham_list)
    num_ham_words = len(ham_counts)

    D = len(spam_counts.keys() & ham_counts.keys())

    p_d = dict()
    q_d = dict()

    for word in spam_counts:
        p_d[word] = (spam_counts[word] + 1) / (num_spam_words + D)

    p_d["default val"] = 1 / (num_spam_words + D)

    for word in ham_counts:
        q_d[word] = (ham_counts[word] + 1) / (num_ham_words + D)

    q_d["default val"] = 1 / (num_ham_words + D)

    probabilities_by_category = (p_d, q_d)

    return probabilities_by_category
Exemplo n.º 7
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    spamfiles = file_lists_by_category[0]
    hamfiles = file_lists_by_category[1]
    w = []
    for spamfile in spamfiles:
        w.extend(util.get_words_in_file(spamfile))
        
    for hamfile in hamfiles:
        w.extend(util.get_words_in_file(hamfile))
        
#    n_spam = len(spam_words)
#    n_ham = len(ham_words)
    spam_count = util.get_counts(spamfiles)
    ham_count = util.get_counts(hamfiles)
    
    n = len(w)
    dict_spam = {wi : 0 for wi in w}
    dict_ham = {wi : 0 for wi in w}
    for key in dict_spam:
        dict_spam[key] = (spam_count[key]+1)/(n+2) 
        dict_ham[key] = (ham_count[key]+1)/(n+2)
                  
    probabilities_by_category = (dict_spam,dict_ham)

    
    
    return probabilities_by_category
Exemplo n.º 8
0
def get_estimates(unqiue_words, files):
    ret = dict()
    num_files = len(files)
    counter = util.get_counts(files)
    # total_words = 0
    # for word in counter:
    #     total_words += counter[word]

    for word in unqiue_words:
        ret[word] = (counter[word] + 1) / (num_files + 2)

    return ret
Exemplo n.º 9
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    ### TODO: Write your code here

    spam_emails = file_lists_by_category[0]
    ham_emails = file_lists_by_category[1]

    spam_email_word_counts = util.get_counts(spam_emails)
    ham_email_word_counts = util.get_counts(ham_emails)

    file_list_train = list(itertools.chain.from_iterable(file_lists_by_category))
    N = len(file_list_train)
    vocabulary = set(util.get_counts(file_list_train).keys())
    D = len(vocabulary)

    words_p_d = {}
    words_q_d = {}

    for word in vocabulary:
        words_p_d[word] = (spam_email_word_counts[word] + 1) / (len(spam_emails) + 2)
        words_q_d[word] = (ham_email_word_counts[word] + 1) / (len(ham_emails) + 2)

    probabilities_by_category = (words_p_d, words_q_d)

    return probabilities_by_category
Exemplo n.º 10
0
def generate_labeled_baseline_graph(output_file, percentile=95, verbose=False):
    data_set = output_file.split('-')[0]
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_new_doc_features(data_set, output_file, percentile).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    with open_graph_file(output_file) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d, features in get_new_doc_features(data_set, output_file, percentile).items():
            for w, c in features:
                tfidf = math.log(c+1) * math.log(num_docs/float(words_doc_count[w]))
                datawriter.writerow([d, w, tfidf])
        if verbose: print 'Wrote graph file %s' % output_file
Exemplo n.º 11
0
def generate_baseline_graph(data_set, filename=None, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_doc_features(data_set).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    if not filename: filename = data_set + '-baseline'
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d,w,c in test_data:
            if type(c) is float:
                datawriter.writerow([str(d), str(w) + 'w', c])
            else:
                tfidf = math.log(c+1) * math.log(num_docs/float(words_doc_count[w]))
                datawriter.writerow([str(d), str(w) + 'w', tfidf])
        if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 12
0
def generate_baseline_graph(data_set, filename=None, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_doc_features(data_set).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    if not filename: filename = data_set + '-baseline'
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d, w, c in test_data:
            if type(c) is float:
                datawriter.writerow([str(d), str(w) + 'w', c])
            else:
                tfidf = math.log(c + 1) * math.log(
                    num_docs / float(words_doc_count[w]))
                datawriter.writerow([str(d), str(w) + 'w', tfidf])
        if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 13
0
def generate_labeled_baseline_graph(output_file, percentile=95, verbose=False):
    data_set = output_file.split('-')[0]
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_new_doc_features(data_set, output_file,
                                              percentile).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    with open_graph_file(output_file) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d, features in get_new_doc_features(data_set, output_file,
                                                percentile).items():
            for w, c in features:
                tfidf = math.log(c + 1) * math.log(
                    num_docs / float(words_doc_count[w]))
                datawriter.writerow([d, w, tfidf])
        if verbose: print 'Wrote graph file %s' % output_file
Exemplo n.º 14
0
def generate_knn_graphs(data_set, ks=[5,10,20,30,50,100], verbose=False):
    '''
    since we get a list of *all* the neighbors ordered by "nearness",
    it makes more sense to iterate through the different k's within
    the function rather than outside it
    '''
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    max_k = max(ks)

    assert max_k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc,word))
                        tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word]))
                        feature_matrix.itemset((doc,word), tfidf)
            if doc % 10 == 9:
                if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i,i), 0.0)
        else:
            normalizing_matrix.itemset((i,i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    doc_neighbors = {}
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs,1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc,doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        neighbors = np.argsort(doc_weights)[0]
        doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor)) for neighbor in neighbors[-max_k:]]
        if doc % 10 == 9:
            if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated folded graph'

    for k in ks:
        filename = '%s-knn-k%d' % (data_set, k)
        with open_graph_file(filename) as graph:
            datawriter = csv.writer(graph, delimiter='\t')
            for doc in xrange(num_docs):
                for neighbor, weight in doc_neighbors[doc][-k:]:
                    if weight >= 1e-9:
                        datawriter.writerow([str(doc+1), str(neighbor+1), weight])
            if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 15
0
def generate_knn_graph(data_set, k, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    assert k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc,word))
                        tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word]))
                        feature_matrix.itemset((doc,word), tfidf)
            if doc % 10 == 9:
                if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i,i), 0.0)
        else:
            normalizing_matrix.itemset((i,i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs,1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc,doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        nearest_neighbors = np.argsort(doc_weights)
        for neighbor in nearest_neighbors[0][-k:]:
            if doc_weights.item(neighbor) < 1e-9:
                continue
            edges.append(((doc+1, int(neighbor)+1), doc_weights.item(neighbor)))
        if doc % 10 == 9:
            if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated folded graph'

    filename = '%s-knn-k%d' % (data_set, k)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for edge, weight in edges:
            datawriter.writerow([edge[0], edge[1], weight])
    if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 16
0
                     "_all.npy")

# loading the halo mass and group identification
Group_M_Mean200_fp = np.load(root + 'Group_M_Mean200_fp' + snap_dir +
                             '.npy') * 1.e10
SubhaloGrNr_fp = np.load(root + 'SubhaloGrNr_fp' + snap_dir + '.npy')
SubhaloPos_fp = np.load(root + 'SubhaloPos_fp' + snap_dir + '.npy') / 1.e3
GroupPos_fp = np.load(root + 'GroupPos_fp' + snap_dir + '.npy') / 1.e3
N_halos_fp = GroupPos_fp.shape[0]
inds_halo_fp = np.arange(N_halos_fp, dtype=int)
GroupEnv_fp = np.load(root + 'GroupEnv_fp' + snap_dir + '.npy')

# get parent indices of the centrals and their subhalo indices in the original array
unique_sub_grnr, firsts = np.unique(SubhaloGrNr_fp, return_index=True)

count_halo_col_fp, count_halo_cents_col_fp, count_halo_sats_col_fp = get_counts(
    SubhaloGrNr_fp, firsts, N_halos_fp, sub_id_col)
count_halo_sfg_fp, count_halo_cents_sfg_fp, count_halo_sats_sfg_fp = get_counts(
    SubhaloGrNr_fp, firsts, N_halos_fp, sub_id_sfg)
count_halo_all_fp, count_halo_cents_all_fp, count_halo_sats_all_fp = get_counts(
    SubhaloGrNr_fp, firsts, N_halos_fp, sub_id_all)


def get_env_pos(gal_inds, sub_grnr, sub_pos, group_env, group_inds,
                group_mass):

    # define mass bins
    log_min = 11.
    log_max = 15.
    N_bins = 41
    bin_edges = np.linspace(log_min, log_max, N_bins)
    bin_cents = (bin_edges[1:] + bin_edges[:-1]) * .5
Exemplo n.º 17
0
def generate_knn_graphs(data_set, ks=[5, 10, 20, 30, 50, 100], verbose=False):
    '''
    since we get a list of *all* the neighbors ordered by "nearness",
    it makes more sense to iterate through the different k's within
    the function rather than outside it
    '''
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    max_k = max(ks)

    assert max_k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc, word))
                        tfidf = math.log(count + 1) * math.log(
                            num_docs / float(words_doc_count[word]))
                        feature_matrix.itemset((doc, word), tfidf)
            if doc % 10 == 9:
                if verbose:
                    print 'Processed %d out of %d documents' % (doc + 1,
                                                                num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i, i), 0.0)
        else:
            normalizing_matrix.itemset((i, i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    doc_neighbors = {}
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs, 1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc, doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        neighbors = np.argsort(doc_weights)[0]
        doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor))
                              for neighbor in neighbors[-max_k:]]
        if doc % 10 == 9:
            if verbose:
                print 'Processed %d out of %d documents' % (doc + 1, num_docs)
    if verbose: print 'Generated folded graph'

    for k in ks:
        filename = '%s-knn-k%d' % (data_set, k)
        with open_graph_file(filename) as graph:
            datawriter = csv.writer(graph, delimiter='\t')
            for doc in xrange(num_docs):
                for neighbor, weight in doc_neighbors[doc][-k:]:
                    if weight >= 1e-9:
                        datawriter.writerow(
                            [str(doc + 1),
                             str(neighbor + 1), weight])
            if verbose: print 'Wrote graph file %s' % filename
Exemplo n.º 18
0
def generate_knn_graph(data_set, k, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    assert k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc, word))
                        tfidf = math.log(count + 1) * math.log(
                            num_docs / float(words_doc_count[word]))
                        feature_matrix.itemset((doc, word), tfidf)
            if doc % 10 == 9:
                if verbose:
                    print 'Processed %d out of %d documents' % (doc + 1,
                                                                num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i, i), 0.0)
        else:
            normalizing_matrix.itemset((i, i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs, 1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc, doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        nearest_neighbors = np.argsort(doc_weights)
        for neighbor in nearest_neighbors[0][-k:]:
            if doc_weights.item(neighbor) < 1e-9:
                continue
            edges.append(
                ((doc + 1, int(neighbor) + 1), doc_weights.item(neighbor)))
        if doc % 10 == 9:
            if verbose:
                print 'Processed %d out of %d documents' % (doc + 1, num_docs)
    if verbose: print 'Generated folded graph'

    filename = '%s-knn-k%d' % (data_set, k)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for edge, weight in edges:
            datawriter.writerow([edge[0], edge[1], weight])
    if verbose: print 'Wrote graph file %s' % filename