Exemplo n.º 1
0
def itertextindex(index_or_dirname, indexname, docnum_field):
    import whoosh.index

    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=True)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    try:

        if docnum_field is None:

            # figure out the field names
            hdr = tuple(index.schema.stored_names())
            yield hdr

            # yield all documents
            astuple = operator.itemgetter(*index.schema.stored_names())
            for _, stored_fields_dict in index.reader().iter_docs():
                yield astuple(stored_fields_dict)

        else:

            # figure out the field names
            hdr = (docnum_field, ) + tuple(index.schema.stored_names())
            yield hdr

            # yield all documents
            astuple = operator.itemgetter(*index.schema.stored_names())
            for docnum, stored_fields_dict in index.reader().iter_docs():
                yield (docnum, ) + astuple(stored_fields_dict)

    except:
        raise

    finally:
        if needs_closing:
            # close the index if we're the ones who opened it
            index.close()
Exemplo n.º 2
0
def itertextindex(index_or_dirname, indexname, docnum_field):
    import whoosh.index

    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname, indexname=indexname,
                                      readonly=True)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r'
                            % index_or_dirname)

    try:

        if docnum_field is None:

            # figure out the field names
            hdr = tuple(index.schema.stored_names())
            yield hdr

            # yield all documents
            astuple = operator.itemgetter(*index.schema.stored_names())
            for _, stored_fields_dict in index.reader().iter_docs():
                yield astuple(stored_fields_dict)

        else:

            # figure out the field names
            hdr = (docnum_field,) + tuple(index.schema.stored_names())
            yield hdr

            # yield all documents
            astuple = operator.itemgetter(*index.schema.stored_names())
            for docnum, stored_fields_dict in index.reader().iter_docs():
                yield (docnum,) + astuple(stored_fields_dict)

    except:
        raise

    finally:
        if needs_closing:
            # close the index if we're the ones who opened it
            index.close()
Exemplo n.º 3
0
def key_terms(storage, schema):
    index = storage.open_index(schema=schema)
    ixreader = index.reader()
    searcher = index.searcher()
    docnums = []
    KEY_LEN = 500
    DOC_LEN = 1000
    for id in xrange(DOC_LEN):
        docnums.append(id)
    #for id in ixreader.all_doc_ids():
    #    print id,
    terms = {}
    i = 0
    for term, score in searcher.key_terms(docnums, content_field_name,
                                          KEY_LEN):
        terms[term] = i
        i += 1
    print 'key_terms finished'

    ar = np.zeros((len(docnums), KEY_LEN))
    for i in xrange(DOC_LEN):
        term_weights = ixreader.vector_as("weight", i, content_field_name)
        all_weight = 0
        n = 0
        for term, weight in term_weights:
            if term in terms:
                ar[i][terms[term]] = weight
                all_weight += weight
                n += 1
        for j in xrange(KEY_LEN):
            ar[i][j] = ar[i][j] / weight

    u, s, v = lin.svd(ar, full_matrices=False)
    data = u[:, 0:100]
    print 'svd finished'

    k = KMeans(init='k-means++', n_init=10)
    k.fit(data)
    #centroids = k.cluster_centers_
    labels = k.labels_
    print 'kmeans finished'

    #af = AffinityPropagation(affinity="euclidean").fit(data)
    #cluster_centers_indices = af.cluster_centers_indices_
    #labels = af.labels_

    doc_arr = np.array(range(DOC_LEN))
    for i in range(np.max(labels)):
        print 'group:', (i + 1)
        for doc_num in doc_arr[labels == i]:
            print ixreader.stored_fields(doc_num).get(
                'id'), ixreader.stored_fields(doc_num).get('title').split(
                    '|')[0] + '/',
        print '\n'
Exemplo n.º 4
0
def key_terms(storage, schema):
    index = storage.open_index(schema=schema)
    ixreader = index.reader()
    searcher = index.searcher()
    docnums = []
    KEY_LEN = 500
    DOC_LEN = 1000
    for id in xrange(DOC_LEN):
        docnums.append(id)
    #for id in ixreader.all_doc_ids():
    #    print id,
    terms = {}
    i = 0
    for term,score in searcher.key_terms(docnums, content_field_name, KEY_LEN):
        terms[term] = i
        i += 1
    print 'key_terms finished'

    ar = np.zeros( (len(docnums), KEY_LEN) )
    for i in xrange(DOC_LEN):
        term_weights = ixreader.vector_as("weight", i, content_field_name)
        all_weight = 0
        n = 0
        for term,weight in term_weights:
            if term in terms:
                ar[i][terms[term]] = weight
                all_weight += weight
                n += 1
        for j in xrange(KEY_LEN):
            ar[i][j] = ar[i][j]/weight
    
    u,s,v = lin.svd(ar, full_matrices=False)
    data = u[:,0:100]
    print 'svd finished'

    k = KMeans(init='k-means++', n_init=10)
    k.fit(data)
    #centroids = k.cluster_centers_
    labels = k.labels_
    print 'kmeans finished'

    #af = AffinityPropagation(affinity="euclidean").fit(data)
    #cluster_centers_indices = af.cluster_centers_indices_
    #labels = af.labels_
    
    doc_arr = np.array(range(DOC_LEN))
    for i in range(np.max(labels)):
        print 'group:', (i+1)
        for doc_num in doc_arr[labels==i]:
            print ixreader.stored_fields(doc_num).get('id'), ixreader.stored_fields(doc_num).get('title').split('|')[0]+ '/',
        print '\n'