示例#1
0
def write(plist, url, title, config):
    '''
    RQ worker function which adds the given document posting list data to the
    inverted index.
    '''
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    TERM_DICT_FILE = config.get('indexer', 'term_dict_file')
    DOC_LIST_FILE = config.get('indexer', 'doc_list_file')

    dl = DocList(DOC_LIST_FILE)
    if len(dl) < MAX_DOCS:
        did = md5(url).hexdigest()

        if did not in dl:
            dl.append(url, title)

            iidx = InvertedIndex(TERM_DICT_FILE, DOC_LIST_FILE)
            iidx.append(plist, did)
            iidx.update()
示例#2
0
def write(plist, url, title, config):
    '''
    RQ worker function which adds the given document posting list data to the
    inverted index.
    '''
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    TERM_DICT_FILE = config.get('indexer', 'term_dict_file')
    DOC_LIST_FILE = config.get('indexer', 'doc_list_file')
    
    dl = DocList(DOC_LIST_FILE)
    if len(dl) < MAX_DOCS:
        did = md5(url).hexdigest()

        if did not in dl:
            dl.append(url, title)

            iidx = InvertedIndex(
                TERM_DICT_FILE,
                DOC_LIST_FILE
            )
            iidx.append(plist, did)
            iidx.update()
示例#3
0
from tokenizer import DocProcessor
from indexer import InvertedIndex

docs = {
    1: '/home/ubuntu/eecs767/var/docs/doc1.html',
    2: '/home/ubuntu/eecs767/var/docs/doc2.html',
    3: '/home/ubuntu/eecs767/var/docs/doc3.html',
    4: '/home/ubuntu/eecs767/var/docs/doc4.html',
    5: '/home/ubuntu/eecs767/var/docs/doc5.html',
}

dproc = DocProcessor()
iidx = InvertedIndex()

for did, doc in docs.iteritems():
    print '-- Processing Doc #%s: %s' % (did, doc)
    dproc.parse(doc)
    plist = dproc.gen_posting_list()

    iidx.append(plist, did)
    iidx.update()
    iidx.clear()