Exemplo n.º 1
0
def get_data():
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    terms = defaultdict(list)
    ind = 0
    urls = []
    for doc in reader:
        for word in set(doc2words.extract_words(doc.text)):
            terms[word].append(ind)
        ind += 1
        urls.append(doc.url)
    return terms, urls
Exemplo n.º 2
0
def make_dictionary_urlid():
    id_url = {}
    term_doc = {}
    reader = DocumentStreamReader(parse_command_line().files)
    i = 0
    for doc in reader:
        id_url[str(i)] = doc.url
        for word in extract_words(doc.text):
            if not (word in term_doc):
                term_doc[word] = []
                term_doc[word].append(i)
            elif term_doc[word][len(term_doc[word])-1] != i:
                term_doc[word].append(i)
        i += 1
    return term_doc, id_url
Exemplo n.º 3
0
def main(variant):
    with open('variant', 'w') as f:
        f.write(variant)

    encoder = Coder(variant)
    paths = []
    chunk_num = 0
    max_chunk_num = 2

    while True:
        tokens = {}
        i = 1
        if chunk_num == max_chunk_num:
            break

        documents = docreader.DocumentStreamReader(
            docreader.parse_command_line().files)
        for doc in documents:
            if chunk_num == 0:
                paths.append(doc.url)

            words = doc2words.extract_words(doc.text)

            for word in set(words):
                if word in tokens:
                    tokens[word].append(i)
                elif len(word) % max_chunk_num == chunk_num:
                    tokens[word] = array('l', [i])

            i += 1

        for token in tokens:
            tokens[token] = encoder.encode(tokens[token])

        with open('index{}.pkl'.format(chunk_num), 'wb') as f:
            pickle.dump(tokens, f)

        chunk_num += 1
        first = False

    with open('paths.pkl', 'wb') as f:
        pickle.dump(paths, f)
import sys
import codecs
import docreader
import pickle
from doc2words import extract_words
from collections import defaultdict


def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)
reader = docreader.DocumentStreamReader(docreader.parse_command_line().files)
encoder_type = docreader.parse_command_line().encoder
fd = open("encoder.txt", "w")
fd.write(encoder_type)
fd.close()

URLs = {}
InvIndex = defaultdict(list)
for idx, doc in enumerate(reader):
    URLs[idx] = doc.url
    Terms = list(sorted(set(extract_words(doc.text))))
    for term in Terms:
        InvIndex[term].append(idx)

save_obj(InvIndex, "index")
save_obj(URLs, "urls")
# -*- coding: utf-8 -*-

import docreader
import doc2words
import varbyte
import simple9
import pickle
import mmh3
import json

#urls = docreader.DocumentStreamReader(["../dataset/lenta.ru_4deb864d-3c46-45e6-85f4-a7ff7544a3fb_01.gz"])
arg = docreader.parse_command_line().files
reader = docreader.DocumentStreamReader(arg[1:])
encoder_str = arg[0]

if encoder_str == 'varbyte':
    encoder = varbyte
elif encoder_str == 'simple9':
    encoder = simple9

#for i in urls:
#    print i.text.encode("utf-8")
#    break

term_dictionary = {}
url_list = []

doc_id = 0
for url in reader:
    doc_id += 1
    url_list.append(url.url)
Exemplo n.º 6
0
import docreader
from docreader import DocumentStreamReader
import index_creation
import bitstream
import cPickle
import mmhash
import dict_hash

if __name__ == '__main__':
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    index = index_creation.Url_Index()
    for doc in reader:
        index.scan_text(doc)
    blob = []
    term = dict()
    for k, v in index.terms.iteritems():
        prev_len = len(blob)
        compr = bitstream.compress_varbyte(v)
        blob.extend(compr)
        term[mmhash.get_unsigned_hash(
            k.encode('utf8'))] = [prev_len, len(compr)]

    index_file = open("index.txt", "wb")
    index_file.write(bytearray(blob))

    url_file = open("url_file.txt", "wb")
    cPickle.dump(index.url, url_file)

    dict_hash.store_dict(term)
Exemplo n.º 7
0
    url_index.add_int(urls.add_string(url))
    pass


def get_wordid(term):
    res = words.get_from_dict(term)
    if res is None:
        global word_count
        word_count += 1
        words.add(term, word_count)
        return word_count
    return res


if __name__ == '__main__':
    cmd = parse_command_line()
    reader = DocumentStreamReader(cmd.files)
    if cmd.code[0] == "varbyte":
        index = VarByte("docindex")
    else:
        index = Simple9("docindex")

    doc_count = -1

    for doc in reader:
        doc_count += 1
        add_doc(doc.url)

        terms = set(extract_words(doc.text))

        for term in terms:
Exemplo n.º 8
0

def load():
    fin = open('back_index.bin', 'r')
    data = pickle.load(fin)
    fin.close()

    fin = open('ids.bin', 'r')
    ids = pickle.load(fin)
    fin.close()

    return data, ids


if __name__ == '__main__':
    parsed_line = parse_command_line().files

    try:
        os.remove('varbyte.bin')
    except:
        pass

    try:
        os.remove('simple9.bin')
    except:
        pass

    if parsed_line[0] == 'varbyte':
        with open('varbyte.bin', 'wb') as f:
            f.write('a')
    else: