def create_model(f, type='bigram'): res = {} for line in f: words = None parts = line.split('\t') if len(parts) == 1: continue elif len(parts) != 2: raise ValueError if type == 'unigram': add_uni_query( res, '='.join(doc2words.extract_words(parts[1].decode('utf8'))), '='.join(doc2words.extract_words(parts[0].decode('utf8')))) elif type == 'bigram': add_bi_query( res, '='.join(doc2words.extract_words(parts[1].decode('utf8'))), '='.join(doc2words.extract_words(parts[0].decode('utf8')))) else: raise ValueError if type == 'unigram': for k in res.keys(): res[k] = np.log(res[k]) elif type == 'bigram': for k in res.keys(): for k1 in res[k].keys(): for k2 in res[k][k1].keys(): res[k][k1][k2] = np.log(float(res[k][k1][k2]) / N) return res
def TakeShingles(text): words = doc2words.extract_words(text) shingles = [] for i in xrange(len(words) - 5): shingle = ' '.join(words[i:i + 5]) shingles.append(mmh3.hash(shingle.encode('utf-8'))) return shingles
def scan_text(self, doc): for word in set(doc2words.extract_words(doc.text)): # print word # word = unicode (word, "utf-8") self.terms[word].append(self.ind) self.ind += 1 self.url.append(doc.url)
def create_dict_varbyte(dict_file_name, support_file_name, info_file_name, url_file_name, reader, if_continue): url_file = open(url_file_name, 'a') if not if_continue: # hash of term to string on bytes in encoding (string == list of deltas between docid) dict_ = defaultdict(str) # hash of term to last_docid in list of deltas last_docid = defaultdict(int) # number of max docid max_docid = 0 # size of all lists lists_size = 0 max_docid_prev = 0 else: dict_pickle_file = open(dict_file_name + 'pickle', 'rb') dict_ = pickle.load(dict_pickle_file) dict_pickle_file.close() last_docid_pickle_file = open(support_file_name + '_last_pickle', 'rb') last_docid = pickle.load(last_docid_pickle_file) last_docid_pickle_file.close() max_docid_pickle_file = open(support_file_name + '_max_pickle', 'rb') max_docid_prev = pickle.load(max_docid_pickle_file) max_docid = max_docid_prev max_docid_pickle_file.close() lists_size_pickle_file = open(support_file_name + '_size_pickle', 'rb') lists_size = pickle.load(lists_size_pickle_file) lists_size_pickle_file.close() for docid, doc in enumerate(reader): if if_continue: docid += max_docid_prev max_docid = docid + 1 url_file.write(doc.url + '\n') lists_size += create_dict_part_varbyte(dict_, last_docid, extract_words(doc.text), docid + 1) if not if_continue: dict_pickle_file = open(dict_file_name + 'pickle', 'wb') pickle.dump(dict_, dict_pickle_file) dict_pickle_file.close() last_docid_pickle_file = open(support_file_name + '_last_pickle', 'wb') pickle.dump(last_docid, last_docid_pickle_file) last_docid_pickle_file.close() max_docid_pickle_file = open(support_file_name + '_max_pickle', 'wb') pickle.dump(max_docid, max_docid_pickle_file) max_docid_pickle_file.close() lists_size_pickle_file = open(support_file_name + '_size_pickle', 'wb') pickle.dump(lists_size, lists_size_pickle_file) lists_size_pickle_file.close() quit() write_stuff(dict_file_name, support_file_name, info_file_name, dict_, lists_size, max_docid, encoding_varbyte_code)
def handle_doc(self, doc, doc_id): self.links.append(doc.url) words = set(extract_words(doc.text)); for word in words: word_hash = get_hash(word.encode("UTF-8")) last_id, arr = self.index.get(word_hash, (0, list())) arr.append(doc_id - last_id) self.index[word_hash] = (doc_id, arr)
def query_model(f): res = Counter() N = 0 for line in f: line = line.decode('utf8') words = None parts = line.split('\t') if len(parts) == 1: words = doc2words.extract_words(parts[0]) elif len(parts) == 2: words = doc2words.extract_words(parts[1]) else: raise ValueError query = ' '.join(words) res[query] += 1 N += 1 for k in res.keys(): res[k] = np.log(float(res[k]) / N) return res
def unigram_model(f): res = Counter() N = 0 for line in f: line = line.decode('utf8') words = None parts = line.split('\t') if len(parts) == 1: words = doc2words.extract_words(parts[0]) elif len(parts) == 2: words = doc2words.extract_words(parts[1]) else: raise ValueError N += len(words) for word in words: res[word] += 1 for k in res.keys(): res[k] = np.log(float(res[k]) / N) return res
def get_data(): reader = DocumentStreamReader(docreader.parse_command_line().files) terms = defaultdict(list) ind = 0 urls = [] for doc in reader: for word in set(doc2words.extract_words(doc.text)): terms[word].append(ind) ind += 1 urls.append(doc.url) return terms, urls
def create_index(args): reader = DocumentStreamReader(args[2:]) if args[1] == 'varbyte': vocabulary = Vocabulary(Simple9) elif args[1] == 'simple9': vocabulary = Vocabulary(Simple9) else: raise AssertionError('Expected varbyte|simple9 as a compressor') for doc in reader: for word in extract_words(doc.text): vocabulary.append(word, doc.url) dump(args[0], vocabulary)
def bigram_model(f): res = {'': Counter()} N = 0 for line in f: line = line.decode('utf8') words = None parts = line.split('\t') if len(parts) == 1: words = doc2words.extract_words(parts[0]) elif len(parts) == 2: words = doc2words.extract_words(parts[1]) else: raise ValueError N += len(words) for i in range(len(words) - 1): if words[i] not in res: res[words[i]] = Counter() res[words[i]][words[i + 1]] += 1 if len(words) > 0: res[''][words[0]] += 1 for k in res.keys(): for k1 in res[k].keys(): res[k][k1] = np.log(float(res[k][k1]) / N) return res
def make_dictionary_urlid(): id_url = {} term_doc = {} reader = DocumentStreamReader(parse_command_line().files) i = 0 for doc in reader: id_url[str(i)] = doc.url for word in extract_words(doc.text): if not (word in term_doc): term_doc[word] = [] term_doc[word].append(i) elif term_doc[word][len(term_doc[word])-1] != i: term_doc[word].append(i) i += 1 return term_doc, id_url
def expand_back_index(doc, id): global ids ids[id] = doc.url words = set(extract_words(doc.text)) for word in words: h = mmh3.hash(word.encode('utf-8')) if h in res: res[h].extend(struct.pack('I', id)) else: res[h] = bytearray() res[h].extend(struct.pack('I', id))
def estimate_query(query): words = doc2words.extract_words(query) if len(words) == 0: return 0 w = 0 if words[0].encode('utf8') not in l_model['']: w -= 10000 else: w += l_model[''][words[0].encode('utf8')] for i in range(1, len(words)): if words[i - 1].encode('utf8') not in l_model: w -= 10000 elif words[i].encode('utf8') not in l_model[words[i - 1].encode('utf8')]: w -= 10000 else: w += l_model[words[i - 1].encode('utf8')][words[i].encode('utf8')] return w
def main(encoding, paths): reader = DocumentStreamReader(paths) if encoding == 'varbyte': encoder = VarbyteEncoder() elif encoding == 'simple9': encoder = Simple9Encoder() else: raise Exception("Unsupported encoding!") ct = time.clock() for doc in reader: url = doc.url words = set([w.encode('utf-8') for w in extract_words(doc.text)]) encoder.add_document(url, words) encoder.write_to_file("index.txt") print "Time for index creation: {}".format(1000 * (time.clock() - ct))
def index_data(reader, archive_type): global doc_id, dic # num = 0 for doc in reader: text = extract_words(doc.text) index_doc(doc.url, text) # num += 1 # if not (num % 10000): # print sys.getsizeof(dic) / (1024 * 1024) # if sys.getsizeof(dic) > 5e2: # save_data(num) # dic.clear() # num += 1 # save_data(num) code_data(dic, archive_type) save_obj([archive_type, dic], 'Data/compressed_dict.pckl') save_obj([archive_type, doc_id], 'Data/compressed_id.pckl')
def main(variant): with open('variant', 'w') as f: f.write(variant) encoder = Coder(variant) paths = [] chunk_num = 0 max_chunk_num = 2 while True: tokens = {} i = 1 if chunk_num == max_chunk_num: break documents = docreader.DocumentStreamReader( docreader.parse_command_line().files) for doc in documents: if chunk_num == 0: paths.append(doc.url) words = doc2words.extract_words(doc.text) for word in set(words): if word in tokens: tokens[word].append(i) elif len(word) % max_chunk_num == chunk_num: tokens[word] = array('l', [i]) i += 1 for token in tokens: tokens[token] = encoder.encode(tokens[token]) with open('index{}.pkl'.format(chunk_num), 'wb') as f: pickle.dump(tokens, f) chunk_num += 1 first = False with open('paths.pkl', 'wb') as f: pickle.dump(paths, f)
def make_dict(self, doc_reader): for docID, doc in enumerate(doc_reader): url = doc.url # не буду делать DocID <=> doc.url body = doc.body # Там нет ни в одном документе тела text = doc.text words = doc2words.extract_words(text) self.urls[docID] = url # if docID == 21: # print doc.url # print "BODY\n", doc.body # print "TEXT\n", doc.text # for word in words: # if self.hash(word) == self.hash(u'сша'): print word for word in words: termID = self.hash(word) if termID in self.dict: if docID not in self.dict[termID]: self.dict[termID] = np.concatenate((self.dict[termID], [docID])) else: self.dict[termID] = np.asarray([docID])
def build_index_files(paths, max_doc_ids_per_index_file=10 ** 6): reader = docreader.DocumentStreamReader(paths) doc_urls = dict() next_file_id = 0 last_doc_ids = 0 file_names = [] index = dict() for doc_id, doc in enumerate(reader): doc_urls[doc_id] = doc.url words = doc2words.extract_words(doc.text) for word in words: doc_ids = index.get(word, set()) index[word] = doc_ids if doc_id not in doc_ids: last_doc_ids += 1 doc_ids.add(doc_id) if last_doc_ids >= max_doc_ids_per_index_file: index_file_name = __get_index_file_name(next_file_id) file_names.append(index_file_name) dump_index_to_file(index, index_file_name) next_file_id += 1 last_doc_ids = 0 index.clear() if index: index_file_name = __get_index_file_name(next_file_id) file_names.append(index_file_name) dump_index_to_file(index, index_file_name) next_file_id += 1 index.clear() return doc_urls, file_names
def create_dict_simple9(dict_file_name, support_file_name, info_file_name, url_file_name, reader): url_file = open(url_file_name, 'a') # hash of term to uncompressed list of deltas between docid dict_ = defaultdict(list) # hash of term to last_docid in list of deltas last_docid = defaultdict(int) # number of max docid max_docid = 0 # size of all lists lists_size = 0 for docid, doc in enumerate(reader): max_docid = docid + 1 url_file.write(doc.url + '\n') create_dict_part_simple9(dict_, last_docid, extract_words(doc.text), docid + 1) for term_hash, list_ in dict_.iteritems(): dict_[term_hash] = compress_list_simple9(list_) lists_size += len(dict_[term_hash]) write_stuff(dict_file_name, support_file_name, info_file_name, dict_, lists_size, max_docid, encoding_simple9_code)
import codecs import docreader import pickle from doc2words import extract_words from collections import defaultdict def save_obj(obj, name): with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stderr = codecs.getwriter('utf8')(sys.stderr) reader = docreader.DocumentStreamReader(docreader.parse_command_line().files) encoder_type = docreader.parse_command_line().encoder fd = open("encoder.txt", "w") fd.write(encoder_type) fd.close() URLs = {} InvIndex = defaultdict(list) for idx, doc in enumerate(reader): URLs[idx] = doc.url Terms = list(sorted(set(extract_words(doc.text)))) for term in Terms: InvIndex[term].append(idx) save_obj(InvIndex, "index") save_obj(URLs, "urls")
res = bytearray() for id in docids: res += code_to_byte(id) res = struct.pack('I', len(res)) + res return res if __name__ == '__main__': reader = docreader.DocumentStreamReader( docreader.parse_command_line().files) mdict = defaultdict(lambda: []) urls = [] for doc in reader: urls.append(doc.url) for word in extract_words(doc.text): mdict[word].append(len(urls)) for term in mdict.keys(): docids = mdict[term] for i in reversed(range(1, len(docids))): docids[i] = docids[i] - docids[i - 1] docids = filter(lambda x: x != 0, docids) mdict[term] = docids for word in mdict.keys(): mdict[word] = code_varbyte(mdict[word]) id_url = {} term_position = {} with open('index', 'wb') as f: for term, coded_ids in mdict.iteritems():
encoder = varbyte elif encoder_str == 'simple9': encoder = simple9 #for i in urls: # print i.text.encode("utf-8") # break term_dictionary = {} url_list = [] doc_id = 0 for url in reader: doc_id += 1 url_list.append(url.url) words = doc2words.extract_words(url.text) uniq_words = list(set(words)) for word in uniq_words: #print mmh3.hash() hash = abs(mmh3.hash(word.encode("utf-8"))) if (term_dictionary.get(hash)): term_dictionary[hash].append(doc_id) else: term_dictionary[hash] = [] term_dictionary[hash].append(doc_id) print term_dictionary[abs(mmh3.hash("энергоносители"))] for key in term_dictionary: term_dictionary[key] = encoder.compress(term_dictionary[key])
if encoder_arg == 'varbyte': encoder = varbyte else: print "Unsupported encoder" exit() dictionary = {} urls = [] """ Reading dataset file """ counter = 0 for entry in docreader.DocumentStreamReader(archive_args): urls.append(entry.url) counter += 1 for word in set(doc2words.extract_words(entry.text)): hash = abs(mmh3.hash(word.encode("utf-8").lower())) if not dictionary.get(hash): dictionary[hash] = [] dictionary[hash].append(counter) """ Compressing dictionary """ dictionary = { entry: [encoder.encode(id) for id in dictionary[entry]] for entry in dictionary } """ Storing index in memory """ desc = open("./index_encoder", "w") desc.write(encoder_arg) desc.close()
#!/usr/bin/env python # -*- coding: utf-8 -*- import doc2words import operator image = {} web = {} with open('image_queries.txt', 'r') as f: for line in f: words = doc2words.extract_words(line) for word in words: if word in image: image[word] += 1 else: image[word] = 1 with open('web_queries.txt', 'r') as f: for line in f: words = doc2words.extract_words(line) for word in words: if word in web: web[word] += 1 else: web[word] = 1 proba = {} for word in image: if word in web:
pickle.dump('varbyte', open('coding_type.txt', 'w')) elif coding == 'simple9': ''' he is working very slowly, so i cannot pass 25, using simple9 (not searching, just indexing) coding = simple9 encoding = simple9_decode pickle.dump('simple9', open('coding_type.txt', 'w')) #''' coding = varbyte encoding = varbyte_decode pickle.dump('varbyte', open('coding_type.txt', 'w')) #''' else: raise NotImplemented reader = DocumentStreamReader(files) d = defaultdict(list) docid = -1 docids = dict() for doc in reader: docid += 1 words = extract_words(doc.text) for word in words: d[word].append(docid) docids[docid] = doc.url Pack(d, docids, coding) ''' for key in docids.keys(): print(key) print(docids[key]) #'''
compression_type = argv[1] assert (compression_type == 'varbyte' or compression_type == 'simple9') files_gz = argv[2:] docReader = docreader.DocumentStreamReader(files_gz) # parse texts and create index print 'parse text' index_dict = {} index_url = [] for idx, doc in enumerate(docReader): words = doc2words.extract_words(doc.text) index_url.append(doc.url) for word in words: if word in index_dict: if index_dict[word][-1] != idx: index_dict[word].append(idx) else: index_dict[word] = [idx] # save to pickle cPickle.dump(index_dict, open('./pickle/index_dict.p', 'w')) # write urls
path = './index/' if not os.path.exists(path): os.makedirs(path) index, urls = {}, [] reader = DocumentStreamReader(files) n_files, batch_size = 0, 5e4 need_to_dump = False for doc_i, doc in enumerate(reader): need_to_dump = True urls.append(doc.url + '\n') terms = set(extract_words(doc.text)) for term in terms: try: index[hash(term)].append(doc_i) except KeyError: # 0 is a fake document. This principle helps to split index into multiple files index[hash(term)] = [0, doc_i] if (doc_i + 1) % batch_size == 0: dump_index_part(path + 'part_{0:03d}'.format(n_files), index, encoding) for key in index.keys(): # next doc will be encoded correctly index[key] = [index[key][-1]] n_files += 1 need_to_dump = False
def __init__(self, query): self.query = query self.words = doc2words.extract_words(query)
#!/usr/bin/env python from docindex import Docindex from doc2words import extract_words doc_texts = [ { 'url': '/get-set-update', 'text': 'update get set' }, { 'url': '/get-set', 'text': 'get set' }, { 'url': '/set', 'text': 'set' }, ] if __name__ == '__main__': di = Docindex() for doc in doc_texts: print "%s\t%d bytes" % (doc['url'], len(doc['text'])) words = extract_words(doc['text']) di.add_doc(doc['url'], words) di.to_file('index.pickle')