global buff_meta if len(buff_meta) > 0: print('writing last set of meta') m = hashlib.md5() m.update(buff_meta[0].SerializeToString()) curr_hash = m.hexdigest() writer = open(os.path.join(args.output, curr_hash), 'bw') decoder.encodeEntry(writer.write, buff_meta) writer.close() if args.mzip: subprocess.call([ "7z", 'a', '-mx9', os.path.join(args.output, curr_hash) + ".7z", os.path.join(args.output, curr_hash) ]) os.remove(os.path.join(args.output, curr_hash)) buff_meta = [] size_meta = 0 if __name__ == "__main__": args = parser.parse_args() print("start") production = buffer_utils.extract_files(args.input, read_file, args.match, True, args.after, args.before, args.temp) for item in map(consumer_metadata, production): pass finalize()
if open_[item]['count'] > 0: yield (item,open_[item]['date'],-1,id_,doc.title) def get_items(doc_index,sites,link,tld): url = link[0] try: site = utils.get_domain(url,tld) except ValueError: return None except Exception as inst: raise inst if site in sites: sites[site] += 1 else: sites[site] = 1 return (doc_index,list(sites.keys()).index(site),link[1],link[2],link[3],link[4]) if __name__ == '__main__': args = parser.parse_args() docs = buffer_utils.extract_files(args.input,read_file, args.match, False, args.after, args.before) docs_items = map(doc_extract_links,docs) tld = utils.get_tld_list(args.ltd_names) sites = OrderedDict() doc_index = 0 final_items = [] for items in docs_items: final_items.extend([t for t in filter(lambda x: x is not None,map(lambda x:get_items(doc_index,sites,x,tld),items))]) doc_index +=1 fi = open(args.output,'wb') pickle.dump({'data':final_items,'sites':sites},fi) fi.close()
size_meta = 0 if curr_file_index != file_name: curr_file_index = file_name print("%d elements in buffer" % len(buff_meta)) size_meta += len(item_.SerializeToString()) buff_meta.append(item_) def finalize(): global buff_meta if len(buff_meta) > 0: print('writing last set of meta') m = hashlib.md5() m.update(buff_meta[0].SerializeToString()) curr_hash = m.hexdigest() writer = open(os.path.join(args.output,curr_hash),'bw') decoder.encodeEntry(writer.write,buff_meta) writer.close() if args.mzip: subprocess.call(["7z",'a','-mx9',os.path.join(args.output,curr_hash)+".7z",os.path.join(args.output,curr_hash)]) os.remove(os.path.join(args.output,curr_hash)) buff_meta = [] size_meta = 0 if __name__ == "__main__": args = parser.parse_args() print("start") production = buffer_utils.extract_files(args.input, read_file, args.match, True, args.after, args.before,args.temp) for item in map(consumer_metadata,production): pass finalize()