def redis_zincr_words(pipe, filename, minlength, maxlength): """Create news sorted set in redis. :param minlength: -- (int) Minimum words length inserted :param maxlength: -- (int) Maximum words length inserted :param filename: -- The absolute path to the file.gz to process. Representation of the set in redis: +------------+------------+-----------+ | Keys | Members | Scores | +============+============+===========+ | 20131001 | word1 | 142 | +------------+------------+-----------+ | ... | word2 | 120 | +------------+------------+-----------+ | 20131002 | ... | ... | +------------+------------+-----------+ This function store all words between minlength and maxlength in redis. Redis will count as well how much time each word will appear by day: The cardinality. """ tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True) with gzip.open(filename, 'rb') as F: blob = TextBlob(clean(F.read()), tokenizer = tokenizer) for word in blob.tokens: if (len(word) >= minlength) and (len(word) <= maxlength): pipe.zincrby(filename[-22:-12].replace('/',''), word, 1) if (len(word) >= maxlength): publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename)) publisher.info(word) pipe.execute()
def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set): """Tokenizing on word category :param r_serv: -- Redis database connexion :param listname: -- (str) path to the file containing the list of path of category files :param choicedatastruct: -- (bool) Changing the index of datastructure :param nb: -- (int) Number of pastes proceeded by function Redis data structures cas be choose as follow: +---------------+------------+-----------+ | Keys | Members | Scores | +===============+============+===========+ | mails_categ | filename | 25000 | +---------------+------------+-----------+ | ... | filename2 | 2400 | +---------------+------------+-----------+ | web_categ | ... | ... | +---------------+------------+-----------+ Or +--------------+-------------+-----------+ | Keys | Members | Scores | +==============+=============+===========+ | filename | mails_categ | 100000 | +--------------+-------------+-----------+ | ... | web_categ | 24050 | +--------------+-------------+-----------+ | filename2 | ... | ... | +--------------+-------------+-----------+ This function tokenise on all special characters like: @^\|[{#~}]!:;$^= And insert data in redis if the token match the keywords in a list previously created. These lists of keywords can be list of everything you want but it's better to create "category" of keywords. """ try: for n in xrange(0,nb): filename = r_serv.lpop(r_set) if filename != None: tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True) set_listof_pid(r_serv, filename, sys.argv[0]) with open(listname, 'rb') as L: # for each "categ" listed in the file for num, fname in enumerate(L): # contain keywords by categ tmp_list = [] #for each keywords with open(fname[:-1], 'rb') as LS: for num, kword in enumerate(LS): tmp_list.append(kword[:-1]) # for each paste with gzip.open(filename, 'rb') as F: blob = TextBlob(clean(F.read()), tokenizer = tokenizer) # for each paste token for word in blob.tokens.lower(): if word in tmp_list: # choosing between two data structures. if choicedatastruct: r_serv.zincrby(filename, fname.split('/')[-1][:-1], 1) else: r_serv.zincrby(fname.split('/')[-1][:-1], filename, 1) update_listof_pid(r_serv) else: publisher.debug("Empty list") #r_serv.save() break except (KeyboardInterrupt, SystemExit) as e: flush_list_of_pid(r_serv) publisher.debug("Pid list flushed")