def splitWord(line): vran = line[0] title = line[1] words = ttp.clean_title(title).split(" ") words = filter(None, words) for i in range(len(words)): words[i] = vran + "-" + words[i] return words
def parseTitle(line): vran = line[0] title = line[1] words = ttp.clean_title(title).split(" ") words = [word for word in words if is_katakana(word)] for i in range(len(words)): words[i] = vran + "-" + words[i] return words
def parseTitle(line): vran = line[0] title = unicode(line[1], 'utf-8') words = ttp.clean_title(title).split(" ") words = [word for word in words if word and is_katakana(word)] for i in range(len(words)): words[i] = words[i] + "|" + title return words
def parseTitle(line): vran = line[0] title = line[1] words = ttp.clean_title(title).split(" ") words = [word for word in words if is_katakana(word) and word] for i in range(len(words)): words[i] = vran + "-" + words[i] return words
def parseTitle(line): ans = [] vran = line[0] title = unicode(line[1], 'utf-8') words = ttp.clean_title(title).split(" ") words = [word for word in words if word and is_katakana(word)] for i in range(len(words)): ans.append((words[i], title)) return ans
def preprocess(title, srm=NLPSpamRemover()): tokenized_title = ttp.clean_title(title) tokens = [] for t in tokenized_title.split(' '): adj = t.replace(' ', '').replace(u'\u3000', '') if len(adj) < 4 and not adj.isalnum(): continue if u'\u5186' in t: #'円' continue #print len(adj), t, len(t), adj.isalnum() tokens.append(adj) clean_tokens = srm.remove_spam2(tokens) return tokenized_title, clean_tokens
def main(spam_remover, logger): import TitlePreprocessing as ttp import time logger.debug('#keys:' + str(len(spam_remover.spam_map))) logger.debug('key1:' + str(type(spam_remover.spam_map.keys()[0])) + ' ' + str(spam_remover.spam_map.keys()[0])) logger.debug('val1:' + str(spam_remover.spam_map[spam_remover.spam_map.keys()[0]])) # TODO move these outputs ito each spam removers class and change it according to datatype # nlp_spam map is hash->dict and entropy_spam is hash->set try: for k, vals in spam_remover.spam_map[spam_remover.spam_map.keys() [0]].iteritems(): logger.debug('elem k: ' + str(type(k)) + ' ' + str(k)) for v in vals: logger.debug('elem v:' + str(type(v)) + ' ' + str(v)) except: pass stime = time.time() # test1 logger.debug('') logger.debug('test1 --------------------------------') title = '【エントリーでポイント10倍】セクシーワンピ/ワンピース/レディースファッション/海外人気モデル【10500円以上で送料無料】' logger.debug('title : ' + str(type(title)) + ' ' + title) tkns = ttp.clean_title(title).split(' ') ptitle1 = ' '.join(tkns) ret = spam_remover.remove_spam(tkns) ptitle2 = ' '.join(ret) if ptitle1 != ptitle2: logger.debug('in :' + ptitle1.encode('utf8')) logger.debug('out:' + ptitle2.encode('utf8')) #""" #""" # test2 if len(sys.argv) > 1: logger.debug('') logger.debug('test2 --------------------------------') # removeSpam_test('507745') removeSpam_benchmark(spam_remover, sys.argv[1], logger)
def main(spam_remover, logger): import TitlePreprocessing as ttp import time logger.debug('#keys:' + str(len(spam_remover.spam_map))) logger.debug('key1:' + str(type(spam_remover.spam_map.keys() [0])) + ' ' + str(spam_remover.spam_map.keys()[0])) logger.debug( 'val1:' + str(spam_remover.spam_map[spam_remover.spam_map.keys()[0]])) # TODO move these outputs ito each spam removers class and change it according to datatype # nlp_spam map is hash->dict and entropy_spam is hash->set try: for k, vals in spam_remover.spam_map[spam_remover.spam_map.keys()[0]].iteritems(): logger.debug('elem k: ' + str(type(k)) + ' ' + str(k)) for v in vals: logger.debug('elem v:' + str(type(v)) + ' ' + str(v)) except: pass stime = time.time() # test1 logger.debug('') logger.debug('test1 --------------------------------') title = '【エントリーでポイント10倍】セクシーワンピ/ワンピース/レディースファッション/海外人気モデル【10500円以上で送料無料】' logger.debug('title : ' + str(type(title)) + ' ' + title) tkns = ttp.clean_title(title).split(' ') ptitle1 = ' '.join(tkns) ret = spam_remover.remove_spam(tkns) ptitle2 = ' '.join(ret) if ptitle1 != ptitle2: logger.debug('in :' + ptitle1.encode('utf8')) logger.debug('out:' + ptitle2.encode('utf8')) #""" #""" # test2 if len(sys.argv) > 1: logger.debug('') logger.debug('test2 --------------------------------') # removeSpam_test('507745') removeSpam_benchmark(spam_remover, sys.argv[1], logger)
def removeSpam_benchmark(spam_remover, genreID, logger): import CassConn.CassConn as CC import TitlePreprocessing as ttp stime = time.time() cc = CC.CassConn(env='INS') batchSize = 500 rowKey = "G_" + str(genreID) items = [x[0] for x in cc.cfProductMaster.xget(rowKey)] logger.debug(' '.join( ['genre ', genreID, ' received ', str(len(items)), ' from G_ index'])) if len(items) > 60000: print 'skip big genre' return for indx in range(0, len(items), batchSize): pdata = cc.cfProductMaster.multiget(items[indx:indx + batchSize], columns=['V1']) for prodKey, pvals in pdata.iteritems(): try: parts = pvals['V1'].split('\t') title = parts[8] # tokenize and remove spam tkns = ttp.clean_title(title).split(' ') # .decode('utf8')) ptitle1 = ' '.join(tkns) ret = spam_remover.remove_spam(tkns) ptitle2 = ' '.join(ret) #logger.debug(' '.join([prodKey,'in :',ptitle1.encode('utf8')])) #logger.debug(' '.join([prodKey,'out:',ptitle2.encode('utf8')])) #logger.debug(' -------------------------- ') except Exception, e: print traceback.format_exc() sys.exit() continue
def removeSpam_benchmark(spam_remover, genreID, logger): import CassConn.CassConn as CC import TitlePreprocessing as ttp stime = time.time() cc = CC.CassConn(env='INS') batchSize = 500 rowKey = "G_" + str(genreID) items = [x[0] for x in cc.cfProductMaster.xget(rowKey)] logger.debug( ' '.join(['genre ', genreID, ' received ', str(len(items)), ' from G_ index'])) if len(items) > 60000: print 'skip big genre' return for indx in range(0, len(items), batchSize): pdata = cc.cfProductMaster.multiget( items[indx:indx + batchSize], columns=['V1']) for prodKey, pvals in pdata.iteritems(): try: parts = pvals['V1'].split('\t') title = parts[8] # tokenize and remove spam tkns = ttp.clean_title(title).split(' ') # .decode('utf8')) ptitle1 = ' '.join(tkns) ret = spam_remover.remove_spam(tkns) ptitle2 = ' '.join(ret) #logger.debug(' '.join([prodKey,'in :',ptitle1.encode('utf8')])) #logger.debug(' '.join([prodKey,'out:',ptitle2.encode('utf8')])) #logger.debug(' -------------------------- ') except Exception, e: print traceback.format_exc() sys.exit() continue
def containsAlphabets(line): title = ttp.clean_title(line) if re.search('[a-zA-Z]' ,title): return title else: return 0
def splitWord(title): words = ttp.clean_title(title).split(' ') words = [word for word in words if not word in ['', '[', ']']] return words
def splitWord(title): words = ttp.clean_title(title).split(' ') words = filter(None, words) return words
def containsAlphabets(line): title = ttp.clean_title(line) if re.search('[a-zA-Z]', title): return title else: return 0