def BadAnchor(anchor, url=None, tl=titlelength, debug=False): anchor = anchor.strip('\r\t\n ') anchor_alnum = re.sub('[^a-zA-Z0-9]', '', anchor) spamwords = bool( set([t for t in anchor.lower().split(' ') if t]).difference(badwordset) == set()) repeaturl = bool( normalize(anchor).lower() == normalize(url).lower()) if url else False urlchars = '[a-zA-Z0-9\-_\?\=\@]' isurl = bool( not re.findall('%s+\.%s+/%s+' % (urlchars, urlchars, urlchars), anchor) == []) number = bool( anchor_alnum.isdigit() and not (anchor_alnum[:2] in ['19', '20'] and len(anchor_alnum) == 4)) containsLongNumbers = not re.findall('[0-9]{5,6}', anchor_alnum) == [ ] # at least 5 digits badchar = not (re.findall('((?:^[^a-zA-Z0-9]|[^a-zA-Z0-9\.\!\?]$))', anchor) == []) ret = badchar or isurl or number or containsLongNumbers or spamwords or repeaturl or len(anchor) > tl \ or not anchor or re.findall('\?[a-zA-Z0-9]',anchor) or not re.findall('[a-zA-Z0-9]',anchor) \ or re.findall('[a-zA-Z0-9]+/[a-zA-Z0-9]+\.[a-z]+',anchor) or re.findall(' (the|a)$(?i)',anchor) if debug: logging.warning('\t'.join(['url','anchor','badchar','isurl','number','containsLongNumbers', \ 'spamwords','repeaturl','anclength','anchor','qmark string', 'no alphanum','url string', \ 'end with articles'])) logging.warning('\t'.join(map(str,[url, anchor, badchar, isurl, number, \ containsLongNumbers, spamwords, repeaturl, len(anchor), anchor, \ re.findall('\?[a-zA-Z0-9]',anchor), not re.findall('[a-zA-Z0-9]',anchor), \ re.findall('[a-zA-Z0-9]+/[a-zA-Z0-9]+\.[a-z]+',anchor), re.findall(' (the|a)$(?i)',anchor)]))) return ret
def IsSrCandidate(sr,url): try: nsr = normalize(sr) nurl = normalize(url) """ examples: 1. url = maps.google.com, sr = www.google.com 2. url = maps.google.com/help/en, sr = maps.google.com/help 3. url = yahoo.com, sr = www.yahoo.com 4. url = news.yahoo.com, sr = finance.yahoo.com """ if nsr == nurl: return False elif nurl[0:len(nsr)]==nsr: return True elif not ( '/' in nsr or '/' in nurl) and nurl.endswith(nsr): return True else: return False except: return False
def RemoveCompanyNames(domain, pagetitles): companyNames = [t for t in re.split('[/.]',normalize(domain)) if t.isalnum()] ret = [] for pt in pagetitles: try: ret.append(re.sub('\s+',' ',MatchWords('(('+'|'.join(companyNames) + '))').sub(' ',pt))) except: raise ValueError(str([domain,pt])) return ret
def findroot(url): try: root = normalize(url).split('/')[0] while root.count('.') > 0: if root in rdict: return root root = re.sub('^.*?\.', '', root) return None except: return None
word_delimiter = ['\+', '-', '_'] old_domain = "" total = 0 for line in sys.stdin: line = line.strip() domain = line.split('\t')[0] if domain != old_domain: old_domain = domain parent = {} total = 0 ql = line.split('\t')[1] if total == MAX_Domain_Num: continue if domain not in ql or any( ext in ql for ext in exclude) or normalize(domain) == normalize(ql): continue #or findroot(domain)!=findroot(ql): continue if ql[len(ql) - 1] in url_delimiter: ql = ql.strip(ql[len(ql) - 1]) fields = re.split('|'.join(url_delimiter), ql) litem = fields[len(fields) - 1] pstr = ql[0:len(ql) - len(litem)] fields2 = ql.split('/') fields2 = re.split('|'.join(word_delimiter), fields2[len(fields2) - 1]) if len(fields2) >= MAX_Token_Num: continue #print "==== ",ql # remove long final node fields2 = re.split('|'.join(word_delimiter), fields2[len(fields2) - 2]) if len(fields2) >= MAX_Token_Num:
root = nurl.split('/')[0] while root.count('.') > 0: if root in rset: rr = rootdict.setdefault(root, {}) rr[nurl] = url break root = re.sub('^.*?\.', '', root) print json.dumps(rootdict) elif mode == 'test': # sanity test to make sure certain domains are in the file rootdictfile = sys.argv[2] with open(rootdictfile) as f: rootdict = json.loads(f.readline()) testfile = sys.argv[3] testdomains = {} with open(testfile) as f: for i, line in enumerate(f.readlines()): testdomains[normalize(line.strip('\r\t\n '))] = i for k, v in rootdict.items(): for k2, v2 in v.items(): if k2 in testdomains: testdomains.pop(k2) # missing domains from rootdict for domain, i in sorted(testdomains.items(), key=lambda t: float(t[1]), reverse=False): print i, domain
MIN_DIF_PARENT = 100 MAX_Single_Parent = float(0.8 * MAX_Top_Num) #cant have too many same parent url parent = {} url_delimiter = ['/', '\?', '&', '=', '\$', '@', ';', ':', ',', '\+'] word_delimiter = ['\+', '-', '_'] old_domain = "" total = 0 questioncount = 0 for line in sys.stdin: line = line.strip('\r\n ') if '\t' not in line: continue domain, ql = line.split('\t') if not domain or not ql: continue ndom = normalize(domain) if not ndom: continue if domain != old_domain: maxvalue = 0 maxkey = "" if len(parent): maxkey = max(parent, key=lambda a: parent.get(a)) maxvalue = parent[maxkey] # print parent # print old_domain, maxvalue # print old_domain,len(parent) if maxvalue >= MAX_Single_Parent and len( parent ) < MIN_DIF_PARENT or questioncount >= MIN_QUESTION_COUNT_TOP_6: try: sys.stdout.write('%s\n' % (old_domain))
def GetFeatures(clicks, domain, url, bt, title, intanc, extanc, newtitle, newintanc, newextanc): #clicks, domain, ql, bingtitle, pagetitle, IntAnc, ExtAnc, newpagetitle, NewIntAnc, NewExtAnc urlroot = re.sub('^https?://', '', url.lower()).split('/')[0] urlrootset = set(urlroot.split('.')) urldomsuffix = re.sub( '^www[0-9]?\.', '', re.sub('^https?://', '', domain.lower()).split('/')[0]) urldomsuffixset = set(urldomsuffix.split('.')) subdomain = urlroot[:-(len(urldomsuffix) + 1)] if urldomsuffix in urlroot else '' #rawIntAnc = [t for t in intanc if not BadAnchor(t[0])] rawIntAncNew = [t for t in newintanc if not BadAnchor(t[0])] #rawExtAnc = [t for t in extanc if not BadAnchor(t[0])] rawExtAncNew = [t for t in newextanc if not BadAnchor(t[0])] title2 = re.sub('\s+', ' ', re.sub('<.*?>', '', newtitle.lower())) titlewords = re.split('[^a-zA-Z0-9]+', title2) titleacronym = ''.join(t[0] for t in titlewords if len(t) > 0) if not titleacronym: titleacronym = None nurl = normalize(url) urltip = re.sub( '\..*$', '', nurl.split('/')[-1]).lower() if '/' in nurl else nurl.split('.')[0] urlset = set( map(RemovePlural, re.split('(?:%[0-9][0-9]|[^a-zA-Z0-9]+)', urltip))) titleset = set(map(RemovePlural, titlewords)) topanchorwords = None if not rawIntAncNew else re.split( '[^a-zA-Z0-9]+', rawIntAncNew[0][0].lower()) topanchoracronym = None if not (rawIntAncNew and topanchorwords) else ''.join( t[0] for t in topanchorwords if len(t) > 0) if not topanchoracronym: topanchoracronym = None IntAnc = [ set(map(RemovePlural, re.split('[^a-zA-Z0-9]+', s[0].lower()))) for s in rawIntAncNew ] ExtAnc = [ set(map(RemovePlural, re.split('[^a-zA-Z0-9]+', s[0].lower()))) for s in rawExtAncNew ] if not bt == None: # signal train or test mode bingtitle = set( map(RemovePlural, re.split('[^a-zA-Z0-9]+', re.sub('<.*?>', '', bt.lower())))) intitle = int(bingtitle.issubset(titleset.union(urldomsuffixset))) inurl = int(bingtitle.issubset(urlset.union(urldomsuffixset))) # stop words takes care of Terms & Conditions versus Terms and Conditions, however it may be dangerous intvec = [ int(bingtitle.issubset(t.union(urldomsuffixset).union(stopwords))) for t in IntAnc ] extvec = [ int(bingtitle.issubset(t.union(urldomsuffixset).union(stopwords))) for t in ExtAnc ] #target if intitle: if intvec and intvec[0]: target = 0 else: target = 1 elif intvec and intvec[0]: target = -1 else: target = -2 else: # test mode target = -3 # features titlelength = len(title2) titlewordcnt = len(titleset) topanchorwordcnt = -1 if not IntAnc else len(IntAnc[0]) topanchorlength = -1 if not rawIntAncNew else len(rawIntAncNew[0][0]) totalwt = sum(s[1] for s in rawIntAncNew) topanchorweight = -1 if not rawIntAncNew else rawIntAncNew[0][1] topanchorwtratio = -1 if not rawIntAncNew else topanchorweight * 1.0 / totalwt top2ndwtratio = -1 if len( rawIntAncNew) < 2 else rawIntAncNew[0][1] * 1.0 / rawIntAncNew[1][1] urltitleoverlap = len([t for t in titleset if t in urltip ]) * 1.0 / len(titleset) urltopanchoroverlap = -1 if not IntAnc else len( [t for t in IntAnc[0] if t in urltip]) * 1.0 / len(IntAnc[0]) topanchortitleoverlap = -1 if not IntAnc else len( IntAnc[0].intersection(titleset)) * 1.0 / len(IntAnc[0]) anchorweightentropy = entropy([t[1] for t in rawIntAncNew]) titlebars = title2.count('|') return '\t'.join( map(UtfFix, [ clicks, domain, url, bt, title, newtitle, '' if not rawIntAncNew else rawIntAncNew[0][0], target ] + map(eval, v1features + v2features + v3features)) + [ json.dumps(rawIntAncNew), json.dumps(rawExtAncNew), json.dumps(intanc), json.dumps(extanc) ])
import sys,re,math,json sys.path.insert(0,'/'.join(__file__.split('/')[:-1])) from genLibs import normalize,fixstr,stdgen # assuming ndom are unique with open(sys.argv[1]) as f: aliaspair = [(normalize(k),set(v)) for k,v in json.loads(f.readline()).items()] aliasdict = {} for k,v in aliaspair: if not k in aliasdict: aliasdict[k] = set() aliasdict[k] = aliasdict[k].union(v) qlsPerDomain = 6 if len(sys.argv) < 3 else int(sys.argv[2]) keepFewQls = False if len(sys.argv) < 4 else eval(str(sys.argv[3])) pd = None printed_domain=set() for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'): tmp = line.strip('\r\t\n ').split('\t') domain, ql, title, score = tmp if not domain == pd: if not pd == None: ret = sorted(ret,key=lambda t:float(t[3]), reverse=True)[:qlsPerDomain] if keepFewQls or len(ret) == qlsPerDomain: aliases = aliasdict.get(ndom, [pd]) for alias in aliases: if alias not in printed_domain: print '\n'.join('\t'.join(map(fixstr,[alias] + r[1:4])) for r in ret) printed_domain.add(alias)
def process_url(url, donormalize=1): if donormalize: return normalize(url) else: return url
def QlPostProcess(ndom,inputbag,strong=True,nitems=10,sortby=3, dedup=True, beautify=True): # default sortby is ctr # inputbag has the same format as output schema # processes: # 1. title dedupping # 2. remove long urls # 3. remove .xyz where xyz not in ognc|aspx|html|htm|jsp|gsp|tmpl # 4. only show one mail, login, locator, menu ret = [] strong = eval(str(strong)) if not inputbag: return ret inputbag = sorted(inputbag, key = lambda t:float(t[sortby]), reverse=True) titleset = set() keydict = {} parentdict = {} urlset = set() #mail, login, locator, menu = False, False, False, False domain = inputbag[0][0].lower() domain2 = re.sub('[^a-zA-Z0-9]','',domain) #ndom = normalize(domain) ndomroot = ndom.split('/')[0] backups = [] for i,rec in enumerate(inputbag): if not rec[2] and dedup: continue if beautify: rec[2] = TitleBeautify(rec[2],ndomroot) if not dedup: ret.append(rec) continue title = rec[2].lower().strip('\t\r\n ') if BadAnchor(title, rec[1],debug=True): continue tmptitle = ''.join([t for t in re.split('[^a-zA-Z0-9]+',re.sub('</?b>','',title)) if t]) if tmptitle in titleset: continue titletokens = re.split('\s+',title) if max(int(t in titletokens) for t in ['invalid','click','next','back','index', 'default', 'page']) == 1: continue cont = False for suffix in ['htm','jpg','jpeg','pdf','xml','doc','mp3'] + tlds: if '.' + suffix in title: backups.append(rec) titleset.add(tmptitle) titleset.add(tmptitle.strip('s')) cont = True if cont: continue if title.startswith('?'): continue if title.isalnum() and not title.isalpha(): backups.append(rec) titleset.add(tmptitle) titleset.add(tmptitle.strip('s')) continue if strong: title = ''.join([t for t in re.split('[^a-zA-Z0-9]+',re.sub('</?b>','',title)) if t]) #if max([ int(title in t or t in title) for t in titleset ] + [0] ) == 1: continue #if max([ int(title == t) for t in titleset ] + [0] ) == 1: continue ql = rec[1].lower() nql = normalize(ql) if nql in urlset: continue if '/' in nql: qltip = nql.split('/')[-1] #if not '?' in qltip and '.' in qltip and not qltip.split('.')[-1] in goodsuffixes: continue if hasManyTokenNode(ql): backups.append(rec) titleset.add(tmptitle) titleset.add(tmptitle.strip('s')) continue title2 =re.sub('[^a-zA-Z0-9]','',title) if len(title2) < 2: backups.append(rec) titleset.add(tmptitle) titleset.add(tmptitle.strip('s')) continue cont = False for k in ['mail','(login|logon|signin|signon)','signup','menu','about','contact','(map|direction|locator|location)']: if not re.findall(k,domain2) and re.findall(k,title2): if k in keydict: cont = True keydict[k] = nql if cont: continue #if re.findall('(login|logon|signin|signon)',title2): rec[2] = 'Log In' parent = re.sub('/[^\/]*$','',nql) if parentdict.setdefault(parent, 0) > 2 and not parent == ndom: backups.append(rec) titleset.add(tmptitle) titleset.add(tmptitle.strip('s')) continue parentdict[parent] += 1 ret.append(rec) if not nitems == -1 and len(ret) >= nitems: break titleset.add(tmptitle) titleset.add(tmptitle.strip('s')) urlset.add(nql) if nitems == -1: return ret return ret + backups[:nitems - len(ret)]
scoreidx = int(sys.argv[4]) if len(sys.argv) > 5: # for debugging purpose mode = sys.argv[5] swapDict = {scoreidx:3,3:scoreidx} revSwapDict = reverseDict(swapDict) if mode == 'debug': pd = None for line in stdgen('None2\tNone2\tNone2\t0.0'): # domain ql title score tmp = line.strip('\n').split('\t') domain, ql, title = tmp[:3] score = float(tmp[3]) ndom = normalize(domain) if not pd == ndom: if not pd == None: print '\n'.join('\t'.join(map(str,t)) for t in QlPostProcess(ndom,inputbag,strong=True,nitems=10,sortby=3, dedup=True, beautify=True)) inputbag = [] pd = ndom inputbag.append([domain, ql, title, score]) else: pd = None for line in stdgen(('None\t' * 11)[:-1]): tmp = line.strip('\n').split('\t') domain = tmp[0] if domain == 'unknown': continue if not pd == domain:
import sys, re, math import simplejson as json sys.path.insert(0, '/'.join(__file__.split('/')[:-1])) from genLibs import normalize rootfile = sys.argv[1] # input must be unique root file, so news.yahoo.com and yahoo.com cannot both be in with open(rootfile) as f: rset = set(json.loads(f.readline())) ret = {} for line in sys.stdin: url = line.strip('\r\t\n ') root = normalize(url).split('/')[0] while root.count('.') > 0: if root in rset: if not root in ret: ret[root] = [] ret[root].append(url) break root = re.sub('^.*?\.', '', root) print json.dumps(ret)
""" http://000sweb.co.monterey.ca.us/cob/ http://000sweb.co.monterey.ca.us/cob/supervisor.htm 1.0 http://000sweb.co.monterey.ca.us/cob/ http://000sweb.co.monterey.ca.us/cob/minutes/2001/010403M.htm 1.0 http://002salvage.com/ http://www.002salvage.com/product/default.asp 1.0 http://002salvage.com/ http://www.002salvage.com/about/news.asp 1.0 http://005.housedems.com/ http://005.housedems.com/contact-me 2.0 http://005.housedems.com/ http://005.housedems.com/biography 1.0 """ if __name__ == "__main__": pd = None top10kfile = 'seed_urls.top10k' if len(sys.argv) < 2 else sys.argv[1] top10kset= set() with open(top10kfile) as f: for line in f.readlines(): top10kset.add(normalize(line.strip('\r\t\n '))) mode = 'Missing' if len(sys.argv) < 3 else sys.argv[2] if mode == 'Found': ret = set() doNorm = True if len(sys.argv) < 4 else eval(str(sys.argv[3])) for line in sys.stdin: domain = line.strip('\r\t\n ').split('\t')[0] if not domain == pd: pd = domain ndom = normalize(domain) if doNorm else domain if ndom in top10kset: if mode == 'Found': ret.add(ndom) elif mode == 'Missing': top10kset.remove(ndom) if mode == 'Found': print '\n'.join(ret) elif mode == 'Missing': print '\n'.join(top10kset)
import sys, re, math, json sys.path.insert(0, '/'.join(__file__.split('/')[:-1])) from genLibs import normalize # alias.dict.onejson # {"http://www.state.in.us/": ["http://www.state.in.us/"], "http://www.hennepin.us/": ["http://www.hennepin.us/"], topk = 6 with open(sys.argv[1]) as f: aliaspair = [(normalize(k), set(v)) for k, v in json.loads(f.readline()).items()] aliasdict = {} for k, v in aliaspair: if not k in aliasdict: aliasdict[k] = set() aliasdict[k] = aliasdict[k].union(v) def fixstr(s): try: return str(s) except: return s.decode('utf-8', 'ignore') ret = {} for line in sys.stdin: domain, ql, title, score = line.strip('\r\t\n ').split('\t') ndom = normalize(domain) if not ndom in ret: ret[ndom] = {} if not domain in ret[ndom]: ret[ndom][domain] = [] ret[ndom][domain].append((domain, ql, title, score))
import re, math, sys sys.path.insert(0, '/'.join(__file__.split('/')[:-1])) from genLibs import normalize import xml.etree.ElementTree as et from xml.sax.saxutils import escape for line in sys.stdin: dom, outlinks = line.strip('\n').split('\t') ndom = normalize(dom) #try: #print escape(outlinks) #root = et.fromstring('<root>' + outlinks + '</root>') for link in re.findall('<L>(.*?)</L>', outlinks): #for link in root.iter('L'): # <L>13da8a85e450 84d103dc8a0a9980 http://www.gobluehose.com/ en canonical 12 1419811200</L> #tmp = link.text.split(' ') tmp = link.split(' ') ol = tmp[2] nol = normalize(ol) anchor = ' '.join(tmp[4:-2]) print '\t'.join([dom, ndom, ol, nol, anchor]) #except: # continue
] if __name__ == '__main__': pd = None mode = 'ql' if len(sys.argv) > 1: mode = sys.argv[1] nQlAncs = 3 if len(sys.argv) <= 2 else int(sys.argv[2]) if mode == 'domain': for line in stdgen('\t'.join(['None'] * 7)): domain, url, redit, title, wmdata, dom_title, dom_wmdata = line.strip( '\n').split('\t')[:7] if not domain == pd: if not pd == None: print '%s\t%s' % (normalize(pd), json.dumps(ret)) ret = {} pd = domain domaintokens = set( tokenize(dom_title) + [ t for x in parseIntAnc(dom_wmdata, 1) for t in tokenize(x) ]) for s in tokenize(title) + [ t for x in parseIntAnc(wmdata, nQlAncs) for t in tokenize(x) ]: if not s in domaintokens: ret[s] = ret.get(s, 0) + 1 elif mode == 'ql':
import sys,re,math sys.path.insert(0,'/'.join(__file__.split('/')[:-1])) from genLibs import normalize,stdgen domainfile = sys.argv[1] nvcut = 0 if len(sys.argv) < 3 else int(sys.argv[2]) domdict= {} with open(domainfile) as f: for line in f.readlines(): tmp = line.strip('\r\t\n ').split('\t') tmp[0] = normalize(tmp[0]) domdict[tmp[0]] = tmp pd = None for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'): tmp = line.strip('\n') domain = tmp.split('\t')[0] if not domain == pd: if not pd == None: tmp0 = domdict.get(ndom,None) if tmp0 and float(tmp0[2]) >= nvcut: assert len(ret) == 6 print '\n'.join(ret) pd = domain ndom = normalize(pd) ret = [] ret.append(tmp)
def UrlTip(url): nurl = normalize(url) if not '/' in nurl: return nurl.split('.')[0] else: return nurl.split('/')[-1].split('.')[0]
from GenTrainToken import BadAnchor from genLibs import normalize try: import simplejson as json except ImportError: import json from DomainTokens import ud, parseIntAnc, stdgen, stopwords, tokenize # get rid of non-ascii # input = hcat pipeline.1/QuicklinkTitles/combined.redir.title.wmdata/par* if __name__ == '__main__': nQlAncs = 3 if len(sys.argv) > 1: nQlAncs = int(sys.argv[1]) pd = None for line in sys.stdin: domain, url, redit, title, wmdata, dom_title, dom_wmdata = line.strip( '\n').split('\t')[:7] if not domain == pd: ndom = normalize(domain) pd = domain domaintokens = set( tokenize(dom_title) + [t for x in parseIntAnc(dom_wmdata, 1) for t in tokenize(x)]) ret = {} for s in tokenize(title) + [ t for x in parseIntAnc(wmdata, nQlAncs) for t in tokenize(x) ]: if not s in domaintokens: ret[s] = ret.get(s, 0) + 1 print '\t'.join([ndom, url, json.dumps(ret)])
'clicks', 'domain', 'ql', 'bingtitle', 'pagetitle', 'intanc', 'extanc', 'newpagetitle', 'newintancs', 'newextancs' ]) pts = None for line in stdgen('domain\tql\tredt\tpagetitle\twmdata\tbingtitle'): line = line.strip('\n') if line.count('\t') > 5: line = line.strip('\t') tmp = line.split('\t') domain, ql, redt, pagetitle, wmdata = tmp[:5] if len(tmp) > 5: bingtitle = tmp[5] else: bingtitle = None nurl = normalize(ql) clicks = str(clickdict.get(nurl, 0.0)) bingtitle = ud(bingtitle) pagetitle = ud(pagetitle) if not pd == domain: if not pd == None: ts = float(time.time()) if not pts == None: deltaT = ts - pts if deltaT > 1000: logging.warning('long duration domain: %s; duration = %f' % (pd, deltaT)) pts = ts newpagetitles = RemoveRepeatedPhrase(pagetitles)
if len(sys.argv) > 6: rankbyctr = eval(str(sys.argv[6])) rankbyscore = True if len(sys.argv) > 7: rankbyscore = eval(str(sys.argv[7])) nitems = 10 if len(sys.argv) > 8: nitems = eval(str(sys.argv[8])) bl = set() if not blf == 'None': with open(blf) as f: for line in f.readlines(): tmp = line.strip('\r\t\n ').split('\t') ndom = normalize(tmp[0]) nql = normalize(tmp[1]) bl.add((ndom, nql)) titlebank = {} if not titlefile == 'None': with open(titlefile) as f: titlebank = dict(((ndom, nql), v[-1][0]) for ndom, v0 in json.loads(f.readline()).items() for nql, v in v0.items()) wl = {} for wlf in wlfs.split(','): with open(wlf) as f: for line in f.readlines(): tmp = line.strip('\r\t\n ').split('\t')[:4]
if mode == 'JoinSr': for line in sys.stdin: domain, nurl, selfclicks, totalclicks, ourl, c1, c2, n1, n2, l0, l1, l2, la, na, selfviews, totalviews, v1, v2, depth, potential, selfctr, totalctr, ctr1, ctr2 = line.strip( '\r\t\n ').split('\t') srdict, root = FindSrsFromUrl(nurl) #srdict = rdict.get(root,{}) assert domain == root if not srdict: continue SrCandidates = FilterSr(nurl, srdict) for sr in SrCandidates: try: print '\t'.join([ sr, nurl, selfclicks, totalclicks, ourl, c1, c2, n1, n2, l0, l1, l2, la, na, selfviews, totalviews, v1, v2, depth, potential, selfctr, totalctr, ctr1, ctr2 ]) except: pass elif mode == 'FindDomain': # streaming in WebmapAllUrls-views.pig for line in sys.stdin: tmp = line.strip('\n').split('\t') url = tmp[0] nurl = normalize(url) srdict, root = FindSrsFromUrl(nurl) if not srdict: continue print '\t'.join(tmp + [root]) else: raise ValueError('Unknown mode in JoinSr.py!')
mode = 'update' if len(sys.argv) > 3: mode = sys.argv[3] try: with open(jsonfile) as f: titledict = json.loads(f.readline()) with open('backlog/%s.bak.%s' % (jsonfile, str(timestamp)), 'w') as f: f.write(json.dumps(titledict)) except: titledict = {} if mode == 'update': for line in sys.stdin: domain, ql, title = line.strip('\n').split('\t') ndom = normalize(domain) nql = normalize(ql) titledictndom = titledict.setdefault(ndom, {}) titledictndomnql = titledictndom.setdefault(nql, []) titledictndomnql.append((title, label)) with open(jsonfile, 'w') as f: f.write(json.dumps(titledict)) elif mode == 'dedup': ret = {} for k, v in titledict.items(): ret[k] = {} for k2, v2 in v.items(): ret[k][k2] = Dedup(v2) with open(jsonfile, 'w') as f: