예제 #1
0
def BadAnchor(anchor, url=None, tl=titlelength, debug=False):
    anchor = anchor.strip('\r\t\n ')
    anchor_alnum = re.sub('[^a-zA-Z0-9]', '', anchor)
    spamwords = bool(
        set([t for t in anchor.lower().split(' ')
             if t]).difference(badwordset) == set())
    repeaturl = bool(
        normalize(anchor).lower() == normalize(url).lower()) if url else False
    urlchars = '[a-zA-Z0-9\-_\?\=\@]'
    isurl = bool(
        not re.findall('%s+\.%s+/%s+' %
                       (urlchars, urlchars, urlchars), anchor) == [])
    number = bool(
        anchor_alnum.isdigit()
        and not (anchor_alnum[:2] in ['19', '20'] and len(anchor_alnum) == 4))
    containsLongNumbers = not re.findall('[0-9]{5,6}', anchor_alnum) == [
    ]  # at least 5 digits
    badchar = not (re.findall('((?:^[^a-zA-Z0-9]|[^a-zA-Z0-9\.\!\?]$))',
                              anchor) == [])
    ret = badchar or isurl or number or containsLongNumbers or spamwords or repeaturl or len(anchor) > tl \
            or not anchor or re.findall('\?[a-zA-Z0-9]',anchor) or not re.findall('[a-zA-Z0-9]',anchor) \
            or re.findall('[a-zA-Z0-9]+/[a-zA-Z0-9]+\.[a-z]+',anchor) or re.findall(' (the|a)$(?i)',anchor)

    if debug:
        logging.warning('\t'.join(['url','anchor','badchar','isurl','number','containsLongNumbers', \
                'spamwords','repeaturl','anclength','anchor','qmark string', 'no alphanum','url string', \
                'end with articles']))
        logging.warning('\t'.join(map(str,[url, anchor, badchar, isurl, number, \
            containsLongNumbers, spamwords, repeaturl, len(anchor), anchor, \
            re.findall('\?[a-zA-Z0-9]',anchor), not re.findall('[a-zA-Z0-9]',anchor), \
            re.findall('[a-zA-Z0-9]+/[a-zA-Z0-9]+\.[a-z]+',anchor), re.findall(' (the|a)$(?i)',anchor)])))

    return ret
예제 #2
0
def IsSrCandidate(sr,url):
  try:
    nsr = normalize(sr)
    nurl = normalize(url)
    """
    examples: 
    1. url = maps.google.com, sr = www.google.com
    2. url = maps.google.com/help/en, sr = maps.google.com/help
    3. url = yahoo.com, sr = www.yahoo.com 
    4. url = news.yahoo.com, sr = finance.yahoo.com
    """
    if nsr == nurl: return False
    elif nurl[0:len(nsr)]==nsr: return True 
    elif not ( '/' in nsr or '/' in nurl) and nurl.endswith(nsr): return True
    else: return False
  except:
    return False 
예제 #3
0
def RemoveCompanyNames(domain, pagetitles):
    companyNames = [t for t in re.split('[/.]',normalize(domain)) if t.isalnum()]
    ret = []
    for pt in pagetitles:
        try:
            ret.append(re.sub('\s+',' ',MatchWords('(('+'|'.join(companyNames) + '))').sub(' ',pt)))
        except:
            raise ValueError(str([domain,pt]))
    return ret
예제 #4
0
def findroot(url):
    try:
        root = normalize(url).split('/')[0]
        while root.count('.') > 0:
            if root in rdict:
                return root
            root = re.sub('^.*?\.', '', root)
        return None
    except:
        return None
예제 #5
0
word_delimiter = ['\+', '-', '_']
old_domain = ""
total = 0
for line in sys.stdin:
    line = line.strip()
    domain = line.split('\t')[0]

    if domain != old_domain:
        old_domain = domain
        parent = {}
        total = 0
    ql = line.split('\t')[1]
    if total == MAX_Domain_Num: continue
    if domain not in ql or any(
            ext in ql
            for ext in exclude) or normalize(domain) == normalize(ql):
        continue
    #or findroot(domain)!=findroot(ql): continue

    if ql[len(ql) - 1] in url_delimiter:
        ql = ql.strip(ql[len(ql) - 1])
    fields = re.split('|'.join(url_delimiter), ql)
    litem = fields[len(fields) - 1]
    pstr = ql[0:len(ql) - len(litem)]

    fields2 = ql.split('/')
    fields2 = re.split('|'.join(word_delimiter), fields2[len(fields2) - 1])
    if len(fields2) >= MAX_Token_Num:
        continue  #print "==== ",ql       # remove long final node
    fields2 = re.split('|'.join(word_delimiter), fields2[len(fields2) - 2])
    if len(fields2) >= MAX_Token_Num:
예제 #6
0
        root = nurl.split('/')[0]

        while root.count('.') > 0:
            if root in rset:
                rr = rootdict.setdefault(root, {})
                rr[nurl] = url
                break
            root = re.sub('^.*?\.', '', root)
    print json.dumps(rootdict)

elif mode == 'test':
    # sanity test to make sure certain domains are in the file
    rootdictfile = sys.argv[2]
    with open(rootdictfile) as f:
        rootdict = json.loads(f.readline())
    testfile = sys.argv[3]
    testdomains = {}
    with open(testfile) as f:
        for i, line in enumerate(f.readlines()):
            testdomains[normalize(line.strip('\r\t\n '))] = i

    for k, v in rootdict.items():
        for k2, v2 in v.items():
            if k2 in testdomains:
                testdomains.pop(k2)
    # missing domains from rootdict
    for domain, i in sorted(testdomains.items(),
                            key=lambda t: float(t[1]),
                            reverse=False):
        print i, domain
예제 #7
0
MIN_DIF_PARENT = 100
MAX_Single_Parent = float(0.8 *
                          MAX_Top_Num)  #cant have too many same parent url
parent = {}
url_delimiter = ['/', '\?', '&', '=', '\$', '@', ';', ':', ',', '\+']
word_delimiter = ['\+', '-', '_']
old_domain = ""
total = 0
questioncount = 0

for line in sys.stdin:
    line = line.strip('\r\n ')
    if '\t' not in line: continue
    domain, ql = line.split('\t')
    if not domain or not ql: continue
    ndom = normalize(domain)
    if not ndom: continue
    if domain != old_domain:
        maxvalue = 0
        maxkey = ""
        if len(parent):
            maxkey = max(parent, key=lambda a: parent.get(a))
            maxvalue = parent[maxkey]
#          print parent
#          print old_domain, maxvalue
#      print old_domain,len(parent)
        if maxvalue >= MAX_Single_Parent and len(
                parent
        ) < MIN_DIF_PARENT or questioncount >= MIN_QUESTION_COUNT_TOP_6:
            try:
                sys.stdout.write('%s\n' % (old_domain))
예제 #8
0
def GetFeatures(clicks, domain, url, bt, title, intanc, extanc, newtitle,
                newintanc, newextanc):
    #clicks, domain, ql, bingtitle, pagetitle, IntAnc, ExtAnc, newpagetitle, NewIntAnc, NewExtAnc

    urlroot = re.sub('^https?://', '', url.lower()).split('/')[0]
    urlrootset = set(urlroot.split('.'))
    urldomsuffix = re.sub(
        '^www[0-9]?\.', '',
        re.sub('^https?://', '', domain.lower()).split('/')[0])
    urldomsuffixset = set(urldomsuffix.split('.'))
    subdomain = urlroot[:-(len(urldomsuffix) +
                           1)] if urldomsuffix in urlroot else ''

    #rawIntAnc = [t for t in intanc if not BadAnchor(t[0])]
    rawIntAncNew = [t for t in newintanc if not BadAnchor(t[0])]

    #rawExtAnc = [t for t in extanc if not BadAnchor(t[0])]
    rawExtAncNew = [t for t in newextanc if not BadAnchor(t[0])]

    title2 = re.sub('\s+', ' ', re.sub('<.*?>', '', newtitle.lower()))
    titlewords = re.split('[^a-zA-Z0-9]+', title2)
    titleacronym = ''.join(t[0] for t in titlewords if len(t) > 0)
    if not titleacronym: titleacronym = None
    nurl = normalize(url)
    urltip = re.sub(
        '\..*$', '',
        nurl.split('/')[-1]).lower() if '/' in nurl else nurl.split('.')[0]

    urlset = set(
        map(RemovePlural, re.split('(?:%[0-9][0-9]|[^a-zA-Z0-9]+)', urltip)))

    titleset = set(map(RemovePlural, titlewords))
    topanchorwords = None if not rawIntAncNew else re.split(
        '[^a-zA-Z0-9]+', rawIntAncNew[0][0].lower())

    topanchoracronym = None if not (rawIntAncNew
                                    and topanchorwords) else ''.join(
                                        t[0] for t in topanchorwords
                                        if len(t) > 0)
    if not topanchoracronym: topanchoracronym = None

    IntAnc = [
        set(map(RemovePlural, re.split('[^a-zA-Z0-9]+', s[0].lower())))
        for s in rawIntAncNew
    ]

    ExtAnc = [
        set(map(RemovePlural, re.split('[^a-zA-Z0-9]+', s[0].lower())))
        for s in rawExtAncNew
    ]

    if not bt == None:  # signal train or test mode

        bingtitle = set(
            map(RemovePlural,
                re.split('[^a-zA-Z0-9]+', re.sub('<.*?>', '', bt.lower()))))

        intitle = int(bingtitle.issubset(titleset.union(urldomsuffixset)))
        inurl = int(bingtitle.issubset(urlset.union(urldomsuffixset)))

        # stop words takes care of Terms & Conditions versus Terms and Conditions, however it may be dangerous
        intvec = [
            int(bingtitle.issubset(t.union(urldomsuffixset).union(stopwords)))
            for t in IntAnc
        ]

        extvec = [
            int(bingtitle.issubset(t.union(urldomsuffixset).union(stopwords)))
            for t in ExtAnc
        ]

        #target
        if intitle:
            if intvec and intvec[0]: target = 0
            else: target = 1
        elif intvec and intvec[0]:
            target = -1
        else:
            target = -2

    else:  # test mode
        target = -3

    # features
    titlelength = len(title2)
    titlewordcnt = len(titleset)
    topanchorwordcnt = -1 if not IntAnc else len(IntAnc[0])
    topanchorlength = -1 if not rawIntAncNew else len(rawIntAncNew[0][0])
    totalwt = sum(s[1] for s in rawIntAncNew)
    topanchorweight = -1 if not rawIntAncNew else rawIntAncNew[0][1]
    topanchorwtratio = -1 if not rawIntAncNew else topanchorweight * 1.0 / totalwt
    top2ndwtratio = -1 if len(
        rawIntAncNew) < 2 else rawIntAncNew[0][1] * 1.0 / rawIntAncNew[1][1]
    urltitleoverlap = len([t for t in titleset if t in urltip
                           ]) * 1.0 / len(titleset)

    urltopanchoroverlap = -1 if not IntAnc else len(
        [t for t in IntAnc[0] if t in urltip]) * 1.0 / len(IntAnc[0])

    topanchortitleoverlap = -1 if not IntAnc else len(
        IntAnc[0].intersection(titleset)) * 1.0 / len(IntAnc[0])

    anchorweightentropy = entropy([t[1] for t in rawIntAncNew])
    titlebars = title2.count('|')

    return '\t'.join(
        map(UtfFix, [
            clicks, domain, url, bt, title, newtitle,
            '' if not rawIntAncNew else rawIntAncNew[0][0], target
        ] + map(eval, v1features + v2features + v3features)) + [
            json.dumps(rawIntAncNew),
            json.dumps(rawExtAncNew),
            json.dumps(intanc),
            json.dumps(extanc)
        ])
예제 #9
0
import sys,re,math,json
sys.path.insert(0,'/'.join(__file__.split('/')[:-1]))
from genLibs import normalize,fixstr,stdgen

# assuming ndom are unique

with open(sys.argv[1]) as f:
    aliaspair = [(normalize(k),set(v)) for k,v in json.loads(f.readline()).items()]
aliasdict = {}
for k,v in aliaspair:
    if not k in aliasdict: aliasdict[k] = set()
    aliasdict[k] = aliasdict[k].union(v)

qlsPerDomain = 6 if len(sys.argv) < 3 else int(sys.argv[2])

keepFewQls = False if len(sys.argv) < 4 else eval(str(sys.argv[3]))

pd = None
printed_domain=set()
for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'):
    tmp = line.strip('\r\t\n ').split('\t')
    domain, ql, title, score = tmp
    if not domain == pd:
        if not pd == None:
            ret = sorted(ret,key=lambda t:float(t[3]), reverse=True)[:qlsPerDomain]
            if keepFewQls or len(ret) == qlsPerDomain:
                aliases = aliasdict.get(ndom, [pd])
                for alias in aliases:
                    if alias not in printed_domain:
                        print '\n'.join('\t'.join(map(fixstr,[alias] +  r[1:4])) for r in ret)
                        printed_domain.add(alias)
예제 #10
0
def process_url(url, donormalize=1):
    if donormalize: return normalize(url)
    else: return url
예제 #11
0
def QlPostProcess(ndom,inputbag,strong=True,nitems=10,sortby=3, dedup=True, beautify=True):
    # default sortby is ctr
    # inputbag has the same format as output schema
    # processes: 
    # 1. title dedupping 
    # 2. remove long urls
    # 3. remove .xyz where xyz not in ognc|aspx|html|htm|jsp|gsp|tmpl
    # 4. only show one mail, login, locator, menu
    ret = []
    strong = eval(str(strong))
    if not inputbag: return ret
    
    inputbag = sorted(inputbag, key = lambda t:float(t[sortby]), reverse=True)
    titleset = set()
    keydict = {}
    parentdict = {}
    urlset = set()
    #mail, login, locator, menu = False, False, False, False
    domain = inputbag[0][0].lower()
    domain2 = re.sub('[^a-zA-Z0-9]','',domain)
    #ndom = normalize(domain)
    ndomroot = ndom.split('/')[0]
    backups = []
    for i,rec in enumerate(inputbag):

        if not rec[2] and dedup: continue
        if beautify: 
            rec[2] = TitleBeautify(rec[2],ndomroot)
        if not dedup:
            ret.append(rec)
            continue


        title = rec[2].lower().strip('\t\r\n ')
        if BadAnchor(title, rec[1],debug=True): continue
        
        tmptitle = ''.join([t for t in re.split('[^a-zA-Z0-9]+',re.sub('</?b>','',title)) if t])
        if tmptitle in titleset:
            continue
        


        titletokens = re.split('\s+',title)
        if max(int(t in titletokens) for t in ['invalid','click','next','back','index', 'default', 'page']) == 1: continue
        cont = False
        for suffix in ['htm','jpg','jpeg','pdf','xml','doc','mp3'] + tlds:
            if '.' + suffix in title: 
                backups.append(rec)
                titleset.add(tmptitle)
                titleset.add(tmptitle.strip('s'))
                cont = True
        if cont: continue
        if title.startswith('?'): continue
        if title.isalnum() and not title.isalpha(): 
            backups.append(rec)
            titleset.add(tmptitle)
            titleset.add(tmptitle.strip('s'))
            continue
        if strong: title = ''.join([t for t in re.split('[^a-zA-Z0-9]+',re.sub('</?b>','',title)) if t])
        #if max([ int(title in t or t in title) for t in titleset ] + [0] ) == 1: continue
        #if max([ int(title == t) for t in titleset ] + [0] ) == 1: continue

        ql = rec[1].lower()
        nql = normalize(ql)
        if nql in urlset: continue
        if '/' in nql:
            qltip = nql.split('/')[-1]
            #if not '?' in qltip and '.' in qltip and not qltip.split('.')[-1] in goodsuffixes: continue
        if hasManyTokenNode(ql): 
            backups.append(rec)
            titleset.add(tmptitle)
            titleset.add(tmptitle.strip('s'))
            continue
        title2 =re.sub('[^a-zA-Z0-9]','',title)
        if len(title2) < 2: 
            backups.append(rec)
            titleset.add(tmptitle)
            titleset.add(tmptitle.strip('s'))
            continue

        cont = False
        for k in ['mail','(login|logon|signin|signon)','signup','menu','about','contact','(map|direction|locator|location)']:
            if not re.findall(k,domain2) and re.findall(k,title2):
                if k in keydict: cont = True
                keydict[k] = nql
        if cont: continue
        #if re.findall('(login|logon|signin|signon)',title2): rec[2] = 'Log In'
        parent = re.sub('/[^\/]*$','',nql)

        if parentdict.setdefault(parent, 0) > 2 and not parent == ndom: 
            backups.append(rec)
            titleset.add(tmptitle)
            titleset.add(tmptitle.strip('s'))
            continue
        parentdict[parent] += 1
        ret.append(rec)
        if not nitems == -1 and len(ret) >= nitems: break
        titleset.add(tmptitle)
        titleset.add(tmptitle.strip('s'))
        urlset.add(nql)

    if nitems == -1: return ret
    return ret + backups[:nitems - len(ret)]
예제 #12
0
        scoreidx = int(sys.argv[4])

    if len(sys.argv) > 5:   # for debugging purpose
        mode = sys.argv[5]

    swapDict = {scoreidx:3,3:scoreidx}
    revSwapDict = reverseDict(swapDict)
    
    if mode == 'debug':
        pd = None
        for line in stdgen('None2\tNone2\tNone2\t0.0'):
            # domain ql title score
            tmp = line.strip('\n').split('\t')
            domain, ql, title = tmp[:3]
            score = float(tmp[3])
            ndom = normalize(domain)
            if not pd == ndom:
                if not pd == None:
                    print '\n'.join('\t'.join(map(str,t)) for t in QlPostProcess(ndom,inputbag,strong=True,nitems=10,sortby=3, dedup=True, beautify=True))
                inputbag = []
                pd = ndom
            inputbag.append([domain, ql, title, score])

    else:
        pd = None
        for line in stdgen(('None\t' * 11)[:-1]):

            tmp = line.strip('\n').split('\t')
            domain = tmp[0]
            if domain == 'unknown': continue
            if not pd == domain:
예제 #13
0
import sys, re, math
import simplejson as json

sys.path.insert(0, '/'.join(__file__.split('/')[:-1]))
from genLibs import normalize
rootfile = sys.argv[1]
# input must be unique root file, so news.yahoo.com and yahoo.com cannot both be in
with open(rootfile) as f:
    rset = set(json.loads(f.readline()))
ret = {}
for line in sys.stdin:
    url = line.strip('\r\t\n ')
    root = normalize(url).split('/')[0]

    while root.count('.') > 0:
        if root in rset:
            if not root in ret: ret[root] = []
            ret[root].append(url)
            break
        root = re.sub('^.*?\.', '', root)
print json.dumps(ret)
예제 #14
0
"""
http://000sweb.co.monterey.ca.us/cob/   http://000sweb.co.monterey.ca.us/cob/supervisor.htm     1.0
http://000sweb.co.monterey.ca.us/cob/   http://000sweb.co.monterey.ca.us/cob/minutes/2001/010403M.htm   1.0
http://002salvage.com/  http://www.002salvage.com/product/default.asp   1.0
http://002salvage.com/  http://www.002salvage.com/about/news.asp        1.0
http://005.housedems.com/       http://005.housedems.com/contact-me     2.0
http://005.housedems.com/       http://005.housedems.com/biography      1.0
"""
if __name__ == "__main__":
    pd = None
    top10kfile = 'seed_urls.top10k' if len(sys.argv) < 2 else sys.argv[1]
    top10kset= set()
    with open(top10kfile) as f:
        for line in f.readlines():
            top10kset.add(normalize(line.strip('\r\t\n ')))
    mode = 'Missing' if len(sys.argv) < 3 else sys.argv[2]
    if mode == 'Found': ret = set()
    doNorm = True if len(sys.argv) < 4 else eval(str(sys.argv[3]))
    for line in sys.stdin:
        domain = line.strip('\r\t\n ').split('\t')[0]
        if not domain == pd:
            pd = domain
            ndom = normalize(domain) if doNorm else domain
            if ndom in top10kset:
                if mode == 'Found': ret.add(ndom)
                elif mode == 'Missing': top10kset.remove(ndom)
    if mode == 'Found':
        print '\n'.join(ret)
    elif mode == 'Missing':
        print '\n'.join(top10kset)
예제 #15
0
import sys, re, math, json
sys.path.insert(0, '/'.join(__file__.split('/')[:-1]))
from genLibs import normalize
# alias.dict.onejson
# {"http://www.state.in.us/": ["http://www.state.in.us/"], "http://www.hennepin.us/": ["http://www.hennepin.us/"],

topk = 6

with open(sys.argv[1]) as f:
    aliaspair = [(normalize(k), set(v))
                 for k, v in json.loads(f.readline()).items()]
aliasdict = {}
for k, v in aliaspair:
    if not k in aliasdict: aliasdict[k] = set()
    aliasdict[k] = aliasdict[k].union(v)


def fixstr(s):
    try:
        return str(s)
    except:
        return s.decode('utf-8', 'ignore')


ret = {}
for line in sys.stdin:
    domain, ql, title, score = line.strip('\r\t\n ').split('\t')
    ndom = normalize(domain)
    if not ndom in ret: ret[ndom] = {}
    if not domain in ret[ndom]: ret[ndom][domain] = []
    ret[ndom][domain].append((domain, ql, title, score))
예제 #16
0
import re, math, sys

sys.path.insert(0, '/'.join(__file__.split('/')[:-1]))
from genLibs import normalize
import xml.etree.ElementTree as et
from xml.sax.saxutils import escape

for line in sys.stdin:
    dom, outlinks = line.strip('\n').split('\t')
    ndom = normalize(dom)
    #try:
    #print escape(outlinks)
    #root = et.fromstring('<root>' + outlinks + '</root>')
    for link in re.findall('<L>(.*?)</L>', outlinks):

        #for link in root.iter('L'):
        # <L>13da8a85e450 84d103dc8a0a9980 http://www.gobluehose.com/ en canonical 12 1419811200</L>
        #tmp = link.text.split(' ')
        tmp = link.split(' ')
        ol = tmp[2]
        nol = normalize(ol)
        anchor = ' '.join(tmp[4:-2])
        print '\t'.join([dom, ndom, ol, nol, anchor])
    #except:
    #    continue
예제 #17
0
    ]


if __name__ == '__main__':
    pd = None
    mode = 'ql'
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    nQlAncs = 3 if len(sys.argv) <= 2 else int(sys.argv[2])
    if mode == 'domain':
        for line in stdgen('\t'.join(['None'] * 7)):
            domain, url, redit, title, wmdata, dom_title, dom_wmdata = line.strip(
                '\n').split('\t')[:7]
            if not domain == pd:
                if not pd == None:
                    print '%s\t%s' % (normalize(pd), json.dumps(ret))
                ret = {}
                pd = domain
                domaintokens = set(
                    tokenize(dom_title) + [
                        t for x in parseIntAnc(dom_wmdata, 1)
                        for t in tokenize(x)
                    ])

            for s in tokenize(title) + [
                    t for x in parseIntAnc(wmdata, nQlAncs)
                    for t in tokenize(x)
            ]:
                if not s in domaintokens: ret[s] = ret.get(s, 0) + 1

    elif mode == 'ql':
예제 #18
0
import sys,re,math
sys.path.insert(0,'/'.join(__file__.split('/')[:-1]))
from genLibs import normalize,stdgen

domainfile = sys.argv[1]
nvcut = 0 if len(sys.argv) < 3 else int(sys.argv[2])
domdict= {}
with open(domainfile) as f:
    for line in f.readlines():
        tmp = line.strip('\r\t\n ').split('\t')
        tmp[0] = normalize(tmp[0])
        domdict[tmp[0]] = tmp

pd = None
for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'):
    tmp = line.strip('\n')
    domain = tmp.split('\t')[0]
    if not domain == pd:
        if not pd == None:
            tmp0 = domdict.get(ndom,None)
            if tmp0 and float(tmp0[2]) >= nvcut:
                assert len(ret) == 6
                print '\n'.join(ret)
        pd = domain
        ndom = normalize(pd)
        ret = []
    ret.append(tmp)

예제 #19
0
def UrlTip(url):
    nurl = normalize(url)
    if not '/' in nurl: return nurl.split('.')[0]
    else: return nurl.split('/')[-1].split('.')[0]
예제 #20
0
from GenTrainToken import BadAnchor
from genLibs import normalize
try:
    import simplejson as json
except ImportError:
    import json
from DomainTokens import ud, parseIntAnc, stdgen, stopwords, tokenize

# get rid of non-ascii
# input = hcat pipeline.1/QuicklinkTitles/combined.redir.title.wmdata/par*
if __name__ == '__main__':
    nQlAncs = 3
    if len(sys.argv) > 1: nQlAncs = int(sys.argv[1])

    pd = None
    for line in sys.stdin:
        domain, url, redit, title, wmdata, dom_title, dom_wmdata = line.strip(
            '\n').split('\t')[:7]
        if not domain == pd:
            ndom = normalize(domain)
            pd = domain
        domaintokens = set(
            tokenize(dom_title) +
            [t for x in parseIntAnc(dom_wmdata, 1) for t in tokenize(x)])
        ret = {}
        for s in tokenize(title) + [
                t for x in parseIntAnc(wmdata, nQlAncs) for t in tokenize(x)
        ]:
            if not s in domaintokens: ret[s] = ret.get(s, 0) + 1
        print '\t'.join([ndom, url, json.dumps(ret)])
예제 #21
0
        'clicks', 'domain', 'ql', 'bingtitle', 'pagetitle', 'intanc', 'extanc',
        'newpagetitle', 'newintancs', 'newextancs'
    ])

pts = None
for line in stdgen('domain\tql\tredt\tpagetitle\twmdata\tbingtitle'):

    line = line.strip('\n')
    if line.count('\t') > 5: line = line.strip('\t')
    tmp = line.split('\t')
    domain, ql, redt, pagetitle, wmdata = tmp[:5]
    if len(tmp) > 5:
        bingtitle = tmp[5]
    else:
        bingtitle = None
    nurl = normalize(ql)
    clicks = str(clickdict.get(nurl, 0.0))
    bingtitle = ud(bingtitle)
    pagetitle = ud(pagetitle)

    if not pd == domain:
        if not pd == None:
            ts = float(time.time())
            if not pts == None:
                deltaT = ts - pts
                if deltaT > 1000:
                    logging.warning('long duration domain: %s; duration = %f' %
                                    (pd, deltaT))
            pts = ts

            newpagetitles = RemoveRepeatedPhrase(pagetitles)
예제 #22
0
    if len(sys.argv) > 6:
        rankbyctr = eval(str(sys.argv[6]))

    rankbyscore = True
    if len(sys.argv) > 7:
        rankbyscore = eval(str(sys.argv[7]))
    nitems = 10
    if len(sys.argv) > 8:
        nitems = eval(str(sys.argv[8]))

    bl = set()
    if not blf == 'None':
        with open(blf) as f:
            for line in f.readlines():
                tmp = line.strip('\r\t\n ').split('\t')
                ndom = normalize(tmp[0])
                nql = normalize(tmp[1])
                bl.add((ndom, nql))

    titlebank = {}
    if not titlefile == 'None':
        with open(titlefile) as f:
            titlebank = dict(((ndom, nql), v[-1][0])
                             for ndom, v0 in json.loads(f.readline()).items()
                             for nql, v in v0.items())

    wl = {}
    for wlf in wlfs.split(','):
        with open(wlf) as f:
            for line in f.readlines():
                tmp = line.strip('\r\t\n ').split('\t')[:4]
예제 #23
0
    if mode == 'JoinSr':
        for line in sys.stdin:
            domain, nurl, selfclicks, totalclicks, ourl, c1, c2, n1, n2, l0, l1, l2, la, na, selfviews, totalviews, v1, v2, depth, potential, selfctr, totalctr, ctr1, ctr2 = line.strip(
                '\r\t\n ').split('\t')
            srdict, root = FindSrsFromUrl(nurl)
            #srdict = rdict.get(root,{})
            assert domain == root
            if not srdict: continue
            SrCandidates = FilterSr(nurl, srdict)
            for sr in SrCandidates:
                try:
                    print '\t'.join([
                        sr, nurl, selfclicks, totalclicks, ourl, c1, c2, n1,
                        n2, l0, l1, l2, la, na, selfviews, totalviews, v1, v2,
                        depth, potential, selfctr, totalctr, ctr1, ctr2
                    ])
                except:
                    pass

    elif mode == 'FindDomain':  # streaming in WebmapAllUrls-views.pig
        for line in sys.stdin:
            tmp = line.strip('\n').split('\t')
            url = tmp[0]
            nurl = normalize(url)
            srdict, root = FindSrsFromUrl(nurl)
            if not srdict: continue
            print '\t'.join(tmp + [root])

    else:
        raise ValueError('Unknown mode in JoinSr.py!')
예제 #24
0
    mode = 'update'
    if len(sys.argv) > 3:
        mode = sys.argv[3]
    try:
        with open(jsonfile) as f:
            titledict = json.loads(f.readline())

        with open('backlog/%s.bak.%s' % (jsonfile, str(timestamp)), 'w') as f:
            f.write(json.dumps(titledict))
    except:
        titledict = {}

    if mode == 'update':
        for line in sys.stdin:
            domain, ql, title = line.strip('\n').split('\t')
            ndom = normalize(domain)
            nql = normalize(ql)
            titledictndom = titledict.setdefault(ndom, {})
            titledictndomnql = titledictndom.setdefault(nql, [])
            titledictndomnql.append((title, label))

        with open(jsonfile, 'w') as f:
            f.write(json.dumps(titledict))

    elif mode == 'dedup':
        ret = {}
        for k, v in titledict.items():
            ret[k] = {}
            for k2, v2 in v.items():
                ret[k][k2] = Dedup(v2)
        with open(jsonfile, 'w') as f: