Пример #1
0
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""
from googlemaps import GoogleMaps
import saxparser
import json
import sys
import time
from searchpl import yelpsearch as yp
contextlist = list()
chandler = saxparser.ContextFileHandler(contextlist)
gkey = sys.argv[1]
print 'google jkeuy', ' ', gkey
gmap = GoogleMaps(gkey)
saxparser.parse('/usa/arao/trec/contexttrec12/contexts.txt', chandler)

for context in contextlist:
    js = json.load(
        open('/usa/arao/trec/contexttrec12/yelpplaces/' +
             context.attribute['number']))
    businesslist = list()
    print 'num business in this contect', ' ', len(js['businesses'])
    for business in js['businesses']:
        if business.get('gmaptime') is not None:
            businesslist.append(business)
            continue

        destlat = float(str(business['location']['coordinate']['latitude']))
        destlong = float(str(business['location']['coordinate']['longitude']))
        dest = gmap.latlng_to_address(destlat, destlong)
Пример #2
0
def main(argv=None):
    stprofile = int(argv[0])
    eprofile = int(argv[1])
    ofile = argv[2]
    profiles = saxparser.profilefilehandler(None).lineparse(pfile)
    contexts = list()
    saxparser.parse(cfile, saxparser.ContextFileHandler(contexts))
    exampledocs = makeexampledocs(exdir)
    categories = getcategories(catfile)
    
    suggestions = dict()
    for context in contexts:
        docs = getyelpsuggestions(context.attribute['number'],ypdir, suggdir, urlfile, categories).values()
        suggestions[context.attribute['number']] = suggestion(docs)
        
    temporaldoccache = dict()


    for profile in profiles[stprofile:eprofile]:
        examples = profile.getgoodexamples()
        negexamples = profile.getnegexamples()                   
        
        for context in contexts[40:]:

            docbag = suggestions[context.attribute['number']].bag
            docscore = dict()
            langresult = list()
            
            if temporal:
                if suggestions[context.attribute['number']].rank is None:
                    temporaldocs = gettemporaldocs(context, '/usa/arao/trec/contexttrec12')
                    doclist = list()
                    for doc in temporaldocs:
                        if temporaldoccache.get(doc.fid) is not None:
                            doclist.append(temporaldoccache.get(doc.fid))
                        else:
                            doclist.append(doc)
                            temporaldoccache[doc.fid] = doc         
                    suggestions[context.attribute['number']].makescore(doclist)
        
          
            for example in examples:
                if exampledocs.get(example) is None:
                    continue
                cdocscore = docbag.multinomial(exampledocs[example], 0.2)               
                for key, value in cdocscore.items():
                    if docscore.get(key) is not None:
                        docscore[key] = docscore.get(key) + (value/ len(examples))
                    else:
                        docscore[key] = value / len(examples)
            
            negdocscore = dict()
            
            for example in negexamples:
                if exampledocs.get(example) is None:
                    continue
                cdocscore = docbag.multinomial(exampledocs[example], 0.2)
                for key, value in cdocscore.items():
                    if negdocscore.get(key) is not None:
                        negdocscore[key] = (value/ len(examples)) + negdocscore[key]
                    else:
                        negdocscore[key] = value / len(examples)

            score = dict() 
            for key in docscore.keys():
                negative = negdocscore.get(key)
                if negative is None:
                    negative = 0
                positive = docscore.get(key)
                score[key] = positive - negative

            docscore = score
            
            for key,value in docscore.items():
                langresult.append(result(key, value, docbag.getdoc(key)))

            finalresult = sorted(langresult, key = lambda item : item.score, reverse=True)
        
            if temporal:
                temprank = suggestions[context.attribute['number']].rank
                combinedresult = list()
                count = 0.0
                for item in finalresult:
                    newrank = ranklambda * count + (1-ranklambda) * temprank.index(item.idt)
                    combinedresult.append(result(item.idt, newrank, item.doc))
                    count = count + 1
                    print count, ' new rank ' , newrank , ' temporal rank ', temprank.index(item.idt) 
                finalresult = sorted(combinedresult, key = lambda item : item.score)

            docbag.flushmemory()

            finalind = len(finalresult)
            if finalind > 50:
                finalind = 50         
            outputrun(ofile,profile, finalresult[0:finalind], context.attribute['number'])
Пример #3
0
#!/usr/bin/python
"""
This script crawls each of the site mentioned in examples.txt file
of the context trec 2012.
For each of the examples it create a directory with that example number
and stores the crawled pages of that site into that directory.
The pages are stored as per the path in url of the crawled page.
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""

import saxparser

from crawler import curlcrawl
examplelist = list()
exhandler = saxparser.ExampleFileHandler(examplelist)
saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt',exhandler)
for ex in examplelist[0:3]:
    fetched = curlcrawl.curlcrawl([ex.attribute['url']],maxlink=10,dumpdir='/usa/arao/trec/contexttrec12/texamplesites/'+ex.attribute['number'], mode = 0750)
    print ex.attribute['number'] , ' ', fetched


Пример #4
0
import sys
import saxparser
import json
import time
from searchpl import yelpsearch as yp
contextlist = list()
chandler = saxparser.ContextFileHandler(contextlist)
ckey = sys.argv[1]
csecret = sys.argv[2]
token = sys.argv[3]
tokensecret = sys.argv[4]
idir = sys.argv[5]
odir = sys.argv[6]
startcontext = int(sys.argv[7])
togetcount = int(sys.argv[8])
saxparser.parse('/usa/arao/trec/contexttrec12/contexts.txt',chandler)

for context in contextlist[startcontext -1 : ]:
    exbusiness = json.load(open(idir+'/'+context.attribute['number']))
    bid = dict()
    for business in exbusiness['businesses']:
        bid[business['id']] = 1
    count = len(exbusiness['businesses'])
    if togetcount < count:
        count = togetcount
    args = []
    args.append('--consumer_key='+ckey)
    args.append('--consumer_secret='+csecret)
    args.append('--token='+token)
    args.append('--token_secret='+tokensecret)
Пример #5
0
#!/usr/bin/python
"""
This script crawls each of the site mentioned in examples.txt file
of the context trec 2012.
For each of the examples it create a directory with that example number
and stores the crawled pages of that site into that directory.
The pages are stored as per the path in url of the crawled page.
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""

import saxparser

from crawler import curlcrawl
examplelist = list()
exhandler = saxparser.ExampleFileHandler(examplelist)
saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt', exhandler)
for ex in examplelist[0:3]:
    fetched = curlcrawl.curlcrawl(
        [ex.attribute['url']],
        maxlink=10,
        dumpdir='/usa/arao/trec/contexttrec12/texamplesites/' +
        ex.attribute['number'],
        mode=0750)
    print ex.attribute['number'], ' ', fetched