For example http://www.cis.udel.edu/home/main.html : saved to /home dir http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir """ from googlemaps import GoogleMaps import saxparser import json import sys import time from searchpl import yelpsearch as yp contextlist = list() chandler = saxparser.ContextFileHandler(contextlist) gkey = sys.argv[1] print 'google jkeuy', ' ', gkey gmap = GoogleMaps(gkey) saxparser.parse('/usa/arao/trec/contexttrec12/contexts.txt', chandler) for context in contextlist: js = json.load( open('/usa/arao/trec/contexttrec12/yelpplaces/' + context.attribute['number'])) businesslist = list() print 'num business in this contect', ' ', len(js['businesses']) for business in js['businesses']: if business.get('gmaptime') is not None: businesslist.append(business) continue destlat = float(str(business['location']['coordinate']['latitude'])) destlong = float(str(business['location']['coordinate']['longitude'])) dest = gmap.latlng_to_address(destlat, destlong)
def main(argv=None): stprofile = int(argv[0]) eprofile = int(argv[1]) ofile = argv[2] profiles = saxparser.profilefilehandler(None).lineparse(pfile) contexts = list() saxparser.parse(cfile, saxparser.ContextFileHandler(contexts)) exampledocs = makeexampledocs(exdir) categories = getcategories(catfile) suggestions = dict() for context in contexts: docs = getyelpsuggestions(context.attribute['number'],ypdir, suggdir, urlfile, categories).values() suggestions[context.attribute['number']] = suggestion(docs) temporaldoccache = dict() for profile in profiles[stprofile:eprofile]: examples = profile.getgoodexamples() negexamples = profile.getnegexamples() for context in contexts[40:]: docbag = suggestions[context.attribute['number']].bag docscore = dict() langresult = list() if temporal: if suggestions[context.attribute['number']].rank is None: temporaldocs = gettemporaldocs(context, '/usa/arao/trec/contexttrec12') doclist = list() for doc in temporaldocs: if temporaldoccache.get(doc.fid) is not None: doclist.append(temporaldoccache.get(doc.fid)) else: doclist.append(doc) temporaldoccache[doc.fid] = doc suggestions[context.attribute['number']].makescore(doclist) for example in examples: if exampledocs.get(example) is None: continue cdocscore = docbag.multinomial(exampledocs[example], 0.2) for key, value in cdocscore.items(): if docscore.get(key) is not None: docscore[key] = docscore.get(key) + (value/ len(examples)) else: docscore[key] = value / len(examples) negdocscore = dict() for example in negexamples: if exampledocs.get(example) is None: continue cdocscore = docbag.multinomial(exampledocs[example], 0.2) for key, value in cdocscore.items(): if negdocscore.get(key) is not None: negdocscore[key] = (value/ len(examples)) + negdocscore[key] else: negdocscore[key] = value / len(examples) score = dict() for key in docscore.keys(): negative = negdocscore.get(key) if negative is None: negative = 0 positive = docscore.get(key) score[key] = positive - negative docscore = score for key,value in docscore.items(): langresult.append(result(key, value, docbag.getdoc(key))) finalresult = sorted(langresult, key = lambda item : item.score, reverse=True) if temporal: temprank = suggestions[context.attribute['number']].rank combinedresult = list() count = 0.0 for item in finalresult: newrank = ranklambda * count + (1-ranklambda) * temprank.index(item.idt) combinedresult.append(result(item.idt, newrank, item.doc)) count = count + 1 print count, ' new rank ' , newrank , ' temporal rank ', temprank.index(item.idt) finalresult = sorted(combinedresult, key = lambda item : item.score) docbag.flushmemory() finalind = len(finalresult) if finalind > 50: finalind = 50 outputrun(ofile,profile, finalresult[0:finalind], context.attribute['number'])
#!/usr/bin/python """ This script crawls each of the site mentioned in examples.txt file of the context trec 2012. For each of the examples it create a directory with that example number and stores the crawled pages of that site into that directory. The pages are stored as per the path in url of the crawled page. For example http://www.cis.udel.edu/home/main.html : saved to /home dir http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir """ import saxparser from crawler import curlcrawl examplelist = list() exhandler = saxparser.ExampleFileHandler(examplelist) saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt',exhandler) for ex in examplelist[0:3]: fetched = curlcrawl.curlcrawl([ex.attribute['url']],maxlink=10,dumpdir='/usa/arao/trec/contexttrec12/texamplesites/'+ex.attribute['number'], mode = 0750) print ex.attribute['number'] , ' ', fetched
import sys import saxparser import json import time from searchpl import yelpsearch as yp contextlist = list() chandler = saxparser.ContextFileHandler(contextlist) ckey = sys.argv[1] csecret = sys.argv[2] token = sys.argv[3] tokensecret = sys.argv[4] idir = sys.argv[5] odir = sys.argv[6] startcontext = int(sys.argv[7]) togetcount = int(sys.argv[8]) saxparser.parse('/usa/arao/trec/contexttrec12/contexts.txt',chandler) for context in contextlist[startcontext -1 : ]: exbusiness = json.load(open(idir+'/'+context.attribute['number'])) bid = dict() for business in exbusiness['businesses']: bid[business['id']] = 1 count = len(exbusiness['businesses']) if togetcount < count: count = togetcount args = [] args.append('--consumer_key='+ckey) args.append('--consumer_secret='+csecret) args.append('--token='+token) args.append('--token_secret='+tokensecret)
#!/usr/bin/python """ This script crawls each of the site mentioned in examples.txt file of the context trec 2012. For each of the examples it create a directory with that example number and stores the crawled pages of that site into that directory. The pages are stored as per the path in url of the crawled page. For example http://www.cis.udel.edu/home/main.html : saved to /home dir http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir """ import saxparser from crawler import curlcrawl examplelist = list() exhandler = saxparser.ExampleFileHandler(examplelist) saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt', exhandler) for ex in examplelist[0:3]: fetched = curlcrawl.curlcrawl( [ex.attribute['url']], maxlink=10, dumpdir='/usa/arao/trec/contexttrec12/texamplesites/' + ex.attribute['number'], mode=0750) print ex.attribute['number'], ' ', fetched