def analyze(s): d=eval(s) special_keys = [] name = d['name'] electoral_district_type = d['electoral_district_type'] electoral_district_name = d['electoral_district_name'] state = d['state'] link = d['link'] text = d['sitetext'].lower().decode('utf-8') name, last,first = conv.clean_name(name) for v in vocabulary: special_keys += [conv.search_to_feature_key(v)]*text.count(v.lower()) text.replace(v.lower(),'') special_keys += [conv.search_to_feature_key('name')]*text.count(name.lower()) special_keys += [conv.search_to_feature_key('last')]*text.count(last.lower()) special_keys += [conv.search_to_feature_key('first')]*text.count(first.lower()) special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+' for') special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'for') special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'4') special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote'+last.lower()) special_keys += [conv.search_to_feature_key('forstate')]*text.count('for '+state.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +name.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +last.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +name.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +last.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +name.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +last.lower()) special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +name.lower()) special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +last.lower()) special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote '+last.lower()) special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote for '+last.lower()) special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('votefor'+last.lower()) special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote4'+last.lower()) try: special_keys += [conv.search_to_feature_key('politicianpublicfigure')]*len(re.findall(r'{last}.{{1,50}}(?:public figure|politician)'.format(last=re.escape(last.encode('utf-8'))), text)) except: import pdb;pdb.set_trace() text.replace(name.lower(),'') text.replace(last.lower(),'') text.replace(first.lower(),'') special_keys += [conv.search_to_feature_key('electoral_district_type')]*sum(text.count(edt.lower()) for edt in conv.district_type_dict[electoral_district_type]) special_keys += [conv.search_to_feature_key('officename')]*sum(text.count(on.lower()) for on in conv.office_names) special_keys += [conv.search_to_feature_key('electoral_district_name')]*text.count(electoral_district_name.lower()) special_keys += [conv.search_to_feature_key('state')]*text.count(state.lower()) if fb_page_data.has_key(conv.strip_and_std(link)): fb_page_dict = fb_page_data[conv.strip_and_std(link)] special_keys.append(conv.search_to_feature_key('fbdata')) fans = int(math.log(int(fb_page_dict['fans']))) special_keys += [conv.search_to_feature_key('fbdata')]*fans if fb_page_dict['authentic'] == 'Authentic': special_keys.append(conv.search_to_feature_key('fbauthentic')) name_key = conv.search_to_feature_key('name') last_key = conv.search_to_feature_key('last') first_key = conv.search_to_feature_key('first') #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key) return basic_analyze(text) + special_keys
def analyze(s): d=eval(s) special_keys = [] name = d['name'] electoral_district_type = d['electoral_district_type'] electoral_district_name = d['electoral_district_name'] state = d['state'] link = d['link'] text = d['sitetext'].lower().decode('utf-8') name, last,first = conv.clean_name(name) for v in vocabulary: special_keys += [conv.search_to_feature_key(v)]*text.count(v.lower()) text.replace(v.lower(),'') special_keys += [conv.search_to_feature_key('name')]*text.count(name.lower()) special_keys += [conv.search_to_feature_key('last')]*text.count(last.lower()) special_keys += [conv.search_to_feature_key('first')]*text.count(first.lower()) special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+' for') special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'for') special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'4') special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote'+last.lower()) special_keys += [conv.search_to_feature_key('forstate')]*text.count('for '+state.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +name.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +last.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +name.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +last.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +name.lower()) special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +last.lower()) special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +name.lower()) special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +last.lower()) special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote '+last.lower()) special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote for '+last.lower()) special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('votefor'+last.lower()) special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote4'+last.lower()) text.replace(name.lower(),'') text.replace(last.lower(),'') text.replace(first.lower(),'') special_keys += [conv.search_to_feature_key('electoral_district_type')]*sum(text.count(edt.lower()) for edt in conv.district_type_dict[electoral_district_type]) special_keys += [conv.search_to_feature_key('officename')]*sum(text.count(on.lower()) for on in conv.office_names) special_keys += [conv.search_to_feature_key('electoral_district_name')]*text.count(electoral_district_name.lower()) special_keys += [conv.search_to_feature_key('state')]*text.count(state.lower()) name_key = conv.search_to_feature_key('name') last_key = conv.search_to_feature_key('last') first_key = conv.search_to_feature_key('first') #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key) return basic_analyze(text) + special_keys
def analyze(s): d = eval(s) special_keys = [] name = d['name'] electoral_district_type = d['electoral_district_type'] electoral_district_name = d['electoral_district_name'] state = d['state'] link = d['link'] text = d['sitetext'].lower().decode('utf-8') name, last, first = conv.clean_name(name) for v in vocabulary: special_keys += [conv.search_to_feature_key(v)] * text.count(v.lower()) text.replace(v.lower(), '') special_keys += [conv.search_to_feature_key('name')] * text.count( name.lower()) special_keys += [conv.search_to_feature_key('last')] * text.count( last.lower()) special_keys += [conv.search_to_feature_key('first')] * text.count( first.lower()) special_keys += [conv.search_to_feature_key('lastfor') ] * text.count(last.lower() + ' for') special_keys += [conv.search_to_feature_key('lastfor') ] * text.count(last.lower() + 'for') special_keys += [conv.search_to_feature_key('lastfor') ] * text.count(last.lower() + '4') special_keys += [conv.search_to_feature_key('votelast') ] * text.count('vote' + last.lower()) special_keys += [conv.search_to_feature_key('forstate') ] * text.count('for ' + state.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('reelect ' + name.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('reelect ' + last.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re elect ' + name.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re elect ' + last.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re-elect ' + name.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re-elect ' + last.lower()) special_keys += [conv.search_to_feature_key('electlast') ] * text.count('elect ' + name.lower()) special_keys += [conv.search_to_feature_key('electlast') ] * text.count('elect ' + last.lower()) special_keys += [conv.search_to_feature_key('votelast') ] * text.count('vote ' + last.lower()) special_keys += [conv.search_to_feature_key('voteforlast') ] * text.count('vote for ' + last.lower()) special_keys += [conv.search_to_feature_key('voteforlast') ] * text.count('votefor' + last.lower()) special_keys += [conv.search_to_feature_key('voteforlast') ] * text.count('vote4' + last.lower()) text.replace(name.lower(), '') text.replace(last.lower(), '') text.replace(first.lower(), '') special_keys += [ conv.search_to_feature_key('electoral_district_type') ] * sum( text.count(edt.lower()) for edt in conv.district_type_dict[electoral_district_type]) special_keys += [conv.search_to_feature_key('officename')] * sum( text.count(on.lower()) for on in conv.office_names) special_keys += [conv.search_to_feature_key('electoral_district_name') ] * text.count(electoral_district_name.lower()) special_keys += [conv.search_to_feature_key('state')] * text.count( state.lower()) name_key = conv.search_to_feature_key('name') last_key = conv.search_to_feature_key('last') first_key = conv.search_to_feature_key('first') #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key) return basic_analyze(text) + special_keys
def getlinks(candidate, webpage, state, district_type, district_name): district_type = district_type.replace('_',' ').strip() state = state_map[state.strip()] candidate, last, first = conversions.clean_name(candidate) candidate = '+'.join(candidate.split(' ')) print candidate state = '+'.join(state.split(' ')) district_type = '+'.join(district_type.split(' ')) district_name = '+'.join(district_name.strip().split(' ')) search_urls = [] extra_children_searches = [] precise_searches = [] search_urls.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}'.format(name=candidate, state=state)) extra_children_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+info'.format(name=candidate, state=state)) extra_children_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+sk=info'.format(name=candidate, state=state)) precise_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+campaign'.format(name=candidate, state=state)) precise_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+elect'.format(name=candidate, state=state)) search_urls = [s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in search_urls] extra_children_searches = [s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in extra_children_searches] precise_searches = [s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in precise_searches] old_webpage = webpage if webpage != 'www.gernensamples.com': webpage = conversions.get_redirect(webpage) #if webpage == '404' or webpage == 'ERROR': #raise Exception websites = [] webpage_stripped = re.match(r'(?:https?://)?(?:www\.)?(?P<content>.+)',webpage).groupdict()['content'].rstrip('/') old_webpage_stripped = re.match(r'(?:https?://)?(?:www\.)?(?P<content>.+)',old_webpage).groupdict()['content'].rstrip('/') #TODO strip queries webpage_no_queries = ul.urlparse.urlparse(webpage) webpage_no_queries = re.match(r'(?:www\.)?(?P<content>.+)',webpage_no_queries.netloc + webpage_no_queries.path).groupdict()['content'].rstrip('/') old_webpage_no_queries = ul.urlparse.urlparse(old_webpage) old_webpage_no_queries = re.match(r'(?:www\.)?(?P<content>.+)',old_webpage_no_queries.netloc + old_webpage_no_queries.path).groupdict()['content'].rstrip('/') patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format(webpage=webpage_stripped.lower())) old_patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format(webpage=old_webpage_stripped.lower())) child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format(webpage=webpage_no_queries.lower())) old_child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format(webpage=old_webpage_no_queries.lower())) n = 4 while True: results = map(lambda x: json.loads(requests.get(x).text),search_urls) if any(map(lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503),results)): print 'sleeping' time.sleep(n + random.randint(1,1000)/1000.) n = n*2 elif any(map(lambda r: r.has_key('error'), results)): raise Exception(', '.join(map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'),results)))) else: break n = 4 while True: child_results = map(lambda x: json.loads(requests.get(x).text),extra_children_searches) if any(map(lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503),child_results)): print 'sleeping' time.sleep(n + random.randint(1,1000)/1000.) n = n*2 elif any(map(lambda r: r.has_key('error'), child_results)): raise Exception(', '.join(map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'),child_results)))) else: break n = 4 while True: precise_results = map(lambda x: json.loads(requests.get(x).text),precise_searches) if any(map(lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503),precise_results)): print 'sleeping' time.sleep(n + random.randint(1,1000)/1000.) n = n*2 elif any(map(lambda r: r.has_key('error'), precise_results)): raise Exception(', '.join(map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'),precise_results)))) else: break if type(results) != list: print type(results) results = [results] real_results = [(r if r.has_key('items') else {'items':[]}) for r in results] results = real_results search_links = [[i['link'].lower() for i in r['items']] for r in results] search_text = [[u'{title} {link} {pagemap} {snippet}'.format(**convert_pagemap_dict(i)).lower().encode('utf-8') for i in r['items']] for r in results] for ri in range(len(search_links)): for si in range(len(search_links[ri])): for r in precise_results: if r.has_key('items'): for i in r['items']: if conversions.child_or_equal_page(search_links[ri][si], i['link'].lower(), True): search_text[ri][si] += ' bipspecialappearsinprecise' child_links = [i['link'].lower() for r in child_results if r.has_key('items') for i in r['items']] child_text = [u'{title} {link} {pagemap} {snippet}'.format(**convert_pagemap_dict(i)).lower().encode('utf-8') for r in child_results if r.has_key('items') for i in r['items']] #search_text = [[u'{title} {link} {pagemap} {snippet}'.format(**i).lower().encode('utf-8') for i in r['items']] for r in results] search_class = [map(lambda s: conversions.page_relation(s, True, webpage,old_webpage),sl) for sl in search_links] #search_class = [map(lambda s: 'True' if patt.match(s) != None or old_patt.match(s) != None else ('Child' if child_patt.match(s) != None or old_child_patt.match(s) != None else 'False'),sl) for sl in search_links] #print search_text #TODO Clean up ssv code ssv = [any(map(patt.match,sl)) or any(map(old_patt.match,sl)) for sl in search_links] non_websites = [[i['link'] for i in r['items'] if webpage not in i['link']] for r in results] cs,ct,cc = zip(*[combine_children(search_links[i],search_text[i],search_class[i], child_links, child_text) for i in range(len(search_links))]) print 'got there',len(results[0]['items']) return non_websites, ssv, webpage_stripped, search_links, search_text, [r['items'] for r in results], search_class, cs, ct, cc,child_links,child_text
def analyze(s): d = eval(s) special_keys = [] name = d['name'] electoral_district_type = d['electoral_district_type'] electoral_district_name = d['electoral_district_name'] state = d['state'] link = d['link'] text = d['sitetext'].lower().decode('utf-8') name, last, first = conv.clean_name(name) for v in vocabulary: special_keys += [conv.search_to_feature_key(v)] * text.count(v.lower()) text.replace(v.lower(), '') special_keys += [conv.search_to_feature_key('name')] * text.count( name.lower()) special_keys += [conv.search_to_feature_key('last')] * text.count( last.lower()) special_keys += [conv.search_to_feature_key('first')] * text.count( first.lower()) special_keys += [conv.search_to_feature_key('lastfor') ] * text.count(last.lower() + ' for') special_keys += [conv.search_to_feature_key('lastfor') ] * text.count(last.lower() + 'for') special_keys += [conv.search_to_feature_key('lastfor') ] * text.count(last.lower() + '4') special_keys += [conv.search_to_feature_key('votelast') ] * text.count('vote' + last.lower()) special_keys += [conv.search_to_feature_key('forstate') ] * text.count('for ' + state.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('reelect ' + name.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('reelect ' + last.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re elect ' + name.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re elect ' + last.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re-elect ' + name.lower()) special_keys += [conv.search_to_feature_key('reelectlast') ] * text.count('re-elect ' + last.lower()) special_keys += [conv.search_to_feature_key('electlast') ] * text.count('elect ' + name.lower()) special_keys += [conv.search_to_feature_key('electlast') ] * text.count('elect ' + last.lower()) special_keys += [conv.search_to_feature_key('votelast') ] * text.count('vote ' + last.lower()) special_keys += [conv.search_to_feature_key('voteforlast') ] * text.count('vote for ' + last.lower()) special_keys += [conv.search_to_feature_key('voteforlast') ] * text.count('votefor' + last.lower()) special_keys += [conv.search_to_feature_key('voteforlast') ] * text.count('vote4' + last.lower()) try: special_keys += [ conv.search_to_feature_key('politicianpublicfigure') ] * len( re.findall( r'{last}.{{1,50}}(?:public figure|politician)'.format( last=re.escape(last.encode('utf-8'))), text)) except: import pdb pdb.set_trace() text.replace(name.lower(), '') text.replace(last.lower(), '') text.replace(first.lower(), '') special_keys += [ conv.search_to_feature_key('electoral_district_type') ] * sum( text.count(edt.lower()) for edt in conv.district_type_dict[electoral_district_type]) special_keys += [conv.search_to_feature_key('officename')] * sum( text.count(on.lower()) for on in conv.office_names) special_keys += [conv.search_to_feature_key('electoral_district_name') ] * text.count(electoral_district_name.lower()) special_keys += [conv.search_to_feature_key('state')] * text.count( state.lower()) if fb_page_data.has_key(conv.strip_and_std(link)): fb_page_dict = fb_page_data[conv.strip_and_std(link)] special_keys.append(conv.search_to_feature_key('fbdata')) fans = int(math.log(int(fb_page_dict['fans']))) special_keys += [conv.search_to_feature_key('fbdata')] * fans if fb_page_dict['authentic'] == 'Authentic': special_keys.append(conv.search_to_feature_key('fbauthentic')) name_key = conv.search_to_feature_key('name') last_key = conv.search_to_feature_key('last') first_key = conv.search_to_feature_key('first') #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key) return basic_analyze(text) + special_keys
"fb/Gubernatorial Races.csv" ) as gr, open("fb/Presidential Race.csv") as pr, open("fb/fbcands.csv") as fc, open("fb/morefbcands.csv", "w") as mfc: csvhr = csv.DictReader(hr) csvsr = csv.DictReader(sr) csvgr = csv.DictReader(gr) csvpr = csv.DictReader(pr) csvfc = csv.DictReader(fc) csvmfc = csv.DictWriter(mfc, csvfc.fieldnames) hrdict = {} for l in csvhr: hrdict.update( { ( re.match(r"(?P<state>\w{2})-(?P<number>\d+)", l["DISTRICT"]).groupdict()["state"], int(re.match(r"(?P<state>\w{2})-(?P<number>\d+)", l["DISTRICT"]).groupdict()["number"]), conversions.clean_name(l["CANDIDATE"]), ): l["URL"].replace("?ref=ts", "") } ) srdict = dict( ((l["STATE"], conversions.clean_name(l["CANDIDATE"])), l["URL"].replace("?ref=ts", "")) for l in csvsr ) grdict = dict( ((l["STATE"], conversions.clean_name(l["CANDIDATE"])), l["URL"].replace("?ref=ts", "")) for l in csvgr ) csvmfc.writeheader() for l in csvfc: try: hrkey = (l["state"], int(l["electoral_district_name"]), conversions.clean_name(l["name"])) except: hrkey = (l["state"], l["electoral_district_name"], conversions.clean_name(l["name"]))
def getlinks(candidate, webpage, state, district_type, district_name): """ Gets all the facebook links found via the Google Search API """ # ### Cleanup input variables # District district_type = district_type.replace('_', ' ').strip() district_type = '+'.join(district_type.split(' ')) district_name = '+'.join(district_name.strip().split(' ')) # State state = state_map[state.strip()] state = '+'.join(state.split(' ')) # Candidate name candidate, last, first = conversions.clean_name(candidate) candidate = '+'.join(candidate.split(' ')) #print 'CANDIDATE: {}'.format(candidate) # Setup search urls search_urls = [] extra_children_searches = [] precise_searches = [] # Common values url = "https://www.googleapis.com/customsearch/v1" cx = "011743744063680272768:cp4-iesopjm" key = "AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA" search_urls.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) # Just searches for general about pages extra_children_searches.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+info'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) # sk=info specifies Facebook's about page extra_children_searches.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+sk=info'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) precise_searches.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+campaign'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) precise_searches.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+elect'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) # Clean up encoding of URL's search_urls = [ s.encode( chardet.detect(s.encode('utf-8'))['encoding'] ) for s in search_urls ] extra_children_searches = [ s.encode( chardet.detect(s.encode('utf-8'))['encoding'] ) for s in extra_children_searches ] #print 'SEARCH_URLS: {}'.format(search_urls) precise_searches = [ s.encode( chardet.detect(s.encode('utf-8'))['encoding'] ) for s in precise_searches ] # This must be a test for a dummy webside used for testing # get_redirect simply gets the final page that returns a 200 old_webpage = webpage if webpage != 'www.gernensamples.com': webpage = conversions.get_redirect(webpage) #print 'WBBPAGES: {}'.format(webpage) has_webpage = True # raise Exception # why do we need this exception?? # print 'ok?' # Cleanup web pages by removing protocol, subdomain, and trailing '/' if has_webpage: #print has_webpage webpage_stripped = re.match( r'(?:https?://)?(?:www\.)?(?P<content>.+)', webpage ).groupdict()['content'].rstrip('/') old_webpage_stripped = re.match( r'(?:https?://)?(?:www\.)?(?P<content>.+)', old_webpage ).groupdict()['content'].rstrip('/') # TODO strip queries webpage_no_queries = ul.urlparse.urlparse(webpage) webpage_no_queries = re.match( r'(?:www\.)?(?P<content>.+)', webpage_no_queries.netloc + webpage_no_queries.path ).groupdict()['content'].rstrip('/') old_webpage_no_queries = ul.urlparse.urlparse(old_webpage) #print 'NO:{}'.format(old_webpage_no_queries) if old_webpage_no_queries is not None: old_webpage_no_queries = re.match( r'(?:www\.)?(?P<content>.+)', old_webpage_no_queries.netloc + old_webpage_no_queries.path ).groupdict()['content'].rstrip('/') patt = re.compile( r'^https?://(?:www.)?{webpage}/?$'.format( webpage=webpage_stripped.lower() ) ) old_patt = re.compile( r'^https?://(?:www.)?{webpage}/?$'.format( webpage=old_webpage_stripped.lower() ) ) child_patt = re.compile( r'^https?://(?:www\.)?{webpage}.+'.format( webpage=webpage_no_queries.lower() ) ) old_child_patt = re.compile( r'^https?://(?:www\.)?{webpage}.+'.format( webpage=old_webpage_no_queries.lower() ) ) print 'starting' n = 4 while True: results = map(lambda x: json.loads(requests.get(x).text), search_urls) #for r in results: # print 'error' in r if any(map( lambda r: ('error' in r and ( r['error']['code'] == 403 or r['error']['code'] == 503) ), results)): print 'sleeping' time.sleep(n + random.randint(1, 1000)/1000.) n = n*2 elif any(map(lambda r: 'error' in r, results)): raise Exception(', '.join( map( lambda r: r['error']['message'], filter(lambda r: 'error' in r, results) ) )) else: break n = 4 while True: child_results = map( lambda x: json.loads(requests.get(x).text), extra_children_searches ) if any(map( lambda r: 'error' in r and ( r['error']['code'] == 403 or r['error']['code'] == 503 ), child_results)): print 'sleeping' time.sleep(n + random.randint(1, 1000) / 1000.) n = n * 2 elif any(map( lambda r: 'error' in r, child_results )): raise Exception(', '.join( map( lambda r: r['error']['message'], filter(lambda r: 'error' in r, child_results) ) )) else: break n = 4 while True: precise_results = map( lambda x: json.loads(requests.get(x).text), precise_searches ) if any(map( lambda r: 'error' in r and ( r['error']['code'] == 403 or r['error']['code'] == 503 ), precise_results)): print 'sleeping' time.sleep(n + random.randint(1, 1000) / 1000.) n = n * 2 elif any(map(lambda r: 'error' in r, precise_results)): raise Exception(', '.join( map( lambda r: r['error']['message'], filter(lambda r: 'error' in r, precise_results) ) )) else: break if type(results) != list: results = [results] # Get results from the "items" key and store it in the results variable real_results = [ (r if 'items' in r else {'items': []}) for r in results ] results = real_results # print 'RESULTS:{}'.format(results) # Get the result URLs, Extract searchable text from the pagemap search_links = [[i['link'].lower() for i in r['items']] for r in results] search_text = [ [u'{title} {link} {pagemap} {snippet}'.format( **convert_pagemap_dict(i) ).lower().encode('utf-8') for i in r['items']] for r in results ] # first loop may be unneccessary for ri in range(len(search_links)): # for 1 to number of result objects for si in range(len(search_links[ri])): # for 1 to number of links # For each "precise result" (name+state+'elect'), # see if the link is equivalent # or a sub page of the main results (name+state) for r in precise_results: if 'items' in r: for i in r['items']: if conversions.child_or_equal_page( search_links[ri][si], i['link'].lower(), True ): search_text[ri][si] += ' bipspecialappearsinprecise' # noqa # Get the result URLs, Extract searchable text from the pagemap child_links = [ i['link'].lower() for r in child_results if 'items' in r for i in r['items'] ] child_text = [ u'{title} {link} {pagemap} {snippet}'.format( **convert_pagemap_dict(i) ).lower().encode('utf-8') for r in child_results if 'items' in r for i in r['items'] ] # Classify each search link based on it's relationship # to the provided web page, either PARENT, CHILD, TRUE (identity), # or FALSE (no match) search_class = [ map(lambda s: conversions.page_relation( s, True, webpage, old_webpage ), sl) for sl in search_links ] # TODO Clean up ssv code # Seems to match each search link result against the webpage domain ssv = [ any(map(patt.match, sl)) or any(map(old_patt.match, sl)) for sl in search_links ] non_websites = [ [i['link'] for i in r['items'] if webpage not in i['link']] for r in results ] cs, ct, cc = zip( *[combine_children( search_links[i], search_text[i], search_class[i], child_links, child_text ) for i in range(len(search_links))] ) print 'got there', len(results[0]['items']) return (non_websites, ssv, webpage_stripped, search_links, search_text, [r['items'] for r in results], search_class, cs, ct, cc, child_links, child_text)
def getlinks(candidate, webpage, state, district_type, district_name): # District district_type = district_type.replace('_', ' ').strip() district_type = '+'.join(district_type.split(' ')) district_name = '+'.join(district_name.strip().split(' ')) # State state = state_map[state.strip()] state = '+'.join(state.split(' ')) # Candidate Name candidate, last, first = conversions.clean_name(candidate) candidate = '+'.join(candidate.split(' ')) #print candidate # Search URLs search_urls = [] precise_searches = [] url = 'https://www.googleapis.com/customsearch/v1' cx = '011743744063680272768:xcugk1a_1t0' #cx = '009761440872559920339:eqjjlrdgzma' key = 'AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl' # Create search URLs search_urls.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) precise_searches.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+campaign'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) precise_searches.append( u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+elect'.format( url=url, cx=cx, key=key, name=candidate, state=state ) ) # URL Encoding Cleanup search_urls = [ s.encode( chardet.detect(s.encode('utf-8'))['encoding'] ) for s in search_urls ] precise_searches = [ s.encode( chardet.detect(s.encode('utf-8'))['encoding'] ) for s in precise_searches ] # ?? Some sort of test? webpage = conversions.twitter_handle_to_web(webpage) print webpage old_webpage = webpage if webpage != 'www.gernensamples.com': webpage = conversions.get_redirect(webpage) # if webpage == '404' or webpage == 'ERROR': # raise Exception #print search_urls #print precise_searches webpage_stripped = re.match( r'(?:https?://)?(?:www\.)?(?P<content>.+)', webpage ).groupdict()['content'].rstrip('/') old_webpage_stripped = re.match( r'(?:https?://)?(?:www\.)?(?P<content>.+)', old_webpage ).groupdict()['content'].rstrip('/') # TODO strip queries webpage_no_queries = ul.urlparse.urlparse(webpage) webpage_no_queries = re.match( r'(?:www\.)?(?P<content>.+)', webpage_no_queries.netloc + webpage_no_queries.path ).groupdict()['content'].rstrip('/') old_webpage_no_queries = ul.urlparse.urlparse(old_webpage) old_webpage_no_queries = re.match( r'(?:www\.)?(?P<content>.+)', old_webpage_no_queries.netloc + old_webpage_no_queries.path ).groupdict()['content'].rstrip('/') patt = re.compile( r'^https?://(?:www.)?{webpage}/?$'.format( webpage=webpage_stripped.lower() ) ) old_patt = re.compile( r'^https?://(?:www.)?{webpage}/?$'.format( webpage=old_webpage_stripped.lower() ) ) print 'searching' # Timeout work n = 4 while True: results = map(lambda x: json.loads(requests.get(x).text), search_urls) if any(map( lambda r: 'error' in r and ( r['error']['code'] == 403 or r['error']['code'] == 503 ), results)): print 'sleeping' time.sleep(n + random.randint(1, 1000)/1000.) n = n*2 elif any(map( lambda r: 'error' in r, results)): raise Exception(', '.join( map(lambda r: r['error']['message'], filter(lambda r: 'error' in r, results)) )) else: break n = 4 while True: precise_results = map( lambda x: json.loads(requests.get(x).text), precise_searches ) if any(map( lambda r: 'error' in r and ( r['error']['code'] == 403 or r['error']['code'] == 503 ), precise_results)): print 'sleeping' time.sleep(n + random.randint(1, 1000)/1000.) n = n*2 elif any(map(lambda r: 'error' in r, precise_results)): raise Exception(', '.join( map(lambda r: r['error']['message'], filter(lambda r: 'error' in r, precise_results)) )) else: break print 'done searching' if type(results) != list: results = [results] # Get results real_results = [(r if 'items' in r else {'items': []}) for r in results] results = real_results search_links = [ [conversions.clean_twitter(i['link'].lower()) for i in r['items']] for r in results ] search_text = [ [u'{title} {link} {pagemap} {snippet}'.format( **convert_pagemap_dict(i) ).lower().encode('utf-8') for i in r['items']] for r in results ] for ri in range(len(search_links)): for si in range(len(search_links[ri])): for r in precise_results: if 'items' in r: for i in r['items']: if conversions.child_or_equal_page( search_links[ri][si], conversions.clean_twitter(i['link'].lower()), True): search_text[ri][si] += ' bipspecialappearsinprecise' child_links = [] child_text = [] search_class = [map(lambda s: conversions.page_relation( s, False, webpage, old_webpage), sl) for sl in search_links ] # TODO Clean up ssv code ssv = [ any(map(patt.match, sl)) or any(map(old_patt.match, sl)) for sl in search_links ] non_websites = [ [ i['link'] for i in r['items'] if webpage not in i['link'] ] for r in results ] cs, ct, cc = zip(*[combine_children( search_links[i], search_text[i], search_class[i], child_links, child_text) for i in range(len(search_links)) ] ) print 'got there', len(results[0]['items']) return (non_websites, ssv, webpage_stripped, search_links, search_text, [r['items'] for r in results], search_class, cs, ct, cc, child_links, child_text)
'fb/Presidential Race.csv') as pr, open('fb/fbcands.csv') as fc, open( 'fb/morefbcands.csv', 'w') as mfc: csvhr = csv.DictReader(hr) csvsr = csv.DictReader(sr) csvgr = csv.DictReader(gr) csvpr = csv.DictReader(pr) csvfc = csv.DictReader(fc) csvmfc = csv.DictWriter(mfc, csvfc.fieldnames) hrdict = {} for l in csvhr: hrdict.update({ (re.match(r'(?P<state>\w{2})-(?P<number>\d+)', l['DISTRICT']).groupdict( )['state'], int( re.match(r'(?P<state>\w{2})-(?P<number>\d+)', l['DISTRICT']).groupdict( )['number']), conversions.clean_name(l['CANDIDATE'])): l['URL'].replace('?ref=ts', '') }) srdict = dict(((l['STATE'], conversions.clean_name(l['CANDIDATE'])), l['URL'].replace('?ref=ts', '')) for l in csvsr) grdict = dict(((l['STATE'], conversions.clean_name(l['CANDIDATE'])), l['URL'].replace('?ref=ts', '')) for l in csvgr) csvmfc.writeheader() for l in csvfc: try: hrkey = (l['state'], int(l['electoral_district_name']), conversions.clean_name(l['name'])) except: hrkey = (l['state'], l['electoral_district_name'], conversions.clean_name(l['name'])) srkey = (l['state'], conversions.clean_name(l['name']))
def getlinks(candidate, webpage, state, district_type, district_name): district_type = district_type.replace('_', ' ').strip() state = state_map[state.strip()] candidate, last, first = conversions.clean_name(candidate) candidate = '+'.join(candidate.split(' ')) print candidate state = '+'.join(state.split(' ')) district_type = '+'.join(district_type.split(' ')) district_name = '+'.join(district_name.strip().split(' ')) search_urls = [] extra_children_searches = [] precise_searches = [] search_urls.append( u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}' .format(name=candidate, state=state)) extra_children_searches.append( u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+info' .format(name=candidate, state=state)) extra_children_searches.append( u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+sk=info' .format(name=candidate, state=state)) precise_searches.append( u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+campaign' .format(name=candidate, state=state)) precise_searches.append( u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+elect' .format(name=candidate, state=state)) search_urls = [ s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in search_urls ] extra_children_searches = [ s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in extra_children_searches ] precise_searches = [ s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in precise_searches ] old_webpage = webpage if webpage != 'www.gernensamples.com': webpage = conversions.get_redirect(webpage) #if webpage == '404' or webpage == 'ERROR': #raise Exception websites = [] webpage_stripped = re.match(r'(?:https?://)?(?:www\.)?(?P<content>.+)', webpage).groupdict()['content'].rstrip('/') old_webpage_stripped = re.match( r'(?:https?://)?(?:www\.)?(?P<content>.+)', old_webpage).groupdict()['content'].rstrip('/') #TODO strip queries webpage_no_queries = ul.urlparse.urlparse(webpage) webpage_no_queries = re.match( r'(?:www\.)?(?P<content>.+)', webpage_no_queries.netloc + webpage_no_queries.path).groupdict()['content'].rstrip('/') old_webpage_no_queries = ul.urlparse.urlparse(old_webpage) old_webpage_no_queries = re.match( r'(?:www\.)?(?P<content>.+)', old_webpage_no_queries.netloc + old_webpage_no_queries.path).groupdict()['content'].rstrip('/') patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format( webpage=webpage_stripped.lower())) old_patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format( webpage=old_webpage_stripped.lower())) child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format( webpage=webpage_no_queries.lower())) old_child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format( webpage=old_webpage_no_queries.lower())) n = 4 while True: results = map(lambda x: json.loads(requests.get(x).text), search_urls) if any( map( lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503), results)): print 'sleeping' time.sleep(n + random.randint(1, 1000) / 1000.) n = n * 2 elif any(map(lambda r: r.has_key('error'), results)): raise Exception(', '.join( map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'), results)))) else: break n = 4 while True: child_results = map(lambda x: json.loads(requests.get(x).text), extra_children_searches) if any( map( lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503), child_results)): print 'sleeping' time.sleep(n + random.randint(1, 1000) / 1000.) n = n * 2 elif any(map(lambda r: r.has_key('error'), child_results)): raise Exception(', '.join( map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'), child_results)))) else: break n = 4 while True: precise_results = map(lambda x: json.loads(requests.get(x).text), precise_searches) if any( map( lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503), precise_results)): print 'sleeping' time.sleep(n + random.randint(1, 1000) / 1000.) n = n * 2 elif any(map(lambda r: r.has_key('error'), precise_results)): raise Exception(', '.join( map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'), precise_results)))) else: break if type(results) != list: print type(results) results = [results] real_results = [(r if r.has_key('items') else { 'items': [] }) for r in results] results = real_results search_links = [[i['link'].lower() for i in r['items']] for r in results] search_text = [[ u'{title} {link} {pagemap} {snippet}'.format( **convert_pagemap_dict(i)).lower().encode('utf-8') for i in r['items'] ] for r in results] for ri in range(len(search_links)): for si in range(len(search_links[ri])): for r in precise_results: if r.has_key('items'): for i in r['items']: if conversions.child_or_equal_page( search_links[ri][si], i['link'].lower(), True): search_text[ri][ si] += ' bipspecialappearsinprecise' child_links = [ i['link'].lower() for r in child_results if r.has_key('items') for i in r['items'] ] child_text = [ u'{title} {link} {pagemap} {snippet}'.format( **convert_pagemap_dict(i)).lower().encode('utf-8') for r in child_results if r.has_key('items') for i in r['items'] ] #search_text = [[u'{title} {link} {pagemap} {snippet}'.format(**i).lower().encode('utf-8') for i in r['items']] for r in results] search_class = [ map(lambda s: conversions.page_relation(s, True, webpage, old_webpage), sl) for sl in search_links ] #search_class = [map(lambda s: 'True' if patt.match(s) != None or old_patt.match(s) != None else ('Child' if child_patt.match(s) != None or old_child_patt.match(s) != None else 'False'),sl) for sl in search_links] #print search_text #TODO Clean up ssv code ssv = [ any(map(patt.match, sl)) or any(map(old_patt.match, sl)) for sl in search_links ] non_websites = [[ i['link'] for i in r['items'] if webpage not in i['link'] ] for r in results] cs, ct, cc = zip(*[ combine_children(search_links[i], search_text[i], search_class[i], child_links, child_text) for i in range(len(search_links)) ]) print 'got there', len(results[0]['items']) return non_websites, ssv, webpage_stripped, search_links, search_text, [ r['items'] for r in results ], search_class, cs, ct, cc, child_links, child_text